diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 5e9901bb87c9a454a393a913b6da6e82266ee719..170e0f839719c71d56008abefb79c7814d0f3e76 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -350,6 +350,22 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)) paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.contrib.load_persistables_for_increment ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.load_persistables_for_inference ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.convert_dist_to_sparse_program ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.HDFSClient.__init__ ArgSpec(args=['self', 'hadoop_home', 'configs'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.HDFSClient.delete ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.HDFSClient.download ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'unzip'], varargs=None, keywords=None, defaults=(False, False)) +paddle.fluid.contrib.HDFSClient.is_dir ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.contrib.HDFSClient.is_exist ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.contrib.HDFSClient.ls ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.HDFSClient.lsr ArgSpec(args=['self', 'hdfs_path', 'only_file', 'sort'], varargs=None, keywords=None, defaults=(True, True)) +paddle.fluid.contrib.HDFSClient.make_local_dirs ArgSpec(args=['local_path'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.HDFSClient.makedirs ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.HDFSClient.rename ArgSpec(args=['self', 'hdfs_src_path', 'hdfs_dst_path', 'overwrite'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.contrib.HDFSClient.upload ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'retry_times'], varargs=None, keywords=None, defaults=(False, 5)) +paddle.fluid.contrib.multi_download ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,)) +paddle.fluid.contrib.multi_upload ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True)) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 779a9ed52365e66d8141f7e3a1183ef6d7832e4b..389366a8a98c5753268718c49c62c2dffe99c32f 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -131,9 +131,7 @@ std::shared_ptr BuildStrategy::CreatePassesFromStrategy( std::unique_ptr BuildStrategy::Apply( const ProgramDesc &main_program, const std::vector &places, - const std::string &loss_var_name, - const std::unordered_set ¶m_names, - const std::vector &local_scopes, + const std::string &loss_var_name, const std::vector &local_scopes, #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const { #else @@ -149,9 +147,6 @@ std::unique_ptr BuildStrategy::Apply( pass->SetNotOwned>("places", &places); pass->Erase("loss_var_name"); pass->SetNotOwned("loss_var_name", &loss_var_name); - pass->Erase("params"); - pass->SetNotOwned>("params", - ¶m_names); pass->Erase("local_scopes"); pass->SetNotOwned>("local_scopes", &local_scopes); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 29396501dc0efedd31a42b77f915fd66c9943985..11db184cb4efe349a340aceb4b7e1e3f4d4b24a5 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -106,16 +106,15 @@ struct BuildStrategy { // Apply the passes built by the pass_builder_. The passes will be // applied to the Program and output an ir::Graph. - std::unique_ptr Apply( - const ProgramDesc &main_program, - const std::vector &places, - const std::string &loss_var_name, - const std::unordered_set ¶m_names, - const std::vector &local_scopes, + std::unique_ptr Apply(const ProgramDesc &main_program, + const std::vector &places, + const std::string &loss_var_name, + const std::vector &local_scopes, #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const; + const bool use_cuda, + platform::NCCLContextMap *nccl_ctxs) const; #else - const bool use_cuda) const; + const bool use_cuda) const; #endif private: diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 8af1d62dea89343ff2d41dd7c6ac837459df7685..7e320a08942e4a9a27e6b5c892a993b3a90c43a4 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -130,7 +130,6 @@ void AddOutputToLeafOps(ir::Graph *graph) { static const char kLossVarName[] = "loss_var_name"; static const char kPlaces[] = "places"; -static const char kParams[] = "params"; static const char kLocalScopes[] = "local_scopes"; static const char kStrategy[] = "strategy"; static const char kNumTrainers[] = "num_trainers"; @@ -147,9 +146,6 @@ void MultiDevSSAGraphBuilder::Init() const { nccl_ctxs_ = &Get("nccl_ctxs"); #endif - for (auto &p : Get>(kParams)) { - grad_names_.insert(GradVarName(p)); - } balance_vars_.resize(places_.size(), 0); if (strategy_.enable_data_balance_ && places_.size() == 1) { LOG(WARNING) << "It is no need to enable data balance when there is only " @@ -359,7 +355,9 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( BuildStrategy::GradientScaleStrategy::kCustomized) { // TODO(paddle-dev): Why is there no input for this op_handle? auto loss_grad_name = node->Op()->OutputArgumentNames()[0]; - CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0]); + auto out_dtype = all_vars_.at(loss_grad_name)->GetDataType(); + CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0], + out_dtype); } // This assumes the backward generating code will ensure IsScaleLossOp // is true only for the op that scale the final scalar loss. @@ -662,13 +660,13 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID( void MultiDevSSAGraphBuilder::CreateScaleLossGradOp( ir::Graph *result, const std::string &loss_grad_name, - ir::Node *out_var_node) const { + ir::Node *out_var_node, proto::VarType::Type dtype) const { for (size_t i = 0; i < places_.size(); ++i) { // Insert ScaleCost OpHandle auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]); auto *op_handle = new ScaleLossGradOpHandle( result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation), - local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx); + local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx, dtype); result->Get(kGraphOps).emplace_back(op_handle); // FIXME: Currently ScaleLossGradOp only use device_count as scale @@ -896,7 +894,6 @@ REGISTER_PASS(multi_devices_pass, paddle::framework::details::MultiDevSSAGraphBuilder) .RequirePassAttr(paddle::framework::details::kLossVarName) .RequirePassAttr(paddle::framework::details::kPlaces) - .RequirePassAttr(paddle::framework::details::kParams) .RequirePassAttr(paddle::framework::details::kLocalScopes) .RequirePassAttr(paddle::framework::details::kStrategy) .RequirePassAttr(paddle::framework::details::kNumTrainers); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 8e462aec7dc7ce45cad592b89de0b6edde8c9146..5736102ddc13418446013307cf8204b677f960dc 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -68,7 +68,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass { void CreateScaleLossGradOp(ir::Graph *result, const std::string &loss_grad_name, - ir::Node *out_var_node) const; + ir::Node *out_var_node, + proto::VarType::Type dtype) const; VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og, int dst_dev_id) const; @@ -102,7 +103,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass { mutable std::string loss_var_name_; mutable std::vector places_; mutable std::vector local_scopes_; - mutable std::unordered_set grad_names_; mutable BuildStrategy strategy_; mutable std::unordered_map all_vars_; diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index ef1626599795a553e654fe5d3ed74ef3a3a67d78..e1b8e8fe05f0615d689e78d9c405cc5d76d2abb1 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -22,39 +22,66 @@ namespace details { ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope, platform::Place place, - platform::DeviceContext *dev_ctx) + platform::DeviceContext *dev_ctx, + proto::VarType::Type dtype) : OpHandleBase(node), coeff_(static_cast(1.0 / num_dev)), scope_(scope), - place_(place) { + place_(place), + out_dtype_(dtype) { this->SetDeviceContext(place_, dev_ctx); } ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {} +struct ScaleLossGradFunctor { + float coeff_; + Tensor *out_; + platform::Place place_; + OpHandleBase *op_handle_; + proto::VarType::Type out_dtype_; + platform::DeviceContext *ctx_; + + ScaleLossGradFunctor(float coeff, Tensor *out, platform::Place place, + OpHandleBase *op_handle, proto::VarType::Type dtype, + platform::DeviceContext *ctx) + : coeff_(coeff), out_(out), place_(place), out_dtype_(dtype), ctx_(ctx) {} + + template + void apply() const { + auto *out_data = out_->mutable_data(place_); + if (platform::is_cpu_place(place_)) { + *out_data = static_cast(coeff_); + } else { +#ifdef PADDLE_WITH_CUDA + OutT cast_coeff = static_cast(coeff_); + auto stream = static_cast(ctx_)->stream(); + memory::Copy(boost::get(place_), out_data, + platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_), + stream); + VLOG(10) << place_ << "RUN Scale loss grad op"; + +#endif + } + } +}; + void ScaleLossGradOpHandle::RunImpl() { // Doesn't wait any event std::string var_name = static_cast(this->outputs_[0])->name_; auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get(); - float *tmp = local_scope.FindVar(var_name) - ->GetMutable() - ->mutable_data(make_ddim({1}), place_); + auto *tensor = local_scope.FindVar(var_name)->GetMutable(); + tensor->Resize(make_ddim({1})); - if (platform::is_cpu_place(place_)) { - *tmp = coeff_; - } else { #ifdef PADDLE_WITH_CUDA - this->RunAndRecordEvent([&] { - auto stream = static_cast( - this->dev_ctxes_.at(place_)) - ->stream(); - memory::Copy(boost::get(place_), tmp, - platform::CPUPlace(), &coeff_, sizeof(float), stream); - VLOG(10) << place_ << "RUN Scale loss grad op"; - }); + ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, + this->dev_ctxes_.at(place_)); + this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); }); +#else + ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, nullptr); + framework::VisitDataType(out_dtype_, func); #endif - } } std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; } diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h index 523b55724c82d4e2bef0520c10e5708c952a3ecc..8bedd1643eb9c5e591fa3c40995fcba08980b9fa 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h @@ -26,8 +26,8 @@ namespace details { struct ScaleLossGradOpHandle : public OpHandleBase { ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope, - platform::Place place, - platform::DeviceContext *context); + platform::Place place, platform::DeviceContext *context, + proto::VarType::Type dtype); ~ScaleLossGradOpHandle() final; @@ -40,6 +40,7 @@ struct ScaleLossGradOpHandle : public OpHandleBase { float coeff_; Scope *scope_; platform::Place place_; + proto::VarType::Type out_dtype_; }; } // namespace details diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc index 5376fc163e259e5049955052baf02fd614aa511e..a8029e67e659a269f8492cf6e2f1f09040144283 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc @@ -24,35 +24,6 @@ namespace paddle { namespace framework { namespace ir { -// The function keeps the graph consistent by replacing -// a node 'from' in the set of inputs nodes -// of the visited node by a node 'to'. -void CorrectGraphEdges(Graph* graph, Node* from, Node* to) { - for (auto& node : GraphTraits::DFS(*graph)) { - auto from_in_inputs = - std::find(std::begin(node.inputs), std::end(node.inputs), from); - - if (from_in_inputs != std::end(node.inputs)) { - IR_NODE_LINK_TO(to, (&node)); - - auto inputs = node.Op()->Inputs(); - - using input_type = VariableNameMap::value_type; - - std::for_each(std::begin(inputs), std::end(inputs), - [from, to, &node](const input_type& i) -> void { - auto param_names = i.second; - auto pi = std::find(std::begin(param_names), - std::end(param_names), from->Name()); - - if (pi != std::end(param_names)) { - node.Op()->SetInput(i.first, {to->Name()}); - } - }); - } - } -} - bool IsReachable(ir::Graph* graph, Node* from, Node* to) { auto find_node = [](ir::Graph* graph, const Node* node) -> Node* { for (auto n : graph->Nodes()) { @@ -99,25 +70,12 @@ bool IsReachable(ir::Graph* graph, Node* from, Node* to) { return false; } -boost::optional HasBias(const Node& op, const std::string& bias_name) { - auto bias_input_names = op.Op()->Inputs(); - auto bias_it = bias_input_names.find(bias_name); - - if (bias_it != std::end(bias_input_names)) { - bool has_bias = !bias_it->second.empty(); - - if (has_bias) { - auto bias_names = bias_it->second; - auto bias_names_it = - std::find_if(std::begin(op.inputs), std::end(op.inputs), - [&bias_names](Node* n) -> bool { - return n->Name() == bias_names[0]; - }); - return *bias_names_it; - } - } - - return boost::none; +template +boost::optional HasAttribute(const Node& op, const std::string& attr) { + if (op.Op()->HasAttr(attr)) + return boost::get(op.Op()->GetAttr(attr)); + else + return boost::none; } ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::IdentityFuseHandle( @@ -151,40 +109,18 @@ void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()( if (!IsReachable(graph, elementwise_add_identity, conv_output)) return; - OpDesc op_desc; - op_desc.SetType("conv2d"); - - op_desc.SetInput("Input", {conv_input->Name()}); - op_desc.SetInput("Filter", {conv_filter->Name()}); - op_desc.SetInput("ResidualData", {elementwise_add_identity->Name()}); - op_desc.SetOutput("Output", {conv_output->Name()}); + auto fuse_relu = HasAttribute(*conv_op, "fuse_relu"); + if (fuse_relu && *fuse_relu) return; - auto conv_bias = HasBias(*conv_op, "Bias"); + conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()}); + conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + conv_op->Op()->SetAttr("fuse_residual_connection", true); - if (conv_bias) { - op_desc.SetInput("Bias", {(*conv_bias)->Name()}); - } - - for (const auto& attr : conv_op->Op()->GetAttrMap()) { - op_desc.SetAttr(attr.first, attr.second); - } - - op_desc.SetAttr("fuse_residual_connection", true); + GraphSafeRemoveNodes(graph, {conv_output, elementwise_add_op}); - auto fused_conv_op = graph->CreateOpNode(&op_desc); - - IR_NODE_LINK_TO(conv_input, fused_conv_op); - IR_NODE_LINK_TO(conv_filter, fused_conv_op); - IR_NODE_LINK_TO(elementwise_add_identity, fused_conv_op); - IR_NODE_LINK_TO(fused_conv_op, conv_output); - - if (conv_bias) { - IR_NODE_LINK_TO((*conv_bias), fused_conv_op); - } + IR_NODE_LINK_TO(elementwise_add_identity, conv_op); + IR_NODE_LINK_TO(conv_op, elementwise_add_out); - CorrectGraphEdges(graph, elementwise_add_out, conv_output); - GraphSafeRemoveNodes(graph, - {elementwise_add_out, conv_op, elementwise_add_op}); (*fusion_stats)++; } @@ -229,60 +165,33 @@ void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()( Node* projection_node; Node* residual_conv_op; - Node* residual_conv_input; - Node* residual_conv_filter; Node* residual_conv_output; if (IsReachable(graph, conv_x_input, conv_y_output)) { projection_node = conv_x_output; residual_conv_op = conv_y_op; - residual_conv_input = conv_y_input; - residual_conv_filter = conv_y_filter; residual_conv_output = conv_y_output; } else if (IsReachable(graph, conv_y_input, conv_x_output)) { projection_node = conv_y_output; residual_conv_op = conv_x_op; - residual_conv_input = conv_x_input; - residual_conv_filter = conv_x_filter; residual_conv_output = conv_x_output; } else { return; } - OpDesc op_desc; - op_desc.SetType("conv2d"); + auto fuse_relu = HasAttribute(*residual_conv_op, "fuse_relu"); + if (fuse_relu && *fuse_relu) return; - op_desc.SetInput("Input", {residual_conv_input->Name()}); - op_desc.SetInput("Filter", {residual_conv_filter->Name()}); - op_desc.SetInput("ResidualData", {projection_node->Name()}); - op_desc.SetOutput("Output", {residual_conv_output->Name()}); + residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()}); + residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); - auto residual_conv_bias = HasBias(*residual_conv_op, "Bias"); + residual_conv_op->Op()->SetAttr("fuse_residual_connection", true); - if (residual_conv_bias) { - op_desc.SetInput("Bias", {(*residual_conv_bias)->Name()}); - } - - for (const auto& attr : residual_conv_op->Op()->GetAttrMap()) { - op_desc.SetAttr(attr.first, attr.second); - } - - op_desc.SetAttr("fuse_residual_connection", true); + GraphSafeRemoveNodes(graph, {residual_conv_output, elementwise_add_op}); - auto fused_conv_op = graph->CreateOpNode(&op_desc); - - IR_NODE_LINK_TO(residual_conv_input, fused_conv_op); - IR_NODE_LINK_TO(residual_conv_filter, fused_conv_op); - IR_NODE_LINK_TO(projection_node, fused_conv_op); - IR_NODE_LINK_TO(fused_conv_op, residual_conv_output); - - if (residual_conv_bias) { - IR_NODE_LINK_TO((*residual_conv_bias), fused_conv_op); - } + IR_NODE_LINK_TO(projection_node, residual_conv_op); + IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out); - CorrectGraphEdges(graph, elementwise_add_out, residual_conv_output); - GraphSafeRemoveNodes( - graph, {elementwise_add_out, residual_conv_op, elementwise_add_op}); (*fusion_stats)++; } diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index dde642764fa5dfce11edcef51ad1be11be331fbc..2fe1c94ec02e8ff0a4acb81868ba2124ea89e506 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -110,22 +110,125 @@ class CompileTimeInferShapeContext : public InferShapeContext { } } + std::vector GetInputVarPtrs( + const std::string &name) override { + const std::vector arg_names = Inputs(name); + std::vector res; + res.reserve(arg_names.size()); + std::transform(arg_names.begin(), arg_names.end(), std::back_inserter(res), + [this](const std::string &name) { + return block_.FindVarRecursive(name); + }); + return res; + } + + std::vector GetOutputVarPtrs( + const std::string &name) override { + const std::vector arg_names = Outputs(name); + std::vector res; + res.reserve(arg_names.size()); + std::transform(arg_names.begin(), arg_names.end(), std::back_inserter(res), + [this](const std::string &name) { + return block_.FindVarRecursive(name); + }); + return res; + } + + DDim GetInputDim(const std::string &name) const override { + const std::vector &arg_names = Inputs(name); + PADDLE_ENFORCE_EQ(arg_names.size(), 1UL, + "Input(%s) should hold one element, but now it holds %d", + name, arg_names.size()); + return this->GetDim(arg_names[0]); + } + + std::vector GetInputsDim(const std::string &name) const override { + const std::vector &arg_names = Inputs(name); + return GetDims(arg_names); + } + bool IsRuntime() const override; + std::vector GetInputsVarType( + const std::string &name) const override { + return GetVarTypes(Inputs(name)); + } + + std::vector GetOutputsVarType( + const std::string &name) const override { + return GetVarTypes(Outputs(name)); + } + + void SetOutputDim(const std::string &name, const DDim &dim) override { + auto &arg_names = Outputs(name); + PADDLE_ENFORCE_EQ(arg_names.size(), 1UL, + "Output(%s) should hold one element, but now it holds %d", + name, arg_names.size()); + SetDim(arg_names[0], dim); + } + + void SetOutputsDim(const std::string &name, + const std::vector &dims) override { + auto &names = Outputs(name); + SetDims(names, dims); + } + protected: - proto::VarType::Type GetVarType(const std::string &name) const override; + std::vector GetVarTypes( + const std::vector &names) const { + std::vector retv; + retv.resize(names.size()); + std::transform( + names.begin(), names.end(), retv.begin(), + std::bind(std::mem_fn(&CompileTimeInferShapeContext::GetVarType), this, + std::placeholders::_1)); + return retv; + } + + proto::VarType::Type GetVarType(const std::string &name) const; + + DDim GetDim(const std::string &name) const { + auto var = block_.FindVarRecursive(name); + PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); + DDim res; + try { + auto shape = var->GetShape(); + res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape); + } catch (...) { + VLOG(5) << "GetDim of variable " << name << " error"; + std::rethrow_exception(std::current_exception()); + } + return res; + } - DDim GetDim(const std::string &name) const override; + std::vector GetDims(const std::vector &names) const { + std::vector ret; + ret.reserve(names.size()); + std::transform( + names.begin(), names.end(), std::back_inserter(ret), + [this](const std::string &name) { return this->GetDim(name); }); + return ret; + } + + void SetDim(const std::string &name, const DDim &dim); - void SetDim(const std::string &name, const DDim &dim) override; + void SetDims(const std::vector &names, + const std::vector &dims) { + size_t length = names.size(); + PADDLE_ENFORCE_EQ(length, dims.size()); + for (size_t i = 0; i < length; ++i) { + if (names[i] == framework::kEmptyVarName) { + continue; + } + SetDim(names[i], dims[i]); + } + } std::vector GetRepeatedDims(const std::string &name) const override; void SetRepeatedDims(const std::string &name, const std::vector &dims) override; - InferShapeVarPtr GetVarPtr(const std::string &name) override; - const OpDesc &op_; const BlockDesc &block_; }; @@ -644,20 +747,6 @@ const std::vector &CompileTimeInferShapeContext::Outputs( return op_.Output(name); } -DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const { - auto var = block_.FindVarRecursive(name); - PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); - DDim res; - try { - auto shape = var->GetShape(); - res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape); - } catch (...) { - VLOG(5) << "GetDim of variable " << name << " error"; - std::rethrow_exception(std::current_exception()); - } - return res; -} - std::vector CompileTimeInferShapeContext::GetRepeatedDims( const std::string &name) const { auto var = block_.FindVarRecursive(name); @@ -696,10 +785,5 @@ proto::VarType::Type CompileTimeInferShapeContext::GetVarType( return block_.FindVarRecursive(name)->GetType(); } -InferShapeVarPtr CompileTimeInferShapeContext::GetVarPtr( - const std::string &name) { - return block_.FindVarRecursive(name); -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index abc0c22b6e090725ac25fa51fb4c523341ec9716..b2d4de5916ba89e6692e10ac7ebef53484b576ca 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -142,12 +142,14 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames, const Scope& scope) { for (auto& var_name_item : innames) { std::vector& input_vars = inputs[var_name_item.first]; + input_vars.reserve(var_name_item.second.size()); for (auto& var_name : var_name_item.second) { input_vars.push_back(scope.FindVar(var_name)); } } for (auto& var_name_item : outnames) { std::vector& output_vars = outputs[var_name_item.first]; + output_vars.reserve(var_name_item.second.size()); for (auto& var_name : var_name_item.second) { output_vars.push_back(scope.FindVar(var_name)); } @@ -556,30 +558,28 @@ class RuntimeInferShapeContext : public InferShapeContext { bool HasOutput(const std::string& name) const override { // has only one output - const auto& outs = op_.Outputs(); + const auto& outs = ctx_.outputs; auto it = outs.find(name); if (it == outs.end()) { return false; } const auto& out = it->second; - if (out.size() == 0 || out[0] == kEmptyVarName) { + if (out.size() == 0) { return false; } PADDLE_ENFORCE_EQ(out.size(), 1UL, "Output %s should not have more than one outputs", name); - return scope_.FindVar(out[0]) != nullptr; + return out[0] != nullptr; } bool HasInputs(const std::string& name) const override { - if (!op_.HasInputs(name)) { - return false; - } - auto inputs = op_.Inputs(name); - if (inputs.empty()) { + const auto& ins = ctx_.inputs; + auto it = ins.find(name); + if (it == ins.end() || it->second.empty()) { return false; } - for (auto& input : inputs) { - if (scope_.FindVar(input) == nullptr) { + for (auto& input : it->second) { + if (input == nullptr) { return false; } } @@ -587,15 +587,13 @@ class RuntimeInferShapeContext : public InferShapeContext { } bool HasOutputs(const std::string& name) const override { - if (!op_.HasOutputs(name)) { - return false; - } - auto outputs = op_.Outputs(name); - if (outputs.empty()) { + const auto& outs = ctx_.outputs; + auto it = outs.find(name); + if (it == outs.end() || it->second.empty()) { return false; } - for (auto& output : outputs) { - if (scope_.FindVar(output) == nullptr) { + for (auto& output : it->second) { + if (output == nullptr) { return false; } } @@ -616,16 +614,18 @@ class RuntimeInferShapeContext : public InferShapeContext { void ShareDim(const std::string& in, const std::string& out, size_t i = 0, size_t j = 0) override { - PADDLE_ENFORCE_LT(i, Inputs(in).size()); - PADDLE_ENFORCE_LT(j, Outputs(out).size()); - const std::string& input_n = Inputs(in)[i]; - const std::string& output_n = Outputs(out)[j]; + auto in_it = ctx_.inputs.find(in); + auto out_it = ctx_.outputs.find(out); + PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i, + "Inputs %s should have %llu argument", in, i); + PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j, + "Outputs %s should have %llu argument", out, j); + + Variable* in_var = in_it->second[i]; + Variable* out_var = out_it->second[j]; - Variable* in_var = scope_.FindVar(input_n); - Variable* out_var = scope_.FindVar(output_n); PADDLE_ENFORCE(in_var->Type() == out_var->Type(), - "The type of %s and %s is not the same.", output_n, - GetDim(input_n)); + "The type of %s and %s is not the same.", in, out); if (in_var->IsType()) { auto& in_sele_rows = in_var->Get(); @@ -646,13 +646,16 @@ class RuntimeInferShapeContext : public InferShapeContext { void ShareLoD(const std::string& in, const std::string& out, size_t i = 0, size_t j = 0) const override { - const std::vector& inputs = Inputs(in); - const std::vector& outputs = Outputs(out); - PADDLE_ENFORCE_LT(i, inputs.size()); - PADDLE_ENFORCE_LT(j, outputs.size()); - Variable* in_var = scope_.FindVar(inputs.at(i)); + auto in_it = ctx_.inputs.find(in); + auto out_it = ctx_.outputs.find(out); + PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i, + "Inputs %s should have %llu argument", in, i); + PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j, + "Outputs %s should have %llu argument", out, j); + + Variable* in_var = in_it->second.at(i); if (!in_var->IsType()) return; - Variable* out_var = scope_.FindVar(outputs.at(j)); + Variable* out_var = out_it->second.at(j); PADDLE_ENFORCE(out_var->IsType(), "The %d-th output of Output(%s) must be LoDTensor.", j, out); auto in_tensor = in_var->Get(); @@ -687,9 +690,64 @@ class RuntimeInferShapeContext : public InferShapeContext { bool IsRuntime() const override { return true; } + // TODO(paddle-dev): Can this be template? + std::vector GetInputVarPtrs( + const std::string& name) override { + const std::vector& vars = InputVars(name); + std::vector res; + res.reserve(vars.size()); + res.insert(res.begin(), vars.begin(), vars.end()); + return res; + } + + std::vector GetOutputVarPtrs( + const std::string& name) override { + const std::vector& vars = OutputVars(name); + std::vector res; + res.reserve(vars.size()); + res.insert(res.begin(), vars.begin(), vars.end()); + return res; + } + + DDim GetInputDim(const std::string& name) const override { + const std::vector& vars = InputVars(name); + PADDLE_ENFORCE_EQ(vars.size(), 1UL, + "Input(%s) should hold one element, but now it holds %d", + name, vars.size()); + return this->GetDim(vars[0]); + } + + std::vector GetInputsDim(const std::string& name) const override { + const std::vector& vars = InputVars(name); + return GetDims(vars); + } + + std::vector GetInputsVarType( + const std::string& name) const override { + return GetVarTypes(InputVars(name)); + } + + std::vector GetOutputsVarType( + const std::string& name) const override { + return GetVarTypes(OutputVars(name)); + } + + void SetOutputDim(const std::string& name, const DDim& dim) override { + auto& vars = OutputVars(name); + PADDLE_ENFORCE_EQ(vars.size(), 1UL, + "Output(%s) should hold one element, but now it holds %d", + name, vars.size()); + SetDim(vars[0], dim); + } + + void SetOutputsDim(const std::string& name, + const std::vector& dims) override { + auto& vars = OutputVars(name); + SetDims(vars, dims); + } + protected: - DDim GetDim(const std::string& name) const override { - Variable* var = scope_.FindVar(name); + DDim GetDim(Variable* var) const { PADDLE_ENFORCE_NOT_NULL(var); if (var->IsType()) { return var->Get().dims(); @@ -697,25 +755,44 @@ class RuntimeInferShapeContext : public InferShapeContext { return var->Get().GetCompleteDims(); } else { PADDLE_THROW( - "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's " + "Only LoDTensor/SelectedRows support 'GetDim', but Variables " "type_id is %s.", - name, ToTypeName(var->Type())); + ToTypeName(var->Type())); } } + std::vector GetDims(const std::vector& vars) const { + std::vector ret; + ret.reserve(vars.size()); + std::transform(vars.begin(), vars.end(), std::back_inserter(ret), + [this](Variable* var) { return this->GetDim(var); }); + return ret; + } + std::vector GetRepeatedDims(const std::string& name) const override { PADDLE_THROW("Only compile time support this method"); } - void SetDim(const std::string& name, const DDim& dim) override { - Variable* var = scope_.FindVar(name); + void SetDim(Variable* var, const DDim& dim) { if (var->IsType()) { var->GetMutable()->Resize(dim); } else if (var->IsType()) { var->GetMutable()->set_height(dim[0]); } else { - PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.", - name, ToTypeName(var->Type())); + PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", + ToTypeName(var->Type())); + } + } + + void SetDims(const std::vector& vars, + const std::vector& dims) { + size_t length = vars.size(); + PADDLE_ENFORCE_EQ(length, dims.size()); + for (size_t i = 0; i < length; ++i) { + if (vars[i] == nullptr) { + continue; + } + SetDim(vars[i], dims[i]); } } @@ -724,16 +801,36 @@ class RuntimeInferShapeContext : public InferShapeContext { PADDLE_THROW("Only compile time support this method"); } - proto::VarType::Type GetVarType(const std::string& name) const override { - auto* var = scope_.FindVar(name); - return ToVarType(var->Type()); + std::vector GetVarTypes( + const std::vector& vars) const { + std::vector retv; + retv.resize(vars.size()); + std::transform(vars.begin(), vars.end(), retv.begin(), + std::bind(std::mem_fn(&RuntimeInferShapeContext::GetVarType), + this, std::placeholders::_1)); + return retv; } - InferShapeVarPtr GetVarPtr(const std::string& name) override { - return scope_.FindVar(name); + proto::VarType::Type GetVarType(Variable* var) const { + return ToVarType(var->Type()); } private: + const std::vector& InputVars(const std::string& name) const { + auto it = ctx_.inputs.find(name); + PADDLE_ENFORCE(it != ctx_.inputs.end(), + "Operator %s does not have the input %s.", op_.Type(), name); + return it->second; + } + + const std::vector& OutputVars(const std::string& name) const { + auto it = ctx_.outputs.find(name); + PADDLE_ENFORCE(it != ctx_.outputs.end(), + "Operator %s does not have the outputs %s.", op_.Type(), + name); + return it->second; + } + const OperatorBase& op_; const Scope& scope_; const RuntimeContext& ctx_; @@ -864,8 +961,7 @@ Scope* OperatorWithKernel::PrepareData( for (size_t i = 0; i < var_name_item.second.size(); ++i) { auto& var_name = var_name_item.second[i]; - auto* var = scope.FindVar(var_name); - input_vars[i] = var; + auto* var = input_vars[i]; // Only tensor can be tranfer to another device. if (var == nullptr || !VarIsTensor(*var)) { diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 7e3fe02eaf5560ef03e42c6b82ed338edc30b0ab..a921f469f5e0276884fe194c99b15100a11113dc 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -190,7 +190,6 @@ std::vector &ParallelExecutor::GetLocalScopes() { ParallelExecutor::ParallelExecutor( const std::vector &places, - const std::unordered_set ¶ms, const std::unordered_set &bcast_vars, const ProgramDesc &main_program, const std::string &loss_var_name, Scope *scope, const std::vector &local_scopes, @@ -209,7 +208,7 @@ ParallelExecutor::ParallelExecutor( "the number of places must be greater than 1."); } - // Step 1. Bcast the params to devs. + // Step 1. Bcast the bcast_vars to devs. // Create local scopes if (local_scopes.empty()) { member_->own_local_scope_ = true; @@ -249,12 +248,12 @@ ParallelExecutor::ParallelExecutor( // ncclOp #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::unique_ptr graph = build_strategy.Apply( - main_program, member_->places_, loss_var_name, params, - member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get()); + main_program, member_->places_, loss_var_name, member_->local_scopes_, + member_->use_cuda_, member_->nccl_ctxs_.get()); #else std::unique_ptr graph = build_strategy.Apply(main_program, member_->places_, loss_var_name, - params, member_->local_scopes_, member_->use_cuda_); + member_->local_scopes_, member_->use_cuda_); #endif auto max_memory_size = GetEagerDeletionThreshold(); if (max_memory_size >= 0) { diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 1fc17a0d64d50eb70ce66cacd4752a5b96d5e894..5f6c2159aa2d90378ac298a8e56b51a188225d45 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -41,7 +41,6 @@ class ParallelExecutor { public: explicit ParallelExecutor(const std::vector &places, - const std::unordered_set ¶ms, const std::unordered_set &bcast_vars, const ProgramDesc &main_program, const std::string &loss_var_name, Scope *scope, diff --git a/paddle/fluid/framework/shape_inference.cc b/paddle/fluid/framework/shape_inference.cc index ddff2c7c261746ac9986e79cff3da7e0a9654adc..4ac872ac3d3bf918678f5294a4c35097c3fb18ab 100644 --- a/paddle/fluid/framework/shape_inference.cc +++ b/paddle/fluid/framework/shape_inference.cc @@ -22,20 +22,6 @@ limitations under the License. */ namespace paddle { namespace framework { -DDim InferShapeContext::GetInputDim(const std::string &name) const { - const std::vector &arg_names = Inputs(name); - PADDLE_ENFORCE_EQ(arg_names.size(), 1UL, - "Input(%s) should hold one element, but now it holds %d", - name, arg_names.size()); - return this->GetDim(arg_names[0]); -} - -std::vector InferShapeContext::GetInputsDim( - const std::string &name) const { - const std::vector &arg_names = Inputs(name); - return GetDims(arg_names); -} - std::vector InferShapeContext::GetReaderDims( const std::string &name) const { const std::vector &arg_names = Inputs(name); @@ -46,26 +32,6 @@ std::vector InferShapeContext::GetReaderDims( return this->GetRepeatedDims(arg_names[0]); } -DDim InferShapeContext::GetInputsElementDim(const std::string &name, - int idx) const { - const std::vector &names = Inputs(name); - return this->GetDim(names[idx]); -} - -void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) { - auto &arg_names = Outputs(name); - PADDLE_ENFORCE_EQ(arg_names.size(), 1UL, - "Output(%s) should hold one element, but now it holds %d", - name, arg_names.size()); - SetDim(arg_names[0], dim); -} - -void InferShapeContext::SetOutputsDim(const std::string &name, - const std::vector &dims) { - auto &names = Outputs(name); - SetDims(names, dims); -} - void InferShapeContext::SetReaderDims(const std::string &name, const std::vector &dims) { const std::vector &arg_names = Outputs(name); @@ -76,69 +42,5 @@ void InferShapeContext::SetReaderDims(const std::string &name, return this->SetRepeatedDims(arg_names[0], dims); } -std::vector InferShapeContext::GetInputVarPtrs( - const std::string &name) { - const std::vector arg_names = Inputs(name); - std::vector res; - res.reserve(arg_names.size()); - std::transform( - arg_names.begin(), arg_names.end(), std::back_inserter(res), - [this](const std::string &name) { return this->GetVarPtr(name); }); - return res; -} - -std::vector InferShapeContext::GetOutputVarPtrs( - const std::string &name) { - const std::vector arg_names = Outputs(name); - std::vector res; - res.reserve(arg_names.size()); - std::transform( - arg_names.begin(), arg_names.end(), std::back_inserter(res), - [this](const std::string &name) { return this->GetVarPtr(name); }); - return res; -} - -std::vector InferShapeContext::GetDims( - const std::vector &names) const { - std::vector ret; - ret.reserve(names.size()); - std::transform( - names.begin(), names.end(), std::back_inserter(ret), - [this](const std::string &name) { return this->GetDim(name); }); - return ret; -} - -void InferShapeContext::SetDims(const std::vector &names, - const std::vector &dims) { - size_t length = names.size(); - PADDLE_ENFORCE_EQ(length, dims.size()); - for (size_t i = 0; i < length; ++i) { - if (names[i] == framework::kEmptyVarName) { - continue; - } - SetDim(names[i], dims[i]); - } -} - -std::vector InferShapeContext::GetInputsVarType( - const std::string &name) const { - return GetVarTypes(Inputs(name)); -} - -std::vector InferShapeContext::GetOutputsVarType( - const std::string &name) const { - return GetVarTypes(Outputs(name)); -} - -std::vector InferShapeContext::GetVarTypes( - const std::vector &names) const { - std::vector retv; - retv.resize(names.size()); - std::transform(names.begin(), names.end(), retv.begin(), - std::bind(std::mem_fn(&InferShapeContext::GetVarType), this, - std::placeholders::_1)); - return retv; -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h index d73cca121e41e68f9fb6548117ed91c5cc1415ca..824f75b3d3cfa03020182d2ea0b2970bdd6aeeca 100644 --- a/paddle/fluid/framework/shape_inference.h +++ b/paddle/fluid/framework/shape_inference.h @@ -33,22 +33,23 @@ class InferShapeContext { virtual bool HasInput(const std::string &name) const = 0; virtual bool HasOutput(const std::string &name) const = 0; - std::vector GetInputsVarType( - const std::string &name) const; - std::vector GetOutputsVarType( - const std::string &name) const; + virtual std::vector GetInputsVarType( + const std::string &name) const = 0; + virtual std::vector GetOutputsVarType( + const std::string &name) const = 0; virtual bool HasInputs(const std::string &name) const = 0; virtual bool HasOutputs(const std::string &name) const = 0; - DDim GetInputDim(const std::string &name) const; - std::vector GetInputsDim(const std::string &name) const; - std::vector GetReaderDims(const std::string &name) const; - DDim GetInputsElementDim(const std::string &name, int idx) const; + virtual DDim GetInputDim(const std::string &name) const = 0; + virtual std::vector GetInputsDim(const std::string &name) const = 0; + virtual std::vector GetReaderDims(const std::string &name) const; - void SetOutputDim(const std::string &name, const DDim &dim); - void SetOutputsDim(const std::string &name, const std::vector &dims); - void SetReaderDims(const std::string &name, const std::vector &dims); + virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0; + virtual void SetOutputsDim(const std::string &name, + const std::vector &dims) = 0; + virtual void SetReaderDims(const std::string &name, + const std::vector &dims); virtual AttrReader Attrs() const = 0; virtual const std::vector &Inputs( @@ -67,27 +68,15 @@ class InferShapeContext { virtual bool IsRuntime() const = 0; - std::vector GetInputVarPtrs(const std::string &name); - std::vector GetOutputVarPtrs(const std::string &name); - virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0; - - // Note: In while op, we need this to be public - void SetDims(const std::vector &names, - const std::vector &dims); + virtual std::vector GetInputVarPtrs( + const std::string &name) = 0; + virtual std::vector GetOutputVarPtrs( + const std::string &name) = 0; protected: - virtual DDim GetDim(const std::string &name) const = 0; - virtual void SetDim(const std::string &name, const DDim &dim) = 0; virtual std::vector GetRepeatedDims(const std::string &name) const = 0; virtual void SetRepeatedDims(const std::string &name, const std::vector &dims) = 0; - - std::vector GetDims(const std::vector &names) const; - - std::vector GetVarTypes( - const std::vector &names) const; - - virtual proto::VarType::Type GetVarType(const std::string &name) const = 0; }; } // namespace framework diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 612503768079472ba233ee3fcd43a47fdba9a0cc..342cb68ab2bf8ceb543317ed8d8f2356ef6b2cde 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -188,11 +188,13 @@ std::vector OpBase::ApplyGrad(framework::Scope* scope) { std::vector ret; for (size_t i = 0; i < input_vars_->size(); ++i) { bool found = false; + VarBase* origin_var = (*input_vars_)[i]; for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { Variable* var = scope->FindVar(outvar); - VarBase* origin_var = (*input_vars_)[i]; std::string orig_var = grad_to_var_->at(outvar); - PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var); + if (origin_var->var_desc_->Name() != orig_var) { + continue; + } VLOG(3) << "apply grad " << outvar << " with origin " << orig_var; origin_var->ApplyGrad(scope, var); found = true; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 433d07c0e5aa0986ab1e9fe349ef865d2851c0c0..97772dc110135d9d2533e1574933d49f7c8cd346 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -43,9 +43,12 @@ void CreateGradOp(const framework::OpDesc& op_desc, class Tracer { public: - explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { + explicit Tracer(framework::BlockDesc* root_block, + framework::BlockDesc* startup_block) + : root_block_(root_block), startup_block_(startup_block) { root_scope_ = new framework::Scope(); scopes_[root_block_] = root_scope_; + scopes_[startup_block_] = root_scope_; } virtual ~Tracer() { delete root_scope_; } @@ -80,6 +83,8 @@ class Tracer { } else { op->pre_ops_->push_back(nullptr); } + VLOG(3) << "input vname " << vname << " " + << var->Get().dims().size(); } *op->output_vars_ = outputs; @@ -98,12 +103,19 @@ class Tracer { outputs[i]->pre_op_ = op; outputs[i]->pre_op_out_idx_ = i; } + + VLOG(3) << "tracer running " << op_desc->Type(); op_base->Run(*scope, platform::CPUPlace()); - framework::OpDesc* grad_op_desc; - auto grad_to_var = new std::unordered_map(); - CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); - op->grad_op_desc_ = grad_op_desc; - op->grad_to_var_ = grad_to_var; + if (block == startup_block_) { + op->grad_op_desc_ = nullptr; + op->grad_to_var_ = nullptr; + } else { + framework::OpDesc* grad_op_desc; + auto grad_to_var = new std::unordered_map(); + CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); + op->grad_op_desc_ = grad_op_desc; + op->grad_to_var_ = grad_to_var; + } op->block_ = block; } @@ -121,6 +133,7 @@ class Tracer { private: std::map scopes_; framework::BlockDesc* root_block_; + framework::BlockDesc* startup_block_; framework::Scope* root_scope_; }; diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 9b5eda17faecce63964954448211238dfccdc5e6..0360cf5273591946570cac47e2578e43f498b550 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -398,26 +398,41 @@ class WhileGradOpShapeInference : public framework::InferShapeBase { ctx->HasInputs(kOutputs); ctx->HasInputs(framework::GradVarName(kOutputs)); - auto p_names = ctx->Inputs(kX); auto pg_ig_names = ctx->Outputs(kXGRAD); - auto var_types = ctx->GetInputsVarType(kX); - std::vector names_to_set; - std::vector dims_to_set; - for (size_t i = 0; i < p_names.size(); ++i) { + std::vector in_var_ptrs = + ctx->GetInputVarPtrs(kX); + std::vector out_var_ptrs = + ctx->GetOutputVarPtrs(kXGRAD); + PADDLE_ENFORCE(in_var_ptrs.size() == out_var_ptrs.size()); + + for (size_t i = 0; i < in_var_ptrs.size(); ++i) { if (pg_ig_names[i] == framework::kEmptyVarName) { continue; } - auto dims = ctx->GetInputsElementDim(kX, i); - if (var_types[i] == framework::proto::VarType::LOD_TENSOR) { - names_to_set.push_back(pg_ig_names[i]); - dims_to_set.push_back(dims); - } else if (var_types[i] == framework::proto::VarType::LOD_TENSOR_ARRAY) { - // not sure how to set the dim of LOD_TENSOR_ARRAY - names_to_set.push_back(pg_ig_names[i]); - dims_to_set.push_back(dims); + if (ctx->IsRuntime()) { + framework::Variable *in_var = + boost::get(in_var_ptrs[i]); + framework::Variable *out_var = + boost::get(out_var_ptrs[i]); + + auto type = framework::ToVarType(in_var->Type()); + if (type == framework::proto::VarType::LOD_TENSOR) { + out_var->GetMutable()->Resize( + in_var->Get().dims()); + } else if (type == framework::proto::VarType::SELECTED_ROWS) { + out_var->GetMutable()->set_height( + in_var->Get().GetCompleteDims()[0]); + } else if (type == framework::proto::VarType::LOD_TENSOR_ARRAY) { + PADDLE_THROW("WhileGradOp doesn't support type %d", + static_cast(type)); + } + } else { + framework::VarDesc *in_var = + boost::get(in_var_ptrs[i]); + boost::get(out_var_ptrs[i]) + ->SetShape(in_var->GetShape()); } } - ctx->SetDims(names_to_set, dims_to_set); } }; diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 154ff2bb209bb8f932c06caa319223ccf3314767..8c116c4abfe42296b616dc536821e9be55a8be84 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -155,11 +155,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto chosen_memory_format = platform::data_format_to_memory_format(data_format); - if (is_conv3d) { - chosen_memory_format = - platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format); + weights_format = mkldnn::memory::format::any; + // Check the format for user's special output + if (chosen_memory_format != mkldnn::memory::format::any) { + if (is_conv3d) { + chosen_memory_format = + platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format); + } } - weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d); auto src_md = platform::MKLDNNMemDesc( src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); @@ -435,11 +438,14 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto chosen_memory_format = platform::data_format_to_memory_format(data_format); - if (is_conv3d) { - chosen_memory_format = - platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format); + weights_format = mkldnn::memory::format::any; + // Check the format for user's special output + if (chosen_memory_format != mkldnn::memory::format::any) { + if (is_conv3d) { + chosen_memory_format = + platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format); + } } - weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d); auto src_md = platform::MKLDNNMemDesc( src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index 78956c9ea4942098002dd30e6f2f471ae49ab8d1..8c54159a41e3361322d0fa7ce36534447680207d 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include "glog/logging.h" // For VLOG @@ -420,7 +421,15 @@ void GRPCClient::Proceed() { sync_cond_.notify_all(); } } - VLOG(3) << "GRPCClient Proceed end"; + + // Last log message + // Avoid using VLOG() and LOG(): in the destructor of google::LogMessage() a + // static Mutex log_mutex is used for synchronization, which might have been + // destructed at this moment. + if (FLAGS_v >= 3) { + std::string msg("GRPCClient Proceed end"); + fwrite(msg.c_str(), msg.length(), 1, stdout); + } } std::shared_ptr GRPCClient::GetChannel(const std::string& ep) { diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu index 1a149298fd33f132a90ff5de3b35dd5894a4ae68..ae669f5525443abe424109b6a6869e2ddaf52ba0 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu @@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_div_op.h" +#include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( elementwise_div, ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, ops::ElementwiseDivKernel); REGISTER_OP_CUDA_KERNEL( elementwise_div_grad, ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel); + elementwise_mul, ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel); REGISTER_OP_CUDA_KERNEL( elementwise_mul_grad, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel); + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel); diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc index 95381774606b2d8e74519befc9a6f7a3ac20aa45..e80a703c30c0335124c089ea82ba4f6fe055acde 100644 --- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc +++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fill_zeros_like_op.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( @@ -22,4 +23,6 @@ REGISTER_OP_CUDA_KERNEL( ops::FillZerosLikeKernel, ops::FillZerosLikeKernel, ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, ops::FillZerosLikeKernel); diff --git a/paddle/fluid/operators/metrics/accuracy_op.cu b/paddle/fluid/operators/metrics/accuracy_op.cu index b255d2a7c413b4f965f6b874d342dcb93c7b5e66..4682940f7e15bc8af5dcda24ea058ac7351887c6 100644 --- a/paddle/fluid/operators/metrics/accuracy_op.cu +++ b/paddle/fluid/operators/metrics/accuracy_op.cu @@ -16,6 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/metrics/accuracy_op.h" #include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/gpu_info.h" namespace paddle { @@ -94,6 +95,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel { // FIXME(typhoonzero): types of T is for inference data. // label data is always int64 -REGISTER_OP_CUDA_KERNEL(accuracy, - paddle::operators::AccuracyOpCUDAKernel, - paddle::operators::AccuracyOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL( + accuracy, paddle::operators::AccuracyOpCUDAKernel, + paddle::operators::AccuracyOpCUDAKernel, + paddle::operators::AccuracyOpCUDAKernel); diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index 8a111e6065b102fd177b9e313cd87dcf8c22b669..271428408cb26296ff318bb39414ad0e8ecc0ac8 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -49,7 +49,8 @@ class MulOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_GT( y_dims.size(), y_num_col_dims, "The input tensor Y's rank of MulOp should be larger than " - "y_num_col_dims."); + "y_num_col_dims: %ld vs %ld", + y_dims.size(), y_num_col_dims); auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims); auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims); diff --git a/paddle/fluid/operators/optimizers/momentum_op.cu b/paddle/fluid/operators/optimizers/momentum_op.cu index 8ce739de8dfd74cb43f9521bf39e3127a8a21925..7f9e7246401bc3c765e539ac4395c4feef3c9508 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.cu +++ b/paddle/fluid/operators/optimizers/momentum_op.cu @@ -14,8 +14,11 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/optimizers/momentum_op.h" +#include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( momentum, ops::MomentumOpKernel, - ops::MomentumOpKernel); + ops::MomentumOpKernel, + ops::MomentumOpKernel); diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h index 84955d3f04308538b9a7b09beff35bc06c34bdc3..3ed1bff5ff4993e9c858dea8d56a8cb6124aca89 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.h +++ b/paddle/fluid/operators/optimizers/momentum_op.h @@ -237,7 +237,8 @@ class SparseMomentumFunctor { inline HOSTDEVICE void operator()(size_t i) { auto row_idx = math::BinarySearch(rows_, row_height_, i / row_numel_); - T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0; + T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] + : static_cast(0); // put memory access in register const T p = p_[i]; const T lr = lr_[0]; @@ -282,7 +283,8 @@ class SparseMomentumFunctor { inline HOSTDEVICE void operator()(size_t i) { auto row_idx = math::BinarySearch(rows_, row_height_, i / row_numel_); - T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0; + T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] + : static_cast(0); // put memory access in register const T p = p_[i]; const T lr = lr_[0]; diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index 0cad224ca8860b0e4bc2e3f2bc1659235aadfe2d..99a4b1b7b0b33aebd9a1a49b0b771fe6fd134bb3 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/top_k_op.h" #include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { @@ -150,7 +151,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, if (k < MaxLength - (*beam)) { topk[k] = topk[k + *beam]; } else { - topk[k].set(-INFINITY, -1); + topk[k].set(-static_cast(INFINITY), -1); } } if (!(*is_empty)) { @@ -160,7 +161,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, } *max = topk[MaxLength - 1]; - if ((*max).v == -1) *is_empty = true; + if ((*max).v == -static_cast(1)) *is_empty = true; *beam = 0; } } @@ -181,7 +182,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, if (k < MaxLength - *beam) { topk[k] = topk[k + *beam]; } else { - topk[k].set(-INFINITY, -1); + topk[k].set(-static_cast(INFINITY), -1); } } if (!(*is_empty)) { @@ -278,7 +279,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, bool firststep = true; for (int j = 0; j < MaxLength; j++) { - topk[j].set(-INFINITY, -1); + topk[j].set(-static_cast(INFINITY), -1); } while (top_num) { ThreadGetTopK( @@ -362,5 +363,7 @@ class TopkOpCUDAKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel, - paddle::operators::TopkOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL( + top_k, paddle::operators::TopkOpCUDAKernel, + paddle::operators::TopkOpCUDAKernel, + paddle::operators::TopkOpCUDAKernel); diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index cbb090adefda03717a634dab24132d36d1cfc648..6ce4bf8f13922e2756c3ee8f189bd36123d6964c 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -23,6 +23,7 @@ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" #define NCCL_ID_VARNAME "NCCLID" @@ -38,6 +39,8 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) { return ncclInt; } else if (type == framework::proto::VarType::INT64) { return ncclInt64; + } else if (type == framework::proto::VarType::FP16) { + return ncclFloat16; } else { PADDLE_THROW("Not supported"); } diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 34e9c897d9e95feb185083b7c0a6a824d8dc809c..be63fb877869b64035207342e5d4398e481dbc99 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -24,8 +24,9 @@ namespace pybind { void BindTracer(pybind11::module *m) { pybind11::class_(*m, "Tracer", "") .def("__init__", - [](imperative::Tracer &self, framework::BlockDesc *root_block) { - new (&self) imperative::Tracer(root_block); + [](imperative::Tracer &self, framework::BlockDesc *root_block, + framework::BlockDesc *startup_block) { + new (&self) imperative::Tracer(root_block, startup_block); }) .def("trace", &imperative::Tracer::Trace) .def("get_scope", &imperative::Tracer::GetScope, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 737ae2dd9c3451a0c9aabd31a5ec05b908356c98..f8a5c9deb066048ff645191c30b8c3dbf1d7eef5 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -977,7 +977,6 @@ All parameter, weight, gradient are variables in Paddle. cannot be updated after being finalized.)DOC"); pe.def(py::init &, - const std::unordered_set &, const std::unordered_set &, const ProgramDesc &, const std::string &, Scope *, std::vector &, const ExecutionStrategy &, const BuildStrategy &, size_t, diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index b2c3e7c989c6e7d947055a1f907ddb439445eac5..6303be003a701e57a8aa1e2f925459f416cdb543 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -489,8 +489,11 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, grad_to_var = dict() op_desc = _create_op_desc_( - "fill_constant", {}, {"Out": [_append_grad_suffix_(loss.name)]}, { - "shape": [1], + "fill_constant", + {}, + {"Out": [_append_grad_suffix_(loss.name)]}, + { + "shape": [1], # TODO(panyx0718): This can be loss.shape. "value": 1.0, "dtype": loss.dtype, "force_cpu": False, diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py index 3bf2fe5db0cb2126295ebfda822eeac8762dbdb7..ece97b661fd7d60f8822439a84ee4403b9e3d81c 100644 --- a/python/paddle/fluid/contrib/__init__.py +++ b/python/paddle/fluid/contrib/__init__.py @@ -22,9 +22,12 @@ from . import op_frequence from .op_frequence import * from . import quantize from .quantize import * +from . import utils +from .utils import * __all__ = [] __all__ += decoder.__all__ __all__ += memory_usage_calc.__all__ __all__ += op_frequence.__all__ __all__ += quantize.__all__ +__all__ += utils.__all__ diff --git a/python/paddle/fluid/contrib/utils/__init__.py b/python/paddle/fluid/contrib/utils/__init__.py index 20b2cc381aaa1b837ce106410246bc8cedb2fc88..1c1c2fb22709189ca03dc543ca551257c8031c1a 100644 --- a/python/paddle/fluid/contrib/utils/__init__.py +++ b/python/paddle/fluid/contrib/utils/__init__.py @@ -13,10 +13,11 @@ # limitations under the License. from __future__ import print_function -#from . import lookup_table_utils -#from .lookup_table_utils import * +from . import lookup_table_utils +from .lookup_table_utils import * from . import hdfs_utils from .hdfs_utils import * -#__all__ = lookup_table_utils.__all__ -__all__ = hdfs_utils.__all__ +__all__ = [] +__all__ += lookup_table_utils.__all__ +__all__ += hdfs_utils.__all__ diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py index baea57ccce0e9ca3a8fab244e43a107a89cfe67d..35ddf97ff2361d8abd34b16761be99990fc3880d 100644 --- a/python/paddle/fluid/contrib/utils/hdfs_utils.py +++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py @@ -14,6 +14,7 @@ """HDFS Utils""" import os +import sys import subprocess import multiprocessing from datetime import datetime @@ -24,7 +25,7 @@ import errno import logging -__all__ = ["HDFSClient", "multi_download"] +__all__ = ["HDFSClient", "multi_download", "multi_upload"] logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') _logger = logging.getLogger("hdfs_utils") @@ -93,13 +94,15 @@ class HDFSClient(object): def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5): """ - upload the local file to hdfs - Args: - hdfs_path: hdfs path, target path - local_path: local file path, source path - overwrite: will overwrite the original file - retry_times: max times retry to upload - Returns: + upload the local file to hdfs + + Args: + hdfs_path(str): the hdfs file path + local_path(str): the local file path + overwrite(bool|None): will overwrite the file on HDFS or not + retry_times(int|5): retry times + + Returns: True or False """ assert hdfs_path is not None @@ -109,7 +112,7 @@ class HDFSClient(object): _logger.warn( "The Local path: {} is dir and I will support it later, return". format(local_path)) - return + return False base = os.path.basename(local_path) if not self.is_exist(hdfs_path): @@ -141,14 +144,16 @@ class HDFSClient(object): def download(self, hdfs_path, local_path, overwrite=False, unzip=False): """ - download from hdfs - Args: - hdfs_path: hdfs path, target path - local_path: local file path, source path - overwrite: will remove original file and overwrite it. - unzip: ignore this param - Returns - True or False + download file from HDFS + + Args: + hdfs_path(str): the hdfs file path + local_path(str): the local file path + overwrite(bool|None): will overwrite the file on HDFS or not + unzip(bool|False): if the download file is compressed by zip, unzip it or not. + + Returns: + True or False """ _logger.info('Downloading %r to %r.', hdfs_path, local_path) _logger.info('Download of %s to %r complete.', hdfs_path, local_path) @@ -188,13 +193,13 @@ class HDFSClient(object): def is_exist(self, hdfs_path=None): """ - whether the remote hdfs path exists? - Args: - hdfs_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp) - fs_name: The default values are the same as in the job configuration - fs_ugi: The default values are the same as in the job configuration - Returns: - True or False + whether the remote HDFS path exists + + Args: + hdfs_path(str): the hdfs file path + + Returns: + True or False """ exist_cmd = ['-test', '-e', hdfs_path] returncode, output, errors = self.__run_hdfs_cmd( @@ -211,13 +216,13 @@ class HDFSClient(object): def is_dir(self, hdfs_path=None): """ - whether the remote hdfs path exists? - Args: - remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp) - fs_name: The default values are the same as in the job configuration - fs_ugi: The default values are the same as in the job configuration - Returns: - True or False + whether the remote HDFS path is directory + + Args: + hdfs_path(str): the hdfs file path + + Returns: + True or False """ if not self.is_exist(hdfs_path): @@ -237,17 +242,17 @@ class HDFSClient(object): def delete(self, hdfs_path): """ - Remove a file or directory from HDFS. + Remove a file or directory from HDFS. + + whether the remote HDFS path exists Args: - param hdfs_path: HDFS path. - param recursive: Recursively delete files and directories. By default, - this method will raise an :class:`HdfsError` if trying to delete a - non-empty directory. + hdfs_path: HDFS path. + Returns: + True or False This function returns `True` if the deletion was successful and `False` if no file or directory previously existed at `hdfs_path`. - """ _logger.info('Deleting %r.', hdfs_path) @@ -273,16 +278,14 @@ class HDFSClient(object): def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False): """ - Rename a file or folder. - Args: - :param hdfs_src_path: Source path. - :param hdfs_dst_path: Destination path. If the path already exists and is - a directory, the source will be moved into it. If the path exists and is - a file, or if a parent destination directory is missing, this method will - raise an :class:`HdfsError`. + Move a file or folder on HDFS. + + Args: + hdfs_path(str): HDFS path. + overwrite(bool|False): If the path already exists and overwrite is False, will return False. + Returns: - This function returns `True` if the rename was successful and `False` if - rename was faild. + True or False """ assert hdfs_src_path is not None assert hdfs_dst_path is not None @@ -320,17 +323,20 @@ class HDFSClient(object): raise def makedirs(self, hdfs_path): - """Create a remote directory, recursively if necessary. + """ + Create a remote directory, recursively if necessary. + Args: - :param hdfs_path: Remote path. Intermediate directories will be created - appropriately. + hdfs_path(str): Remote path. Intermediate directories will be created appropriately. + Returns: - True if make a directories was successful, False when make a directiries was failed. + True or False """ _logger.info('Creating directories to %r.', hdfs_path) assert hdfs_path is not None if self.is_exist(hdfs_path): + _logger.error("HDFS path is exist: {}".format(hdfs_path)) return mkdirs_commands = ['-mkdir', hdfs_path] @@ -346,11 +352,13 @@ class HDFSClient(object): def ls(self, hdfs_path): """ - ls a hdfs_path. - Args: - :param hdfs_path: hdfs_path will be ls. + ls directory contents about HDFS hdfs_path + + Args: + hdfs_path(str): Remote HDFS path will be ls. + Returns: - This function returns a `list` that contaion all files in the hdfs_path. + List: a contents list about hdfs_path. """ assert hdfs_path is not None @@ -378,11 +386,15 @@ class HDFSClient(object): def lsr(self, hdfs_path, only_file=True, sort=True): """ - ls a hdfs_path sort by time. - Args: - :param hdfs_path: hdfs_path will be ls. + list directory contents about HDFS hdfs_path recursively + + Args: + hdfs_path(str): Remote HDFS path. + only_file(bool|True): will discard folders. + sort(bool|True): will be sorted by create time. + Returns: - This function returns a `list` that contaion all files sorted by time in the hdfs_path. + List: a contents list about hdfs_path. """ def sort_by_time(v1, v2): @@ -422,21 +434,106 @@ class HDFSClient(object): return ret_lines +def multi_download(client, + hdfs_path, + local_path, + trainer_id, + trainers, + multi_processes=5): + """ + Download files from HDFS using multi process. + + Args: + client(HDFSClient): instance of HDFSClient + hdfs_path(str): path on hdfs + local_path(str): path on local + trainer_id(int): current trainer id + trainers(int): all trainers number + multi_processes(int|5): the download data process at the same time, default=5 + + Returns: + List: + Download files in local folder. + """ + + def __subprocess_download(datas): + for data in datas: + re_path = os.path.relpath(os.path.dirname(data), hdfs_path) + if re_path == os.curdir: + sub_local_re_path = local_path + else: + sub_local_re_path = os.path.join(local_path, re_path) + client.download(data, sub_local_re_path) + + assert isinstance(client, HDFSClient) + + client.make_local_dirs(local_path) + _logger.info("Make local dir {} successfully".format(local_path)) + + all_need_download = client.lsr(hdfs_path, sort=True) + need_download = all_need_download[trainer_id::trainers] + _logger.info("Get {} files From all {} files need to be download from {}". + format(len(need_download), len(all_need_download), hdfs_path)) + + _logger.info("Start {} multi process to download datas".format( + multi_processes)) + procs = [] + for i in range(multi_processes): + process_datas = need_download[i::multi_processes] + p = multiprocessing.Process( + target=__subprocess_download, args=(process_datas, )) + procs.append(p) + p.start() + + # complete the processes + for proc in procs: + proc.join() + + _logger.info("Finish {} multi process to download datas".format( + multi_processes)) + + local_downloads = [] + for data in need_download: + data_name = os.path.basename(data) + re_path = os.path.relpath(os.path.dirname(data), hdfs_path) + if re_path == os.curdir: + local_re_path = os.path.join(local_path, data_name) + else: + local_re_path = os.path.join(local_path, re_path, data_name) + local_downloads.append(local_re_path) + + return local_downloads + + +def getfilelist(path): + rlist = [] + for dir, folder, file in os.walk(path): + for i in file: + t = os.path.join(dir, i) + rlist.append(t) + for r in rlist: + print(r) + + def multi_upload(client, hdfs_path, local_path, multi_processes=5, - overwrite=False): + overwrite=False, + sync=True): """ - Upload file to hdfs. + Upload files to HDFS using multi process. + Args: - :param overwrite: will overwrite hdfs file or not - :param multi_processes: the upload data process at the same time, default=5 - :param client: instance of HDFSClient - :param hdfs_path: path on hdfs - :param local_path: path on local + client(HDFSClient): instance of HDFSClient + hdfs_path(str): path on hdfs + local_path(str): path on local + multi_processes(int|5): the upload data process at the same time, default=5 + overwrite(bool|False): will overwrite file on HDFS or not + sync(bool|True): upload files sync or not. + Returns: - + None """ def __subprocess_upload(datas): @@ -446,13 +543,6 @@ def multi_upload(client, client.upload(hdfs_re_path, data, overwrite, retry_times=5) def get_local_files(path): - """ - Get all local files - Args: - path: local file path - Returns: - A list that contation all files in the path. - """ rlist = [] if not os.path.isdir(path): @@ -488,71 +578,6 @@ def multi_upload(client, multi_processes)) -def multi_download(client, - hdfs_path, - local_path, - trainer_id, - trainers, - file_cnt, - multi_processes=5): - """ - multi_download - Args: - :param client: instance of HDFSClient - :param hdfs_path: path on hdfs - :param local_path: path on local - :param trainer_id: current trainer id - :param trainers: all trainers number - :param file_cnt: all file number - :param multi_processes: the download data process at the same time, default=5 - :return: None - Returns: - A list that be downloaded. - """ - - def __subprocess_download(datas): - for data in datas: - re_path = os.path.relpath(os.path.dirname(data), hdfs_path) - local_re_path = os.path.join(local_path, re_path) - client.download(data, local_re_path) - - assert isinstance(client, HDFSClient) - - client.make_local_dirs(local_path) - _logger.info("Make local dir {} successfully".format(local_path)) - - all_need_download = client.lsr(hdfs_path, sort=True)[:file_cnt] - need_download = all_need_download[trainer_id::trainers] - _logger.info("Get {} files From all {} files need to be download from {}". - format(len(need_download), len(all_need_download), hdfs_path)) - - _logger.info("Start {} multi process to download datas".format( - multi_processes)) - procs = [] - for i in range(multi_processes): - process_datas = need_download[i::multi_processes] - p = multiprocessing.Process( - target=__subprocess_download, args=(process_datas, )) - procs.append(p) - p.start() - - # complete the processes - for proc in procs: - proc.join() - - _logger.info("Finish {} multi process to download datas".format( - multi_processes)) - - local_downloads = [] - for data in need_download: - data_name = os.path.basename(data) - re_path = os.path.relpath(os.path.dirname(data), hdfs_path) - local_re_path = os.path.join(local_path, re_path, data_name) - local_downloads.append(local_re_path) - - return local_downloads - - if __name__ == "__main__": hadoop_home = "/home/client/hadoop-client/hadoop/" diff --git a/python/paddle/fluid/contrib/utils/lookup_table_utils.py b/python/paddle/fluid/contrib/utils/lookup_table_utils.py index cc2418238f98d8e2b9af0cf4290f6088c11e1b92..20e6328d81cc727340ea4a16012f6ee9967ea1e6 100644 --- a/python/paddle/fluid/contrib/utils/lookup_table_utils.py +++ b/python/paddle/fluid/contrib/utils/lookup_table_utils.py @@ -18,14 +18,12 @@ import os import time import logging -import paddle -import paddle.fluid as fluid from paddle.fluid import core from paddle.fluid import io from paddle.fluid import Program __all__ = [ - "load_inference_model", "load_persistable_vars", + "load_persistables_for_increment", "load_persistables_for_inference", "convert_dist_to_sparse_program" ] @@ -80,19 +78,28 @@ def __get_prefetch_op_tuples(main_program): return prefetch_op_tuples -def convert_dist_to_sparse_program(main_program): - if not main_program._distributed_lookup_table: +def convert_dist_to_sparse_program(program): + """ + WARNING: this function will only be used for distributed training with distributed lookup table. + when we train model with distributed lookup table but want to do the local inference, we can use + this function to convert the train program with distributed lookup table to sparse lookup table. + + :param program(Program): the program must be the trainer program, which will be get by the distribute transpiler. + :return: + program: The `program` is a Program, it's the program replace distributed lookup table to sparse lookup table. + """ + if not program._distributed_lookup_table: _logger.warn( "There are no distributed lookup tables need to be converted") return # create table param and grad var in pserver program - origin_emb_var = "{}.origin".format(main_program._distributed_lookup_table) - emb_var = main_program._distributed_lookup_table - main_program.global_block()._rename_var(emb_var, origin_emb_var) - origin_param_var = main_program.global_block().vars[origin_emb_var] + origin_emb_var = "{}.origin".format(program._distributed_lookup_table) + emb_var = program._distributed_lookup_table + program.global_block()._rename_var(emb_var, origin_emb_var) + origin_param_var = program.global_block().vars[origin_emb_var] - param_var = main_program.global_block().create_var( + param_var = program.global_block().create_var( name=emb_var, shape=origin_param_var.shape, dtype=origin_param_var.dtype, @@ -100,28 +107,28 @@ def convert_dist_to_sparse_program(main_program): persistable=True) # parameter must be selected rows param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS) - main_program._sync_with_cpp() + program._sync_with_cpp() - prefetch_op_tuples = __get_prefetch_op_tuples(main_program) + prefetch_op_tuples = __get_prefetch_op_tuples(program) split_ids_id = prefetch_op_tuples[0] for idx in range(split_ids_id + 2, split_ids_id - 1, -1): - main_program.global_block()._remove_op(idx) - main_program.desc.flush() + program.global_block()._remove_op(idx) + program.desc.flush() in_out_pairs = zip(prefetch_op_tuples[1], prefetch_op_tuples[2]) for in_out_pair in in_out_pairs: idx = split_ids_id - ids = main_program.global_block().vars[in_out_pair[0]] - out = main_program.global_block().vars[in_out_pair[1]] - __insert_lookup_sparse_table_op(main_program, idx, ids, param_var, out) - main_program.desc.flush() - return main_program + ids = program.global_block().vars[in_out_pair[0]] + out = program.global_block().vars[in_out_pair[1]] + __insert_lookup_sparse_table_op(program, idx, ids, param_var, out) + program.desc.flush() + return program -def load_persistable_vars(executor, dirname, program, lookup_table_var): +def _load_persistable_vars(executor, dirname, program, lookup_table_vars): def _is_checkpoint_var(exclude_fluid_vars=None): """ the checkpoint will not save or load all the variables. @@ -159,8 +166,82 @@ def load_persistable_vars(executor, dirname, program, lookup_table_var): return is_valid - def _load_lookup_table_vars(executor, dirname, main_program, - lookup_table_vars): + io.load_vars( + executor, + dirname=dirname, + main_program=program, + predicate=_is_checkpoint_var(lookup_table_vars), + filename=None) + + +def load_persistables_for_increment(dirname, executor, program, + lookup_table_var, lookup_table_var_path): + """ + WARNING: this function will only be used for distributed training with distributed lookup table. + for increment trainning, the pserver will not only load dense variables, + but also load the suitable lookup table var. Because of slice lookup table + var with HASH, we must load the correct slice var. + + + :param dirname(str): The directory path + :param executor(Executor): The executor to run for loading inference model. + :param program(Program): The parameter server program, which will run on Pserver. + :param lookup_table_var: the distributed lookup tables var name. + :param lookup_table_var_path: the the distributed lookup tables var location. + :return: None + """ + + def __load_lookup_table_vars(executor, main_program, lookup_table_var, + lookup_table_var_path): + emb_var = main_program.global_block().var(lookup_table_var) + + load_program = Program() + load_block = load_program.global_block() + load_block.append_op( + type='load', + inputs={}, + outputs={'Out': [emb_var]}, + attrs={'file_path': lookup_table_var_path}) + executor.run(load_program) + + if not os.path.isdir(dirname): + raise ValueError("There is no directory named '%s'", dirname) + + if not os.path.exists(lookup_table_var_path): + raise ValueError("There is no file named '%s'", lookup_table_var_path) + + if not isinstance(program, Program): + raise ValueError("program must be an instance of fluid.Program") + + _logger.info("Start Load Sparse Program With " + "Distributed Lookup Table Vars from {}, time = {}".format( + dirname, time.ctime())) + + _load_persistable_vars(executor, dirname, program, [lookup_table_var]) + __load_lookup_table_vars(executor, program, lookup_table_var, + lookup_table_var_path) + + _logger.info("Finish Load Sparse Program With " + "Distributed Lookup Table Vars from {}, time = {}".format( + dirname, time.ctime())) + + +def load_persistables_for_inference(dirname, executor, program, + lookup_table_var_name): + """ + WARNING: this function will only be used for inference with distributed lookup table. + Inference with distributed lookup table is a little funky, this function will load distributed + lookup table vars into sparse var, can be used in local inference mode. + + :param dirname(str): The directory path + :param executor(Executor): The executor to run for loading inference model. + :param program(Program): The parameter server program, which will run on Pserver. + :param lookup_table_var_name: the distributed lookup tables var name. + :return: None + """ + + def __load_lookup_table_vars(executor, dirname, main_program, + lookup_table_vars): if not os.path.isdir(dirname): raise ValueError("There is no directory named '%s'", dirname) @@ -209,48 +290,34 @@ def load_persistable_vars(executor, dirname, program, lookup_table_var): global_block.append_op(type='delete_var', inputs={'X': sums}) executor.run(convert_program) - _logger.info("Start Load Sparse Program With " - "Distributed Lookup Table Vars from {}, time = {}".format( - dirname, time.ctime())) - - lookup_table_vars = [lookup_table_var] - - io.load_vars( - executor, - dirname=dirname, - main_program=program, - predicate=_is_checkpoint_var(lookup_table_vars), - filename=None) - - _load_lookup_table_vars(executor, dirname, program, lookup_table_vars) - - _logger.info("Finish Load Sparse Program With " - "Distributed Lookup Table Vars from {}, time = {}".format( - dirname, time.ctime())) - - -def load_inference_model(dirname, executor, lookup_table_var_name): if not os.path.isdir(dirname): raise ValueError("There is no directory named '%s'", dirname) - local_model = os.path.join(dirname, model_filename) + if program: + if not isinstance(program, Program): + raise ValueError("program must be an instance of fluid.Program") + else: + local_model = os.path.join(dirname, model_filename) - with open(local_model, "rb") as f: - program_desc_str = f.read() + with open(local_model, "rb") as f: + program_desc_str = f.read() - program = Program.parse_from_string(program_desc_str) + program = Program.parse_from_string(program_desc_str) - if not core._is_program_version_supported(program._version()): - raise ValueError("Unsupported program version: %d\n" % - program._version()) + if not core._is_program_version_supported(program._version()): + raise ValueError("Unsupported program version: %d\n" % + program._version()) - # Binary data also need version. - load_persistable_vars(executor, dirname, program, lookup_table_var_name) + _logger.info("Start Load Sparse Program With " + "Distributed Lookup Table Vars from {}, time = {}".format( + dirname, time.ctime())) + + _load_persistable_vars(executor, dirname, program, [lookup_table_var_name]) + __load_lookup_table_vars(executor, dirname, program, + [lookup_table_var_name]) - feed_target_names = program.desc.get_feed_target_names() - fetch_target_names = program.desc.get_fetch_target_names() - fetch_targets = [ - program.global_block().var(name) for name in fetch_target_names - ] + _logger.info("Finish Load Sparse Program With " + "Distributed Lookup Table Vars from {}, time = {}".format( + dirname, time.ctime())) - return [program, feed_target_names, fetch_targets] + return program diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py index 13d2893fd146b5a3d9100ee1ba6c2243cb9c411b..af02721eb72c1d0f8aa3d7ab8db504c4c33b64d5 100644 --- a/python/paddle/fluid/data_feeder.py +++ b/python/paddle/fluid/data_feeder.py @@ -44,6 +44,8 @@ class DataToLoDTensorConverter(object): self.dtype = 'int64' elif dtype == core.VarDesc.VarType.FP64: self.dtype = 'float64' + elif dtype == core.VarDesc.VarType.FP16: + self.dtype = 'float16' elif dtype == core.VarDesc.VarType.INT32: self.dtype = 'int32' elif dtype == core.VarDesc.VarType.UINT8: diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index b5d603d4781a7fa0b964b635e8f94f2557aeccd8..de30ed2fc5858187d2ecede299832701304e4198 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1324,6 +1324,9 @@ class Block(object): def _prepend_op(self, *args, **kwargs): op_desc = self.desc._prepend_op() op = Operator(self, op_desc, *args, **kwargs) + if _in_imperative_mode(): + _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], + [v._ivar for v in op.outputs], self.desc) self.ops.insert(0, op) return op diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py index 15d38ddb56c71ef7de67f79cf52cd26070f470cb..aa48ef71aa61086764019ac29abd9cb4c53325fa 100644 --- a/python/paddle/fluid/imperative/base.py +++ b/python/paddle/fluid/imperative/base.py @@ -28,7 +28,8 @@ def enabled(): def guard(): train = framework.Program() startup = framework.Program() - tracer = core.Tracer(train.current_block().desc) + tracer = core.Tracer(train.current_block().desc, + startup.current_block().desc) with framework.program_guard(train, startup): with framework.unique_name.guard(): with framework._imperative_guard(tracer): diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index 1a28f7f4ae35295394b560d79e3dc0cdd5f2beab..044717c31975d671818cae17cd989774c96ed9fa 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -25,11 +25,9 @@ __all__ = ['PyLayer'] class PyLayer(core.Layer): def __init__(self): - pass + self._built = False def __call__(self, inputs): - # TODO(panyx0718): Support declarative mode as well. - assert base.enabled() if not isinstance(inputs, list) and not isinstance(inputs, tuple): inputs = [inputs] @@ -37,8 +35,15 @@ class PyLayer(core.Layer): for x in inputs: py_var = base.to_variable(x) var_inputs.append(py_var) + if not self._built: + self._build_once(inputs) + self._built = True + outputs = self.forward(var_inputs) return outputs + def _build_once(self, inputs): + pass + def forward(self, inputs): return [] diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index b37ebbe5179ba6e36be70ff936cb8a3ca0d89d13..26d1f8f4d2bd67a35c4ec96a025ee273cec4dbd1 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -18,6 +18,7 @@ from . import framework import numpy as np import contextlib from .core import VarDesc +from . import unique_name __all__ = [ 'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear', @@ -207,16 +208,39 @@ class UniformInitializer(Initializer): # Initialization Ops should be prepended and not appended if self._seed == 0: self._seed = block.program.random_seed + + # to be compatible of fp16 initalizers + if var.dtype == VarDesc.VarType.FP16: + out_dtype = VarDesc.VarType.FP32 + out_var = block.create_var( + name=unique_name.generate(".".join(['gaussian_random', 'tmp'])), + shape=var.shape, + dtype=out_dtype, + type=VarDesc.VarType.LOD_TENSOR, + persistable=False) + else: + out_dtype = var.dtype + out_var = var + op = block._prepend_op( type="uniform_random", - outputs={"Out": var}, + outputs={"Out": out_var}, attrs={ "shape": var.shape, - "dtype": int(var.dtype), + "dtype": out_dtype, "min": self._low, "max": self._high, "seed": self._seed }) + + if var.dtype == VarDesc.VarType.FP16: + block.append_op( + type="cast", + inputs={"X": out_var}, + outputs={"Out": var}, + attrs={"in_dtype": out_var.dtype, + "out_dtype": var.dtype}) + var.op = op return op @@ -261,17 +285,39 @@ class NormalInitializer(Initializer): # Initialization Ops should be prepended and not appended if self._seed == 0: self._seed = block.program.random_seed + + # to be compatible of fp16 initalizers + if var.dtype == VarDesc.VarType.FP16: + out_dtype = VarDesc.VarType.FP32 + out_var = block.create_var( + name=unique_name.generate(".".join(['gaussian_random', 'tmp'])), + shape=var.shape, + dtype=out_dtype, + type=VarDesc.VarType.LOD_TENSOR, + persistable=False) + else: + out_dtype = var.dtype + out_var = var + op = block._prepend_op( type="gaussian_random", - outputs={"Out": var}, + outputs={"Out": out_var}, attrs={ "shape": var.shape, - "dtype": int(var.dtype), + "dtype": out_dtype, "mean": self._mean, "std": self._std_dev, "seed": self._seed, "use_mkldnn": False }) + + if var.dtype == VarDesc.VarType.FP16: + block.append_op( + type="cast", + inputs={"X": out_var}, + outputs={"Out": var}, + attrs={"in_dtype": out_var.dtype, + "out_dtype": var.dtype}) var.op = op return op diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index dde05189722fef77e03a1c2d8f3cbae44a3e8245..06039b206b4ddb02e38035134e50b353b987074e 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -63,14 +63,18 @@ def noam_decay(d_model, warmup_steps): Returns: The decayed learning rate. """ - with default_main_program()._lr_schedule_guard(): - global_step = _decay_step_counter(1) - a = global_step**-0.5 - b = (warmup_steps**-1.5) * global_step - lr_value = (d_model**-0.5) * nn.elementwise_min(a, b) + def _lr_schedule(dtype): + with default_main_program()._lr_schedule_guard(): + global_step = _decay_step_counter(1) - return lr_value + a = global_step**-0.5 + b = (warmup_steps**-1.5) * global_step + lr_value = (d_model**-0.5) * nn.elementwise_min(a, b) + + return lr_value + + return _lr_schedule def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): @@ -109,15 +113,19 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): sgd_optimizer.minimize(avg_cost) """ - with default_main_program()._lr_schedule_guard(): - global_step = _decay_step_counter() - div_res = global_step / decay_steps - if staircase: - div_res = ops.floor(div_res) - decayed_lr = learning_rate * (decay_rate**div_res) + def _lr_schedule(dtype): + with default_main_program()._lr_schedule_guard(): + global_step = _decay_step_counter() + + div_res = global_step / decay_steps + if staircase: + div_res = ops.floor(div_res) + decayed_lr = learning_rate * (decay_rate**div_res) - return decayed_lr + return decayed_lr + + return _lr_schedule def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): @@ -138,15 +146,19 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): Returns: The decayed learning rate """ - with default_main_program()._lr_schedule_guard(): - global_step = _decay_step_counter() - div_res = global_step / decay_steps - if staircase: - div_res = ops.floor(div_res) - decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res) + def _lr_schedule(dtype): + with default_main_program()._lr_schedule_guard(): + global_step = _decay_step_counter() + + div_res = global_step / decay_steps + if staircase: + div_res = ops.floor(div_res) + decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res) + + return decayed_lr - return decayed_lr + return _lr_schedule def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): @@ -184,16 +196,20 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): staircase=True)) sgd_optimizer.minimize(avg_cost) """ - with default_main_program()._lr_schedule_guard(): - global_step = _decay_step_counter() - div_res = global_step / decay_steps - if staircase: - div_res = ops.floor(div_res) + def _lr_schedule(dtype): + with default_main_program()._lr_schedule_guard(): + global_step = _decay_step_counter() - decayed_lr = learning_rate / (1 + decay_rate * div_res) + div_res = global_step / decay_steps + if staircase: + div_res = ops.floor(div_res) - return decayed_lr + decayed_lr = learning_rate / (1 + decay_rate * div_res) + + return decayed_lr + + return _lr_schedule def polynomial_decay(learning_rate, @@ -224,28 +240,33 @@ def polynomial_decay(learning_rate, Returns: Variable: The decayed learning rate """ - with default_main_program()._lr_schedule_guard(): - global_step = _decay_step_counter() - if cycle: - div_res = ops.ceil(global_step / decay_steps) - zero_var = tensor.fill_constant( - shape=[1], dtype='float32', value=0.0) - one_var = tensor.fill_constant( - shape=[1], dtype='float32', value=1.0) + def _lr_schedule(dtype, decay_steps=decay_steps): + with default_main_program()._lr_schedule_guard(): + global_step = _decay_step_counter() + + if cycle: + div_res = ops.ceil(global_step / decay_steps) + zero_var = tensor.fill_constant( + shape=[1], dtype=dtype, value=0.0) + one_var = tensor.fill_constant( + shape=[1], dtype=dtype, value=1.0) + + with control_flow.Switch() as switch: + with switch.case(global_step == zero_var): + tensor.assign(input=one_var, output=div_res) + decay_steps = decay_steps * div_res + else: + decay_steps_var = tensor.fill_constant( + shape=[1], dtype=dtype, value=float(decay_steps)) + global_step = nn.elementwise_min( + x=global_step, y=decay_steps_var) - with control_flow.Switch() as switch: - with switch.case(global_step == zero_var): - tensor.assign(input=one_var, output=div_res) - decay_steps = decay_steps * div_res - else: - decay_steps_var = tensor.fill_constant( - shape=[1], dtype='float32', value=float(decay_steps)) - global_step = nn.elementwise_min(x=global_step, y=decay_steps_var) + decayed_lr = (learning_rate - end_learning_rate) * \ + ((1 - global_step / decay_steps) ** power) + end_learning_rate + return decayed_lr - decayed_lr = (learning_rate - end_learning_rate) * \ - ((1 - global_step / decay_steps) ** power) + end_learning_rate - return decayed_lr + return _lr_schedule def piecewise_decay(boundaries, values): @@ -273,38 +294,42 @@ def piecewise_decay(boundaries, values): """ - with default_main_program()._lr_schedule_guard(): - if len(values) - len(boundaries) != 1: - raise ValueError("len(values) - len(boundaries) should be 1") - - global_step = _decay_step_counter() - - lr = tensor.create_global_var( - shape=[1], - value=0.0, - dtype='float32', - persistable=True, - name="learning_rate") - - with control_flow.Switch() as switch: - for i in range(len(boundaries)): - boundary_val = tensor.fill_constant( - shape=[1], - dtype='float32', - value=float(boundaries[i]), - force_cpu=True) - value_var = tensor.fill_constant( - shape=[1], dtype='float32', value=float(values[i])) - with switch.case(global_step < boundary_val): - tensor.assign(value_var, lr) - last_value_var = tensor.fill_constant( + + def _lr_schedule(dtype): + with default_main_program()._lr_schedule_guard(): + if len(values) - len(boundaries) != 1: + raise ValueError("len(values) - len(boundaries) should be 1") + + global_step = _decay_step_counter() + + lr = tensor.create_global_var( shape=[1], + value=0.0, dtype='float32', - value=float(values[len(values) - 1])) - with switch.default(): - tensor.assign(last_value_var, lr) + persistable=True, + name="learning_rate") + + with control_flow.Switch() as switch: + for i in range(len(boundaries)): + boundary_val = tensor.fill_constant( + shape=[1], + dtype='float32', + value=float(boundaries[i]), + force_cpu=True) + value_var = tensor.fill_constant( + shape=[1], dtype='float32', value=float(values[i])) + with switch.case(global_step < boundary_val): + tensor.assign(value_var, lr) + last_value_var = tensor.fill_constant( + shape=[1], + dtype='float32', + value=float(values[len(values) - 1])) + with switch.default(): + tensor.assign(last_value_var, lr) + + return lr - return lr + return _lr_schedule def append_LARS(params_grads, learning_rate, weight_decay): diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 4d8311a0d3ada78e4f6cc54f8990e2a2e2cadc4d..4d44ce50a310cc6c95318a159b15544d8628e0bf 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -29,6 +29,7 @@ from . import utils from .. import unique_name from functools import reduce from .. import core +from ..imperative import layers __all__ = [ 'fc', @@ -2797,6 +2798,10 @@ def batch_norm(input, helper = LayerHelper('batch_norm', **locals()) dtype = helper.input_dtype() + # use fp32 for bn parameter + if dtype == core.VarDesc.VarType.FP16: + dtype = core.VarDesc.VarType.FP32 + input_shape = input.shape if data_layout == 'NCHW': channel_num = input_shape[1] @@ -2831,7 +2836,7 @@ def batch_norm(input, trainable=False, do_model_average=do_model_average_for_mean_and_var), shape=param_shape, - dtype=input.dtype) + dtype=dtype) mean.stop_gradient = True variance = helper.create_parameter( @@ -2841,7 +2846,7 @@ def batch_norm(input, trainable=False, do_model_average=do_model_average_for_mean_and_var), shape=param_shape, - dtype=input.dtype) + dtype=dtype) variance.stop_gradient = True # create output @@ -9426,3 +9431,47 @@ def huber_loss(input, label, delta): 'Residual': residual}, attrs={'delta': delta}) return out + + +class FC(layers.PyLayer): + def __init__(self, + size, + param_attr=None, + num_flatten_dims=1, + dtype=core.VarDesc.VarType.FP32): + super(FC, self).__init__() + self._size = size + self._num_flatten_dims = num_flatten_dims + self._dtype = dtype + self._helper = LayerHelper('FC', param_attr=param_attr) + + def _build_once(self, inputs): + input_shape = inputs[0].shape + param_shape = [ + reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1) + ] + [self._size] + self._w = self._helper.create_parameter( + attr=self._helper.param_attr, + shape=param_shape, + dtype=self._dtype, + is_bias=False) + + def forward(self, inputs): + tmp = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="mul", + inputs={"X": inputs[0], + "Y": self._w}, + outputs={"Out": tmp}, + attrs={ + "x_num_col_dims": self._num_flatten_dims, + "y_num_col_dims": 1 + }) + + out = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="sum", + inputs={"X": [tmp]}, + outputs={"Out": out}, + attrs={"use_mkldnn": False}) + return out diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 59c22d4e498814d468c78b10265b7afe35461dfb..58cfc498c9edd77163b2bd4cad2cb991b6f2b20c 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -50,17 +50,21 @@ class Optimizer(object): def __init__(self, learning_rate, regularization=None, name=None): if not isinstance(learning_rate, float) and \ - not isinstance(learning_rate, framework.Variable): - raise TypeError("learning rate should be float or Variable") + not isinstance(learning_rate, framework.Variable) and \ + not callable(learning_rate): + raise TypeError( + "learning rate should be float or Variable or callable(dtype)") self._name = name self.regularization = regularization self._learning_rate = learning_rate # the learning rate type should be inferenced from loss self._dtype = None # each program should have a independent learning rate - # program -> Variable(learning_rate) + # program -> Variable(learning_rate) or: + # program -> callable(return learning_rate Variable) self._learning_rate_map = dict() - if isinstance(self._learning_rate, framework.Variable): + if isinstance(self._learning_rate, framework.Variable) or \ + callable(self._learning_rate): self._learning_rate_map[framework.default_main_program( )] = self._learning_rate # Dictionary of accumulators. Some optimizer subclasses need to @@ -75,6 +79,11 @@ class Optimizer(object): if isinstance(lr, framework.Variable): return + elif callable(lr): + dtype = 'float32' if self._dtype is None else self._dtype + self._learning_rate_map[framework.default_main_program()] = lr( + dtype) + return else: if not isinstance(self._learning_rate, float): raise TypeError( diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index c54c3963a152851f5396c2ba71c28cc09c1cd523..74cf76da951a4cea884c4fdb8591b3d4fb010300 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -92,35 +92,27 @@ class ParallelExecutor(object): num_trainers=1, trainer_id=0, scope=None): + # step1: get places, the places are used in run too. self._places = [] - self._act_places = [] if use_cuda: - gpus = [] gpus_env = os.getenv("FLAGS_selected_gpus") if gpus_env: gpus = [int(s) for s in gpus_env.split(",")] else: - for i in six.moves.range(core.get_cuda_device_count()): - gpus.append(i) - for i in gpus: - p = core.Place() - self._act_places.append(core.CUDAPlace(i)) - p.set_place(self._act_places[-1]) - self._places.append(p) + gpus = [ + i for i in six.moves.range(core.get_cuda_device_count()) + ] + self._places = [core.CUDAPlace(i) for i in gpus] else: cpu_num = int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - for i in six.moves.range(cpu_num): - p = core.Place() - self._act_places.append(core.CPUPlace()) - p.set_place(self._act_places[-1]) - self._places.append(p) + self._places = [core.CPUPlace() for _ in six.moves.range(cpu_num)] assert self._places, "no place for execution" + # step2: init exec_strategy if exec_strategy is None: exec_strategy = ExecutionStrategy() exec_strategy.use_cuda = use_cuda - if exec_strategy.num_threads == 0: if use_cuda: # Experiments on se-resnext shows that too many threads hurt @@ -131,49 +123,54 @@ class ParallelExecutor(object): os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exec_strategy.num_threads = cpu_num * 2 + # step3: init build_strategy if build_strategy is None: build_strategy = BuildStrategy() - build_strategy.num_trainers = num_trainers build_strategy.trainer_id = trainer_id - main = main_program - main = main if main else framework.default_main_program() + # step4: get main_program, scope, local_scopes + main = main_program if main_program \ + else framework.default_main_program() + scope = scope if scope is not None else executor.global_scope() + + if share_vars_from and not isinstance(share_vars_from, + ParallelExecutor): + raise TypeError("share_vars_from must be ParallelExecutor.") + + local_scopes = share_vars_from.executor.local_scopes()\ + if share_vars_from else [] + # step5: check trainers_endpoints, it is used for distribution. trainers_endpoints = main._trainers_endpoints if num_trainers > 1 and trainers_endpoints: assert num_trainers == len( trainers_endpoints), "num_trainers == len(end_points)" build_strategy.trainers_endpoints = trainers_endpoints - if scope == None: - scope = executor.global_scope() - - if share_vars_from and not isinstance(share_vars_from, - ParallelExecutor): - raise TypeError("share_vars_from must be ParallelExecutor.") - - local_scopes = share_vars_from.executor.local_scopes( - ) if share_vars_from else [] - - self.persistable_vars = [ - v.name for v in [ + # step5: get persistable_vars, parameter_vars, places. persistable_vars + # need be broadcast to other local_scope. + persistable_vars = set([ + cpt.to_text(v.name) for v in [ var for var in main.list_vars() if var.persistable and var.type != core.VarDesc.VarType.RAW ] - ] + ]) + + def place_obj(place): + p = core.Place() + p.set_place(place) + return p + places = list(map(place_obj, self._places)) + + # step6: init ParallelExecutor self.executor = core.ParallelExecutor( - self._places, - set([ - cpt.to_text(p.name) - for p in main.global_block().iter_parameters() - if not p.stop_gradient - ]), - set(cpt.to_text(var) for var in self.persistable_vars), main.desc, + places, persistable_vars, main.desc, cpt.to_text(loss_name) if loss_name else six.u(''), scope, local_scopes, exec_strategy, build_strategy, num_trainers, trainer_id) + self.scope = scope def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True): @@ -261,7 +258,7 @@ class ParallelExecutor(object): self.executor.feed_and_split_tensor_into_local_scopes( feed_tensor_dict) elif isinstance(feed, list) or isinstance(feed, tuple): - if len(feed) != len(self._act_places): + if len(feed) != len(self._places): raise ValueError( "Feed a list of tensor, the list should be the same size as places" ) @@ -277,7 +274,7 @@ class ParallelExecutor(object): tensor = each[feed_name] if not isinstance(tensor, core.LoDTensor): tmp = core.LoDTensor() - tmp.set(tensor, self._act_places[i]) + tmp.set(tensor, self._places[i]) tensor = tmp res_dict[feed_name] = tensor res.append(res_dict) @@ -294,4 +291,4 @@ class ParallelExecutor(object): @property def device_count(self): - return len(self._act_places) + return len(self._places) diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 76a707efdc0804be0316ab12c347ffed6199529a..0fe836683b029698b670bbb9f9bb258c2f3b68a0 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -368,6 +368,8 @@ class OpTest(unittest.TestCase): place = core.CUDAPlace(0) if core.is_float16_supported(place): return [place] + else: + return [] else: return [] places = [fluid.CPUPlace()] diff --git a/python/paddle/fluid/tests/unittests/test_accuracy_op.py b/python/paddle/fluid/tests/unittests/test_accuracy_op.py index 1b2b53f2d4ce91ae7b5b191ed770b5338f0948c8..5257b0be6f61bc90a6492c44044c122485f4742c 100644 --- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py +++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py @@ -22,8 +22,10 @@ from op_test import OpTest class TestAccuracyOp(OpTest): def setUp(self): self.op_type = "accuracy" + self.dtype = np.float32 + self.init_dtype() n = 8192 - infer = np.random.random((n, 1)).astype("float32") + infer = np.random.random((n, 1)).astype(self.dtype) indices = np.random.randint(0, 2, (n, 1)) label = np.random.randint(0, 2, (n, 1)) self.inputs = {'Out': infer, 'Indices': indices, "Label": label} @@ -34,14 +36,25 @@ class TestAccuracyOp(OpTest): num_correct += 1 break self.outputs = { - 'Accuracy': np.array([num_correct / float(n)]).astype("float32"), + 'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype), 'Correct': np.array([num_correct]).astype("int32"), 'Total': np.array([n]).astype("int32") } + def init_dtype(self): + pass + def test_check_output(self): self.check_output() +class TestAccuracyOpFp16(TestAccuracyOp): + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output(atol=1e-3) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py index 1902a9869807ba7ce3f9828c124256cc6752857e..438d45b84033b697c3210acc44392b93bf436df0 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest -from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride +from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1 class TestMKLDNN(TestConv2dOp): @@ -37,5 +37,23 @@ class TestMKLDNNWithStride(TestWithStride): self.data_format = "NCHW" +class TestMKLDNNWithGroup(TestWithGroup): + def init_kernel_type(self): + self.use_mkldnn = True + self.data_format = "NCHW" + + +class TestMKLDNNWith1x1(TestWith1x1): + def init_kernel_type(self): + self.use_mkldnn = True + self.data_format = "NCHW" + + +class TestMKLDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1): + def init_kernel_type(self): + self.use_mkldnn = True + self.data_format = "NCHW" + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py index cadaf1df53af0af56afa8c3631b0f5ce390f318c..15d4db590edc9012604361751e9860ba63239bba 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py @@ -21,14 +21,16 @@ from op_test import OpTest class ElementwiseDivOp(OpTest): def setUp(self): self.op_type = "elementwise_div" + self.dtype = np.float32 + self.init_dtype() """ Warning CPU gradient check error! 'X': np.random.random((32,84)).astype("float32"), 'Y': np.random.random((32,84)).astype("float32") """ self.inputs = { - 'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"), - 'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32") + 'X': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype), + 'Y': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) } self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])} @@ -46,6 +48,9 @@ class ElementwiseDivOp(OpTest): self.check_grad( ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y')) + def init_dtype(self): + pass + class TestElementwiseDivOp_scalar(ElementwiseDivOp): def setUp(self): @@ -126,5 +131,21 @@ class TestElementwiseDivOp_broadcast_3(ElementwiseDivOp): } +class TestElementwiseDivOpFp16(ElementwiseDivOp): + def init_dtype(self): + self.dtype = np.float16 + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out', max_relative_error=1) + + def test_check_grad_ingore_x(self): + self.check_grad( + ['Y'], 'Out', max_relative_error=1, no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad( + ['X'], 'Out', max_relative_error=1, no_grad_set=set('Y')) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py index 57ba34f833f824d13e0b82caea789f7f57622bc9..04840991883229614c1ca4890e5cec2e7ae21084 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py @@ -135,5 +135,10 @@ class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp): } +class TestElementwiseMulOpFp16(ElementwiseMulOp): + def init_dtype(self): + self.dtype = np.float16 + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py index eec73d0beb39c49f535a03532e536092001c8445..20f1a110c35d689064c49efba246f078c3badd33 100644 --- a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py +++ b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py @@ -22,12 +22,22 @@ from op_test import OpTest class TestFillZerosLikeOp(OpTest): def setUp(self): self.op_type = "fill_zeros_like" - self.inputs = {'X': np.random.random((219, 232)).astype("float32")} + self.dtype = np.float32 + self.init_dtype() + self.inputs = {'X': np.random.random((219, 232)).astype(self.dtype)} self.outputs = {'Out': np.zeros_like(self.inputs["X"])} + def init_dtype(self): + pass + def test_check_output(self): self.check_output() +class TestFillZerosLikeOpFp16(TestFillZerosLikeOp): + def init_dtype(self): + self.dtype = np.float16 + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index b5b6305155d1ef3dcf6ce590c221664754c5bdc8..0fe69d1bd4b1b10c09879871c8cf1fc197d1106b 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -12,12 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. +import contextlib import unittest -import sys import numpy as np import paddle.fluid as fluid from paddle.fluid import core +from paddle.fluid.layers.nn import FC + + +@contextlib.contextmanager +def new_program_scope(): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + yield class MyLayer(fluid.imperative.PyLayer): @@ -30,6 +41,23 @@ class MyLayer(fluid.imperative.PyLayer): return [fluid.layers.elementwise_mul(x, x)] +class MLP(fluid.imperative.PyLayer): + def __init__(self): + super(MLP, self).__init__() + self._fc1 = FC(3, + fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1))) + self._fc2 = FC(4, + fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1))) + + def forward(self, inputs): + x = self._fc1(inputs[0]) + x = self._fc2(x) + x = fluid.layers.reduce_sum(x) + return x + + class TestImperative(unittest.TestCase): def test_layer(self): with fluid.imperative.guard(): @@ -39,13 +67,56 @@ class TestImperative(unittest.TestCase): l.forward([]) def test_layer_in_out(self): + np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32) with fluid.imperative.guard(): l = MyLayer() - x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0] + x = l(np_inp)[0] self.assertIsNotNone(x) - sys.stderr.write("%s output: %s\n" % (x, x._numpy())) + dy_out = x._numpy() x._backward() - sys.stderr.write("grad %s\n" % l._x_for_debug._gradient()) + dy_grad = l._x_for_debug._gradient() + + with new_program_scope(): + inp = fluid.layers.data( + name="inp", shape=[3], append_batch_size=False) + l = MyLayer() + x = l(inp)[0] + param_grads = fluid.backward.append_backward( + x, parameter_list=[l._x_for_debug.name])[0] + exe = fluid.Executor(fluid.CPUPlace()) + + static_out, static_grad = exe.run( + feed={inp.name: np_inp}, + fetch_list=[x.name, param_grads[1].name]) + + self.assertTrue(np.allclose(dy_out, static_out)) + self.assertTrue(np.allclose(dy_grad, static_grad)) + + def test_mlp(self): + np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + with fluid.imperative.guard(): + mlp = MLP() + out = mlp(np_inp) + dy_out = out._numpy() + out._backward() + dy_grad = mlp._fc1._w._gradient() + + with new_program_scope(): + inp = fluid.layers.data( + name="inp", shape=[2, 2], append_batch_size=False) + mlp = MLP() + out = mlp(inp) + param_grads = fluid.backward.append_backward( + out, parameter_list=[mlp._fc1._w.name])[0] + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(fluid.default_startup_program()) + + static_out, static_grad = exe.run( + feed={inp.name: np_inp}, + fetch_list=[out.name, param_grads[1].name]) + + self.assertTrue(np.allclose(dy_out, static_out)) + self.assertTrue(np.allclose(dy_grad, static_grad)) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py index 0d3e6d73e0149fe633b8f1de9041068c2e3bb293..e34a712d844c2d45f442d04f9350fbd7bc911a2a 100644 --- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py +++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py @@ -97,7 +97,7 @@ class TestLearningRateDecay(unittest.TestCase): startup_prog = fluid.Program() with fluid.program_guard(main_prog, startup_prog): - decayed_lr = fluid_decay_fn(**kwargs) + decayed_lr = fluid_decay_fn(**kwargs)("float32") place = fluid.CPUPlace() exe = fluid.Executor(place) diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py index cf4346cf2e7a099334ec273546901a91d0ad925d..77ec6f9b6bcda7568325698634fd4f86557cd1be 100644 --- a/python/paddle/fluid/tests/unittests/test_momentum_op.py +++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py @@ -24,11 +24,13 @@ from op_test import OpTest class TestMomentumOp1(OpTest): def setUp(self): self.op_type = "momentum" + self.dtype = np.float32 + self.init_dtype() - param = np.random.random((123, 321)).astype("float32") - grad = np.random.random((123, 321)).astype("float32") - velocity = np.zeros((123, 321)).astype("float32") - learning_rate = np.array([0.001]).astype("float32") + param = np.random.random((123, 321)).astype(self.dtype) + grad = np.random.random((123, 321)).astype(self.dtype) + velocity = np.zeros((123, 321)).astype(self.dtype) + learning_rate = np.array([0.001]).astype(self.dtype) mu = 0.0001 use_nesterov = False @@ -50,10 +52,21 @@ class TestMomentumOp1(OpTest): self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out} + def init_dtype(self): + pass + def test_check_output(self): self.check_output() +class TestMomentumOpFp16(TestMomentumOp1): + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output(atol=1e-3) + + class TestMomentumOp2(OpTest): '''Test Momentum with default values for attributes ''' diff --git a/python/paddle/fluid/tests/unittests/test_top_k_op.py b/python/paddle/fluid/tests/unittests/test_top_k_op.py index 69b29db83a43d18c0825b610642009a0377b9901..21b5a62baf96bfb2d76a8c59133e8f5d1cb35aea 100644 --- a/python/paddle/fluid/tests/unittests/test_top_k_op.py +++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py @@ -23,8 +23,11 @@ class TestTopkOp(OpTest): def setUp(self): self.set_args() self.op_type = "top_k" + self.dtype = np.float32 + self.init_dtype() + k = self.top_k - input = np.random.random((self.row, k)).astype("float32") + input = np.random.random((self.row, k)).astype(self.dtype) output = np.ndarray((self.row, k)) indices = np.ndarray((self.row, k)).astype("int64") @@ -38,6 +41,9 @@ class TestTopkOp(OpTest): self.outputs = {'Out': output, 'Indices': indices} + def init_dtype(self): + pass + def set_args(self): self.row = 32 self.top_k = 1 @@ -46,6 +52,11 @@ class TestTopkOp(OpTest): self.check_output() +class TestTopkOpFp16(TestTopkOp): + def init_dtype(self): + self.dtype = np.float16 + + class TestTopkOp3d(OpTest): def setUp(self): self.op_type = "top_k" diff --git a/python/setup.py.in b/python/setup.py.in index 22b9537a90e79c2571f61ec0dc156b602df784d6..5d5f2dd0f18cd3e707dca8b9f337f2f2a07d47aa 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -107,9 +107,9 @@ packages=['paddle', 'paddle.fluid.distributed', 'paddle.fluid.layers', 'paddle.fluid.contrib', - 'paddle.fluid.contrib.utils', 'paddle.fluid.contrib.decoder', 'paddle.fluid.contrib.quantize', + 'paddle.fluid.contrib.utils', 'paddle.fluid.transpiler', 'paddle.fluid.transpiler.details']