diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc index 855e2cae20cc47387cd7dde1ec74347824fbe82d..2118c9d5d4e1a0491ae11f57c08ed04980ced2e4 100644 --- a/paddle/framework/backward.cc +++ b/paddle/framework/backward.cc @@ -30,6 +30,7 @@ static void ForEachVarName(Map& names, T callback) { } } +// return whether all the names + suffixes in the set static bool AllInSet( const std::map>& names, const std::string& suffix, const std::unordered_set& set) { @@ -48,7 +49,7 @@ static std::shared_ptr NOP() { return net_op; } -// Get backward operator from a forward operator, recursively implementation. +// Get backward operator from a forward operator, a recursive implementation. // // no_grad_names the gradient variable names without gradient calculating. // @@ -56,27 +57,30 @@ static std::shared_ptr NOP() { // BackwardRecursive. use `uid = uniq_id++;` to get the unique index, and // pass `uniq_id` through recursive calling. // -// returns The backward operator. For simple situation, it is a simple -// operator. For complex situation, it is a NetOp. +// returns The backward operator. In a simple situation, it may be a simple +// operator, in a complex situation, it maybe a NetOp. // // See Backward.h for details static std::shared_ptr BackwardRecursive( const OperatorBase& forwardOp, std::unordered_set& no_grad_names, size_t& uniq_id); + std::shared_ptr BackwardRecursive( const OperatorBase& forwardOp, std::unordered_set& no_grad_names, size_t& uniq_id) { // If all input gradients of forwarding operator do not need to calculate, // just return an NOP. Not return null ptr because NOP does not take - // too much time for calculation, but it is useful for simplifying logic. - if (AllInSet(forwardOp.inputs_, kGradVarSuffix, no_grad_names)) { + // much time for calculation, but it is useful for simplifying logic. + if (AllInSet(forwardOp.inputs_ /*names*/, kGradVarSuffix /*suffix*/, + no_grad_names /*set*/)) { return NOP(); } // All output gradients of forwarding operator do not need to calculate. // Then all input gradients cannot be computed at all, and we put them into // `no_grad_names` set. Return an NOP. - if (AllInSet(forwardOp.outputs_, kGradVarSuffix, no_grad_names)) { + if (AllInSet(forwardOp.outputs_ /*names*/, kGradVarSuffix /*suffix*/, + no_grad_names /*set*/)) { ForEachVarName(forwardOp.inputs_, [&no_grad_names](const std::string& name) -> bool { no_grad_names.insert(GradVarName(name)); @@ -93,11 +97,11 @@ std::shared_ptr BackwardRecursive( auto& forwardNet = static_cast(forwardOp); // Map from output gradient variable name to operator's indices in - // backward net. That operator generates that variable. + // backward net's ops_. That operator generates that variable. std::unordered_map> dup_output_ops; size_t local_op_id = 0; - // reversely travel forwardNet + // reversely travel forwardNet and collect all duplicate outputs. for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend(); ++it, ++local_op_id) { auto fwd = *it; @@ -112,25 +116,35 @@ std::shared_ptr BackwardRecursive( // Get unique ID for this method. auto uid = uniq_id++; // TODO(dzh): more comment + // multiple operators which have the same output (y for example) may + // overwrite the same y variable when backward, special operations are token + // to handle this case. For each duplicate output, rename it to an alias + // (original name with a offset), append an `add` op for its operator, + // and finally sum all the alias variable to the final output variable y. using Pos = std::pair>; std::list insert_position; for (auto& dup_output_op : dup_output_ops) { const std::string& name = dup_output_op.first; auto& dup_op = dup_output_op.second; + // no duplicate output if (dup_op.size() == 1) continue; - std::vector dup_outputs; + // process the duplicate outputs + std::vector dup_outputs; for (size_t i = 0; i < dup_op.size(); ++i) { + // rename each duplicate output to an alias auto op_offset = dup_op[i]; dup_outputs.push_back(name + "@RENAME@" + std::to_string(uid) + "@" + std::to_string(i)); net->ops_[op_offset]->Rename(name, dup_outputs.back()); } + // collect all the offset to append `add` op for each alias insert_position.push_back( {dup_op.back(), OpRegistry::CreateOp("add", {{"X", {dup_outputs}}}, {{"Out", {name}}}, {})}); } + // make sure the inserted `add` ops follow the BFS order. insert_position.sort( [](const Pos& l, const Pos& r) { return l.first > r.first; });