提交 f79a3a83 编写于 作者: S sneaxiy

merge release/1.3

上级 d2d3f2b5
...@@ -231,7 +231,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ...@@ -231,7 +231,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
) )
ENDFUNCTION() ENDFUNCTION()
SET(PROTOBUF_VERSION 3.1) SET(PROTOBUF_VERSION 3.1.0)
IF(NOT PROTOBUF_FOUND) IF(NOT PROTOBUF_FOUND)
build_protobuf(extern_protobuf FALSE) build_protobuf(extern_protobuf FALSE)
......
...@@ -21,7 +21,7 @@ function(CheckCompilerCXX11Flag) ...@@ -21,7 +21,7 @@ function(CheckCompilerCXX11Flag)
if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3) if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.") message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
endif() endif()
endif() endif()
endif() endif()
endfunction() endfunction()
...@@ -147,6 +147,7 @@ set(GPU_COMMON_FLAGS ...@@ -147,6 +147,7 @@ set(GPU_COMMON_FLAGS
-Wno-error=unused-function # Warnings in Numpy Header. -Wno-error=unused-function # Warnings in Numpy Header.
-Wno-error=array-bounds # Warnings in Eigen::array -Wno-error=array-bounds # Warnings in Eigen::array
) )
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
endif(NOT WIN32) endif(NOT WIN32)
if (APPLE) if (APPLE)
......
...@@ -261,7 +261,7 @@ paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=N ...@@ -261,7 +261,7 @@ paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=N
paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None)) paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None))
paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords='ignored', defaults=(None,)) paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
......
...@@ -50,12 +50,15 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_ ...@@ -50,12 +50,15 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper) if(WITH_GPU)
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info)
else()
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper cpu_info)
endif()
cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass) cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass)
cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info) cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info)
cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle
all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
...@@ -67,13 +70,11 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he ...@@ -67,13 +70,11 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass inplace_op_pass) set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass)
if (WITH_GPU) if (WITH_GPU)
list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
endif() endif()
cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph) cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry)
cc_test(memory_optimize_pass_test SRCS memory_optimize_pass_test.cc memory_optimize_pass.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry pass)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
......
...@@ -206,8 +206,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -206,8 +206,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
new std::vector<OpDesc *>(main_program.Block(0).AllOps()); new std::vector<OpDesc *>(main_program.Block(0).AllOps());
graph->Set<const std::vector<OpDesc *>>(kAllOpDescs, graph->Set<const std::vector<OpDesc *>>(kAllOpDescs,
all_op_descs); // take ownership all_op_descs); // take ownership
graph->Set<GraphNodePool>(kGraphNodePool,
new GraphNodePool); // take ownership
pass->Erase(kAllOpDescs); pass->Erase(kAllOpDescs);
pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs); pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs);
...@@ -242,7 +240,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -242,7 +240,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
continue; continue;
} }
} }
VLOG(3) << "Start Apply Pass " << pass->Type();
graph = pass->Apply(std::move(graph)); graph = pass->Apply(std::move(graph));
VLOG(3) << "Finish Apply Pass " << pass->Type();
} }
return graph; return graph;
} }
......
...@@ -49,7 +49,7 @@ DEFINE_bool( ...@@ -49,7 +49,7 @@ DEFINE_bool(
"If this option turns on, only these op in whitelist can be inplaced." "If this option turns on, only these op in whitelist can be inplaced."
"If it turns off, all of the running op can be candidate of inplaced op." "If it turns off, all of the running op can be candidate of inplaced op."
"Such as scale, elementwise_add" "Such as scale, elementwise_add"
"By default, it's turned on"); "By default, it's turned off");
DECLARE_string(memory_optimize_debug); DECLARE_string(memory_optimize_debug);
...@@ -171,16 +171,15 @@ void InplacePass::InplaceModifyDesc(const std::string& var, ...@@ -171,16 +171,15 @@ void InplacePass::InplaceModifyDesc(const std::string& var,
} }
} }
const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var, const NodeSwapQueue InplacePass::TryInplaceModifyVar(
const std::string& cache_var, const std::string& var, const std::string& cache_var, const size_t& idx,
const size_t& idx, ir::Graph* graph) const {
ir::Graph* graph) const {
PADDLE_ENFORCE(var_nodes_[var].size() >= 1 && PADDLE_ENFORCE(var_nodes_[var].size() >= 1 &&
var_nodes_[var].at(0)->Var() != nullptr); var_nodes_[var].at(0)->Var() != nullptr);
std::unique_ptr<VarDesc> var_desc(new VarDesc(*var_nodes_[var].at(0)->Var())); std::unique_ptr<VarDesc> var_desc(new VarDesc(*var_nodes_[var].at(0)->Var()));
var_desc->SetName(cache_var); var_desc->SetName(cache_var);
SSANodePair swap_nodes; NodeSwapQueue swap_nodes;
for (size_t i = idx; i < view_.AllOps().size(); ++i) { for (size_t i = idx; i < view_.AllOps().size(); ++i) {
auto* op = view_.AllOps()[i]; auto* op = view_.AllOps()[i];
...@@ -230,7 +229,7 @@ const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var, ...@@ -230,7 +229,7 @@ const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var,
return swap_nodes; return swap_nodes;
} }
void InplacePass::CommitModify(const SSANodePair& swap_nodes, void InplacePass::CommitModify(const NodeSwapQueue& swap_nodes,
ir::Graph* graph) const { ir::Graph* graph) const {
for (auto& pair : swap_nodes) { for (auto& pair : swap_nodes) {
auto *node = pair.first, *cache_node = pair.second; auto *node = pair.first, *cache_node = pair.second;
...@@ -245,7 +244,7 @@ void InplacePass::CommitModify(const SSANodePair& swap_nodes, ...@@ -245,7 +244,7 @@ void InplacePass::CommitModify(const SSANodePair& swap_nodes,
} }
} }
void InplacePass::WithdrawModify(const SSANodePair& nodes, void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
ir::Graph* graph) const { ir::Graph* graph) const {
for (auto& pair : nodes) { for (auto& pair : nodes) {
auto *node = pair.first, *cache_node = pair.second; auto *node = pair.first, *cache_node = pair.second;
...@@ -403,18 +402,20 @@ void GraphView::Build(ir::Graph* g) { ...@@ -403,18 +402,20 @@ void GraphView::Build(ir::Graph* g) {
// 2. track the nodes which used by parameter server. // 2. track the nodes which used by parameter server.
// these node can not be inplaced, otherwise trainer // these node can not be inplaced, otherwise trainer
// pserver can not find each other name. // pserver can not find each other name.
for (auto& node : g->Nodes()) { auto update_skip_set = [&](ir::Node* node) {
if (!node->IsOp()) continue; for (auto& in : node->inputs) {
if (node->Name() == "send") { if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name());
for (auto& in : node->inputs) {
dup_nodes_.emplace(in->Name());
}
} }
if (node->Name() == "recv") { for (auto& out : node->outputs) {
for (auto& out : node->outputs) { if (out->IsVar() && out->Var() != nullptr)
dup_nodes_.emplace(out->Name()); dup_nodes_.emplace(out->Name());
}
} }
};
for (auto& node : g->Nodes()) {
if (!node->IsOp()) continue;
if (node->Name() == "send") update_skip_set(node);
if (node->Name() == "recv") update_skip_set(node);
if (node->Name() == "prefetch") update_skip_set(node);
} }
} }
......
...@@ -56,7 +56,8 @@ class GraphView { ...@@ -56,7 +56,8 @@ class GraphView {
std::map<ir::Node*, std::unordered_set<ir::Node*>> adj_list_; std::map<ir::Node*, std::unordered_set<ir::Node*>> adj_list_;
}; };
typedef std::vector<std::pair<ir::Node*, ir::Node*>> SSANodePair; // swap pairs in sequence
typedef std::vector<std::pair<ir::Node*, ir::Node*>> NodeSwapQueue;
class InplacePass : public ir::Pass { class InplacePass : public ir::Pass {
public: public:
InplacePass(); InplacePass();
...@@ -68,14 +69,14 @@ class InplacePass : public ir::Pass { ...@@ -68,14 +69,14 @@ class InplacePass : public ir::Pass {
void InitSSAGraphNodes() const; void InitSSAGraphNodes() const;
private: private:
const SSANodePair TryInplaceModifyVar(const std::string& var, const NodeSwapQueue TryInplaceModifyVar(const std::string& var,
const std::string& cache_var, const std::string& cache_var,
const size_t& idx, const size_t& idx,
ir::Graph* graph) const; ir::Graph* graph) const;
void CommitModify(const SSANodePair&, ir::Graph* graph) const; void CommitModify(const NodeSwapQueue&, ir::Graph* graph) const;
void WithdrawModify(const SSANodePair& nodes, ir::Graph* graph) const; void WithdrawModify(const NodeSwapQueue& nodes, ir::Graph* graph) const;
void InplaceModifyDesc(const std::string& in_var, const std::string& out_var, void InplaceModifyDesc(const std::string& in_var, const std::string& out_var,
const size_t& idx) const; const size_t& idx) const;
......
...@@ -13,17 +13,114 @@ ...@@ -13,17 +13,114 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include <algorithm>
#include <deque>
#include <functional> #include <functional>
#include <iostream> #include <iterator>
#include <numeric> #include <numeric>
#include <sstream> #include <sstream>
#include <string> #include <string>
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/cpu_info.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/gpu_info.h"
#endif // PADDLE_WITH_CUDA
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
using paddle::framework::VarDesc;
std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
PADDLE_ENFORCE(graph.Has(kAllOpDescs),
"Graph has no attribute of kAllOpDescs.");
// 1. get op desc order
auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs);
// 2. topology sort order
auto nodes = graph.Nodes();
std::deque<ir::Node*> ops;
FilterVariables(nodes, [&](ir::Node* op) {
if (op->IsOp() && op->Op() != nullptr) {
ops.emplace_back(op);
}
});
std::unordered_map<ir::Node*, size_t> op_deps;
std::list<ir::Node*> ready_ops;
std::unordered_map<ir::Node*, std::unordered_set<ir::Node*>> pending_ops;
for (auto* op : ops) {
std::unordered_set<ir::Node*> preceding_op;
for (auto* in : op->inputs) {
if (in->inputs.empty()) continue;
PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp());
preceding_op.emplace(in->inputs[0]);
pending_ops[in->inputs[0]].emplace(op);
}
op_deps[op] = preceding_op.size();
if (preceding_op.empty()) {
ready_ops.emplace_back(op);
}
}
// 3. generated op list based desc order and the topology order
std::vector<ir::Node*> ret;
std::list<OpDesc*> op_descs_list(op_descs.begin(), op_descs.end());
auto update_by_found_node = [&](ir::Node* found_node) {
for (auto* pending_op : pending_ops[found_node]) {
if (--op_deps[pending_op] == 0) {
ready_ops.emplace_back(pending_op);
}
}
ready_ops.remove(found_node);
ret.emplace_back(found_node);
};
while (!ready_ops.empty()) {
bool all_of_ready_op_unmatched = true;
for (auto it = op_descs_list.begin(); it != op_descs_list.end();) {
auto op_desc = *it;
ir::Node* found_node = nullptr;
for (auto* op : ready_ops) {
if (IsSameDesc(op->Op(), op_desc)) {
found_node = op;
break;
}
}
// 3.1 op desc deleted by other pass
if (found_node == nullptr) {
++it;
continue;
} else {
all_of_ready_op_unmatched = false;
it = op_descs_list.erase(it);
}
update_by_found_node(found_node);
}
// 3.2 op descs are added by other pass
// preceding op non empty means some new op descs are
// created, but not contained in return node list.
// these new op desc may depend on each other.
std::list<ir::Node*> prev_ready_ops(ready_ops);
if (all_of_ready_op_unmatched) {
for (auto op : prev_ready_ops) {
update_by_found_node(op);
}
}
}
PADDLE_ENFORCE(std::all_of(
op_deps.begin(), op_deps.end(),
[&](const std::pair<ir::Node*, size_t>& p) { return p.second == 0; }));
return ret;
}
size_t NodeSizeInBytes(const VarDesc& node) { size_t NodeSize(const VarDesc& node) {
auto shape = node.GetShape(); auto shape = node.GetShape();
int size = int size =
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()); std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
...@@ -31,9 +128,9 @@ size_t NodeSizeInBytes(const VarDesc& node) { ...@@ -31,9 +128,9 @@ size_t NodeSizeInBytes(const VarDesc& node) {
return type_size * std::abs(size); return type_size * std::abs(size);
} }
size_t NodeSizeInBytes(ir::Node* n) { size_t NodeSize(ir::Node* n) {
auto* desc = FindVarDescInBlock(n); auto* desc = FindVarDescInBlock(n);
return NodeSizeInBytes(*desc); return NodeSize(*desc);
} }
std::string DebugStringImpl(VarDesc* var) { std::string DebugStringImpl(VarDesc* var) {
...@@ -59,7 +156,6 @@ std::string DebugStringImpl(VarDesc* var) { ...@@ -59,7 +156,6 @@ std::string DebugStringImpl(VarDesc* var) {
std::string DebugString(ir::Node* var) { std::string DebugString(ir::Node* var) {
return DebugStringImpl(FindVarDescInBlock(var)); return DebugStringImpl(FindVarDescInBlock(var));
} }
// return DebugString(var->Var()); }
// NOTE(dzh): based ir node, if a large node has been reused // NOTE(dzh): based ir node, if a large node has been reused
// by a small size node, then next time it appear in pool, it will // by a small size node, then next time it appear in pool, it will
...@@ -76,22 +172,26 @@ struct NodeComparator { ...@@ -76,22 +172,26 @@ struct NodeComparator {
bool operator()(ir::Node* lhs, ir::Node* rhs) const { bool operator()(ir::Node* lhs, ir::Node* rhs) const {
auto* lhs_desc = FindVarDescInBlock(lhs); auto* lhs_desc = FindVarDescInBlock(lhs);
auto* rhs_desc = FindVarDescInBlock(rhs); auto* rhs_desc = FindVarDescInBlock(rhs);
// match data type
if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) {
return false;
}
// match shape
auto lhs_shape = lhs_desc->GetShape(); auto lhs_shape = lhs_desc->GetShape();
auto rhs_shape = rhs_desc->GetShape(); auto rhs_shape = rhs_desc->GetShape();
if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
(lhs_shape[0] != -1 && rhs_shape[0] != -1)) { (lhs_shape[0] != -1 && rhs_shape[0] != -1)) {
return NodeSizeInBytes(lhs) <= NodeSizeInBytes(rhs); return NodeSize(lhs) <= NodeSize(rhs);
} else { } else {
return false; return false;
} }
} }
}; };
void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) { void OrderedSet::Insert(ir::Node* var) {
PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar()); PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar());
PADDLE_ENFORCE(op->IsOp());
if (mark_table_.count(var->Name()) != 0) { if (mark_table_.count(var->Name()) != 0) {
mark_table_[var->Name()]->second.insert(op); mark_table_[var->Name()]->emplace_back(var);
return; return;
} }
...@@ -99,14 +199,15 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) { ...@@ -99,14 +199,15 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
auto var_shape = var_desc->GetShape(); auto var_shape = var_desc->GetShape();
int batch_size = static_cast<int>(var_shape[0]); int batch_size = static_cast<int>(var_shape[0]);
NodeComparator compare_node; NodeComparator functor;
Iter it = nodes_.begin(); Iter it = nodes_.begin();
while (it != nodes_.end()) { while (it != nodes_.end()) {
auto* cache_desc = FindVarDescInBlock(it->first); auto& prev = it->front();
auto* cache_desc = FindVarDescInBlock(prev);
int cache_batch_size = cache_desc->GetShape()[0]; int cache_batch_size = cache_desc->GetShape()[0];
if ((cache_batch_size == -1 && batch_size == -1) || if ((cache_batch_size == -1 && batch_size == -1) ||
(cache_batch_size != -1 && batch_size != -1)) { (cache_batch_size != -1 && batch_size != -1)) {
if (compare_node(it->first, var)) { if (functor(prev, var)) {
++it; ++it;
} else { } else {
break; break;
...@@ -118,62 +219,127 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) { ...@@ -118,62 +219,127 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
} }
} }
it = it = nodes_.insert(it, {var});
nodes_.insert(it, std::make_pair(var, std::unordered_set<ir::Node*>{op}));
mark_table_[var->Name()] = it; mark_table_[var->Name()] = it;
} }
int OrderedNodeList::GetIndex(ir::Node* var) { int OrderedSet::GetNodeIndexInPool(ir::Node* var) {
return std::distance(nodes_.begin(), mark_table_[var->Name()]); return std::distance(nodes_.begin(), mark_table_[var->Name()]);
} }
ir::Node* OrderedNodeList::NodeMatch(ir::Node* var) const { ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const {
ir::Node* found_node = nullptr; ir::Node* found_node = nullptr;
NodeComparator compare_node; NodeComparator functor;
for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
if (compare_node(var, it->first)) { auto& candidate = it->front();
found_node = it->first; if (functor(var, candidate)) {
found_node = candidate;
break; break;
} }
} }
return found_node; return found_node;
} }
void OrderedNodeList::Erase(ir::Node* var) { Erase(var->Name()); } ir::Node* OrderedSet::FindNextBestFitNode(ir::Node* var, ir::Node* prev) const {
ir::Node* found_node = nullptr;
NodeComparator functor;
auto it =
std::find_if(nodes_.begin(), nodes_.end(), [&](const NodeVector& v) {
if (v.front() == prev)
return true;
else
return false;
});
PADDLE_ENFORCE(it != nodes_.end(), "Not found previous in node list!");
for (it = std::next(it); it != nodes_.end(); ++it) {
auto& candidate = it->front();
if (functor(var, candidate)) {
found_node = candidate;
break;
}
}
return found_node;
}
bool OrderedSet::Has(ir::Node* var) const {
if (mark_table_.count(var->Name())) {
auto& node_in_samename = mark_table_.at(var->Name());
auto iter =
std::find_if(node_in_samename->begin(), node_in_samename->end(),
[&](ir::Node* n) { return n->Name() == var->Name(); });
return iter != node_in_samename->end();
}
return false;
}
void OrderedNodeList::Erase(const std::string& var) { void OrderedSet::Erase(const std::string& var) {
PADDLE_ENFORCE(mark_table_.count(var)); PADDLE_ENFORCE(mark_table_.count(var));
nodes_.erase(mark_table_[var]); nodes_.erase(mark_table_[var]);
mark_table_.erase(var); mark_table_.erase(var);
} }
std::string OrderedNodeList::ToString() const { void OrderedSet::Erase(ir::Node* var) {
PADDLE_ENFORCE(var != nullptr);
Erase(var->Name());
}
std::string OrderedSet::ToString() const {
std::stringstream ss; std::stringstream ss;
for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
ss << DebugString(it->first) << " "; for (auto& node : *it) {
ss << DebugString(node) << " ";
}
} }
return ss.str(); return ss.str();
} }
bool NodeCanReused(ir::Node* node) { bool NodeCanReused(ir::Node* node) {
// valid the node is a var node
if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false; if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false;
// auto* desc = node->Var();
bool flag = NodeCanReused(*node->Var()); bool flag = true;
// op output force generated in cpu, can not be reused.
for (auto* op : node->inputs) { for (auto* op : node->inputs) {
if (op->Op()->HasAttr("force_cpu")) { if (op->Op()->HasAttr("force_cpu")) {
// op output force generated in cpu, can not be reused.
flag &= framework::AttrReader(op->Op()->GetAttrMap()) flag &= framework::AttrReader(op->Op()->GetAttrMap())
.Get<bool>("force_cpu") == 0; .Get<bool>("force_cpu") == 0;
} }
} }
// var desc validation.
flag &= NodeCanReused(*node->Var());
return flag; return flag;
} }
int MinChunkSize() {
int size{0};
#ifdef PADDLE_WITH_CUDA
size = platform::GpuMinChunkSize();
#else
size = platform::CpuMinChunkSize();
#endif // PADDLE_WITH_CUDA
return size;
}
bool NodeCanReused(const VarDesc& node) { bool NodeCanReused(const VarDesc& node) {
auto type = node.GetType(); auto type = node.GetType();
if (node.Persistable() || type != proto::VarType::LOD_TENSOR || // only these types holds bulk of gpu memory
node.GetShape().empty()) { if (!(type == proto::VarType::LOD_TENSOR ||
type == proto::VarType::SELECTED_ROWS ||
type == proto::VarType::LOD_TENSOR_ARRAY)) {
return false;
}
// persistable variable is parameter
if (node.Persistable()) {
return false;
}
// shape < min_chunk_size is meaningless.
// further more, fetched loss always has size = 1
// which should not be reused.
auto shape = node.GetShape();
int size = std::abs(
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()));
if (shape.empty() || size < MinChunkSize()) {
return false; return false;
} }
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
...@@ -193,6 +359,176 @@ bool OpHasSubBlock(OpDesc* desc) { ...@@ -193,6 +359,176 @@ bool OpHasSubBlock(OpDesc* desc) {
return false; return false;
} }
ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) {
ops_ = SortOpLikeDescOrder(graph);
ConnectNodes();
}
void ControlFlowGraph::BuildCFGGraph() {
// FIXME(dzh): same effect with ConnectNodes, but use the control
// link to build dependency graph, it goes wrong in transformer.
for (ir::Node* op : ops_) {
for (auto& input_var : op->inputs) {
if (!input_var->inputs.empty()) {
PADDLE_ENFORCE(
input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(),
"Preceding Op Node of Var Node must be unique");
auto* pred_op = input_var->inputs[0];
if (pred_op->Op() != nullptr) {
predecessors_[op].insert(pred_op);
successors_[pred_op].insert(op);
}
}
if (input_var->IsVar() && !input_var->IsCtrlVar()) {
uses_[op].insert(input_var->Name());
}
}
for (auto& output_var : op->outputs) {
// output var may be used by many op
for (auto* succ_op : output_var->outputs) {
if (succ_op->Op() != nullptr) {
successors_[op].insert(succ_op);
predecessors_[succ_op].insert(op);
}
}
if (output_var->IsVar() && !output_var->IsCtrlVar()) {
defs_[op].insert(output_var->Name());
}
}
}
}
void ControlFlowGraph::ConnectNodes() {
for (size_t i = 0; i < ops_.size(); ++i) {
auto& op = ops_[i];
try {
auto& next_op = ops_.at(i + 1);
successors_[op].insert(next_op);
predecessors_[next_op].insert(op);
} catch (...) {
// do nothing
}
FilterVariables(op->inputs,
[&](ir::Node* var) { uses_[op].emplace(var->Name()); });
FilterVariables(op->outputs,
[&](ir::Node* var) { defs_[op].emplace(var->Name()); });
}
}
void ControlFlowGraph::LiveVariableAnalysis() {
// NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm)
// compute the liveness of for each variable though reversed_ops algorithm.
// It iterates the operators from end to begin, compute the live in/live out
// variable set for each op, then the diff between in/out will be used for
// the variable reuse. For detail refer to
// http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf
std::list<ir::Node*> work_list(ops_.rbegin(), ops_.rend());
while (!work_list.empty()) {
ir::Node* op = work_list.front();
work_list.pop_front();
// get the live_in calculated before. Empty if first.
auto prev_live_in = std::move(live_in_[op]);
for (auto& s : successors_[op]) {
for (auto& var : live_in_[s]) {
live_out_[op].insert(var);
}
}
for (auto& var : uses_[op]) {
live_in_[op].insert(var);
}
for (auto& var : live_out_[op]) {
live_in_[op].insert(var);
}
for (auto& var : defs_[op]) {
live_in_[op].erase(var);
}
// If the live_in is not changed, then the liveness analysis of
// predecessors is completed.
//
// Otherwise, recalculate the predecessors liveness
if (live_in_[op] != prev_live_in) {
for (auto& pre : predecessors_[op]) {
work_list.push_back(pre);
}
}
}
}
void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
const std::string& new_node,
int begin_idx) {
// update graph from begin idx to the end
for (size_t i = begin_idx; i != ops_.size(); ++i) {
auto* op = ops_[i];
if (uses_[op].find(old_node) != uses_[op].end()) {
uses_[op].erase(old_node);
uses_[op].insert(new_node);
}
if (defs_[op].find(old_node) != defs_[op].end()) {
defs_[op].erase(old_node);
defs_[op].insert(new_node);
}
if (live_in_[op].find(old_node) != live_in_[op].end()) {
live_in_[op].erase(old_node);
live_in_[op].insert(new_node);
}
if (live_out_[op].find(old_node) != live_out_[op].end()) {
live_out_[op].erase(old_node);
live_out_[op].insert(new_node);
}
}
}
const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
auto it = live_in_.find(op);
PADDLE_ENFORCE(
it != live_in_.end(),
string::Sprintf("Expect %s in live_in, but Not Found.", op->Name()));
return it->second;
}
const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
auto it = live_out_.find(op);
PADDLE_ENFORCE(
it != live_out_.end(),
string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
return it->second;
}
const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const {
auto it = uses_.find(op);
PADDLE_ENFORCE(
it != uses_.end(),
string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
return it->second;
}
const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; }
std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }
ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name,
ir::Node* op) const {
// in ssa-graph, different version nodes have same name,
// this function get the latest version var before target op
// It may return nullptr, such as data node.
ir::Node* found_node = nullptr;
for (auto* node : ops_) {
if (node == op) break;
for (auto& output : node->outputs) {
PADDLE_ENFORCE((output != nullptr && output->IsVar()),
"Output is empty!");
if (output->Var() && output->Name() == name) {
found_node = output;
}
}
}
return found_node;
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
#include <iostream> #include <iostream>
#include <iterator> #include <iterator>
#include <list> #include <list>
#include <map>
#include <set>
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
...@@ -27,41 +29,43 @@ namespace paddle { ...@@ -27,41 +29,43 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
constexpr char kFetchedVars[] = "fetched_vars"; constexpr char kAllOpDescs[] = "all_op_descs";
constexpr char kGraphNodePool[] = "graph_node_pool";
// NOTE(dzh): Variable and the operators use the var. std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
// for early delete pass.
// Because analysis var pass build base on ir::Node, which maybe released
// or modified between passes, so we use OpDesc* to mark ops.
using GraphNodePool = std::vector<
std::pair<std::string /*var node*/, std::unordered_set<OpDesc*> /* ops */>>;
// NOTE(dzh): by default, it sort node in ascend order(by node bytes size). // NOTE(dzh): A ordered set for node reuse in memory optimize.
// in fluid, -1 means the batch_size is determined in runtime. // the orderedset sort node in ascend order(by node bytes size).
// the node batch_size equal -1 always ranking in the front than the node not. // in fluid, -1 means the batch_size, which is determined in runtime.
// So the reuse happens between nodes who's batch_size both are -1
// simultaneously or not.
//
// sort rule:
// rule 0 : smaller node ranking in front.
// rule 1 : batch_size equal -1 ranking in the front than the node not.
//
// For example, // For example,
// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], .. // node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], ..
// O(1) insert, delete
class OrderedNodeList {
public:
using NodePair = std::pair<ir::Node*, std::unordered_set<ir::Node*>>;
using Iter = typename std::list<NodePair>::iterator;
using ConstIter = typename std::list<NodePair>::const_iterator;
void Insert(ir::Node* var, ir::Node* op); class OrderedSet {
public:
// nodes with same name exists in pool.
using NodeVector = std::vector<ir::Node*>;
using Iter = typename std::list<NodeVector>::iterator;
using ConstIter = typename std::list<NodeVector>::const_iterator;
void Insert(ir::Node* var);
void Erase(ir::Node* var); void Erase(ir::Node* var);
void Erase(const std::string& var); void Erase(const std::string& var);
bool Has(ir::Node* var) const;
bool Has(ir::Node* var) { return mark_table_.count(var->Name()); } void Clear() {
mark_table_.clear();
bool Has(const std::string& var) { return mark_table_.count(var); } nodes_.clear();
}
ir::Node* NodeMatch(ir::Node* var) const; // find the bestfit shape node block with var.
ir::Node* FindBestFitNode(ir::Node* var) const;
ir::Node* FindNextBestFitNode(ir::Node* var, ir::Node* prev) const;
// map store non-const iterator, can not promise const // map store non-const iterator, can not promise const
int GetIndex(ir::Node* var); int GetNodeIndexInPool(ir::Node* var);
// pool all node to string // pool all node to string
std::string ToString() const; std::string ToString() const;
...@@ -69,18 +73,54 @@ class OrderedNodeList { ...@@ -69,18 +73,54 @@ class OrderedNodeList {
Iter end() { return nodes_.end(); } Iter end() { return nodes_.end(); }
ConstIter begin() const { return nodes_.begin(); } ConstIter begin() const { return nodes_.begin(); }
ConstIter end() const { return nodes_.end(); } ConstIter end() const { return nodes_.end(); }
size_t size() const { return nodes_.size(); }
void Clear() { size_t size() const { return nodes_.size(); }
mark_table_.clear();
nodes_.clear();
}
private: private:
// for searching. // for searching.
std::unordered_map<std::string, Iter> mark_table_; std::unordered_map<std::string, Iter> mark_table_;
// node swap pairs. var -> ops dep var // node pool
std::list<NodePair> nodes_; std::list<NodeVector> nodes_;
};
class ControlFlowGraph {
public:
ControlFlowGraph() = default;
// IR Graph
explicit ControlFlowGraph(const ir::Graph& graph);
void LiveVariableAnalysis();
void RenameVarInCFGGraph(const std::string& old_node,
const std::string& new_node, int begin_idx);
const std::set<std::string> LiveIn(ir::Node* op) const;
const std::set<std::string> LiveOut(ir::Node* op) const;
const std::set<std::string> Use(ir::Node* op) const;
const std::vector<ir::Node*> Ops() const;
std::vector<ir::Node*>& Ops();
// for ssa-graph nodes
ir::Node* GetNodeByName(const std::string& name, ir::Node* op) const;
private:
void BuildCFGGraph();
void ConnectNodes();
using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>;
using VarSetMap = std::map<ir::Node*, std::set<std::string>>;
// successors ops use the output variables.
NodeListMap successors_;
// predecessors ops generated input variables.
NodeListMap predecessors_;
// variables lived before run current op.
VarSetMap live_in_;
// variables lived after run current op.
VarSetMap live_out_;
VarSetMap uses_; // op inputs
VarSetMap defs_; // op outputs
std::vector<ir::Node*> ops_; // op sequence by topology sort
}; };
// valid a tensor can be reuse or not // valid a tensor can be reuse or not
...@@ -93,15 +133,24 @@ bool NodeCanReused(const VarDesc& node); ...@@ -93,15 +133,24 @@ bool NodeCanReused(const VarDesc& node);
bool OpHasSubBlock(OpDesc* desc); bool OpHasSubBlock(OpDesc* desc);
// node memory size in bytes // node memory size in bytes
size_t NodeSizeInBytes(ir::Node* n); size_t NodeSize(ir::Node* n);
// node memory size in bytes // node memory size in bytes
size_t NodeSizeInBytes(const VarDesc&); size_t NodeSize(const VarDesc&);
std::string DebugString(ir::Node* var); std::string DebugString(ir::Node* var);
// NOTE(dzhwinter)
// after node reuse, the replaced node shape is
// different with its VarDesc. So need to find the
// correct VarDesc in Block.
VarDesc* FindVarDescInBlock(ir::Node* n); VarDesc* FindVarDescInBlock(ir::Node* n);
static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
op1->Outputs() == op2->Outputs();
}
template <typename Container, typename Callback> template <typename Container, typename Callback>
class FilterVariableImpl { class FilterVariableImpl {
public: public:
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include <algorithm> #include <algorithm>
#include <iostream> #include <iostream>
#include <iterator>
#include <memory> #include <memory>
#include <sstream> #include <sstream>
#include <string> #include <string>
...@@ -22,13 +23,19 @@ ...@@ -22,13 +23,19 @@
#include <vector> #include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/details/graph_test_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
TEST(OrderedNodeList, Normal) { TEST(OrderedSet, Normal) {
OrderedNodeList pool; OrderedSet pool;
std::vector<std::unique_ptr<ir::Node>> nodes; std::vector<std::unique_ptr<ir::Node>> nodes;
// clang-format off // clang-format off
...@@ -56,8 +63,15 @@ TEST(OrderedNodeList, Normal) { ...@@ -56,8 +63,15 @@ TEST(OrderedNodeList, Normal) {
nodes.emplace_back(std::move(node)); nodes.emplace_back(std::move(node));
} }
// Insert
for (auto& node : nodes) { for (auto& node : nodes) {
pool.Insert(node.get(), op.get()); pool.Insert(node.get());
}
// Has/size
ASSERT_EQ(pool.size(), shapes.size());
for (auto& node : nodes) {
ASSERT_TRUE(pool.Has(node.get()));
} }
// assert its order and interface. // assert its order and interface.
...@@ -66,14 +80,14 @@ TEST(OrderedNodeList, Normal) { ...@@ -66,14 +80,14 @@ TEST(OrderedNodeList, Normal) {
std::cout << pool.ToString() << std::endl; std::cout << pool.ToString() << std::endl;
ASSERT_EQ(pool.size(), static_cast<size_t>(COUNT - 1)); ASSERT_EQ(pool.size(), static_cast<size_t>(COUNT - 1));
ASSERT_EQ(pool.GetIndex(nodes.back().get()), 0); ASSERT_EQ(pool.GetNodeIndexInPool(nodes.back().get()), 0);
{ {
auto v1 = block_desc->Var("11"); auto v1 = block_desc->Var("11");
v1->SetShape({-1, 256, 56, 56}); v1->SetShape({-1, 256, 56, 56});
std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v1); std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v1);
node1->inputs.emplace_back(op.get()); node1->inputs.emplace_back(op.get());
auto* cache = pool.NodeMatch(node1.get()); auto* cache = pool.FindBestFitNode(node1.get());
ASSERT_EQ(cache, nullptr); ASSERT_EQ(cache, nullptr);
} }
{ {
...@@ -81,16 +95,447 @@ TEST(OrderedNodeList, Normal) { ...@@ -81,16 +95,447 @@ TEST(OrderedNodeList, Normal) {
v2->SetShape({-1, 2, 5}); v2->SetShape({-1, 2, 5});
std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v2); std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v2);
node1->inputs.emplace_back(op.get()); node1->inputs.emplace_back(op.get());
auto* cache = pool.NodeMatch(node1.get()); auto* cache = pool.FindBestFitNode(node1.get());
ASSERT_EQ(pool.GetIndex(cache), 2); // match 6:[-1,2,5] ASSERT_EQ(pool.GetNodeIndexInPool(cache), 2); // match 6:[-1,2,5]
} }
{ {
auto v3 = block_desc->Var("13"); auto v3 = block_desc->Var("13");
v3->SetShape({2, 5}); v3->SetShape({2, 5});
std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v3); std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v3);
node1->inputs.emplace_back(op.get()); node1->inputs.emplace_back(op.get());
auto* cache = pool.NodeMatch(node1.get()); auto* cache = pool.FindBestFitNode(node1.get());
ASSERT_EQ(pool.GetIndex(cache), 5); // match 4:[5,2] ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5); // match 4:[5,2]
}
}
TEST(OrderedSet, FindBestFitNode) {
OrderedSet pool;
std::vector<std::unique_ptr<ir::Node>> nodes;
ProgramDesc prog;
BlockDesc* block_desc = prog.MutableBlock(0);
auto* op_desc = block_desc->AppendOp();
op_desc->SetType("dummy");
std::unique_ptr<ir::Node> op = ir::CreateNodeForTest(op_desc);
{
auto desc = block_desc->Var("a");
desc->SetShape({128, 128});
std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
node->inputs.emplace_back(op.get());
nodes.emplace_back(std::move(node));
}
{
auto desc = block_desc->Var("b");
desc->SetShape({128, 129});
std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
node->inputs.emplace_back(op.get());
nodes.emplace_back(std::move(node));
}
{
auto desc = block_desc->Var("c");
desc->SetShape({128, 128});
std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
node->inputs.emplace_back(op.get());
nodes.emplace_back(std::move(node));
}
for (auto& node : nodes) {
pool.Insert(node.get());
}
// FindNextBestFitNode
auto* n = nodes[0].get();
auto* cache = pool.FindBestFitNode(n);
PADDLE_ENFORCE(cache->Name() == "a");
cache = pool.FindNextBestFitNode(n, cache);
PADDLE_ENFORCE(cache->Name() == "c");
cache = pool.FindNextBestFitNode(n, cache);
PADDLE_ENFORCE(cache->Name() == "b");
}
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_OPERATOR(sum, paddle::framework::DummyOp,
paddle::framework::SumOpMaker,
paddle::framework::DummyVarTypeInference);
REGISTER_OPERATOR(assign, paddle::framework::DummyOp,
paddle::framework::AssignOpMaker,
paddle::framework::DummyVarTypeInference);
REGISTER_OPERATOR(dummy, paddle::framework::DummyOp,
paddle::framework::SumOpMaker,
paddle::framework::DummyVarTypeInference);
/*
https://en.wikipedia.org/wiki/Live_variable_analysis
Create a customed classical dependency graph, left row is the instruction
number.
1. a = 1
2. b = a
3. c = a
4. d = b + c
5. e = d
a--------+
| |
b c
| |
d--------+
|
e
Then analysis these variable's liveness range
*/
namespace paddle {
namespace framework {
namespace details {
inline static ProgramDesc FillProgramDesc() {
ProgramDesc prog;
prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR);
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"b"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"c"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"d"});
op->SetOutput("Out", {"e"});
}
return prog;
}
TEST(CFGGraph, IRGraph) {
// prepare ir graph
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
ControlFlowGraph cfg(graph);
cfg.LiveVariableAnalysis();
// test assign op
ASSERT_TRUE((std::set<std::string>{"a"} == cfg.LiveIn(cfg.Ops()[0])));
ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveOut(cfg.Ops()[0])));
// test assign op
ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveIn(cfg.Ops()[1])));
ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveOut(cfg.Ops()[1])));
// test sum op
ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveIn(cfg.Ops()[2])));
ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveOut(cfg.Ops()[2])));
// test assign op
ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveIn(cfg.Ops()[3])));
ASSERT_TRUE((std::set<std::string>{} == cfg.LiveOut(cfg.Ops()[3])));
}
// 1. normal test
TEST(SortOpLikeDescOrder, NormalTest) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto nodes = SortOpLikeDescOrder(graph);
auto op_descs = prog.Block(0).AllOps();
for (size_t i = 0; i < nodes.size(); ++i) {
auto node = nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 2. remove some op_desc
TEST(SortOpLikeDescOrder, RemoveOpDesc) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto nodes = graph.Nodes();
auto op_descs = prog.Block(0).AllOps();
ir::Node* found_node = nullptr;
for (auto node : nodes) {
if (node->IsOp() && node->outputs.back()->Name() == "e") {
found_node = node;
break;
}
}
PADDLE_ENFORCE(found_node != nullptr);
for (auto it = op_descs.begin(); it != op_descs.end();) {
if (IsSameDesc(*it, found_node->Op())) {
it = op_descs.erase(it);
} else {
++it;
}
}
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
ir::Node* e = find_node_in_graph("e");
ir::Node* d = find_node_in_graph("d");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
graph.RemoveNode(found_node);
graph.RemoveNode(e);
// other node keeps the same order
auto remain_nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < remain_nodes.size(); ++i) {
auto node = remain_nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 3. add some op_desc
TEST(SortOpLikeDescOrder, AddOpDesc) {
auto prog = FillProgramDesc();
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
ir::Graph graph(prog);
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
// cached desc different with real one
// mimic the intermidiete pass modify the programdesc.
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto op_descs = prog.Block(0).AllOps();
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
d1->inputs.emplace_back(node);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
op_descs.insert(op_descs.begin() + 4, op);
auto nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < nodes.size(); ++i) {
auto node = nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 4. add and delete some op_desc
TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
// remove sum node
auto op_descs = prog.Block(0).AllOps();
ir::Node* found_node = nullptr;
auto nodes = graph.Nodes();
for (auto node : nodes) {
if (node->Name() == "sum") {
found_node = node;
break;
}
}
PADDLE_ENFORCE(found_node != nullptr);
for (auto it = op_descs.begin(); it != op_descs.end();) {
if (IsSameDesc(*it, found_node->Op())) {
it = op_descs.erase(it);
} else {
++it;
}
}
{
ir::Node* d = find_node_in_graph("d");
ir::Node* c = find_node_in_graph("c");
ir::Node* e = find_node_in_graph("e");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
std::remove(c->outputs.begin(), c->outputs.end(), found_node);
ir::Node* pending_op = found_node->outputs[0]->outputs[0];
graph.RemoveNode(e);
graph.RemoveNode(pending_op);
graph.RemoveNode(found_node);
}
// add node
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
{
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
}
op_descs.insert(op_descs.begin() + 2, op);
// check the order
auto mynodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < mynodes.size(); ++i) {
auto node = mynodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 5. add and replace some op_desc inplace.
TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
auto op_descs = prog.Block(0).AllOps();
// add node
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
{
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
d1->inputs.emplace_back(node);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
}
op_descs.emplace_back(op);
// replace op_desc inplace
auto nodes = graph.Nodes();
ir::Node* found_node = nullptr;
for (auto node : nodes) {
if (node->IsOp() && node->Op() && node->Name() == "assign") {
if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") {
found_node = node;
break;
}
}
}
{
ir::Node* d = find_node_in_graph("d");
ir::Node* e = find_node_in_graph("e");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
std::remove(e->inputs.begin(), e->inputs.end(), found_node);
graph.RemoveNode(found_node);
}
op_descs.erase(op_descs.begin() + 3);
auto replace_op = prog.MutableBlock(0)->AppendOp();
replace_op->SetType("sum");
replace_op->SetInput("X", {"d", "d1"});
replace_op->SetOutput("Out", {"e"});
{
ir::Node* sum2 = graph.CreateOpNode(replace_op);
ir::Node* e = find_node_in_graph("e");
ir::Node* d = find_node_in_graph("d");
ir::Node* d1 = find_node_in_graph("d1");
sum2->inputs.emplace_back(d);
sum2->inputs.emplace_back(d1);
sum2->outputs.emplace_back(e);
e->inputs.emplace_back(sum2);
d->outputs.emplace_back(sum2);
d1->outputs.emplace_back(sum2);
}
op_descs.emplace_back(replace_op);
// compare op order
auto graph_nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < graph_nodes.size(); ++i) {
auto node = graph_nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
} }
} }
......
...@@ -43,16 +43,10 @@ namespace paddle { ...@@ -43,16 +43,10 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
op1->Outputs() == op2->Outputs();
}
std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl( std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
auto nodes = graph->Nodes(); auto nodes = graph->Nodes();
auto subblock_vars = GetSubBlockVars(nodes); CollectSkipVarsSet(nodes);
skip_set_.insert(subblock_vars.begin(), subblock_vars.end());
cfg_.reset(new details::ControlFlowGraph(*graph)); cfg_.reset(new details::ControlFlowGraph(*graph));
cfg_->LiveVariableAnalysis(); cfg_->LiveVariableAnalysis();
...@@ -75,82 +69,67 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl( ...@@ -75,82 +69,67 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
} }
for (auto& var : op->outputs) { for (auto& var : op->outputs) {
if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 || if (var->IsVar() && !var->IsCtrlVar() && skip_set_.count(var->Name())) {
skip_set_.count(var->Name())) VLOG(3) << "Skip set contains variable of " << var->Name()
<< "disable reuse on it. skipped";
continue; continue;
ir::Node* cache = pool_.NodeMatch(var);
if (var->Name() == FLAGS_memory_optimize_debug) {
VLOG(3) << "start match var " << DebugString(var) << " of op "
<< op->Name();
VLOG(3) << pool_.ToString();
VLOG(3) << "matched in pool : "
<< ((cache == nullptr) ? "False" : "True");
} }
if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) {
ir::Node* cache = pool_.FindBestFitNode(var);
while (cache != nullptr && var->Name() == cache->Name()) {
VLOG(3) << "The same cache variable is cascade reused. "
<< cache->Name() << " is re-filled to the pool after "
<< "the reused op is finished. Current op can not "
<< "replace it again. Skip this candidate.";
cache = pool_.FindNextBestFitNode(var, cache);
}
if (var->Name() == FLAGS_memory_optimize_debug) {
VLOG(3) << "start match var " << DebugString(var) << " of op "
<< op->Name();
VLOG(3) << pool_.ToString();
VLOG(3) << "matched in pool : "
<< ((cache == nullptr) ? "False" : "True");
}
if (cache == nullptr) continue; if (cache != nullptr) {
if (var->Name() == cache->Name()) { int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
VLOG(3) << "The same cache variable is cascade reused." << var->Name() VLOG(3) << string::Sprintf(
<< " is re-filled to the pool after" "!!! %s, %s => %s, cache idx %d, pool size %d",
<< "the reused op is finished. Current op can not " std::to_string(reuse_id++), DebugString(var), DebugString(cache),
<< "replace it again. Skip this candidate."; node_idx_in_pool, static_cast<int>(pool_.size()));
continue; // NOTE(dzhwinter): update the ProgramDesc/IR Graph
// and the CFG Graph on the fly.
int node_idx_in_pool = pool_.GetIndex(cache); //
VLOG(3) << string::Sprintf( // IR Graph define the dependence relationship between nodes.
"!!! %s, %s => %s, cache idx %d, pool size %d", //
std::to_string(reuse_id++), DebugString(var), DebugString(cache), // ProgramDesc defines the input/output vars. Its used in
node_idx_in_pool, static_cast<int>(pool_.size())); // CreateOp, CreateVar when running happens.
// update CFG Graph on the fly. //
// reused var maybe re-fill into the pool // CFG Graph store the liveness information, when reuse happens
cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx); // we also need to update the variable liveness.
// NOTE(dzhwinter): we need to both update the ProgramDesc const std::string var_name = var->Name();
// and IR Graph. because op_desc/var_desc is used in CreateOp, const std::string cache_name = cache->Name();
// CreateVar when running happens. But IR Graph
// define the dependence relationship between nodes. cfg_->RenameVarInCFGGraph(var_name, cache_name, idx);
RenameVarInGraphDesc(var->Name(), cache->Name(), idx); RenameVarInGraphDesc(var_name, cache_name, idx);
RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get()); RenameVarInGraphNode(var_name, cache_name, idx, graph.get());
pool_.Erase(cache_name);
pool_.Erase(cache);
}
// fill the pool
std::unordered_set<std::string> unlived_vars;
for (auto var : cfg_->LiveIn(op)) {
if (cfg_->LiveOut(op).count(var) == 0) {
unlived_vars.emplace(var);
} }
} }
for (auto var : unlived_vars) { }
ir::Node* var_node = cfg_->GetNodeFromVarName(var, op); // fill the pool
for (auto var : cfg_->LiveIn(op)) {
if (cfg_->LiveOut(op).count(var) == 0) {
ir::Node* var_node = cfg_->GetNodeByName(var, op);
if (var_node == nullptr || var_node->IsCtrlVar()) continue;
if (NodeCanReused(var_node) && !pool_.Has(var_node)) { if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
pool_.Insert(var_node, op); pool_.Insert(var_node);
} }
} }
} }
} }
graph->ResolveHazard(var_nodes_); graph->ResolveHazard(var_nodes_);
// For early delete pass. use GraphNodePool load the unlived vars.
// 1. find all deps op for each unlived var in memory pool.
for (auto& op : graph->Nodes()) {
for (auto& var : op->inputs) {
if (pool_.Has(var)) {
pool_.Insert(var, op);
}
}
}
// 2. convert ir node based memory pool to graph node
// because Node* maybe released bettwen passes.
auto& graph_pool = graph->Get<GraphNodePool>(kGraphNodePool);
for (auto it = pool_.begin(); it != pool_.end(); ++it) {
std::unordered_set<OpDesc*> descs;
for (auto& op : it->second) {
PADDLE_ENFORCE(op->IsOp());
descs.insert(op->Op());
}
graph_pool.push_back(std::make_pair(it->first->Name(), descs));
}
return graph; return graph;
} }
...@@ -199,12 +178,12 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { ...@@ -199,12 +178,12 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
PADDLE_ENFORCE(sub_op != nullptr); PADDLE_ENFORCE(sub_op != nullptr);
for (auto* var : sub_op->outputs) { for (auto* var : sub_op->outputs) {
if (NodeCanReused(var)) { if (NodeCanReused(var)) {
ir::Node* cache = pool_.NodeMatch(var); ir::Node* cache = pool_.FindBestFitNode(var);
if (cache != nullptr) { if (cache != nullptr) {
if (var->Var()->GetDataType() != cache->Var()->GetDataType()) { if (var->Var()->GetDataType() != cache->Var()->GetDataType()) {
continue; continue;
} }
int node_idx_in_pool = pool_.GetIndex(cache); int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
VLOG(3) << string::Sprintf( VLOG(3) << string::Sprintf(
"!!! %s, %s => %s, cache idx %d, pool size %d", "!!! %s, %s => %s, cache idx %d, pool size %d",
std::to_string(sub_reuse_id++), DebugString(var), std::to_string(sub_reuse_id++), DebugString(var),
...@@ -224,20 +203,27 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { ...@@ -224,20 +203,27 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
} }
} }
std::unordered_set<std::string> MemoryOptimizePass::GetSubBlockVars( void MemoryOptimizePass::CollectSkipVarsSet(
const std::unordered_set<ir::Node*>& nodes) const { const std::unordered_set<ir::Node*>& nodes) const {
std::unordered_set<std::string> vars; auto update_skip_set = [&](OpDesc* op_desc) {
auto inputs = op_desc->InputArgumentNames();
auto outputs = op_desc->OutputArgumentNames();
skip_set_.insert(inputs.begin(), inputs.end());
skip_set_.insert(outputs.begin(), outputs.end());
};
for (auto& op : nodes) { for (auto& op : nodes) {
if (!op->IsOp() || op->Op() == nullptr) continue; if (!op->IsOp() || op->Op() == nullptr) continue;
auto* op_desc = op->Op(); auto* op_desc = op->Op();
if (OpHasSubBlock(op_desc)) { // NOTE(dzhwinter):
auto inputs = op_desc->InputArgumentNames(); // current block can not reuse next level block vars.
auto outputs = op_desc->OutputArgumentNames(); if (OpHasSubBlock(op_desc)) update_skip_set(op_desc);
vars.insert(inputs.begin(), inputs.end()); // NOTE(dzhwinter):
vars.insert(outputs.begin(), outputs.end()); // distributed ops input/output name need to
} // keep same bettwen trainer/pserver
if (op_desc->Type() == "send") update_skip_set(op_desc);
if (op_desc->Type() == "recv") update_skip_set(op_desc);
if (op_desc->Type() == "prefetch") update_skip_set(op_desc);
} }
return vars;
} }
void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var, void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var,
...@@ -291,8 +277,7 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, ...@@ -291,8 +277,7 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
// redirect the input to the latest version of cache_var // redirect the input to the latest version of cache_var
for (auto* node : op->inputs) { for (auto* node : op->inputs) {
if (node->Name() == var) { if (node->Name() == var) {
ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); ir::Node* cache_node = var_nodes_[cache_var].back();
var_nodes_[cache_var].emplace_back(cache_node);
// swap node to cache_node // swap node to cache_node
cache_node->outputs.insert(cache_node->outputs.end(), cache_node->outputs.insert(cache_node->outputs.end(),
...@@ -301,11 +286,15 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, ...@@ -301,11 +286,15 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
auto* prev_op = node->inputs[0]; auto* prev_op = node->inputs[0];
std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node, std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node,
cache_node); cache_node);
cache_node->inputs.emplace_back(prev_op);
for (auto* next_op : node->outputs) { for (auto* next_op : node->outputs) {
std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
cache_node); cache_node);
} }
// erase unused node
auto& nodes = var_nodes_.at(var);
nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
graph->RemoveNode(node);
} }
} }
...@@ -325,271 +314,14 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, ...@@ -325,271 +314,14 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
cache_node); cache_node);
} }
}
}
}
// release node of unused var in graph
for (auto* node : var_nodes_[var]) {
graph->RemoveNode(node);
}
var_nodes_.at(var).clear();
}
std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
PADDLE_ENFORCE(graph.Has(kAllOpDescs),
"Graph has no attribute of kAllOpDescs.");
// 1. get op desc order
auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs);
// 2. topology sort order
auto nodes = graph.Nodes();
std::deque<ir::Node*> ops;
FilterVariables(nodes, [&](ir::Node* op) {
if (op->IsOp() && op->Op() != nullptr) {
ops.emplace_back(op);
}
});
std::unordered_map<ir::Node*, size_t> op_deps;
std::list<ir::Node*> ready_ops;
std::unordered_map<ir::Node*, std::unordered_set<ir::Node*>> pending_ops;
for (auto* op : ops) {
std::unordered_set<ir::Node*> preceding_op;
for (auto* in : op->inputs) {
if (in->inputs.empty()) continue;
PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp());
preceding_op.emplace(in->inputs[0]);
pending_ops[in->inputs[0]].emplace(op);
}
op_deps[op] = preceding_op.size();
if (preceding_op.empty()) {
ready_ops.emplace_back(op);
}
}
// 3. generated op list based desc order and the topology order
std::vector<ir::Node*> ret;
std::list<OpDesc*> op_descs_list(op_descs.begin(), op_descs.end());
auto update_by_found_node = [&](ir::Node* found_node) {
for (auto* pending_op : pending_ops[found_node]) {
if (--op_deps[pending_op] == 0) {
ready_ops.emplace_back(pending_op);
}
}
ready_ops.remove(found_node);
ret.emplace_back(found_node);
};
while (!ready_ops.empty()) {
bool all_of_ready_op_unmatched = true;
for (auto it = op_descs_list.begin(); it != op_descs_list.end();) {
auto op_desc = *it;
ir::Node* found_node = nullptr;
for (auto* op : ready_ops) {
if (IsSameDesc(op->Op(), op_desc)) {
found_node = op;
break;
}
}
// 3.1 op desc deleted by other pass
if (found_node == nullptr) {
++it;
continue;
} else {
all_of_ready_op_unmatched = false;
it = op_descs_list.erase(it);
}
update_by_found_node(found_node);
}
// 3.2 op descs are added by other pass
// preceding op non empty means some new op descs are
// created, but not contained in return node list.
// these new op desc may depend on each other.
std::list<ir::Node*> prev_ready_ops(ready_ops);
if (all_of_ready_op_unmatched) {
for (auto op : prev_ready_ops) {
update_by_found_node(op);
}
}
}
PADDLE_ENFORCE(std::all_of(
op_deps.begin(), op_deps.end(),
[&](const std::pair<ir::Node*, size_t>& p) { return p.second == 0; }));
return ret;
}
ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) {
ops_ = SortOpLikeDescOrder(graph);
ConnectNodes();
}
void ControlFlowGraph::BuildCFGGraph() {
// FIXME(dzh): same effect with ConnectNodes, but use the control
// link to build dependency graph, it goes wrong in transformer.
for (ir::Node* op : ops_) {
for (auto& input_var : op->inputs) {
if (!input_var->inputs.empty()) {
PADDLE_ENFORCE(
input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(),
"Preceding Op Node of Var Node must be unique");
auto* pred_op = input_var->inputs[0];
if (pred_op->Op() != nullptr) {
predecessors_[op].insert(pred_op);
successors_[pred_op].insert(op);
}
}
if (input_var->IsVar() && !input_var->IsCtrlVar()) {
uses_[op].insert(input_var->Name());
}
}
for (auto& output_var : op->outputs) {
// output var may be used by many op
for (auto* succ_op : output_var->outputs) {
if (succ_op->Op() != nullptr) {
successors_[op].insert(succ_op);
predecessors_[succ_op].insert(op);
}
}
if (output_var->IsVar() && !output_var->IsCtrlVar()) {
defs_[op].insert(output_var->Name());
}
}
}
}
void ControlFlowGraph::ConnectNodes() {
for (size_t i = 0; i < ops_.size(); ++i) {
auto& op = ops_[i];
try {
auto& next_op = ops_.at(i + 1);
successors_[op].insert(next_op);
predecessors_[next_op].insert(op);
} catch (...) {
// do nothing
}
FilterVariables(op->inputs,
[&](ir::Node* var) { uses_[op].emplace(var->Name()); });
FilterVariables(op->outputs,
[&](ir::Node* var) { defs_[op].emplace(var->Name()); });
}
}
void ControlFlowGraph::LiveVariableAnalysis() {
// NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm)
// compute the liveness of for each variable though reversed_ops algorithm.
// It iterates the operators from end to begin, compute the live in/live out
// variable set for each op, then the diff between in/out will be used for
// the variable reuse. For detail refer to
// http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf
std::list<ir::Node*> work_list(ops_.rbegin(), ops_.rend());
while (!work_list.empty()) {
ir::Node* op = work_list.front();
work_list.pop_front();
// get the live_in calculated before. Empty if first.
auto prev_live_in = std::move(live_in_[op]);
for (auto& s : successors_[op]) {
for (auto& var : live_in_[s]) {
live_out_[op].insert(var);
}
}
for (auto& var : uses_[op]) {
live_in_[op].insert(var);
}
for (auto& var : live_out_[op]) {
live_in_[op].insert(var);
}
for (auto& var : defs_[op]) {
live_in_[op].erase(var);
}
// If the live_in is not changed, then the liveness analysis of
// predecessors is completed.
//
// Otherwise, recalculate the predecessors liveness
if (live_in_[op] != prev_live_in) {
for (auto& pre : predecessors_[op]) {
work_list.push_back(pre);
}
}
}
}
void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
const std::string& new_node,
int begin_idx) {
// update graph from begin idx to the end
for (size_t i = begin_idx; i != ops_.size(); ++i) {
auto* op = ops_[i];
if (uses_[op].find(old_node) != uses_[op].end()) {
uses_[op].erase(old_node);
uses_[op].insert(new_node);
}
if (defs_[op].find(old_node) != defs_[op].end()) {
defs_[op].erase(old_node);
defs_[op].insert(new_node);
}
if (live_in_[op].find(old_node) != live_in_[op].end()) {
live_in_[op].erase(old_node);
live_in_[op].insert(new_node);
}
if (live_out_[op].find(old_node) != live_out_[op].end()) {
live_out_[op].erase(old_node);
live_out_[op].insert(new_node);
}
}
}
const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
auto it = live_in_.find(op);
PADDLE_ENFORCE(
it != live_in_.end(),
string::Sprintf("Expect %s in live_in, but Not Found.", op->Name()));
return it->second;
}
const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
auto it = live_out_.find(op);
PADDLE_ENFORCE(
it != live_out_.end(),
string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
return it->second;
}
const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const {
auto it = uses_.find(op);
PADDLE_ENFORCE(
it != uses_.end(),
string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
return it->second;
}
const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; }
std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }
ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name, // erase unused node
ir::Node* op) const { auto& nodes = var_nodes_.at(var);
// in ssa-graph, different version nodes have same name, nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
// this function get the latest version var before target op graph->RemoveNode(node);
// It may return nullptr, such as data node.
ir::Node* found_node = nullptr;
for (auto* node : ops_) {
if (node == op) break;
for (auto& output : node->outputs) {
if (output->Name() == name) {
found_node = output;
} }
} }
} }
return found_node;
} }
} // namespace details } // namespace details
...@@ -598,5 +330,4 @@ ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name, ...@@ -598,5 +330,4 @@ ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name,
REGISTER_PASS(memory_optimize_pass, REGISTER_PASS(memory_optimize_pass,
paddle::framework::details::MemoryOptimizePass) paddle::framework::details::MemoryOptimizePass)
.RequireGraphAttr(paddle::framework::details::kGraphNodePool)
.RequireGraphAttr(paddle::framework::details::kAllOpDescs); .RequireGraphAttr(paddle::framework::details::kAllOpDescs);
...@@ -32,20 +32,15 @@ ...@@ -32,20 +32,15 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
constexpr char kAllOpDescs[] = "all_op_descs";
std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
class ControlFlowGraph;
class MemoryOptimizePass : public ir::Pass { class MemoryOptimizePass : public ir::Pass {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl( std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override; std::unique_ptr<ir::Graph> graph) const override;
private:
// fill the variable map(var_nodes) by version. // fill the variable map(var_nodes) by version.
void InitSSAGraphNodes() const; void InitSSAGraphNodes() const;
private:
// update program descs // update program descs
void RenameVarInGraphDesc(const std::string& var, void RenameVarInGraphDesc(const std::string& var,
const std::string& cache_var, size_t idx) const; const std::string& cache_var, size_t idx) const;
...@@ -55,13 +50,14 @@ class MemoryOptimizePass : public ir::Pass { ...@@ -55,13 +50,14 @@ class MemoryOptimizePass : public ir::Pass {
ir::Graph* graph) const; ir::Graph* graph) const;
void SubGraphOptimize(OpDesc* op_desc) const; void SubGraphOptimize(OpDesc* op_desc) const;
// scan subblock and collect the output/input variables. // 1. scan op with subblock and collect the output/input vars.
std::unordered_set<std::string> GetSubBlockVars( // while, while_grad, conditional_block
const std::unordered_set<ir::Node*>&) const; // 2. scan distributed ops and collect the output/input vars
void CollectSkipVarsSet(const std::unordered_set<ir::Node*>&) const;
private: private:
// Reuse Node Pool, Owned. // Reuse Node Pool, Owned.
mutable OrderedNodeList pool_; mutable OrderedSet pool_;
// controlflow Graph // controlflow Graph
mutable std::unique_ptr<ControlFlowGraph> cfg_; mutable std::unique_ptr<ControlFlowGraph> cfg_;
// skip set // skip set
...@@ -70,45 +66,6 @@ class MemoryOptimizePass : public ir::Pass { ...@@ -70,45 +66,6 @@ class MemoryOptimizePass : public ir::Pass {
mutable std::map<std::string, std::vector<ir::Node*>> var_nodes_; mutable std::map<std::string, std::vector<ir::Node*>> var_nodes_;
}; };
class ControlFlowGraph {
public:
ControlFlowGraph() = default;
// For IR Graph in parallelexecutor
explicit ControlFlowGraph(const ir::Graph& graph);
void LiveVariableAnalysis();
void RenameVarInCFGGraph(const std::string& old_node,
const std::string& new_node, int begin_idx);
const std::set<std::string> LiveIn(ir::Node* op) const;
const std::set<std::string> LiveOut(ir::Node* op) const;
const std::set<std::string> Use(ir::Node* op) const;
const std::vector<ir::Node*> Ops() const;
std::vector<ir::Node*>& Ops();
// for ssa-graph nodes
ir::Node* GetNodeFromVarName(const std::string& name, ir::Node* op) const;
private:
void BuildCFGGraph();
void ConnectNodes();
using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>;
using VarSetMap = std::map<ir::Node*, std::set<std::string>>;
// successors ops use the output variables.
NodeListMap successors_;
// predecessors ops generated input variables.
NodeListMap predecessors_;
// variables lived before run current op.
VarSetMap live_in_;
// variables lived after run current op.
VarSetMap live_out_;
VarSetMap uses_; // op inputs
VarSetMap defs_; // op outputs
std::vector<ir::Node*> ops_; // op sequence by topology sort
};
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/memory_optimize_pass.h"
#include <algorithm>
#include <iostream>
#include <iterator>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/graph_test_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
REGISTER_OPERATOR(sum, paddle::framework::DummyOp,
paddle::framework::SumOpMaker,
paddle::framework::DummyVarTypeInference);
REGISTER_OPERATOR(assign, paddle::framework::DummyOp,
paddle::framework::AssignOpMaker,
paddle::framework::DummyVarTypeInference);
REGISTER_OPERATOR(dummy, paddle::framework::DummyOp,
paddle::framework::SumOpMaker,
paddle::framework::DummyVarTypeInference);
/*
https://en.wikipedia.org/wiki/Live_variable_analysis
Create a customed classical dependency graph, left row is the instruction
number.
1. a = 1
2. b = a
3. c = a
4. d = b + c
5. e = d
a--------+
| |
b c
| |
d--------+
|
e
Then analysis these variable's liveness range
*/
namespace paddle {
namespace framework {
namespace details {
static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
op1->Outputs() == op2->Outputs();
}
inline static ProgramDesc FillProgramDesc() {
ProgramDesc prog;
prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR);
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"b"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"c"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"d"});
op->SetOutput("Out", {"e"});
}
return prog;
}
TEST(CFGGraph, IRGraph) {
// prepare ir graph
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
ControlFlowGraph cfg(graph);
cfg.LiveVariableAnalysis();
// test assign op
ASSERT_TRUE((std::set<std::string>{"a"} == cfg.LiveIn(cfg.Ops()[0])));
ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveOut(cfg.Ops()[0])));
// test assign op
ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveIn(cfg.Ops()[1])));
ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveOut(cfg.Ops()[1])));
// test sum op
ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveIn(cfg.Ops()[2])));
ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveOut(cfg.Ops()[2])));
// test assign op
ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveIn(cfg.Ops()[3])));
ASSERT_TRUE((std::set<std::string>{} == cfg.LiveOut(cfg.Ops()[3])));
}
// 1. normal test
TEST(SortOpLikeDescOrder, NormalTest) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto nodes = SortOpLikeDescOrder(graph);
auto op_descs = prog.Block(0).AllOps();
for (size_t i = 0; i < nodes.size(); ++i) {
auto node = nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 2. remove some op_desc
TEST(SortOpLikeDescOrder, RemoveOpDesc) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto nodes = graph.Nodes();
auto op_descs = prog.Block(0).AllOps();
ir::Node* found_node = nullptr;
for (auto node : nodes) {
if (node->IsOp() && node->outputs.back()->Name() == "e") {
found_node = node;
break;
}
}
PADDLE_ENFORCE(found_node != nullptr);
for (auto it = op_descs.begin(); it != op_descs.end();) {
if (IsSameDesc(*it, found_node->Op())) {
it = op_descs.erase(it);
} else {
++it;
}
}
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
ir::Node* e = find_node_in_graph("e");
ir::Node* d = find_node_in_graph("d");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
graph.RemoveNode(found_node);
graph.RemoveNode(e);
// other node keeps the same order
auto remain_nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < remain_nodes.size(); ++i) {
auto node = remain_nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 3. add some op_desc
TEST(SortOpLikeDescOrder, AddOpDesc) {
auto prog = FillProgramDesc();
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
ir::Graph graph(prog);
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
// cached desc different with real one
// mimic the intermidiete pass modify the programdesc.
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto op_descs = prog.Block(0).AllOps();
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
d1->inputs.emplace_back(node);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
op_descs.insert(op_descs.begin() + 4, op);
auto nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < nodes.size(); ++i) {
auto node = nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 4. add and delete some op_desc
TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
// remove sum node
auto op_descs = prog.Block(0).AllOps();
ir::Node* found_node = nullptr;
auto nodes = graph.Nodes();
for (auto node : nodes) {
if (node->Name() == "sum") {
found_node = node;
break;
}
}
PADDLE_ENFORCE(found_node != nullptr);
for (auto it = op_descs.begin(); it != op_descs.end();) {
if (IsSameDesc(*it, found_node->Op())) {
it = op_descs.erase(it);
} else {
++it;
}
}
{
ir::Node* d = find_node_in_graph("d");
ir::Node* c = find_node_in_graph("c");
ir::Node* e = find_node_in_graph("e");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
std::remove(c->outputs.begin(), c->outputs.end(), found_node);
ir::Node* pending_op = found_node->outputs[0]->outputs[0];
graph.RemoveNode(e);
graph.RemoveNode(pending_op);
graph.RemoveNode(found_node);
}
// add node
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
{
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
}
op_descs.insert(op_descs.begin() + 2, op);
// check the order
auto mynodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < mynodes.size(); ++i) {
auto node = mynodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 5. add and replace some op_desc inplace.
TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
auto op_descs = prog.Block(0).AllOps();
// add node
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
{
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
d1->inputs.emplace_back(node);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
}
op_descs.emplace_back(op);
// replace op_desc inplace
auto nodes = graph.Nodes();
ir::Node* found_node = nullptr;
for (auto node : nodes) {
if (node->IsOp() && node->Op() && node->Name() == "assign") {
if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") {
found_node = node;
break;
}
}
}
{
ir::Node* d = find_node_in_graph("d");
ir::Node* e = find_node_in_graph("e");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
std::remove(e->inputs.begin(), e->inputs.end(), found_node);
graph.RemoveNode(found_node);
}
op_descs.erase(op_descs.begin() + 3);
auto replace_op = prog.MutableBlock(0)->AppendOp();
replace_op->SetType("sum");
replace_op->SetInput("X", {"d", "d1"});
replace_op->SetOutput("Out", {"e"});
{
ir::Node* sum2 = graph.CreateOpNode(replace_op);
ir::Node* e = find_node_in_graph("e");
ir::Node* d = find_node_in_graph("d");
ir::Node* d1 = find_node_in_graph("d1");
sum2->inputs.emplace_back(d);
sum2->inputs.emplace_back(d1);
sum2->outputs.emplace_back(e);
e->inputs.emplace_back(sum2);
d->outputs.emplace_back(sum2);
d1->outputs.emplace_back(sum2);
}
op_descs.emplace_back(replace_op);
// compare op order
auto graph_nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < graph_nodes.size(); ++i) {
auto node = graph_nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/op_proto_maker.h"
namespace paddle { namespace paddle {
......
...@@ -21,8 +21,6 @@ namespace paddle { ...@@ -21,8 +21,6 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
constexpr char kAllOpDescs[] = "all_op_descs";
class SequentialExecutionPass : public ir::Pass { class SequentialExecutionPass : public ir::Pass {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl( std::unique_ptr<ir::Graph> ApplyImpl(
......
...@@ -69,7 +69,7 @@ class InplaceInToOut : public InplaceOpInference { ...@@ -69,7 +69,7 @@ class InplaceInToOut : public InplaceOpInference {
bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const { bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const {
return in.Name() != out.Name() && details::NodeCanReused(in) && return in.Name() != out.Name() && details::NodeCanReused(in) &&
details::NodeCanReused(out) && details::NodeCanReused(out) &&
details::NodeSizeInBytes(out) <= details::NodeSizeInBytes(in); details::NodeSize(out) <= details::NodeSize(in);
} }
}; };
......
...@@ -179,11 +179,11 @@ TEST(InferInplace, SingleOpInplaceInToOut) { ...@@ -179,11 +179,11 @@ TEST(InferInplace, SingleOpInplaceInToOut) {
op->SetOutput("Out", {"test2_out"}); op->SetOutput("Out", {"test2_out"});
prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64}); prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_out"); prog.MutableBlock(0)->Var("test2_out");
prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16}); prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
...@@ -201,11 +201,11 @@ TEST(InferInplace, SingleGradOpInplaceInToOut) { ...@@ -201,11 +201,11 @@ TEST(InferInplace, SingleGradOpInplaceInToOut) {
op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"}); op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});
prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16}); prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_out"); prog.MutableBlock(0)->Var("test2_out");
prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16}); prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
...@@ -233,12 +233,12 @@ TEST(InferInplace, MultiOutInplaceInToOut) { ...@@ -233,12 +233,12 @@ TEST(InferInplace, MultiOutInplaceInToOut) {
prog.MutableBlock(0)->Var("o0"); prog.MutableBlock(0)->Var("o0");
prog.MutableBlock(0)->Var("y0"); prog.MutableBlock(0)->Var("y0");
prog.MutableBlock(0)->Var("z0"); prog.MutableBlock(0)->Var("z0");
prog.MutableBlock(0)->Var("a0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("b0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("c0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("o0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("y0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("z0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
...@@ -267,15 +267,16 @@ TEST(InferInplace, MultiGradInplaceInToOut) { ...@@ -267,15 +267,16 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
prog.MutableBlock(0)->Var("o0"); prog.MutableBlock(0)->Var("o0");
prog.MutableBlock(0)->Var("y0"); prog.MutableBlock(0)->Var("y0");
prog.MutableBlock(0)->Var("z0"); prog.MutableBlock(0)->Var("z0");
prog.MutableBlock(0)->Var("a0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("b0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("c0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("o0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("y0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("z0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
EXPECT_EQ(in_to_outs.size(), 3ul); EXPECT_EQ(in_to_outs.size(), 3ul);
std::unordered_map<std::string, std::string> expects = { std::unordered_map<std::string, std::string> expects = {
{"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"}, {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
......
...@@ -38,9 +38,13 @@ std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl( ...@@ -38,9 +38,13 @@ std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl(
->assert_is_op("scale") ->assert_is_op("scale")
->assert_op_attr<float>("scale", 1.) ->assert_op_attr<float>("scale", 1.)
->assert_op_attr<float>("bias", 0.); ->assert_op_attr<float>("bias", 0.);
auto scale_out = detector.mutable_pattern() auto scale_out =
->NewNode("scale_out") detector.mutable_pattern()
->assert_is_op_output("scale"); ->NewNode("scale_out")
->assert_is_op_output("scale")
// scale's output var should has only one consumer, or it can't be
// removed.
->assert_more([](Node* x) { return x->outputs.size() == 1UL; });
pre_op->LinksTo({scale_in}); pre_op->LinksTo({scale_in});
scale_op->LinksFrom({scale_in}).LinksTo({scale_out}); scale_op->LinksFrom({scale_in}).LinksTo({scale_out});
......
...@@ -37,6 +37,7 @@ class InferCleanGraphPass : public FusePassBase { ...@@ -37,6 +37,7 @@ class InferCleanGraphPass : public FusePassBase {
std::unordered_set<const Node*> invalid_nodes; std::unordered_set<const Node*> invalid_nodes;
int valid_op = 0; int valid_op = 0;
for (auto* node : graph->Nodes()) { for (auto* node : graph->Nodes()) {
PADDLE_ENFORCE_NOT_NULL(node);
if (is_valid_node(node)) { if (is_valid_node(node)) {
invalid_nodes.insert(node); invalid_nodes.insert(node);
} else if (node->IsOp()) { } else if (node->IsOp()) {
......
...@@ -171,14 +171,6 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts( ...@@ -171,14 +171,6 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_); eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_);
graph = eager_deletion_pass->Apply(std::move(graph)); graph = eager_deletion_pass->Apply(std::move(graph));
VLOG(10) << "EagerDeletionPass Applied"; VLOG(10) << "EagerDeletionPass Applied";
if (build_strategy_.memory_early_delete_) {
auto early_delete_pass =
ir::PassRegistry::Instance().Get("memory_early_delete_pass");
early_delete_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
graph = early_delete_pass->Apply(std::move(graph));
}
VLOG(10) << "MemoryEarlyDeletePass Applied.";
} }
return graph; return graph;
...@@ -288,6 +280,8 @@ ParallelExecutor::ParallelExecutor( ...@@ -288,6 +280,8 @@ ParallelExecutor::ParallelExecutor(
graphs.push_back(std::move(graph)); graphs.push_back(std::move(graph));
#endif #endif
auto max_memory_size = GetEagerDeletionThreshold(); auto max_memory_size = GetEagerDeletionThreshold();
VLOG(10) << "Eager Deletion Threshold "
<< static_cast<float>(max_memory_size) / (1 << 30);
if (max_memory_size >= 0) { if (max_memory_size >= 0) {
for (size_t i = 0; i < graphs.size(); ++i) { for (size_t i = 0; i < graphs.size(); ++i) {
graphs[i] = member_->PrepareGCAndRefCnts( graphs[i] = member_->PrepareGCAndRefCnts(
...@@ -506,6 +500,5 @@ ParallelExecutor::~ParallelExecutor() { ...@@ -506,6 +500,5 @@ ParallelExecutor::~ParallelExecutor() {
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
USE_PASS(memory_early_delete_pass);
USE_PASS(reference_count_pass); USE_PASS(reference_count_pass);
USE_PASS(eager_deletion_pass); USE_PASS(eager_deletion_pass);
...@@ -460,77 +460,6 @@ inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) { ...@@ -460,77 +460,6 @@ inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
return node.inputs.size() == n; return node.inputs.size() == n;
} }
NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
PADDLE_ENFORCE(!source.empty(),
"Start points of topological sorting should not be empty!");
// CHECK all the inputs' in-degree is 0
for (auto *node : source) {
PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
}
std::unordered_set<Node *> visited;
std::unordered_set<Node *> to_visit{source.begin(), source.end()};
std::vector<Node *> inlink_visited;
while (!to_visit.empty()) {
std::vector<Node *> queue(to_visit.begin(), to_visit.end());
for (auto *p : queue) {
if (Agent(p).deleted()) {
visited.insert(p);
to_visit.erase(p);
}
inlink_visited.clear();
std::copy_if(p->inputs.begin(), p->inputs.end(),
std::back_inserter(inlink_visited),
[&](Node *x) -> bool { return visited.count(x) != 0; });
if (inlink_visited.size() == p->inputs.size()) {
sorted_.push_back(p);
for (auto *_ : p->outputs) {
if (!visited.count(_)) {
to_visit.insert(_);
}
}
to_visit.erase(p);
visited.insert(p);
}
}
}
}
NodesTSIterator::NodesTSIterator(const NodesTSIterator &other)
: sorted_(other.sorted_), cursor_(other.cursor_) {}
Node &NodesTSIterator::operator*() {
PADDLE_ENFORCE_LT(cursor_, sorted_.size());
return *sorted_[cursor_];
}
NodesTSIterator &NodesTSIterator::operator++() {
if (++cursor_ >= sorted_.size()) {
sorted_.clear();
cursor_ = 0;
}
return *this;
}
NodesTSIterator &NodesTSIterator::operator=(const NodesTSIterator &other) {
cursor_ = other.cursor_;
sorted_ = other.sorted_;
return *this;
}
bool NodesTSIterator::operator==(const NodesTSIterator &other) {
return sorted_ == other.sorted_ && cursor_ == other.cursor_;
}
Node *NodesTSIterator::operator->() {
PADDLE_ENFORCE_LT(cursor_, sorted_.size());
return sorted_[cursor_];
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -30,6 +30,7 @@ namespace inference { ...@@ -30,6 +30,7 @@ namespace inference {
namespace analysis { namespace analysis {
using framework::ir::Graph; using framework::ir::Graph;
using framework::ir::NodesTSIterator;
const char kIsFunctionNode[] = "__is_function_node__"; const char kIsFunctionNode[] = "__is_function_node__";
const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__"; const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__";
...@@ -132,32 +133,6 @@ struct Agent { ...@@ -132,32 +133,6 @@ struct Agent {
framework::ir::Node *x_; framework::ir::Node *x_;
}; };
// Topological sorting iterator on nodes.
struct NodesTSIterator
: public std::iterator<std::forward_iterator_tag, framework::ir::Node *> {
NodesTSIterator() = default;
explicit NodesTSIterator(const std::vector<framework::ir::Node *> &source);
NodesTSIterator(NodesTSIterator &&other)
: sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
other.cursor_ = 0;
}
NodesTSIterator(const NodesTSIterator &other);
framework::ir::Node &operator*();
NodesTSIterator &operator++();
// TODO(Superjomn) current implementation just compare the first
// element, need to compare the graph and all the elements in the queue and
// set.
NodesTSIterator &operator=(const NodesTSIterator &other);
bool operator==(const NodesTSIterator &other);
bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
framework::ir::Node *operator->();
private:
std::vector<framework::ir::Node *> sorted_;
size_t cursor_{0};
};
// The nodes those have no input will be treated as start points. // The nodes those have no input will be treated as start points.
static std::vector<framework::ir::Node *> ExtractStartPoints(const Graph &g) { static std::vector<framework::ir::Node *> ExtractStartPoints(const Graph &g) {
std::vector<framework::ir::Node *> result; std::vector<framework::ir::Node *> result;
......
...@@ -16,6 +16,12 @@ ...@@ -16,6 +16,12 @@
/*! \file paddle_api.h /*! \file paddle_api.h
*/ */
/*! \mainpage Paddle Inference APIs
* \section intro_sec Introduction
* The Paddle inference library aims to offer an high performance inference SDK
* for Paddle users.
*/
#include <cassert> #include <cassert>
#include <memory> #include <memory>
#include <string> #include <string>
...@@ -34,26 +40,49 @@ enum PaddleDType { ...@@ -34,26 +40,49 @@ enum PaddleDType {
}; };
/** /**
*\brief Memory menager for PaddleTensor. * \brief Memory manager for `PaddleTensor`.
* *
*The PaddleBuf holds a buffer for data input or output. The memory can be * The PaddleBuf holds a buffer for data input or output. The memory can be
*allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
*should be reused for better performance. * should be reused for better performance.
* *
*For user allocated memory, the following API can be used: * For user allocated memory, the following API can be used:
*- PaddleBuf(void* data, size_t length) to set an external memory by * - PaddleBuf(void* data, size_t length) to set an external memory by
*specifying * specifying the memory address and length.
* the memory address and length. * - Reset(void* data, size_t length) to reset the PaddleBuf with an external
*- Reset(void* data, size_t length) to reset the PaddleBuf with an external
*memory. *memory.
*ATTENTION, for user allocated memory, deallocation should be done by users * ATTENTION, for user allocated memory, deallocation should be done by users
*externally after the program finished. The PaddleBuf won't do any allocation *externally after the program finished. The PaddleBuf won't do any allocation
*or deallocation. *or deallocation.
* *
*To have the PaddleBuf allocate and manage the memory: * To have the PaddleBuf allocate and manage the memory:
*- PaddleBuf(size_t length) will allocate a memory of size `length`. * - PaddleBuf(size_t length) will allocate a memory of size `length`.
*- Resize(size_t length) resize the memory to no less than `length`, ATTENTION * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
* if the allocated memory is larger than `length`, nothing will done. * if the allocated memory is larger than `length`, nothing will done.
*
* Usage:
*
* Let PaddleBuf manage the memory internally.
* \code{cpp}
* const int num_elements = 128;
* PaddleBuf buf(num_elements * sizeof(float));
* \endcode
*
* Or
* \code{cpp}
* PaddleBuf buf;
* buf.Resize(num_elements * sizeof(float));
* \endcode
* Works the exactly the same.
*
* One can also make the `PaddleBuf` use the external memory.
* \code{cpp}
* PaddleBuf buf;
* void* external_memory = new float[num_elements];
* buf.Reset(external_memory, num_elements*sizeof(float));
* ...
* delete[] external_memory; // manage the memory lifetime outside.
* \endcode
*/ */
class PaddleBuf { class PaddleBuf {
public: public:
...@@ -78,7 +107,7 @@ class PaddleBuf { ...@@ -78,7 +107,7 @@ class PaddleBuf {
/** Tell whether the buffer is empty. /** Tell whether the buffer is empty.
*/ */
bool empty() const { return length_ == 0; } bool empty() const { return length_ == 0; }
/** Get the memory address. /** Get the data's memory address.
*/ */
void* data() const { return data_; } void* data() const { return data_; }
/** Get the memory length. /** Get the memory length.
...@@ -110,7 +139,8 @@ struct PaddleTensor { ...@@ -110,7 +139,8 @@ struct PaddleTensor {
}; };
enum class PaddlePlace { kUNK = -1, kCPU, kGPU }; enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
/** Tensor without copy, currently only supports AnalysisPredictor.
/** Tensor without copy, currently only supports `AnalysisPredictor`.
*/ */
class ZeroCopyTensor { class ZeroCopyTensor {
public: public:
...@@ -269,9 +299,11 @@ struct NativeConfig : public PaddlePredictor::Config { ...@@ -269,9 +299,11 @@ struct NativeConfig : public PaddlePredictor::Config {
* *
* Usage: * Usage:
* *
* \code{.cpp}
* NativeConfig config; * NativeConfig config;
* ... // change the configs. * ... // change the configs.
* auto native_predictor = CreatePaddlePredictor(config); * auto native_predictor = CreatePaddlePredictor(config);
* \endcode
* *
* FOR EXTENSION DEVELOPER: * FOR EXTENSION DEVELOPER:
* Different predictors are designated by config type. Similar configs can be * Different predictors are designated by config type. Similar configs can be
......
...@@ -66,8 +66,54 @@ void GpuPassStrategy::EnableMKLDNN() { ...@@ -66,8 +66,54 @@ void GpuPassStrategy::EnableMKLDNN() {
LOG(ERROR) << "GPU not support MKLDNN yet"; LOG(ERROR) << "GPU not support MKLDNN yet";
} }
GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
passes_.assign({
"infer_clean_graph_pass", //
"identity_scale_op_clean_pass", //
"conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", //
"conv_bn_fuse_pass", //
#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
// guaranteed at least v7
"conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
#endif
});
for (int i = 6; i >= 3; i--) {
passes_.push_back("transpose_flatten" + std::to_string(i) +
"_concat_fuse_pass");
}
use_gpu_ = true;
}
void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
analysis_passes_.push_back(pass); analysis_passes_.push_back(pass);
} }
CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
// NOTE the large fusions should be located in the front, so that they will
// not be damaged by smaller ones.
passes_.assign({
"infer_clean_graph_pass", //
"attention_lstm_fuse_pass", //
"seqpool_concat_fuse_pass", //
"seqconv_eltadd_relu_fuse_pass", //
// "embedding_fc_lstm_fuse_pass", //
"fc_lstm_fuse_pass", //
"mul_lstm_fuse_pass", //
"fc_gru_fuse_pass", //
"mul_gru_fuse_pass", //
"seq_concat_fc_fuse_pass", //
"fc_fuse_pass", //
"repeated_fc_relu_fuse_pass", //
"squared_mat_sub_fuse_pass", //
"conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", //
"is_test_pass", //
"identity_scale_op_clean_pass", //
});
use_gpu_ = false;
}
} // namespace paddle } // namespace paddle
...@@ -97,30 +97,7 @@ class PassStrategy : public PaddlePassBuilder { ...@@ -97,30 +97,7 @@ class PassStrategy : public PaddlePassBuilder {
*/ */
class CpuPassStrategy : public PassStrategy { class CpuPassStrategy : public PassStrategy {
public: public:
CpuPassStrategy() : PassStrategy({}) { CpuPassStrategy();
// NOTE the large fusions should be located in the front, so that they will
// not be damaged by smaller ones.
passes_.assign({
"infer_clean_graph_pass", //
"attention_lstm_fuse_pass", //
"seqpool_concat_fuse_pass", //
"seqconv_eltadd_relu_fuse_pass", //
// "embedding_fc_lstm_fuse_pass", //
"fc_lstm_fuse_pass", //
"mul_lstm_fuse_pass", //
"fc_gru_fuse_pass", //
"mul_gru_fuse_pass", //
"seq_concat_fc_fuse_pass", //
"fc_fuse_pass", //
"repeated_fc_relu_fuse_pass", //
"squared_mat_sub_fuse_pass", //
"conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", //
"is_test_pass", //
"identity_scale_op_clean_pass", //
});
use_gpu_ = false;
}
explicit CpuPassStrategy(const CpuPassStrategy &other) explicit CpuPassStrategy(const CpuPassStrategy &other)
: PassStrategy(other.AllPasses()) {} : PassStrategy(other.AllPasses()) {}
...@@ -153,27 +130,7 @@ class CpuPassStrategy : public PassStrategy { ...@@ -153,27 +130,7 @@ class CpuPassStrategy : public PassStrategy {
*/ */
class GpuPassStrategy : public PassStrategy { class GpuPassStrategy : public PassStrategy {
public: public:
GpuPassStrategy() : PassStrategy({}) { GpuPassStrategy();
passes_.assign({
"infer_clean_graph_pass", //
"identity_scale_op_clean_pass", //
"conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", //
"conv_bn_fuse_pass", //
#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
// guaranteed at least v7
"conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
#endif
});
for (int i = 6; i >= 3; i--) {
passes_.push_back("transpose_flatten" + std::to_string(i) +
"_concat_fuse_pass");
}
use_gpu_ = true;
}
explicit GpuPassStrategy(const GpuPassStrategy &other) explicit GpuPassStrategy(const GpuPassStrategy &other)
: PassStrategy(other.AllPasses()) { : PassStrategy(other.AllPasses()) {
......
...@@ -51,6 +51,11 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker { ...@@ -51,6 +51,11 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
comment.type)); comment.type));
AddInput("Y", string::Sprintf("the right hand operand of %s operator", AddInput("Y", string::Sprintf("the right hand operand of %s operator",
comment.type)); comment.type));
AddAttr<int>(
"axis",
"The start dimension index for broadcasting Y onto X. [default -1]")
.SetDefault(-1)
.EqualGreaterThan(-1);
AddAttr<bool>("force_cpu", AddAttr<bool>("force_cpu",
"Force fill output variable to cpu " "Force fill output variable to cpu "
"memory. Otherwise, fill output variable to the running " "memory. Otherwise, fill output variable to the running "
...@@ -64,11 +69,6 @@ N-dim tensor. X and Y could be any type. The each element of the Out tensor is ...@@ -64,11 +69,6 @@ N-dim tensor. X and Y could be any type. The each element of the Out tensor is
calculated by $%s$ calculated by $%s$
)DOC", )DOC",
comment.equation)); comment.equation));
AddAttr<int>(
"axis",
"The start dimension index for broadcasting Y onto X. [default -1]")
.SetDefault(-1)
.EqualGreaterThan(-1);
} }
}; };
......
...@@ -72,7 +72,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -72,7 +72,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#pragma omp parallel for #pragma omp parallel for
#endif #endif
for (int i = 0; i < fixed_ratios.size(); i++) { for (size_t i = 0; i < fixed_ratios.size(); i++) {
sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i])); sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i]));
} }
...@@ -115,11 +115,10 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -115,11 +115,10 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
} }
} }
if (clip) { if (clip) {
platform::Transform<platform::CPUDeviceContext> trans; T* dt = boxes->data<T>();
ClipFunctor<T> clip_func; std::transform(dt, dt + boxes->numel(), dt, [](T v) -> T {
trans(ctx.template device_context<platform::CPUDeviceContext>(), return std::min<T>(std::max<T>(v, 0.), 1.);
boxes->data<T>(), boxes->data<T>() + boxes->numel(), });
boxes->data<T>(), clip_func);
} }
framework::Tensor var_t; framework::Tensor var_t;
var_t.mutable_data<T>( var_t.mutable_data<T>(
...@@ -141,7 +140,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -141,7 +140,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
#endif #endif
for (int i = 0; i < box_num; ++i) { for (int i = 0; i < box_num; ++i) {
for (int j = 0; j < variances.size(); ++j) { for (size_t j = 0; j < variances.size(); ++j) {
e_vars(i, j) = variances[j]; e_vars(i, j) = variances[j];
} }
} }
......
...@@ -46,13 +46,6 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior, ...@@ -46,13 +46,6 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
} }
} }
template <typename T>
struct ClipFunctor {
HOSTDEVICE inline T operator()(T in) const {
return std::min<T>(std::max<T>(in, 0.), 1.);
}
};
template <typename T> template <typename T>
class PriorBoxOpKernel : public framework::OpKernel<T> { class PriorBoxOpKernel : public framework::OpKernel<T> {
public: public:
...@@ -101,31 +94,30 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -101,31 +94,30 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
boxes->mutable_data<T>(ctx.GetPlace()); boxes->mutable_data<T>(ctx.GetPlace());
vars->mutable_data<T>(ctx.GetPlace()); vars->mutable_data<T>(ctx.GetPlace());
auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes); T* b_t = boxes->data<T>();
for (int h = 0; h < feature_height; ++h) { for (int h = 0; h < feature_height; ++h) {
for (int w = 0; w < feature_width; ++w) { for (int w = 0; w < feature_width; ++w) {
T center_x = (w + offset) * step_width; T center_x = (w + offset) * step_width;
T center_y = (h + offset) * step_height; T center_y = (h + offset) * step_height;
T box_width, box_height; T box_width, box_height;
int idx = 0;
for (size_t s = 0; s < min_sizes.size(); ++s) { for (size_t s = 0; s < min_sizes.size(); ++s) {
auto min_size = min_sizes[s]; auto min_size = min_sizes[s];
if (min_max_aspect_ratios_order) { if (min_max_aspect_ratios_order) {
box_width = box_height = min_size / 2.; box_width = box_height = min_size / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; b_t[0] = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; b_t[1] = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; b_t[2] = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; b_t[3] = (center_y + box_height) / img_height;
idx++; b_t += 4;
if (max_sizes.size() > 0) { if (max_sizes.size() > 0) {
auto max_size = max_sizes[s]; auto max_size = max_sizes[s];
// square prior with size sqrt(minSize * maxSize) // square prior with size sqrt(minSize * maxSize)
box_width = box_height = sqrt(min_size * max_size) / 2.; box_width = box_height = sqrt(min_size * max_size) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; b_t[0] = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; b_t[1] = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; b_t[2] = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; b_t[3] = (center_y + box_height) / img_height;
idx++; b_t += 4;
} }
// priors with different aspect ratios // priors with different aspect ratios
for (size_t r = 0; r < aspect_ratios.size(); ++r) { for (size_t r = 0; r < aspect_ratios.size(); ++r) {
...@@ -135,11 +127,11 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -135,11 +127,11 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
} }
box_width = min_size * sqrt(ar) / 2.; box_width = min_size * sqrt(ar) / 2.;
box_height = min_size / sqrt(ar) / 2.; box_height = min_size / sqrt(ar) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; b_t[0] = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; b_t[1] = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; b_t[2] = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; b_t[3] = (center_y + box_height) / img_height;
idx++; b_t += 4;
} }
} else { } else {
// priors with different aspect ratios // priors with different aspect ratios
...@@ -147,21 +139,21 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -147,21 +139,21 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
float ar = aspect_ratios[r]; float ar = aspect_ratios[r];
box_width = min_size * sqrt(ar) / 2.; box_width = min_size * sqrt(ar) / 2.;
box_height = min_size / sqrt(ar) / 2.; box_height = min_size / sqrt(ar) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; b_t[0] = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; b_t[1] = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; b_t[2] = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; b_t[3] = (center_y + box_height) / img_height;
idx++; b_t += 4;
} }
if (max_sizes.size() > 0) { if (max_sizes.size() > 0) {
auto max_size = max_sizes[s]; auto max_size = max_sizes[s];
// square prior with size sqrt(minSize * maxSize) // square prior with size sqrt(minSize * maxSize)
box_width = box_height = sqrt(min_size * max_size) / 2.; box_width = box_height = sqrt(min_size * max_size) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; b_t[0] = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; b_t[1] = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; b_t[2] = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; b_t[3] = (center_y + box_height) / img_height;
idx++; b_t += 4;
} }
} }
} }
...@@ -169,11 +161,10 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -169,11 +161,10 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
} }
if (clip) { if (clip) {
platform::Transform<platform::CPUDeviceContext> trans; T* dt = boxes->data<T>();
ClipFunctor<T> clip_func; std::transform(dt, dt + boxes->numel(), dt, [](T v) -> T {
trans(ctx.template device_context<platform::CPUDeviceContext>(), return std::min<T>(std::max<T>(v, 0.), 1.);
boxes->data<T>(), boxes->data<T>() + boxes->numel(), });
boxes->data<T>(), clip_func);
} }
framework::Tensor var_t; framework::Tensor var_t;
......
...@@ -144,34 +144,40 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -144,34 +144,40 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
"The ignore threshold to ignore confidence loss.") "The ignore threshold to ignore confidence loss.")
.SetDefault(0.7); .SetDefault(0.7);
AddComment(R"DOC( AddComment(R"DOC(
This operator generate yolov3 loss by given predict result and ground This operator generates yolov3 loss based on given predict result and ground
truth boxes. truth boxes.
The output of previous network is in shape [N, C, H, W], while H and W The output of previous network is in shape [N, C, H, W], while H and W
should be the same, specify the grid size, each grid point predict given should be the same, H and W specify the grid size, each grid point predict
number boxes, this given number is specified by anchors, it should be given number boxes, this given number, which following will be represented as S,
half anchors length, which following will be represented as S. In the is specified by the number of anchors, In the second dimension(the channel
second dimention(the channel dimention), C should be S * (class_num + 5), dimension), C should be equal to S * (class_num + 5), class_num is the object
class_num is the box categoriy number of source dataset(such as coco), category number of source dataset(such as 80 in coco dataset), so in the
so in the second dimention, stores 4 box location coordinates x, y, w, h second(channel) dimension, apart from 4 box location coordinates x, y, w, h,
and confidence score of the box and class one-hot key of each anchor box. also includes confidence score of the box and class one-hot key of each anchor box.
While the 4 location coordinates if $$tx, ty, tw, th$$, the box predictions Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box predictions
correspnd to: should be as follows:
$$ $$
b_x = \sigma(t_x) + c_x b_x = \\sigma(t_x) + c_x
b_y = \sigma(t_y) + c_y $$
$$
b_y = \\sigma(t_y) + c_y
$$
$$
b_w = p_w e^{t_w} b_w = p_w e^{t_w}
$$
$$
b_h = p_h e^{t_h} b_h = p_h e^{t_h}
$$ $$
While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$ In the equation above, :math:`c_x, c_y` is the left top corner of current grid
is specified by anchors. and :math:`p_w, p_h` is specified by anchors.
As for confidence score, it is the logistic regression value of IoU between As for confidence score, it is the logistic regression value of IoU between
anchor boxes and ground truth boxes, the score of the anchor box which has anchor boxes and ground truth boxes, the score of the anchor box which has
the max IoU should be 1, and if the anchor box has IoU bigger then ignore the max IoU should be 1, and if the anchor box has IoU bigger than ignore
thresh, the confidence score loss of this anchor box will be ignored. thresh, the confidence score loss of this anchor box will be ignored.
Therefore, the yolov3 loss consist of three major parts, box location loss, Therefore, the yolov3 loss consist of three major parts, box location loss,
...@@ -186,13 +192,13 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -186,13 +192,13 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
In order to trade off box coordinate losses between big boxes and small In order to trade off box coordinate losses between big boxes and small
boxes, box coordinate losses will be mutiplied by scale weight, which is boxes, box coordinate losses will be mutiplied by scale weight, which is
calculated as follow. calculated as follows.
$$ $$
weight_{box} = 2.0 - t_w * t_h weight_{box} = 2.0 - t_w * t_h
$$ $$
Final loss will be represented as follow. Final loss will be represented as follows.
$$ $$
loss = (loss_{xy} + loss_{wh}) * weight_{box} loss = (loss_{xy} + loss_{wh}) * weight_{box}
......
...@@ -264,6 +264,23 @@ class ElementwiseOpInplace : public framework::InplaceInToOut { ...@@ -264,6 +264,23 @@ class ElementwiseOpInplace : public framework::InplaceInToOut {
} }
}; };
class ElementwiseGradOpInplace : public framework::InplaceInToOut {
public:
using framework::InplaceInToOut::InplaceInToOut;
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
std::unordered_map<std::string, std::string> ret;
if (block->HasVar(framework::GradVarName("X")) &&
block->HasVar(framework::GradVarName("Out"))) {
ret[framework::GradVarName("Out")] = framework::GradVarName("X");
}
return ret;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -316,4 +333,5 @@ class ElementwiseOpInplace : public framework::InplaceInToOut { ...@@ -316,4 +333,5 @@ class ElementwiseOpInplace : public framework::InplaceInToOut {
op_type##GradMaker, \ op_type##GradMaker, \
::paddle::operators::ElementwiseOpInplace); \ ::paddle::operators::ElementwiseOpInplace); \
REGISTER_OPERATOR(op_type##_grad, \ REGISTER_OPERATOR(op_type##_grad, \
::paddle::operators::ElementwiseOpExplicitGrad) ::paddle::operators::ElementwiseOpExplicitGrad, \
::paddle::operators::ElementwiseGradOpInplace)
...@@ -311,6 +311,10 @@ class LSTMGradKernel : public framework::OpKernel<T> { ...@@ -311,6 +311,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr; lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
} }
// lstm_value.output_value not used in bp, set to nullptr
// lstm_grad.state_active_grad not used in bp, set to nullptr
lstm_value.output_value = nullptr;
lstm_grad.state_active_grad = nullptr;
int cur_batch_size = bend - bstart; int cur_batch_size = bend - bstart;
math::LstmUnitGradFunctor<DeviceContext, T>::compute( math::LstmUnitGradFunctor<DeviceContext, T>::compute(
device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size, device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
......
...@@ -405,6 +405,11 @@ class LSTMPGradKernel : public framework::OpKernel<T> { ...@@ -405,6 +405,11 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
} }
int cur_batch_size = bend - bstart; int cur_batch_size = bend - bstart;
// lstmp_value.output_value not used in bp, set to null
// lstmp_grad.state_active_grad not used in bp, set to null
lstmp_value.output_value = nullptr;
lstmp_grad.state_active_grad = nullptr;
math::LstmUnitGradFunctor<DeviceContext, T>::compute( math::LstmUnitGradFunctor<DeviceContext, T>::compute(
device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size, device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
gate_act, cell_act, cand_act); gate_act, cell_act, cand_act);
......
...@@ -168,9 +168,10 @@ void Pool2dOpMaker::Make() { ...@@ -168,9 +168,10 @@ void Pool2dOpMaker::Make() {
"be ignored."); // TODO(Chengduo): Add checker. "be ignored."); // TODO(Chengduo): Add checker.
// (Currently, // (Currently,
// TypedAttrChecker don't support vector type.) // TypedAttrChecker don't support vector type.)
AddAttr<bool>("global_pooling", AddAttr<bool>(
"(bool, default false) Whether to use the global pooling. " "global_pooling",
"If global_pooling = true, ksize and paddings will be ignored.") "(bool, default false) Whether to use the global pooling. "
"If global_pooling = true, kernel size and paddings will be ignored.")
.SetDefault(false); .SetDefault(false);
AddAttr<std::vector<int>>("strides", AddAttr<std::vector<int>>("strides",
"(vector<int>, default {1, 1}), strides(height, " "(vector<int>, default {1, 1}), strides(height, "
...@@ -182,7 +183,7 @@ void Pool2dOpMaker::Make() { ...@@ -182,7 +183,7 @@ void Pool2dOpMaker::Make() {
"paddings", "paddings",
"(vector<int>, default {0,0}), paddings(height, width) of pooling " "(vector<int>, default {0,0}), paddings(height, width) of pooling "
"operator." "operator."
"If global_pooling = true, paddings and ksize will be ignored.") "If global_pooling = true, paddings and kernel size will be ignored.")
.SetDefault({0, 0}); .SetDefault({0, 0});
AddAttr<bool>( AddAttr<bool>(
"exclusive", "exclusive",
...@@ -204,7 +205,7 @@ void Pool2dOpMaker::Make() { ...@@ -204,7 +205,7 @@ void Pool2dOpMaker::Make() {
.SetDefault(false); .SetDefault(false);
AddAttr<bool>( AddAttr<bool>(
"ceil_mode", "ceil_mode",
"(bool, default false) Wether to use the ceil function to calculate " "(bool, default false) Whether to use the ceil function to calculate "
"output height and width. False is the default. If it is set to False, " "output height and width. False is the default. If it is set to False, "
"the floor function will be used.") "the floor function will be used.")
.SetDefault(false); .SetDefault(false);
...@@ -259,31 +260,40 @@ Example: ...@@ -259,31 +260,40 @@ Example:
W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1 W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
$$ $$
For exclusive = true: For exclusive = false:
$$ $$
hstart = i * strides[0] - paddings[0] hstart = i * strides[0] - paddings[0]
$$
$$
hend = hstart + ksize[0] hend = hstart + ksize[0]
$$
$$
wstart = j * strides[1] - paddings[1] wstart = j * strides[1] - paddings[1]
$$
$$
wend = wstart + ksize[1] wend = wstart + ksize[1]
$$
$$
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]} Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
$$ $$
For exclusive = false:
For exclusive = true:
$$ $$
hstart = max(0, i * strides[0] - paddings[0]) hstart = max(0, i * strides[0] - paddings[0])
$$
$$
hend = min(H, hstart + ksize[0]) hend = min(H, hstart + ksize[0])
$$
$$
wstart = max(0, j * strides[1] - paddings[1]) wstart = max(0, j * strides[1] - paddings[1])
$$
$$
wend = min(W, wstart + ksize[1]) wend = min(W, wstart + ksize[1])
$$
$$
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)} Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
$$ $$
For adaptive = true:
$$
hstart = floor(i * H_{in} / H_{out})
hend = ceil((i + 1) * H_{in} / H_{out})
wstart = floor(j * W_{in} / W_{out})
wend = ceil((j + 1) * W_{in} / W_{out})
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
$$
)DOC"); )DOC");
} }
...@@ -324,7 +334,7 @@ void Pool3dOpMaker::Make() { ...@@ -324,7 +334,7 @@ void Pool3dOpMaker::Make() {
AddAttr<bool>( AddAttr<bool>(
"global_pooling", "global_pooling",
"(bool, default false) Whether to use the global pooling. " "(bool, default false) Whether to use the global pooling. "
"If global_pooling = true, ksize and paddings wille be ignored.") "If global_pooling = true, kernel size and paddings will be ignored.")
.SetDefault(false); .SetDefault(false);
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>(
"strides", "strides",
...@@ -359,7 +369,7 @@ void Pool3dOpMaker::Make() { ...@@ -359,7 +369,7 @@ void Pool3dOpMaker::Make() {
.SetDefault(false); .SetDefault(false);
AddAttr<bool>( AddAttr<bool>(
"ceil_mode", "ceil_mode",
"(bool, default false) Wether to use the ceil function to calculate " "(bool, default false) Whether to use the ceil function to calculate "
"output height and width. False is the default. If it is set to False, " "output height and width. False is the default. If it is set to False, "
"the floor function will be used.") "the floor function will be used.")
.SetDefault(false); .SetDefault(false);
...@@ -392,48 +402,68 @@ Example: ...@@ -392,48 +402,68 @@ Example:
Output: Output:
Out shape: $(N, C, D_{out}, H_{out}, W_{out})$ Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
For ceil_mode = false: For ceil_mode = false:
$$ $$
D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\ $$
W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1 $$
$$ H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[2]} + 1
$$
$$
W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
$$
For ceil_mode = true: For ceil_mode = true:
$$ $$
D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1 \\ D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1
H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\ $$
W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1 $$
$$ H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1
For exclusive = true: $$
$$ $$
dstart = i * strides[0] - paddings[0] W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
dend = dstart + ksize[0] $$
hstart = j * strides[1] - paddings[1]
hend = hstart + ksize[1]
wstart = k * strides[2] - paddings[2]
wend = wstart + ksize[2]
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
$$
For exclusive = false: For exclusive = false:
$$ $$
dstart = max(0, i * strides[0] - paddings[0]) dstart = i * strides[0] - paddings[0]
dend = min(D, dstart + ksize[0]) $$
hstart = max(0, j * strides[1] - paddings[1]) $$
hend = min(H, hstart + ksize[1]) dend = dstart + ksize[0]
wstart = max(0, k * strides[2] - paddings[2]) $$
wend = min(W, wstart + ksize[2]) $$
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)} hstart = j * strides[1] - paddings[1]
$$ $$
$$
For adaptive = true: hend = hstart + ksize[1]
$$ $$
dstart = floor(i * D_{in} / D_{out}) $$
dend = ceil((i + 1) * D_{in} / D_{out}) wstart = k * strides[2] - paddings[2]
hstart = floor(j * H_{in} / H_{out}) $$
hend = ceil((j + 1) * H_{in} / H_{out}) $$
wstart = floor(k * W_{in} / W_{out}) wend = wstart + ksize[2]
wend = ceil((k + 1) * W_{in} / W_{out}) $$
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)} $$
$$ Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
$$
For exclusive = true:
$$
dstart = max(0, i * strides[0] - paddings[0])
$$
$$
dend = min(D, dstart + ksize[0])
$$
$$
hend = min(H, hstart + ksize[1])
$$
$$
wstart = max(0, k * strides[2] - paddings[2])
$$
$$
wend = min(W, wstart + ksize[2])
$$
$$
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
$$
)DOC"); )DOC");
} }
......
...@@ -121,7 +121,7 @@ struct RandomCropFunctor { ...@@ -121,7 +121,7 @@ struct RandomCropFunctor {
HOSTDEVICE void operator()(size_t ins_idx) { HOSTDEVICE void operator()(size_t ins_idx) {
typename Random<DeviceContext>::Engine engine(seed_); typename Random<DeviceContext>::Engine engine(seed_);
engine.discard(ins_idx * (rank_ - num_batchsize_dims_)); engine.discard(ins_idx * (rank_ - num_batchsize_dims_));
size_t offsets[9]; size_t offsets[9] = {};
for (int i = num_batchsize_dims_; i < rank_; ++i) { for (int i = num_batchsize_dims_; i < rank_; ++i) {
typename Random<DeviceContext>::template UniformIntDist<size_t> dist( typename Random<DeviceContext>::template UniformIntDist<size_t> dist(
0, x_dims_[i] - out_dims_[i]); 0, x_dims_[i] - out_dims_[i]);
......
...@@ -12,18 +12,138 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,18 +12,138 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <thrust/device_vector.h>
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/slice_op.h" #include "paddle/fluid/operators/slice_op.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
template <size_t D>
__global__ void Padding(const paddle::platform::float16* d_out,
const int* out_dims, const int* in_dims,
const int* offsets, int64_t n,
paddle::platform::float16* d_in) {
int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x;
if (out_idx < n) {
int64_t out_idx_tmp = out_idx;
int coords[D] = {0};
for (int i = D - 1; i >= 0; --i) {
coords[i] = out_idx_tmp % out_dims[i];
out_idx_tmp /= out_dims[i];
coords[i] += offsets[i];
}
int64_t in_idx = 0;
for (int i = 0; i < D; ++i) {
in_idx = in_idx * in_dims[i] + coords[i];
}
d_in[in_idx] = d_out[out_idx];
}
}
template <>
class SliceGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>
: public framework::OpKernel<paddle::platform::float16> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* d_in = ctx.Output<framework::Tensor>(framework::GradVarName("Input"));
d_in->mutable_data<paddle::platform::float16>(ctx.GetPlace());
auto out_dims = d_out->dims();
auto in_dims = d_in->dims();
int rank = out_dims.size();
std::vector<int> offsets(rank, 0);
auto axes = ctx.Attr<std::vector<int>>("axes");
auto starts = ctx.Attr<std::vector<int>>("starts");
for (size_t i = 0; i < starts.size(); ++i) {
if (starts[i] < 0) {
starts[i] += in_dims[axes[i]];
}
offsets[axes[i]] = std::max(starts[i], 0);
}
math::SetConstant<paddle::platform::CUDADeviceContext,
paddle::platform::float16>
set_zero;
auto& dev_ctx =
ctx.template device_context<paddle::platform::CUDADeviceContext>();
set_zero(dev_ctx, d_in, static_cast<paddle::platform::float16>(0));
int64_t numel = d_out->numel();
dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1);
dim3 threads(PADDLE_CUDA_NUM_THREADS);
auto stream = ctx.cuda_device_context().stream();
auto out_shape = framework::vectorize2int(out_dims);
thrust::device_vector<int> out_dims_vec(out_shape.begin(), out_shape.end());
auto in_shape = framework::vectorize2int(in_dims);
thrust::device_vector<int> in_dims_vec(in_shape.begin(), in_shape.end());
thrust::device_vector<int> offsets_vec(offsets.begin(), offsets.end());
const int* out_dims_ptr = thrust::raw_pointer_cast(out_dims_vec.data());
const int* in_dims_ptr = thrust::raw_pointer_cast(in_dims_vec.data());
const int* offsets_ptr = thrust::raw_pointer_cast(offsets_vec.data());
switch (rank) {
case 1:
Padding<1><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 2:
Padding<2><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 3:
Padding<3><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 4:
Padding<4><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 5:
Padding<5><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 6:
Padding<6><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>, slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, double>, ops::SliceKernel<paddle::platform::CUDADeviceContext, double>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, int>, ops::SliceKernel<paddle::platform::CUDADeviceContext, int>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
slice_grad, slice_grad,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, float>, ops::SliceGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, double>, ops::SliceGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int>, ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
...@@ -439,7 +439,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> { ...@@ -439,7 +439,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>(); context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();
Tensor* logit_grad = Tensor* logit_grad =
context.Output<Tensor>(framework::GradVarName("Logits")); context.Output<Tensor>(framework::GradVarName("Logits"));
logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax")); framework::TensorCopy(*context.Input<Tensor>("Softmax"), context.GetPlace(),
context.device_context(), logit_grad);
T* logit_grad_data = logit_grad->data<T>(); T* logit_grad_data = logit_grad->data<T>();
const int batch_size = logit_grad->dims()[0]; const int batch_size = logit_grad->dims()[0];
......
...@@ -398,9 +398,9 @@ PYBIND11_MODULE(core, m) { ...@@ -398,9 +398,9 @@ PYBIND11_MODULE(core, m) {
py::arg("recursive_sequence_lengths"), R"DOC( py::arg("recursive_sequence_lengths"), R"DOC(
Set LoD of the LoDTensor according to recursive sequence length. Set LoD of the LoDTensor according to recursive sequence length.
For example, if recursive_sequence_lengths=[2, 3], meaning that For example, if recursive_sequence_lengths=[[2, 3]], meaning that
there are two sequences with length 2 and 3 respectively, the there are two sequences with length 2 and 3 respectively, the
corresponding lod would be [0, 2, 2+3], i.e, [0, 2, 5]. corresponding lod would be [[0, 2, 2+3]], i.e, [[0, 2, 5]].
Args: Args:
recursive_sequence_lengths (List[List[int]]): sequence lengths. recursive_sequence_lengths (List[List[int]]): sequence lengths.
...@@ -803,9 +803,11 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -803,9 +803,11 @@ All parameter, weight, gradient are variables in Paddle.
.def(py::init<const platform::Place &>()) .def(py::init<const platform::Place &>())
.def("close", &Executor::Close) .def("close", &Executor::Close)
.def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope, .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
int block_id, bool create_local_scope, bool create_vars) { int block_id, bool create_local_scope, bool create_vars,
const std::vector<std::string> &fetch_vars) {
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
self.Run(prog, scope, block_id, create_local_scope, create_vars); self.Run(prog, scope, block_id, create_local_scope, create_vars,
fetch_vars);
}); });
m.def("init_gflags", framework::InitGflags); m.def("init_gflags", framework::InitGflags);
......
此差异已折叠。
...@@ -25,4 +25,5 @@ import paddle.reader ...@@ -25,4 +25,5 @@ import paddle.reader
import paddle.dataset import paddle.dataset
import paddle.batch import paddle.batch
import paddle.compat import paddle.compat
import paddle.distributed
batch = batch.batch batch = batch.batch
...@@ -161,7 +161,6 @@ def __bootstrap__(): ...@@ -161,7 +161,6 @@ def __bootstrap__():
'times_excess_than_required_tmp_allocation', 'times_excess_than_required_tmp_allocation',
'enable_inplace_whitelist' 'enable_inplace_whitelist'
] ]
core.init_gflags([sys.argv[0]] + core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)]) ["--tryfromenv=" + ",".join(read_env_flags)])
core.init_glog(sys.argv[0]) core.init_glog(sys.argv[0])
......
...@@ -177,7 +177,10 @@ class CompiledProgram(object): ...@@ -177,7 +177,10 @@ class CompiledProgram(object):
# FIXME(dzhwinter): enable_inplace should be after memory_optimize # FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass. # if turn on python memory optimize, turn off the inplace_pass.
self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True if self._build_strategy.memory_optimize is True:
self._build_strategy.memory_optimize = False if self._program._is_mem_optimized else True
if self._build_strategy.enable_inplace is True:
self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True
if self._build_strategy.num_trainers > 1 and trainers_endpoints: if self._build_strategy.num_trainers > 1 and trainers_endpoints:
assert self._build_strategy.num_trainers == len( assert self._build_strategy.num_trainers == len(
......
...@@ -63,10 +63,10 @@ Notes: ...@@ -63,10 +63,10 @@ Notes:
## 4. How to reproduce the results ## 4. How to reproduce the results
* Small dataset * Small dataset
```bash ```bash
python python/paddle/fluid/contrib/tests/test_calibration.py FLAGS_use_mkldnn=true python python/paddle/fluid/contrib/tests/test_calibration.py
``` ```
* Full dataset * Full dataset
```bash ```bash
DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py FLAGS_use_mkldnn=true DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
``` ```
...@@ -6,5 +6,9 @@ if(APPLE OR WIN32 OR NOT WITH_MKL) ...@@ -6,5 +6,9 @@ if(APPLE OR WIN32 OR NOT WITH_MKL)
endif() endif()
foreach(src ${TEST_OPS}) foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py) if(src MATCHES "test_calibration")
py_test(${src} SRCS ${src}.py ENVS FLAGS_use_mkldnn=true)
else()
py_test(${src} SRCS ${src}.py)
endif()
endforeach() endforeach()
...@@ -199,7 +199,6 @@ class TestCalibrationForResnet50(unittest.TestCase): ...@@ -199,7 +199,6 @@ class TestCalibrationForResnet50(unittest.TestCase):
def run_program(self, model_path, generate_int8=False, algo='direct'): def run_program(self, model_path, generate_int8=False, algo='direct'):
image_shape = [3, 224, 224] image_shape = [3, 224, 224]
os.environ['FLAGS_use_mkldnn'] = 'True'
fluid.memory_optimize(fluid.default_main_program()) fluid.memory_optimize(fluid.default_main_program())
...@@ -241,9 +240,6 @@ class TestCalibrationForResnet50(unittest.TestCase): ...@@ -241,9 +240,6 @@ class TestCalibrationForResnet50(unittest.TestCase):
label = label.reshape([-1, 1]) label = label.reshape([-1, 1])
running_program = calibrator.sampling_program.clone( running_program = calibrator.sampling_program.clone(
) if generate_int8 else infer_program.clone() ) if generate_int8 else infer_program.clone()
for op in running_program.current_block().ops:
if op.has_attr("use_mkldnn"):
op._set_attr("use_mkldnn", True)
t1 = time.time() t1 = time.time()
_, acc1, _ = exe.run( _, acc1, _ = exe.run(
......
...@@ -261,45 +261,42 @@ def _as_lodtensor(data, place): ...@@ -261,45 +261,42 @@ def _as_lodtensor(data, place):
class Executor(object): class Executor(object):
""" """
An Executor in Python, only support the single-GPU running. For multi-cards, please refer to An Executor in Python, supports single/multiple-GPU running, and single/multiple-CPU running.
ParallelExecutor. Python executor takes a program, adds feed operators and fetch operators to this program according
Python executor takes a program, add feed operators and fetch operators to this program according
to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
the variables(or names) that user want to get after program run. Note: the executor will run all the variables(or names) that user wants to get after program runs. Note: the executor will run all
operators in the program but not only the operators dependent by the fetch_list. operators in the program but not only the operators dependent by the fetch_list.
It store the global variables into the global scope, and create a local scope for the temporary It stores the global variables into the global scope, and creates a local scope for the temporary
variables. The local scope contents will be discarded after every minibatch forward/backward finished. variables. The contents in local scope may be discarded after every minibatch forward/backward
But the global scope variables will be persistent through different runs. finished. But the global scope variables will be persistent through different runs.
All of ops in program will be running in sequence.
Example: Example:
.. code-block:: python
# First create the Executor. .. code-block:: python
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place) # First create the Executor.
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
# Run the startup program once and only once. exe = fluid.Executor(place)
# Not need to optimize/compile the startup program.
exe.run(fluid.default_startup_program()) # Run the startup program once and only once.
# Not need to optimize/compile the startup program.
# Run the main program directly without compile. exe.run(fluid.default_startup_program())
loss, = exe.run(fluid.default_main_program(),
feed=feed_dict, # Run the main program directly without compile.
fetch_list=[loss.name]) loss, = exe.run(fluid.default_main_program(),
# Or, compiled the program and run. See `CompiledProgram` for more detail. feed=feed_dict,
compiled_prog = compiler.CompiledProgram( fetch_list=[loss.name])
fluid.default_main_program()).with_data_parallel( # Or, compiled the program and run. See `CompiledProgram` for more detail.
loss_name=loss.name) compiled_prog = compiler.CompiledProgram(
loss, = exe.run(compiled_prog, fluid.default_main_program()).with_data_parallel(
feed=feed_dict, loss_name=loss.name)
fetch_list=[loss.name]) loss, = exe.run(compiled_prog,
feed=feed_dict,
fetch_list=[loss.name])
Args: Args:
place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
Note: For debugging complicated network in parallel-GPUs, you can test it on the executor.
They has the exactly same arguments, and expected the same results.
""" """
def __init__(self, place): def __init__(self, place):
...@@ -382,6 +379,12 @@ class Executor(object): ...@@ -382,6 +379,12 @@ class Executor(object):
] ]
return outs return outs
'''
TODO(typhoonzero): Define "no longer use" meaning? Can user create
a new Executor for the same program and run?
TODO(panyx0718): Why ParallelExecutor doesn't have close?
'''
def close(self): def close(self):
""" """
Close this executor. Close this executor.
...@@ -389,9 +392,6 @@ class Executor(object): ...@@ -389,9 +392,6 @@ class Executor(object):
You can no longer use this executor after calling this method. You can no longer use this executor after calling this method.
For the distributed training, this method would free the resource on PServers related to For the distributed training, this method would free the resource on PServers related to
the current Trainer. the current Trainer.
TODO(typhoonzero): Define "no longer use" meaning? Can user create
a new Executor for the same program and run?
TODO(panyx0718): Why ParallelExecutor doesn't have close?
Example: Example:
>>> cpu = core.CPUPlace() >>> cpu = core.CPUPlace()
......
...@@ -555,7 +555,8 @@ class OpProtoHolder(object): ...@@ -555,7 +555,8 @@ class OpProtoHolder(object):
return { return {
core.op_proto_and_checker_maker.kOpRoleAttrName(), core.op_proto_and_checker_maker.kOpRoleAttrName(),
core.op_proto_and_checker_maker.kOpRoleVarAttrName(), core.op_proto_and_checker_maker.kOpRoleVarAttrName(),
core.op_proto_and_checker_maker.kOpNameScopeAttrName() core.op_proto_and_checker_maker.kOpNameScopeAttrName(),
core.op_proto_and_checker_maker.kOpCreationCallstackAttrName()
} }
......
...@@ -766,7 +766,10 @@ def _load_distributed_persistables(executor, dirname, main_program=None): ...@@ -766,7 +766,10 @@ def _load_distributed_persistables(executor, dirname, main_program=None):
dtype=slice_var.dtype, dtype=slice_var.dtype,
persistable=True) persistable=True)
dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:]) dim1_flatten = 1
if len(slice.shape) >= 2:
dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
start = int(offset / dim1_flatten) start = int(offset / dim1_flatten)
end = int(offset / dim1_flatten + slice.shape[0]) end = int(offset / dim1_flatten + slice.shape[0])
...@@ -892,7 +895,7 @@ def save_inference_model(dirname, ...@@ -892,7 +895,7 @@ def save_inference_model(dirname,
True is supported. True is supported.
Returns: Returns:
None target_var_name_list(list): The fetch variables' name list
Raises: Raises:
ValueError: If `feed_var_names` is not a list of basestring. ValueError: If `feed_var_names` is not a list of basestring.
...@@ -945,11 +948,13 @@ def save_inference_model(dirname, ...@@ -945,11 +948,13 @@ def save_inference_model(dirname,
# TODO(Superjomn) add an IR pass to remove 1-scale op. # TODO(Superjomn) add an IR pass to remove 1-scale op.
with program_guard(main_program): with program_guard(main_program):
uniq_target_vars = [] uniq_target_vars = []
for var in target_vars: for i, var in enumerate(target_vars):
if isinstance(var, Variable): if isinstance(var, Variable):
var1 = layers.scale(var, 1.) var = layers.scale(
uniq_target_vars.append(var1) var, 1., name="save_infer_model/scale_{}".format(i))
uniq_target_vars.append(var)
target_vars = uniq_target_vars target_vars = uniq_target_vars
target_var_name_list = [var.name for var in target_vars]
# when a pserver and a trainer running on the same machine, mkdir may conflict # when a pserver and a trainer running on the same machine, mkdir may conflict
try: try:
...@@ -1006,6 +1011,7 @@ def save_inference_model(dirname, ...@@ -1006,6 +1011,7 @@ def save_inference_model(dirname,
params_filename = os.path.basename(params_filename) params_filename = os.path.basename(params_filename)
save_persistables(executor, dirname, main_program, params_filename) save_persistables(executor, dirname, main_program, params_filename)
return target_var_name_list
def load_inference_model(dirname, def load_inference_model(dirname,
......
...@@ -506,9 +506,9 @@ class While(object): ...@@ -506,9 +506,9 @@ class While(object):
while loop control flow. while loop control flow.
Args: Args:
cond (Variable): condition used to compare. cond(Variable): condition used to compare.
is_test(bool): A flag indicating whether execution is in test phase. is_test(bool): A flag indicating whether execution is in test phase.
name (str): The name of this layer. name(str): The name of this layer.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -589,7 +589,8 @@ class While(object): ...@@ -589,7 +589,8 @@ class While(object):
def lod_rank_table(x, level=0): def lod_rank_table(x, level=0):
"""LoD Rank Table Operator. Given an input variable **x** and a level number """
LoD Rank Table Operator. Given an input variable **x** and a level number
of LoD, this layer creates a LodRankTable object. A LoDRankTable object of LoD, this layer creates a LodRankTable object. A LoDRankTable object
contains a list of bi-element tuples. Each tuple consists of an index and contains a list of bi-element tuples. Each tuple consists of an index and
a length, both of which are int type. Refering to specified level of LoD, a length, both of which are int type. Refering to specified level of LoD,
...@@ -883,10 +884,8 @@ def less_than(x, y, force_cpu=None, cond=None, **ignored): ...@@ -883,10 +884,8 @@ def less_than(x, y, force_cpu=None, cond=None, **ignored):
return cond return cond
def equal(x, y, cond=None, **ignored): def equal(x, y, cond=None):
""" """
**equal**
This layer returns the truth value of :math:`x == y` elementwise. This layer returns the truth value of :math:`x == y` elementwise.
Args: Args:
...@@ -1458,7 +1457,6 @@ class DynamicRNN(object): ...@@ -1458,7 +1457,6 @@ class DynamicRNN(object):
Returns: Returns:
The current timestep in the input sequence. The current timestep in the input sequence.
""" """
self._assert_in_rnn_block_("step_input") self._assert_in_rnn_block_("step_input")
if not isinstance(x, Variable): if not isinstance(x, Variable):
...@@ -1535,8 +1533,7 @@ class DynamicRNN(object): ...@@ -1535,8 +1533,7 @@ class DynamicRNN(object):
@signature_safe_contextmanager @signature_safe_contextmanager
def block(self): def block(self):
""" """
The block for user to define operators in RNN. See the class docstring The block for user to define operators in RNN.
for more details.
""" """
if self.status != DynamicRNN.BEFORE_RNN: if self.status != DynamicRNN.BEFORE_RNN:
raise ValueError("rnn.block() can only be invoke once") raise ValueError("rnn.block() can only be invoke once")
...@@ -1640,8 +1637,7 @@ class DynamicRNN(object): ...@@ -1640,8 +1637,7 @@ class DynamicRNN(object):
dtype(str|numpy.dtype): The data type of the initialized memory. dtype(str|numpy.dtype): The data type of the initialized memory.
Returns: Returns:
the memory variable. The memory variable.
""" """
self._assert_in_rnn_block_('memory') self._assert_in_rnn_block_('memory')
self._init_zero_idx_() self._init_zero_idx_()
...@@ -1740,7 +1736,7 @@ class DynamicRNN(object): ...@@ -1740,7 +1736,7 @@ class DynamicRNN(object):
def output(self, *outputs): def output(self, *outputs):
""" """
mark the RNN output variables. Mark the RNN output variables.
Args: Args:
outputs: The output variables. outputs: The output variables.
......
...@@ -545,15 +545,16 @@ def yolov3_loss(x, ...@@ -545,15 +545,16 @@ def yolov3_loss(x,
TypeError: Attr ignore_thresh of yolov3_loss must be a float number TypeError: Attr ignore_thresh of yolov3_loss must be a float number
Examples: Examples:
.. code-block:: python .. code-block:: python
x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32') x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32') gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32') gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326] anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
anchors = [0, 1, 2] anchor_mask = [0, 1, 2]
loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80, anchors=anchors, loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel, anchors=anchors,
ignore_thresh=0.5, downsample_ratio=32) anchor_mask=anchor_mask, class_num=80,
ignore_thresh=0.7, downsample_ratio=32)
""" """
helper = LayerHelper('yolov3_loss', **locals()) helper = LayerHelper('yolov3_loss', **locals())
......
...@@ -56,7 +56,10 @@ def data(name, ...@@ -56,7 +56,10 @@ def data(name,
Args: Args:
name(str): The name/alias of the function name(str): The name/alias of the function
shape(list): Tuple declaring the shape. shape(list): Tuple declaring the shape. If :code:`append_batch_size` is
True and there is no -1 inside :code:`shape`, it should be
considered as the shape of the each sample. Otherwise, it
should be considered as the shape of the batched data.
append_batch_size(bool): append_batch_size(bool):
1. If true, it prepends -1 to the shape. 1. If true, it prepends -1 to the shape.
For example if shape=[1], the resulting shape is [-1, 1]. For example if shape=[1], the resulting shape is [-1, 1].
......
...@@ -24,7 +24,7 @@ from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype ...@@ -24,7 +24,7 @@ from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
__all__ = [ __all__ = [
'deprecated', 'generate_layer_fn', 'generate_layer_fn_noattr', 'autodoc', 'deprecated', 'generate_layer_fn', 'generate_activation_fn', 'autodoc',
'templatedoc' 'templatedoc'
] ]
...@@ -89,6 +89,9 @@ def _generate_doc_string_(op_proto, additional_args_lines=None): ...@@ -89,6 +89,9 @@ def _generate_doc_string_(op_proto, additional_args_lines=None):
buf.write('\n') buf.write('\n')
skip_attrs = OpProtoHolder.generated_op_attr_names() skip_attrs = OpProtoHolder.generated_op_attr_names()
# attr use_mkldnn and is_test also should not be visible to users.
skip_attrs.add("use_mkldnn")
skip_attrs.add("is_test")
for each_attr in op_proto.attrs: for each_attr in op_proto.attrs:
if each_attr.name in skip_attrs: if each_attr.name in skip_attrs:
...@@ -226,7 +229,7 @@ def generate_layer_fn(op_type): ...@@ -226,7 +229,7 @@ def generate_layer_fn(op_type):
return func return func
def generate_layer_fn_noattr(op_type): def generate_activation_fn(op_type):
"""Register the Python layer for an Operator without Attribute. """Register the Python layer for an Operator without Attribute.
Args: Args:
...@@ -246,6 +249,7 @@ def generate_layer_fn_noattr(op_type): ...@@ -246,6 +249,7 @@ def generate_layer_fn_noattr(op_type):
func.__name__ = op_type func.__name__ = op_type
func.__doc__ = _generate_doc_string_(op_proto) func.__doc__ = _generate_doc_string_(op_proto)
return func return func
......
...@@ -2441,7 +2441,7 @@ def pool2d(input, ...@@ -2441,7 +2441,7 @@ def pool2d(input,
data = fluid.layers.data( data = fluid.layers.data(
name='data', shape=[3, 32, 32], dtype='float32') name='data', shape=[3, 32, 32], dtype='float32')
conv2d = fluid.layers.pool2d( pool2d = fluid.layers.pool2d(
input=data, input=data,
pool_size=2, pool_size=2,
pool_type='max', pool_type='max',
...@@ -2490,6 +2490,7 @@ def pool2d(input, ...@@ -2490,6 +2490,7 @@ def pool2d(input,
return pool_out return pool_out
@templatedoc()
def pool3d(input, def pool3d(input,
pool_size=-1, pool_size=-1,
pool_type="max", pool_type="max",
...@@ -2501,13 +2502,19 @@ def pool3d(input, ...@@ -2501,13 +2502,19 @@ def pool3d(input,
name=None, name=None,
exclusive=True): exclusive=True):
""" """
This function adds the operator for pooling in 3-dimensions, using the ${comment}
pooling configurations mentioned in input parameters.
Args: Args:
input (Variable): ${input_comment} input (Variable): The input tensor of pooling operator. The format of
pool_size (int): ${ksize_comment} input tensor is NCDHW, where N is batch size, C is
pool_type (str): ${pooling_type_comment} the number of channels, D is the depth of the feature,
H is the height of the feature, and W is the width
of the feature.
pool_size (int|list|tuple): The pool kernel size. If pool kernel size
is a tuple or list, it must contain three integers,
(pool_size_Depth, pool_size_Height, pool_size_Width).
Otherwise, the pool kernel size will be the cube of an int.
pool_type (string): ${pooling_type_comment}
pool_stride (int): stride of the pooling layer. pool_stride (int): stride of the pooling layer.
pool_padding (int): padding size. pool_padding (int): padding size.
global_pooling (bool): ${global_pooling_comment} global_pooling (bool): ${global_pooling_comment}
...@@ -2520,6 +2527,19 @@ def pool3d(input, ...@@ -2520,6 +2527,19 @@ def pool3d(input,
Returns: Returns:
Variable: output of pool3d layer. Variable: output of pool3d layer.
Examples:
.. code-block:: python
data = fluid.layers.data(
name='data', shape=[3, 32, 32, 32], dtype='float32')
pool3d = fluid.layers.pool3d(
input=data,
pool_size=2,
pool_type='max',
pool_stride=1,
global_pooling=False)
""" """
if pool_type not in ["max", "avg"]: if pool_type not in ["max", "avg"]:
raise ValueError( raise ValueError(
...@@ -2569,7 +2589,27 @@ def adaptive_pool2d(input, ...@@ -2569,7 +2589,27 @@ def adaptive_pool2d(input,
require_index=False, require_index=False,
name=None): name=None):
""" """
${comment} **Adaptive Pool2d Operator**
The adaptive_pool2d operation calculates the output based on the input, pool_size,
pool_type parameters. Input(X) and output(Out) are in NCHW format, where N is batch
size, C is the number of channels, H is the height of the feature, and W is
the width of the feature. Parameters(pool_size) should contain two elements which
represent height and width, respectively. Also the H and W dimensions of output(Out)
is same as Parameter(pool_size).
For average adaptive pool2d:
.. math::
hstart &= floor(i * H_{in} / H_{out})
hend &= ceil((i + 1) * H_{in} / H_{out})
wstart &= floor(j * W_{in} / W_{out})
wend &= ceil((j + 1) * W_{in} / W_{out})
Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
Args: Args:
input (Variable): The input tensor of pooling operator. The format of input (Variable): The input tensor of pooling operator. The format of
...@@ -2579,8 +2619,8 @@ def adaptive_pool2d(input, ...@@ -2579,8 +2619,8 @@ def adaptive_pool2d(input,
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain two integers, (pool_size_Height, pool_size_Width). it must contain two integers, (pool_size_Height, pool_size_Width).
pool_type: ${pooling_type_comment} pool_type: ${pooling_type_comment}
require_index (bool): If true, the index of max pooling point along with outputs. require_index (bool): If true, the index of max pooling point will be returned along
it cannot be set in average pooling type. with outputs. It cannot be set in average pooling type.
name (str|None): A name for this layer(optional). If set None, the name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically. layer will be named automatically.
...@@ -2661,18 +2701,42 @@ def adaptive_pool3d(input, ...@@ -2661,18 +2701,42 @@ def adaptive_pool3d(input,
require_index=False, require_index=False,
name=None): name=None):
""" """
${comment} **Adaptive Pool3d Operator**
The adaptive_pool3d operation calculates the output based on the input, pool_size,
pool_type parameters. Input(X) and output(Out) are in NCDHW format, where N is batch
size, C is the number of channels, D is the depth of the feature, H is the height of
the feature, and W is the width of the feature. Parameters(pool_size) should contain
three elements which represent height and width, respectively. Also the D, H and W
dimensions of output(Out) is same as Parameter(pool_size).
For average adaptive pool3d:
.. math::
dstart &= floor(i * D_{in} / D_{out})
dend &= ceil((i + 1) * D_{in} / D_{out})
hstart &= floor(j * H_{in} / H_{out})
hend &= ceil((j + 1) * H_{in} / H_{out})
wstart &= floor(k * W_{in} / W_{out})
wend &= ceil((k + 1) * W_{in} / W_{out})
Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
Args: Args:
input (Variable): The input tensor of pooling operator. The format of input (Variable): The input tensor of pooling operator. The format of
input tensor is NCHW, where N is batch size, C is input tensor is NCDHW, where N is batch size, C is
the number of channels, H is the height of the the number of channels, D is the depth of the feature,
feature, and W is the width of the feature. H is the height of the feature, and W is the width of the feature.
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain two integers, (Depth, Height, Width). it must contain three integers, (Depth, Height, Width).
pool_type: ${pooling_type_comment} pool_type: ${pooling_type_comment}
require_index (bool): If true, the index of max pooling point along with outputs. require_index (bool): If true, the index of max pooling point will be returned along
it cannot be set in average pooling type. with outputs. It cannot be set in average pooling type.
name (str|None): A name for this layer(optional). If set None, the name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically. layer will be named automatically.
...@@ -2709,7 +2773,7 @@ def adaptive_pool3d(input, ...@@ -2709,7 +2773,7 @@ def adaptive_pool3d(input,
name='data', shape=[3, 32, 32], dtype='float32') name='data', shape=[3, 32, 32], dtype='float32')
pool_out, mask = fluid.layers.adaptive_pool3d( pool_out, mask = fluid.layers.adaptive_pool3d(
input=data, input=data,
pool_size=[3, 3], pool_size=[3, 3, 3],
pool_type='avg') pool_type='avg')
""" """
if pool_type not in ["max", "avg"]: if pool_type not in ["max", "avg"]:
...@@ -2930,6 +2994,7 @@ def batch_norm(input, ...@@ -2930,6 +2994,7 @@ def batch_norm(input,
"momentum": momentum, "momentum": momentum,
"epsilon": epsilon, "epsilon": epsilon,
"is_test": is_test, "is_test": is_test,
"data_layout": data_layout,
"use_mkldnn": False, "use_mkldnn": False,
"fuse_with_relu": fuse_with_relu, "fuse_with_relu": fuse_with_relu,
"use_global_stats": use_global_stats "use_global_stats": use_global_stats
...@@ -3235,7 +3300,7 @@ def group_norm(input, ...@@ -3235,7 +3300,7 @@ def group_norm(input,
# create output # create output
mean_out = helper.create_variable(dtype=dtype, stop_gradient=True) mean_out = helper.create_variable(dtype=dtype, stop_gradient=True)
variance_out = helper.create_variable(dtype=dtype, stop_gradient=True) variance_out = helper.create_variable(dtype=dtype, stop_gradient=True)
group_norm_out = helper.create_variable(dtype) group_norm_out = helper.create_variable(dtype=dtype)
helper.append_op( helper.append_op(
type="group_norm", type="group_norm",
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
from __future__ import print_function from __future__ import print_function
import os import os
from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr from .layer_function_generator import generate_layer_fn, generate_activation_fn
from .. import core from .. import core
from ..framework import convert_np_dtype_to_dtype_ from ..framework import convert_np_dtype_to_dtype_
...@@ -53,7 +53,7 @@ globals()['_elementwise_div'] = generate_layer_fn('elementwise_div') ...@@ -53,7 +53,7 @@ globals()['_elementwise_div'] = generate_layer_fn('elementwise_div')
__all__ += __activations_noattr__ __all__ += __activations_noattr__
for _OP in set(__activations_noattr__): for _OP in set(__activations_noattr__):
globals()[_OP] = generate_layer_fn_noattr(_OP) globals()[_OP] = generate_activation_fn(_OP)
__all__ += ["uniform_random"] __all__ += ["uniform_random"]
......
...@@ -1368,9 +1368,9 @@ class FtrlOptimizer(Optimizer): ...@@ -1368,9 +1368,9 @@ class FtrlOptimizer(Optimizer):
Args: Args:
learning_rate (float|Variable): global learning rate. learning_rate (float|Variable): global learning rate.
l1 (float): l1 (float): L1 regularization strength.
l2 (float): l2 (float): L2 regularization strength.
lr_power (float): lr_power (float): Learning Rate Power.
regularization: A Regularizer, such as regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer. fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix. name: A optional name prefix.
......
...@@ -148,7 +148,10 @@ class ParallelExecutor(object): ...@@ -148,7 +148,10 @@ class ParallelExecutor(object):
else framework.default_main_program() else framework.default_main_program()
# FIXME(dzhwinter): enable_inplace should be after memory_optimize # FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass. # if turn on python memory optimize, turn off the inplace_pass.
build_strategy.enable_inplace = False if main._is_mem_optimized else True if build_strategy.memory_optimize is True:
build_strategy.memory_optimize = False if main._is_mem_optimized else True
if build_strategy.enable_inplace is True:
build_strategy.enable_inplace = False if main._is_mem_optimized else True
scope = scope if scope is not None else executor.global_scope() scope = scope if scope is not None else executor.global_scope()
if share_vars_from and not isinstance(share_vars_from, if share_vars_from and not isinstance(share_vars_from,
......
...@@ -77,6 +77,7 @@ list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op) ...@@ -77,6 +77,7 @@ list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
list(REMOVE_ITEM TEST_OPS test_nearest_interp_op) list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
list(REMOVE_ITEM TEST_OPS test_imperative_resnet) list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
list(REMOVE_ITEM TEST_OPS test_imperative_optimizer) list(REMOVE_ITEM TEST_OPS test_imperative_optimizer)
list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer)
foreach(TEST_OP ${TEST_OPS}) foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP}) py_test_modules(${TEST_OP} MODULES ${TEST_OP})
endforeach(TEST_OP) endforeach(TEST_OP)
...@@ -107,10 +108,18 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE ...@@ -107,10 +108,18 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450) set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL) py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
if(NOT WIN32)
py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL)
endif()
if(NOT APPLE) if(NOT APPLE)
py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# change the timeout from 600 to 1200, because in debug mode, this test need more time.
set_tests_properties(test_image_classification_resnet PROPERTIES TIMEOUT 1200)
endif()
endif() endif()
if (WITH_NGRAPH) if (WITH_NGRAPH)
add_subdirectory(ngraph) add_subdirectory(ngraph)
endif() endif()
......
...@@ -79,7 +79,7 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -79,7 +79,7 @@ class TestParallelExecutorBase(unittest.TestCase):
if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv
build_strategy.memory_optimize = use_ir_memory_optimize build_strategy.memory_optimize = False if memory_opt else use_ir_memory_optimize
# python memory optimization is conflict with inplace pass. # python memory optimization is conflict with inplace pass.
# Use ir graph memory optimization after inplace pass is the correct way. # Use ir graph memory optimization after inplace pass is the correct way.
build_strategy.enable_inplace = False if memory_opt else enable_inplace build_strategy.enable_inplace = False if memory_opt else enable_inplace
......
...@@ -121,6 +121,8 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -121,6 +121,8 @@ class TestMNIST(TestParallelExecutorBase):
regularization=fluid.regularizer.L2Decay(1e-6)) regularization=fluid.regularizer.L2Decay(1e-6))
return optimizer return optimizer
# NOTE(dzh):
# need to make it compatible with elewise fuse act
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
model, model,
feed_dict={"image": img, feed_dict={"image": img,
...@@ -128,6 +130,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -128,6 +130,7 @@ class TestMNIST(TestParallelExecutorBase):
use_cuda=use_cuda, use_cuda=use_cuda,
fuse_elewise_add_act_ops=False, fuse_elewise_add_act_ops=False,
memory_opt=False, memory_opt=False,
use_ir_memory_optimize=False,
optimizer=_optimizer) optimizer=_optimizer)
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence( fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
model, model,
...@@ -136,6 +139,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -136,6 +139,7 @@ class TestMNIST(TestParallelExecutorBase):
use_cuda=use_cuda, use_cuda=use_cuda,
fuse_elewise_add_act_ops=True, fuse_elewise_add_act_ops=True,
memory_opt=False, memory_opt=False,
use_ir_memory_optimize=False,
optimizer=_optimizer) optimizer=_optimizer)
for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss): for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
......
...@@ -1020,7 +1020,11 @@ class DistributeTranspiler(object): ...@@ -1020,7 +1020,11 @@ class DistributeTranspiler(object):
skip_dim0 = 0 skip_dim0 = 0
slice_vars = self.param_var_mapping[orig_var_name] slice_vars = self.param_var_mapping[orig_var_name]
orig_dim1_flatten = reduce(lambda x, y: x * y, slice_vars[0].shape[1:]) orig_dim1_flatten = 1
if len(slice_vars[0].shape) >= 2:
orig_dim1_flatten = reduce(lambda x, y: x * y,
slice_vars[0].shape[1:])
for slice_var in slice_vars[:block_idx]: for slice_var in slice_vars[:block_idx]:
skip_dim0 += slice_var.shape[0] skip_dim0 += slice_var.shape[0]
......
requests==2.9.2 requests==2.9.2
numpy>=1.12 numpy>=1.12
protobuf==3.1 protobuf>=3.1.0
recordio>=0.1.0 recordio>=0.1.0
matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib
rarfile rarfile
......
...@@ -100,6 +100,7 @@ packages=['paddle', ...@@ -100,6 +100,7 @@ packages=['paddle',
'paddle.utils', 'paddle.utils',
'paddle.dataset', 'paddle.dataset',
'paddle.reader', 'paddle.reader',
'paddle.distributed',
'paddle.fluid', 'paddle.fluid',
'paddle.fluid.imperative', 'paddle.fluid.imperative',
'paddle.fluid.proto', 'paddle.fluid.proto',
......
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册