提交 f79a3a83 编写于 作者: S sneaxiy

merge release/1.3

上级 d2d3f2b5
......@@ -231,7 +231,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
)
ENDFUNCTION()
SET(PROTOBUF_VERSION 3.1)
SET(PROTOBUF_VERSION 3.1.0)
IF(NOT PROTOBUF_FOUND)
build_protobuf(extern_protobuf FALSE)
......
......@@ -21,7 +21,7 @@ function(CheckCompilerCXX11Flag)
if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
endif()
endif()
endif()
endif()
endfunction()
......@@ -147,6 +147,7 @@ set(GPU_COMMON_FLAGS
-Wno-error=unused-function # Warnings in Numpy Header.
-Wno-error=array-bounds # Warnings in Eigen::array
)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
endif(NOT WIN32)
if (APPLE)
......
......@@ -261,7 +261,7 @@ paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=N
paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None))
paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords='ignored', defaults=(None,))
paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
......
......@@ -50,12 +50,15 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper)
if(WITH_GPU)
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info)
else()
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper cpu_info)
endif()
cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass)
cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info)
cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle
all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
......@@ -67,13 +70,11 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass inplace_op_pass)
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass)
if (WITH_GPU)
list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
endif()
cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph)
cc_test(memory_optimize_pass_test SRCS memory_optimize_pass_test.cc memory_optimize_pass.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry pass)
cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
......
......@@ -206,8 +206,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
new std::vector<OpDesc *>(main_program.Block(0).AllOps());
graph->Set<const std::vector<OpDesc *>>(kAllOpDescs,
all_op_descs); // take ownership
graph->Set<GraphNodePool>(kGraphNodePool,
new GraphNodePool); // take ownership
pass->Erase(kAllOpDescs);
pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs);
......@@ -242,7 +240,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
continue;
}
}
VLOG(3) << "Start Apply Pass " << pass->Type();
graph = pass->Apply(std::move(graph));
VLOG(3) << "Finish Apply Pass " << pass->Type();
}
return graph;
}
......
......@@ -49,7 +49,7 @@ DEFINE_bool(
"If this option turns on, only these op in whitelist can be inplaced."
"If it turns off, all of the running op can be candidate of inplaced op."
"Such as scale, elementwise_add"
"By default, it's turned on");
"By default, it's turned off");
DECLARE_string(memory_optimize_debug);
......@@ -171,16 +171,15 @@ void InplacePass::InplaceModifyDesc(const std::string& var,
}
}
const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var,
const std::string& cache_var,
const size_t& idx,
ir::Graph* graph) const {
const NodeSwapQueue InplacePass::TryInplaceModifyVar(
const std::string& var, const std::string& cache_var, const size_t& idx,
ir::Graph* graph) const {
PADDLE_ENFORCE(var_nodes_[var].size() >= 1 &&
var_nodes_[var].at(0)->Var() != nullptr);
std::unique_ptr<VarDesc> var_desc(new VarDesc(*var_nodes_[var].at(0)->Var()));
var_desc->SetName(cache_var);
SSANodePair swap_nodes;
NodeSwapQueue swap_nodes;
for (size_t i = idx; i < view_.AllOps().size(); ++i) {
auto* op = view_.AllOps()[i];
......@@ -230,7 +229,7 @@ const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var,
return swap_nodes;
}
void InplacePass::CommitModify(const SSANodePair& swap_nodes,
void InplacePass::CommitModify(const NodeSwapQueue& swap_nodes,
ir::Graph* graph) const {
for (auto& pair : swap_nodes) {
auto *node = pair.first, *cache_node = pair.second;
......@@ -245,7 +244,7 @@ void InplacePass::CommitModify(const SSANodePair& swap_nodes,
}
}
void InplacePass::WithdrawModify(const SSANodePair& nodes,
void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
ir::Graph* graph) const {
for (auto& pair : nodes) {
auto *node = pair.first, *cache_node = pair.second;
......@@ -403,18 +402,20 @@ void GraphView::Build(ir::Graph* g) {
// 2. track the nodes which used by parameter server.
// these node can not be inplaced, otherwise trainer
// pserver can not find each other name.
for (auto& node : g->Nodes()) {
if (!node->IsOp()) continue;
if (node->Name() == "send") {
for (auto& in : node->inputs) {
dup_nodes_.emplace(in->Name());
}
auto update_skip_set = [&](ir::Node* node) {
for (auto& in : node->inputs) {
if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name());
}
if (node->Name() == "recv") {
for (auto& out : node->outputs) {
for (auto& out : node->outputs) {
if (out->IsVar() && out->Var() != nullptr)
dup_nodes_.emplace(out->Name());
}
}
};
for (auto& node : g->Nodes()) {
if (!node->IsOp()) continue;
if (node->Name() == "send") update_skip_set(node);
if (node->Name() == "recv") update_skip_set(node);
if (node->Name() == "prefetch") update_skip_set(node);
}
}
......
......@@ -56,7 +56,8 @@ class GraphView {
std::map<ir::Node*, std::unordered_set<ir::Node*>> adj_list_;
};
typedef std::vector<std::pair<ir::Node*, ir::Node*>> SSANodePair;
// swap pairs in sequence
typedef std::vector<std::pair<ir::Node*, ir::Node*>> NodeSwapQueue;
class InplacePass : public ir::Pass {
public:
InplacePass();
......@@ -68,14 +69,14 @@ class InplacePass : public ir::Pass {
void InitSSAGraphNodes() const;
private:
const SSANodePair TryInplaceModifyVar(const std::string& var,
const std::string& cache_var,
const size_t& idx,
ir::Graph* graph) const;
const NodeSwapQueue TryInplaceModifyVar(const std::string& var,
const std::string& cache_var,
const size_t& idx,
ir::Graph* graph) const;
void CommitModify(const SSANodePair&, ir::Graph* graph) const;
void CommitModify(const NodeSwapQueue&, ir::Graph* graph) const;
void WithdrawModify(const SSANodePair& nodes, ir::Graph* graph) const;
void WithdrawModify(const NodeSwapQueue& nodes, ir::Graph* graph) const;
void InplaceModifyDesc(const std::string& in_var, const std::string& out_var,
const size_t& idx) const;
......
......@@ -13,17 +13,114 @@
// limitations under the License.
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include <algorithm>
#include <deque>
#include <functional>
#include <iostream>
#include <iterator>
#include <numeric>
#include <sstream>
#include <string>
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/cpu_info.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/gpu_info.h"
#endif // PADDLE_WITH_CUDA
namespace paddle {
namespace framework {
namespace details {
using paddle::framework::VarDesc;
std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
PADDLE_ENFORCE(graph.Has(kAllOpDescs),
"Graph has no attribute of kAllOpDescs.");
// 1. get op desc order
auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs);
// 2. topology sort order
auto nodes = graph.Nodes();
std::deque<ir::Node*> ops;
FilterVariables(nodes, [&](ir::Node* op) {
if (op->IsOp() && op->Op() != nullptr) {
ops.emplace_back(op);
}
});
std::unordered_map<ir::Node*, size_t> op_deps;
std::list<ir::Node*> ready_ops;
std::unordered_map<ir::Node*, std::unordered_set<ir::Node*>> pending_ops;
for (auto* op : ops) {
std::unordered_set<ir::Node*> preceding_op;
for (auto* in : op->inputs) {
if (in->inputs.empty()) continue;
PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp());
preceding_op.emplace(in->inputs[0]);
pending_ops[in->inputs[0]].emplace(op);
}
op_deps[op] = preceding_op.size();
if (preceding_op.empty()) {
ready_ops.emplace_back(op);
}
}
// 3. generated op list based desc order and the topology order
std::vector<ir::Node*> ret;
std::list<OpDesc*> op_descs_list(op_descs.begin(), op_descs.end());
auto update_by_found_node = [&](ir::Node* found_node) {
for (auto* pending_op : pending_ops[found_node]) {
if (--op_deps[pending_op] == 0) {
ready_ops.emplace_back(pending_op);
}
}
ready_ops.remove(found_node);
ret.emplace_back(found_node);
};
while (!ready_ops.empty()) {
bool all_of_ready_op_unmatched = true;
for (auto it = op_descs_list.begin(); it != op_descs_list.end();) {
auto op_desc = *it;
ir::Node* found_node = nullptr;
for (auto* op : ready_ops) {
if (IsSameDesc(op->Op(), op_desc)) {
found_node = op;
break;
}
}
// 3.1 op desc deleted by other pass
if (found_node == nullptr) {
++it;
continue;
} else {
all_of_ready_op_unmatched = false;
it = op_descs_list.erase(it);
}
update_by_found_node(found_node);
}
// 3.2 op descs are added by other pass
// preceding op non empty means some new op descs are
// created, but not contained in return node list.
// these new op desc may depend on each other.
std::list<ir::Node*> prev_ready_ops(ready_ops);
if (all_of_ready_op_unmatched) {
for (auto op : prev_ready_ops) {
update_by_found_node(op);
}
}
}
PADDLE_ENFORCE(std::all_of(
op_deps.begin(), op_deps.end(),
[&](const std::pair<ir::Node*, size_t>& p) { return p.second == 0; }));
return ret;
}
size_t NodeSizeInBytes(const VarDesc& node) {
size_t NodeSize(const VarDesc& node) {
auto shape = node.GetShape();
int size =
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
......@@ -31,9 +128,9 @@ size_t NodeSizeInBytes(const VarDesc& node) {
return type_size * std::abs(size);
}
size_t NodeSizeInBytes(ir::Node* n) {
size_t NodeSize(ir::Node* n) {
auto* desc = FindVarDescInBlock(n);
return NodeSizeInBytes(*desc);
return NodeSize(*desc);
}
std::string DebugStringImpl(VarDesc* var) {
......@@ -59,7 +156,6 @@ std::string DebugStringImpl(VarDesc* var) {
std::string DebugString(ir::Node* var) {
return DebugStringImpl(FindVarDescInBlock(var));
}
// return DebugString(var->Var()); }
// NOTE(dzh): based ir node, if a large node has been reused
// by a small size node, then next time it appear in pool, it will
......@@ -76,22 +172,26 @@ struct NodeComparator {
bool operator()(ir::Node* lhs, ir::Node* rhs) const {
auto* lhs_desc = FindVarDescInBlock(lhs);
auto* rhs_desc = FindVarDescInBlock(rhs);
// match data type
if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) {
return false;
}
// match shape
auto lhs_shape = lhs_desc->GetShape();
auto rhs_shape = rhs_desc->GetShape();
if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
(lhs_shape[0] != -1 && rhs_shape[0] != -1)) {
return NodeSizeInBytes(lhs) <= NodeSizeInBytes(rhs);
return NodeSize(lhs) <= NodeSize(rhs);
} else {
return false;
}
}
};
void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
void OrderedSet::Insert(ir::Node* var) {
PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar());
PADDLE_ENFORCE(op->IsOp());
if (mark_table_.count(var->Name()) != 0) {
mark_table_[var->Name()]->second.insert(op);
mark_table_[var->Name()]->emplace_back(var);
return;
}
......@@ -99,14 +199,15 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
auto var_shape = var_desc->GetShape();
int batch_size = static_cast<int>(var_shape[0]);
NodeComparator compare_node;
NodeComparator functor;
Iter it = nodes_.begin();
while (it != nodes_.end()) {
auto* cache_desc = FindVarDescInBlock(it->first);
auto& prev = it->front();
auto* cache_desc = FindVarDescInBlock(prev);
int cache_batch_size = cache_desc->GetShape()[0];
if ((cache_batch_size == -1 && batch_size == -1) ||
(cache_batch_size != -1 && batch_size != -1)) {
if (compare_node(it->first, var)) {
if (functor(prev, var)) {
++it;
} else {
break;
......@@ -118,62 +219,127 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
}
}
it =
nodes_.insert(it, std::make_pair(var, std::unordered_set<ir::Node*>{op}));
it = nodes_.insert(it, {var});
mark_table_[var->Name()] = it;
}
int OrderedNodeList::GetIndex(ir::Node* var) {
int OrderedSet::GetNodeIndexInPool(ir::Node* var) {
return std::distance(nodes_.begin(), mark_table_[var->Name()]);
}
ir::Node* OrderedNodeList::NodeMatch(ir::Node* var) const {
ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const {
ir::Node* found_node = nullptr;
NodeComparator compare_node;
NodeComparator functor;
for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
if (compare_node(var, it->first)) {
found_node = it->first;
auto& candidate = it->front();
if (functor(var, candidate)) {
found_node = candidate;
break;
}
}
return found_node;
}
void OrderedNodeList::Erase(ir::Node* var) { Erase(var->Name()); }
ir::Node* OrderedSet::FindNextBestFitNode(ir::Node* var, ir::Node* prev) const {
ir::Node* found_node = nullptr;
NodeComparator functor;
auto it =
std::find_if(nodes_.begin(), nodes_.end(), [&](const NodeVector& v) {
if (v.front() == prev)
return true;
else
return false;
});
PADDLE_ENFORCE(it != nodes_.end(), "Not found previous in node list!");
for (it = std::next(it); it != nodes_.end(); ++it) {
auto& candidate = it->front();
if (functor(var, candidate)) {
found_node = candidate;
break;
}
}
return found_node;
}
bool OrderedSet::Has(ir::Node* var) const {
if (mark_table_.count(var->Name())) {
auto& node_in_samename = mark_table_.at(var->Name());
auto iter =
std::find_if(node_in_samename->begin(), node_in_samename->end(),
[&](ir::Node* n) { return n->Name() == var->Name(); });
return iter != node_in_samename->end();
}
return false;
}
void OrderedNodeList::Erase(const std::string& var) {
void OrderedSet::Erase(const std::string& var) {
PADDLE_ENFORCE(mark_table_.count(var));
nodes_.erase(mark_table_[var]);
mark_table_.erase(var);
}
std::string OrderedNodeList::ToString() const {
void OrderedSet::Erase(ir::Node* var) {
PADDLE_ENFORCE(var != nullptr);
Erase(var->Name());
}
std::string OrderedSet::ToString() const {
std::stringstream ss;
for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
ss << DebugString(it->first) << " ";
for (auto& node : *it) {
ss << DebugString(node) << " ";
}
}
return ss.str();
}
bool NodeCanReused(ir::Node* node) {
// valid the node is a var node
if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false;
// auto* desc = node->Var();
bool flag = NodeCanReused(*node->Var());
bool flag = true;
// op output force generated in cpu, can not be reused.
for (auto* op : node->inputs) {
if (op->Op()->HasAttr("force_cpu")) {
// op output force generated in cpu, can not be reused.
flag &= framework::AttrReader(op->Op()->GetAttrMap())
.Get<bool>("force_cpu") == 0;
}
}
// var desc validation.
flag &= NodeCanReused(*node->Var());
return flag;
}
int MinChunkSize() {
int size{0};
#ifdef PADDLE_WITH_CUDA
size = platform::GpuMinChunkSize();
#else
size = platform::CpuMinChunkSize();
#endif // PADDLE_WITH_CUDA
return size;
}
bool NodeCanReused(const VarDesc& node) {
auto type = node.GetType();
if (node.Persistable() || type != proto::VarType::LOD_TENSOR ||
node.GetShape().empty()) {
// only these types holds bulk of gpu memory
if (!(type == proto::VarType::LOD_TENSOR ||
type == proto::VarType::SELECTED_ROWS ||
type == proto::VarType::LOD_TENSOR_ARRAY)) {
return false;
}
// persistable variable is parameter
if (node.Persistable()) {
return false;
}
// shape < min_chunk_size is meaningless.
// further more, fetched loss always has size = 1
// which should not be reused.
auto shape = node.GetShape();
int size = std::abs(
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()));
if (shape.empty() || size < MinChunkSize()) {
return false;
}
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
......@@ -193,6 +359,176 @@ bool OpHasSubBlock(OpDesc* desc) {
return false;
}
ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) {
ops_ = SortOpLikeDescOrder(graph);
ConnectNodes();
}
void ControlFlowGraph::BuildCFGGraph() {
// FIXME(dzh): same effect with ConnectNodes, but use the control
// link to build dependency graph, it goes wrong in transformer.
for (ir::Node* op : ops_) {
for (auto& input_var : op->inputs) {
if (!input_var->inputs.empty()) {
PADDLE_ENFORCE(
input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(),
"Preceding Op Node of Var Node must be unique");
auto* pred_op = input_var->inputs[0];
if (pred_op->Op() != nullptr) {
predecessors_[op].insert(pred_op);
successors_[pred_op].insert(op);
}
}
if (input_var->IsVar() && !input_var->IsCtrlVar()) {
uses_[op].insert(input_var->Name());
}
}
for (auto& output_var : op->outputs) {
// output var may be used by many op
for (auto* succ_op : output_var->outputs) {
if (succ_op->Op() != nullptr) {
successors_[op].insert(succ_op);
predecessors_[succ_op].insert(op);
}
}
if (output_var->IsVar() && !output_var->IsCtrlVar()) {
defs_[op].insert(output_var->Name());
}
}
}
}
void ControlFlowGraph::ConnectNodes() {
for (size_t i = 0; i < ops_.size(); ++i) {
auto& op = ops_[i];
try {
auto& next_op = ops_.at(i + 1);
successors_[op].insert(next_op);
predecessors_[next_op].insert(op);
} catch (...) {
// do nothing
}
FilterVariables(op->inputs,
[&](ir::Node* var) { uses_[op].emplace(var->Name()); });
FilterVariables(op->outputs,
[&](ir::Node* var) { defs_[op].emplace(var->Name()); });
}
}
void ControlFlowGraph::LiveVariableAnalysis() {
// NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm)
// compute the liveness of for each variable though reversed_ops algorithm.
// It iterates the operators from end to begin, compute the live in/live out
// variable set for each op, then the diff between in/out will be used for
// the variable reuse. For detail refer to
// http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf
std::list<ir::Node*> work_list(ops_.rbegin(), ops_.rend());
while (!work_list.empty()) {
ir::Node* op = work_list.front();
work_list.pop_front();
// get the live_in calculated before. Empty if first.
auto prev_live_in = std::move(live_in_[op]);
for (auto& s : successors_[op]) {
for (auto& var : live_in_[s]) {
live_out_[op].insert(var);
}
}
for (auto& var : uses_[op]) {
live_in_[op].insert(var);
}
for (auto& var : live_out_[op]) {
live_in_[op].insert(var);
}
for (auto& var : defs_[op]) {
live_in_[op].erase(var);
}
// If the live_in is not changed, then the liveness analysis of
// predecessors is completed.
//
// Otherwise, recalculate the predecessors liveness
if (live_in_[op] != prev_live_in) {
for (auto& pre : predecessors_[op]) {
work_list.push_back(pre);
}
}
}
}
void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
const std::string& new_node,
int begin_idx) {
// update graph from begin idx to the end
for (size_t i = begin_idx; i != ops_.size(); ++i) {
auto* op = ops_[i];
if (uses_[op].find(old_node) != uses_[op].end()) {
uses_[op].erase(old_node);
uses_[op].insert(new_node);
}
if (defs_[op].find(old_node) != defs_[op].end()) {
defs_[op].erase(old_node);
defs_[op].insert(new_node);
}
if (live_in_[op].find(old_node) != live_in_[op].end()) {
live_in_[op].erase(old_node);
live_in_[op].insert(new_node);
}
if (live_out_[op].find(old_node) != live_out_[op].end()) {
live_out_[op].erase(old_node);
live_out_[op].insert(new_node);
}
}
}
const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
auto it = live_in_.find(op);
PADDLE_ENFORCE(
it != live_in_.end(),
string::Sprintf("Expect %s in live_in, but Not Found.", op->Name()));
return it->second;
}
const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
auto it = live_out_.find(op);
PADDLE_ENFORCE(
it != live_out_.end(),
string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
return it->second;
}
const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const {
auto it = uses_.find(op);
PADDLE_ENFORCE(
it != uses_.end(),
string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
return it->second;
}
const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; }
std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }
ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name,
ir::Node* op) const {
// in ssa-graph, different version nodes have same name,
// this function get the latest version var before target op
// It may return nullptr, such as data node.
ir::Node* found_node = nullptr;
for (auto* node : ops_) {
if (node == op) break;
for (auto& output : node->outputs) {
PADDLE_ENFORCE((output != nullptr && output->IsVar()),
"Output is empty!");
if (output->Var() && output->Name() == name) {
found_node = output;
}
}
}
return found_node;
}
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -17,6 +17,8 @@
#include <iostream>
#include <iterator>
#include <list>
#include <map>
#include <set>
#include <string>
#include <utility>
#include <vector>
......@@ -27,41 +29,43 @@ namespace paddle {
namespace framework {
namespace details {
constexpr char kFetchedVars[] = "fetched_vars";
constexpr char kGraphNodePool[] = "graph_node_pool";
constexpr char kAllOpDescs[] = "all_op_descs";
// NOTE(dzh): Variable and the operators use the var.
// for early delete pass.
// Because analysis var pass build base on ir::Node, which maybe released
// or modified between passes, so we use OpDesc* to mark ops.
using GraphNodePool = std::vector<
std::pair<std::string /*var node*/, std::unordered_set<OpDesc*> /* ops */>>;
std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
// NOTE(dzh): by default, it sort node in ascend order(by node bytes size).
// in fluid, -1 means the batch_size is determined in runtime.
// the node batch_size equal -1 always ranking in the front than the node not.
// NOTE(dzh): A ordered set for node reuse in memory optimize.
// the orderedset sort node in ascend order(by node bytes size).
// in fluid, -1 means the batch_size, which is determined in runtime.
// So the reuse happens between nodes who's batch_size both are -1
// simultaneously or not.
//
// sort rule:
// rule 0 : smaller node ranking in front.
// rule 1 : batch_size equal -1 ranking in the front than the node not.
//
// For example,
// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], ..
// O(1) insert, delete
class OrderedNodeList {
public:
using NodePair = std::pair<ir::Node*, std::unordered_set<ir::Node*>>;
using Iter = typename std::list<NodePair>::iterator;
using ConstIter = typename std::list<NodePair>::const_iterator;
void Insert(ir::Node* var, ir::Node* op);
class OrderedSet {
public:
// nodes with same name exists in pool.
using NodeVector = std::vector<ir::Node*>;
using Iter = typename std::list<NodeVector>::iterator;
using ConstIter = typename std::list<NodeVector>::const_iterator;
void Insert(ir::Node* var);
void Erase(ir::Node* var);
void Erase(const std::string& var);
bool Has(ir::Node* var) { return mark_table_.count(var->Name()); }
bool Has(const std::string& var) { return mark_table_.count(var); }
ir::Node* NodeMatch(ir::Node* var) const;
bool Has(ir::Node* var) const;
void Clear() {
mark_table_.clear();
nodes_.clear();
}
// find the bestfit shape node block with var.
ir::Node* FindBestFitNode(ir::Node* var) const;
ir::Node* FindNextBestFitNode(ir::Node* var, ir::Node* prev) const;
// map store non-const iterator, can not promise const
int GetIndex(ir::Node* var);
int GetNodeIndexInPool(ir::Node* var);
// pool all node to string
std::string ToString() const;
......@@ -69,18 +73,54 @@ class OrderedNodeList {
Iter end() { return nodes_.end(); }
ConstIter begin() const { return nodes_.begin(); }
ConstIter end() const { return nodes_.end(); }
size_t size() const { return nodes_.size(); }
void Clear() {
mark_table_.clear();
nodes_.clear();
}
size_t size() const { return nodes_.size(); }
private:
// for searching.
std::unordered_map<std::string, Iter> mark_table_;
// node swap pairs. var -> ops dep var
std::list<NodePair> nodes_;
// node pool
std::list<NodeVector> nodes_;
};
class ControlFlowGraph {
public:
ControlFlowGraph() = default;
// IR Graph
explicit ControlFlowGraph(const ir::Graph& graph);
void LiveVariableAnalysis();
void RenameVarInCFGGraph(const std::string& old_node,
const std::string& new_node, int begin_idx);
const std::set<std::string> LiveIn(ir::Node* op) const;
const std::set<std::string> LiveOut(ir::Node* op) const;
const std::set<std::string> Use(ir::Node* op) const;
const std::vector<ir::Node*> Ops() const;
std::vector<ir::Node*>& Ops();
// for ssa-graph nodes
ir::Node* GetNodeByName(const std::string& name, ir::Node* op) const;
private:
void BuildCFGGraph();
void ConnectNodes();
using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>;
using VarSetMap = std::map<ir::Node*, std::set<std::string>>;
// successors ops use the output variables.
NodeListMap successors_;
// predecessors ops generated input variables.
NodeListMap predecessors_;
// variables lived before run current op.
VarSetMap live_in_;
// variables lived after run current op.
VarSetMap live_out_;
VarSetMap uses_; // op inputs
VarSetMap defs_; // op outputs
std::vector<ir::Node*> ops_; // op sequence by topology sort
};
// valid a tensor can be reuse or not
......@@ -93,15 +133,24 @@ bool NodeCanReused(const VarDesc& node);
bool OpHasSubBlock(OpDesc* desc);
// node memory size in bytes
size_t NodeSizeInBytes(ir::Node* n);
size_t NodeSize(ir::Node* n);
// node memory size in bytes
size_t NodeSizeInBytes(const VarDesc&);
size_t NodeSize(const VarDesc&);
std::string DebugString(ir::Node* var);
// NOTE(dzhwinter)
// after node reuse, the replaced node shape is
// different with its VarDesc. So need to find the
// correct VarDesc in Block.
VarDesc* FindVarDescInBlock(ir::Node* n);
static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
op1->Outputs() == op2->Outputs();
}
template <typename Container, typename Callback>
class FilterVariableImpl {
public:
......
......@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include <algorithm>
#include <iostream>
#include <iterator>
#include <memory>
#include <sstream>
#include <string>
......@@ -22,13 +23,19 @@
#include <vector>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/graph_test_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
namespace paddle {
namespace framework {
namespace details {
TEST(OrderedNodeList, Normal) {
OrderedNodeList pool;
TEST(OrderedSet, Normal) {
OrderedSet pool;
std::vector<std::unique_ptr<ir::Node>> nodes;
// clang-format off
......@@ -56,8 +63,15 @@ TEST(OrderedNodeList, Normal) {
nodes.emplace_back(std::move(node));
}
// Insert
for (auto& node : nodes) {
pool.Insert(node.get(), op.get());
pool.Insert(node.get());
}
// Has/size
ASSERT_EQ(pool.size(), shapes.size());
for (auto& node : nodes) {
ASSERT_TRUE(pool.Has(node.get()));
}
// assert its order and interface.
......@@ -66,14 +80,14 @@ TEST(OrderedNodeList, Normal) {
std::cout << pool.ToString() << std::endl;
ASSERT_EQ(pool.size(), static_cast<size_t>(COUNT - 1));
ASSERT_EQ(pool.GetIndex(nodes.back().get()), 0);
ASSERT_EQ(pool.GetNodeIndexInPool(nodes.back().get()), 0);
{
auto v1 = block_desc->Var("11");
v1->SetShape({-1, 256, 56, 56});
std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v1);
node1->inputs.emplace_back(op.get());
auto* cache = pool.NodeMatch(node1.get());
auto* cache = pool.FindBestFitNode(node1.get());
ASSERT_EQ(cache, nullptr);
}
{
......@@ -81,16 +95,447 @@ TEST(OrderedNodeList, Normal) {
v2->SetShape({-1, 2, 5});
std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v2);
node1->inputs.emplace_back(op.get());
auto* cache = pool.NodeMatch(node1.get());
ASSERT_EQ(pool.GetIndex(cache), 2); // match 6:[-1,2,5]
auto* cache = pool.FindBestFitNode(node1.get());
ASSERT_EQ(pool.GetNodeIndexInPool(cache), 2); // match 6:[-1,2,5]
}
{
auto v3 = block_desc->Var("13");
v3->SetShape({2, 5});
std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v3);
node1->inputs.emplace_back(op.get());
auto* cache = pool.NodeMatch(node1.get());
ASSERT_EQ(pool.GetIndex(cache), 5); // match 4:[5,2]
auto* cache = pool.FindBestFitNode(node1.get());
ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5); // match 4:[5,2]
}
}
TEST(OrderedSet, FindBestFitNode) {
OrderedSet pool;
std::vector<std::unique_ptr<ir::Node>> nodes;
ProgramDesc prog;
BlockDesc* block_desc = prog.MutableBlock(0);
auto* op_desc = block_desc->AppendOp();
op_desc->SetType("dummy");
std::unique_ptr<ir::Node> op = ir::CreateNodeForTest(op_desc);
{
auto desc = block_desc->Var("a");
desc->SetShape({128, 128});
std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
node->inputs.emplace_back(op.get());
nodes.emplace_back(std::move(node));
}
{
auto desc = block_desc->Var("b");
desc->SetShape({128, 129});
std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
node->inputs.emplace_back(op.get());
nodes.emplace_back(std::move(node));
}
{
auto desc = block_desc->Var("c");
desc->SetShape({128, 128});
std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
node->inputs.emplace_back(op.get());
nodes.emplace_back(std::move(node));
}
for (auto& node : nodes) {
pool.Insert(node.get());
}
// FindNextBestFitNode
auto* n = nodes[0].get();
auto* cache = pool.FindBestFitNode(n);
PADDLE_ENFORCE(cache->Name() == "a");
cache = pool.FindNextBestFitNode(n, cache);
PADDLE_ENFORCE(cache->Name() == "c");
cache = pool.FindNextBestFitNode(n, cache);
PADDLE_ENFORCE(cache->Name() == "b");
}
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_OPERATOR(sum, paddle::framework::DummyOp,
paddle::framework::SumOpMaker,
paddle::framework::DummyVarTypeInference);
REGISTER_OPERATOR(assign, paddle::framework::DummyOp,
paddle::framework::AssignOpMaker,
paddle::framework::DummyVarTypeInference);
REGISTER_OPERATOR(dummy, paddle::framework::DummyOp,
paddle::framework::SumOpMaker,
paddle::framework::DummyVarTypeInference);
/*
https://en.wikipedia.org/wiki/Live_variable_analysis
Create a customed classical dependency graph, left row is the instruction
number.
1. a = 1
2. b = a
3. c = a
4. d = b + c
5. e = d
a--------+
| |
b c
| |
d--------+
|
e
Then analysis these variable's liveness range
*/
namespace paddle {
namespace framework {
namespace details {
inline static ProgramDesc FillProgramDesc() {
ProgramDesc prog;
prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR);
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"b"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"c"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"d"});
op->SetOutput("Out", {"e"});
}
return prog;
}
TEST(CFGGraph, IRGraph) {
// prepare ir graph
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
ControlFlowGraph cfg(graph);
cfg.LiveVariableAnalysis();
// test assign op
ASSERT_TRUE((std::set<std::string>{"a"} == cfg.LiveIn(cfg.Ops()[0])));
ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveOut(cfg.Ops()[0])));
// test assign op
ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveIn(cfg.Ops()[1])));
ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveOut(cfg.Ops()[1])));
// test sum op
ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveIn(cfg.Ops()[2])));
ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveOut(cfg.Ops()[2])));
// test assign op
ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveIn(cfg.Ops()[3])));
ASSERT_TRUE((std::set<std::string>{} == cfg.LiveOut(cfg.Ops()[3])));
}
// 1. normal test
TEST(SortOpLikeDescOrder, NormalTest) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto nodes = SortOpLikeDescOrder(graph);
auto op_descs = prog.Block(0).AllOps();
for (size_t i = 0; i < nodes.size(); ++i) {
auto node = nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 2. remove some op_desc
TEST(SortOpLikeDescOrder, RemoveOpDesc) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto nodes = graph.Nodes();
auto op_descs = prog.Block(0).AllOps();
ir::Node* found_node = nullptr;
for (auto node : nodes) {
if (node->IsOp() && node->outputs.back()->Name() == "e") {
found_node = node;
break;
}
}
PADDLE_ENFORCE(found_node != nullptr);
for (auto it = op_descs.begin(); it != op_descs.end();) {
if (IsSameDesc(*it, found_node->Op())) {
it = op_descs.erase(it);
} else {
++it;
}
}
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
ir::Node* e = find_node_in_graph("e");
ir::Node* d = find_node_in_graph("d");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
graph.RemoveNode(found_node);
graph.RemoveNode(e);
// other node keeps the same order
auto remain_nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < remain_nodes.size(); ++i) {
auto node = remain_nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 3. add some op_desc
TEST(SortOpLikeDescOrder, AddOpDesc) {
auto prog = FillProgramDesc();
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
ir::Graph graph(prog);
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
// cached desc different with real one
// mimic the intermidiete pass modify the programdesc.
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto op_descs = prog.Block(0).AllOps();
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
d1->inputs.emplace_back(node);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
op_descs.insert(op_descs.begin() + 4, op);
auto nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < nodes.size(); ++i) {
auto node = nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 4. add and delete some op_desc
TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
// remove sum node
auto op_descs = prog.Block(0).AllOps();
ir::Node* found_node = nullptr;
auto nodes = graph.Nodes();
for (auto node : nodes) {
if (node->Name() == "sum") {
found_node = node;
break;
}
}
PADDLE_ENFORCE(found_node != nullptr);
for (auto it = op_descs.begin(); it != op_descs.end();) {
if (IsSameDesc(*it, found_node->Op())) {
it = op_descs.erase(it);
} else {
++it;
}
}
{
ir::Node* d = find_node_in_graph("d");
ir::Node* c = find_node_in_graph("c");
ir::Node* e = find_node_in_graph("e");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
std::remove(c->outputs.begin(), c->outputs.end(), found_node);
ir::Node* pending_op = found_node->outputs[0]->outputs[0];
graph.RemoveNode(e);
graph.RemoveNode(pending_op);
graph.RemoveNode(found_node);
}
// add node
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
{
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
}
op_descs.insert(op_descs.begin() + 2, op);
// check the order
auto mynodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < mynodes.size(); ++i) {
auto node = mynodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 5. add and replace some op_desc inplace.
TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
auto op_descs = prog.Block(0).AllOps();
// add node
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
{
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
d1->inputs.emplace_back(node);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
}
op_descs.emplace_back(op);
// replace op_desc inplace
auto nodes = graph.Nodes();
ir::Node* found_node = nullptr;
for (auto node : nodes) {
if (node->IsOp() && node->Op() && node->Name() == "assign") {
if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") {
found_node = node;
break;
}
}
}
{
ir::Node* d = find_node_in_graph("d");
ir::Node* e = find_node_in_graph("e");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
std::remove(e->inputs.begin(), e->inputs.end(), found_node);
graph.RemoveNode(found_node);
}
op_descs.erase(op_descs.begin() + 3);
auto replace_op = prog.MutableBlock(0)->AppendOp();
replace_op->SetType("sum");
replace_op->SetInput("X", {"d", "d1"});
replace_op->SetOutput("Out", {"e"});
{
ir::Node* sum2 = graph.CreateOpNode(replace_op);
ir::Node* e = find_node_in_graph("e");
ir::Node* d = find_node_in_graph("d");
ir::Node* d1 = find_node_in_graph("d1");
sum2->inputs.emplace_back(d);
sum2->inputs.emplace_back(d1);
sum2->outputs.emplace_back(e);
e->inputs.emplace_back(sum2);
d->outputs.emplace_back(sum2);
d1->outputs.emplace_back(sum2);
}
op_descs.emplace_back(replace_op);
// compare op order
auto graph_nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < graph_nodes.size(); ++i) {
auto node = graph_nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
......
......@@ -43,16 +43,10 @@ namespace paddle {
namespace framework {
namespace details {
static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
op1->Outputs() == op2->Outputs();
}
std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
auto nodes = graph->Nodes();
auto subblock_vars = GetSubBlockVars(nodes);
skip_set_.insert(subblock_vars.begin(), subblock_vars.end());
CollectSkipVarsSet(nodes);
cfg_.reset(new details::ControlFlowGraph(*graph));
cfg_->LiveVariableAnalysis();
......@@ -75,82 +69,67 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
}
for (auto& var : op->outputs) {
if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 ||
skip_set_.count(var->Name()))
if (var->IsVar() && !var->IsCtrlVar() && skip_set_.count(var->Name())) {
VLOG(3) << "Skip set contains variable of " << var->Name()
<< "disable reuse on it. skipped";
continue;
ir::Node* cache = pool_.NodeMatch(var);
if (var->Name() == FLAGS_memory_optimize_debug) {
VLOG(3) << "start match var " << DebugString(var) << " of op "
<< op->Name();
VLOG(3) << pool_.ToString();
VLOG(3) << "matched in pool : "
<< ((cache == nullptr) ? "False" : "True");
}
if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) {
ir::Node* cache = pool_.FindBestFitNode(var);
while (cache != nullptr && var->Name() == cache->Name()) {
VLOG(3) << "The same cache variable is cascade reused. "
<< cache->Name() << " is re-filled to the pool after "
<< "the reused op is finished. Current op can not "
<< "replace it again. Skip this candidate.";
cache = pool_.FindNextBestFitNode(var, cache);
}
if (var->Name() == FLAGS_memory_optimize_debug) {
VLOG(3) << "start match var " << DebugString(var) << " of op "
<< op->Name();
VLOG(3) << pool_.ToString();
VLOG(3) << "matched in pool : "
<< ((cache == nullptr) ? "False" : "True");
}
if (cache == nullptr) continue;
if (var->Name() == cache->Name()) {
VLOG(3) << "The same cache variable is cascade reused." << var->Name()
<< " is re-filled to the pool after"
<< "the reused op is finished. Current op can not "
<< "replace it again. Skip this candidate.";
continue;
int node_idx_in_pool = pool_.GetIndex(cache);
VLOG(3) << string::Sprintf(
"!!! %s, %s => %s, cache idx %d, pool size %d",
std::to_string(reuse_id++), DebugString(var), DebugString(cache),
node_idx_in_pool, static_cast<int>(pool_.size()));
// update CFG Graph on the fly.
// reused var maybe re-fill into the pool
cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx);
// NOTE(dzhwinter): we need to both update the ProgramDesc
// and IR Graph. because op_desc/var_desc is used in CreateOp,
// CreateVar when running happens. But IR Graph
// define the dependence relationship between nodes.
RenameVarInGraphDesc(var->Name(), cache->Name(), idx);
RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get());
pool_.Erase(cache);
}
// fill the pool
std::unordered_set<std::string> unlived_vars;
for (auto var : cfg_->LiveIn(op)) {
if (cfg_->LiveOut(op).count(var) == 0) {
unlived_vars.emplace(var);
if (cache != nullptr) {
int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
VLOG(3) << string::Sprintf(
"!!! %s, %s => %s, cache idx %d, pool size %d",
std::to_string(reuse_id++), DebugString(var), DebugString(cache),
node_idx_in_pool, static_cast<int>(pool_.size()));
// NOTE(dzhwinter): update the ProgramDesc/IR Graph
// and the CFG Graph on the fly.
//
// IR Graph define the dependence relationship between nodes.
//
// ProgramDesc defines the input/output vars. Its used in
// CreateOp, CreateVar when running happens.
//
// CFG Graph store the liveness information, when reuse happens
// we also need to update the variable liveness.
const std::string var_name = var->Name();
const std::string cache_name = cache->Name();
cfg_->RenameVarInCFGGraph(var_name, cache_name, idx);
RenameVarInGraphDesc(var_name, cache_name, idx);
RenameVarInGraphNode(var_name, cache_name, idx, graph.get());
pool_.Erase(cache_name);
}
}
for (auto var : unlived_vars) {
ir::Node* var_node = cfg_->GetNodeFromVarName(var, op);
}
// fill the pool
for (auto var : cfg_->LiveIn(op)) {
if (cfg_->LiveOut(op).count(var) == 0) {
ir::Node* var_node = cfg_->GetNodeByName(var, op);
if (var_node == nullptr || var_node->IsCtrlVar()) continue;
if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
pool_.Insert(var_node, op);
pool_.Insert(var_node);
}
}
}
}
graph->ResolveHazard(var_nodes_);
// For early delete pass. use GraphNodePool load the unlived vars.
// 1. find all deps op for each unlived var in memory pool.
for (auto& op : graph->Nodes()) {
for (auto& var : op->inputs) {
if (pool_.Has(var)) {
pool_.Insert(var, op);
}
}
}
// 2. convert ir node based memory pool to graph node
// because Node* maybe released bettwen passes.
auto& graph_pool = graph->Get<GraphNodePool>(kGraphNodePool);
for (auto it = pool_.begin(); it != pool_.end(); ++it) {
std::unordered_set<OpDesc*> descs;
for (auto& op : it->second) {
PADDLE_ENFORCE(op->IsOp());
descs.insert(op->Op());
}
graph_pool.push_back(std::make_pair(it->first->Name(), descs));
}
return graph;
}
......@@ -199,12 +178,12 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
PADDLE_ENFORCE(sub_op != nullptr);
for (auto* var : sub_op->outputs) {
if (NodeCanReused(var)) {
ir::Node* cache = pool_.NodeMatch(var);
ir::Node* cache = pool_.FindBestFitNode(var);
if (cache != nullptr) {
if (var->Var()->GetDataType() != cache->Var()->GetDataType()) {
continue;
}
int node_idx_in_pool = pool_.GetIndex(cache);
int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
VLOG(3) << string::Sprintf(
"!!! %s, %s => %s, cache idx %d, pool size %d",
std::to_string(sub_reuse_id++), DebugString(var),
......@@ -224,20 +203,27 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
}
}
std::unordered_set<std::string> MemoryOptimizePass::GetSubBlockVars(
void MemoryOptimizePass::CollectSkipVarsSet(
const std::unordered_set<ir::Node*>& nodes) const {
std::unordered_set<std::string> vars;
auto update_skip_set = [&](OpDesc* op_desc) {
auto inputs = op_desc->InputArgumentNames();
auto outputs = op_desc->OutputArgumentNames();
skip_set_.insert(inputs.begin(), inputs.end());
skip_set_.insert(outputs.begin(), outputs.end());
};
for (auto& op : nodes) {
if (!op->IsOp() || op->Op() == nullptr) continue;
auto* op_desc = op->Op();
if (OpHasSubBlock(op_desc)) {
auto inputs = op_desc->InputArgumentNames();
auto outputs = op_desc->OutputArgumentNames();
vars.insert(inputs.begin(), inputs.end());
vars.insert(outputs.begin(), outputs.end());
}
// NOTE(dzhwinter):
// current block can not reuse next level block vars.
if (OpHasSubBlock(op_desc)) update_skip_set(op_desc);
// NOTE(dzhwinter):
// distributed ops input/output name need to
// keep same bettwen trainer/pserver
if (op_desc->Type() == "send") update_skip_set(op_desc);
if (op_desc->Type() == "recv") update_skip_set(op_desc);
if (op_desc->Type() == "prefetch") update_skip_set(op_desc);
}
return vars;
}
void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var,
......@@ -291,8 +277,7 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
// redirect the input to the latest version of cache_var
for (auto* node : op->inputs) {
if (node->Name() == var) {
ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
var_nodes_[cache_var].emplace_back(cache_node);
ir::Node* cache_node = var_nodes_[cache_var].back();
// swap node to cache_node
cache_node->outputs.insert(cache_node->outputs.end(),
......@@ -301,11 +286,15 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
auto* prev_op = node->inputs[0];
std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node,
cache_node);
cache_node->inputs.emplace_back(prev_op);
for (auto* next_op : node->outputs) {
std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
cache_node);
}
// erase unused node
auto& nodes = var_nodes_.at(var);
nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
graph->RemoveNode(node);
}
}
......@@ -325,271 +314,14 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
cache_node);
}
}
}
}
// release node of unused var in graph
for (auto* node : var_nodes_[var]) {
graph->RemoveNode(node);
}
var_nodes_.at(var).clear();
}
std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
PADDLE_ENFORCE(graph.Has(kAllOpDescs),
"Graph has no attribute of kAllOpDescs.");
// 1. get op desc order
auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs);
// 2. topology sort order
auto nodes = graph.Nodes();
std::deque<ir::Node*> ops;
FilterVariables(nodes, [&](ir::Node* op) {
if (op->IsOp() && op->Op() != nullptr) {
ops.emplace_back(op);
}
});
std::unordered_map<ir::Node*, size_t> op_deps;
std::list<ir::Node*> ready_ops;
std::unordered_map<ir::Node*, std::unordered_set<ir::Node*>> pending_ops;
for (auto* op : ops) {
std::unordered_set<ir::Node*> preceding_op;
for (auto* in : op->inputs) {
if (in->inputs.empty()) continue;
PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp());
preceding_op.emplace(in->inputs[0]);
pending_ops[in->inputs[0]].emplace(op);
}
op_deps[op] = preceding_op.size();
if (preceding_op.empty()) {
ready_ops.emplace_back(op);
}
}
// 3. generated op list based desc order and the topology order
std::vector<ir::Node*> ret;
std::list<OpDesc*> op_descs_list(op_descs.begin(), op_descs.end());
auto update_by_found_node = [&](ir::Node* found_node) {
for (auto* pending_op : pending_ops[found_node]) {
if (--op_deps[pending_op] == 0) {
ready_ops.emplace_back(pending_op);
}
}
ready_ops.remove(found_node);
ret.emplace_back(found_node);
};
while (!ready_ops.empty()) {
bool all_of_ready_op_unmatched = true;
for (auto it = op_descs_list.begin(); it != op_descs_list.end();) {
auto op_desc = *it;
ir::Node* found_node = nullptr;
for (auto* op : ready_ops) {
if (IsSameDesc(op->Op(), op_desc)) {
found_node = op;
break;
}
}
// 3.1 op desc deleted by other pass
if (found_node == nullptr) {
++it;
continue;
} else {
all_of_ready_op_unmatched = false;
it = op_descs_list.erase(it);
}
update_by_found_node(found_node);
}
// 3.2 op descs are added by other pass
// preceding op non empty means some new op descs are
// created, but not contained in return node list.
// these new op desc may depend on each other.
std::list<ir::Node*> prev_ready_ops(ready_ops);
if (all_of_ready_op_unmatched) {
for (auto op : prev_ready_ops) {
update_by_found_node(op);
}
}
}
PADDLE_ENFORCE(std::all_of(
op_deps.begin(), op_deps.end(),
[&](const std::pair<ir::Node*, size_t>& p) { return p.second == 0; }));
return ret;
}
ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) {
ops_ = SortOpLikeDescOrder(graph);
ConnectNodes();
}
void ControlFlowGraph::BuildCFGGraph() {
// FIXME(dzh): same effect with ConnectNodes, but use the control
// link to build dependency graph, it goes wrong in transformer.
for (ir::Node* op : ops_) {
for (auto& input_var : op->inputs) {
if (!input_var->inputs.empty()) {
PADDLE_ENFORCE(
input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(),
"Preceding Op Node of Var Node must be unique");
auto* pred_op = input_var->inputs[0];
if (pred_op->Op() != nullptr) {
predecessors_[op].insert(pred_op);
successors_[pred_op].insert(op);
}
}
if (input_var->IsVar() && !input_var->IsCtrlVar()) {
uses_[op].insert(input_var->Name());
}
}
for (auto& output_var : op->outputs) {
// output var may be used by many op
for (auto* succ_op : output_var->outputs) {
if (succ_op->Op() != nullptr) {
successors_[op].insert(succ_op);
predecessors_[succ_op].insert(op);
}
}
if (output_var->IsVar() && !output_var->IsCtrlVar()) {
defs_[op].insert(output_var->Name());
}
}
}
}
void ControlFlowGraph::ConnectNodes() {
for (size_t i = 0; i < ops_.size(); ++i) {
auto& op = ops_[i];
try {
auto& next_op = ops_.at(i + 1);
successors_[op].insert(next_op);
predecessors_[next_op].insert(op);
} catch (...) {
// do nothing
}
FilterVariables(op->inputs,
[&](ir::Node* var) { uses_[op].emplace(var->Name()); });
FilterVariables(op->outputs,
[&](ir::Node* var) { defs_[op].emplace(var->Name()); });
}
}
void ControlFlowGraph::LiveVariableAnalysis() {
// NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm)
// compute the liveness of for each variable though reversed_ops algorithm.
// It iterates the operators from end to begin, compute the live in/live out
// variable set for each op, then the diff between in/out will be used for
// the variable reuse. For detail refer to
// http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf
std::list<ir::Node*> work_list(ops_.rbegin(), ops_.rend());
while (!work_list.empty()) {
ir::Node* op = work_list.front();
work_list.pop_front();
// get the live_in calculated before. Empty if first.
auto prev_live_in = std::move(live_in_[op]);
for (auto& s : successors_[op]) {
for (auto& var : live_in_[s]) {
live_out_[op].insert(var);
}
}
for (auto& var : uses_[op]) {
live_in_[op].insert(var);
}
for (auto& var : live_out_[op]) {
live_in_[op].insert(var);
}
for (auto& var : defs_[op]) {
live_in_[op].erase(var);
}
// If the live_in is not changed, then the liveness analysis of
// predecessors is completed.
//
// Otherwise, recalculate the predecessors liveness
if (live_in_[op] != prev_live_in) {
for (auto& pre : predecessors_[op]) {
work_list.push_back(pre);
}
}
}
}
void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
const std::string& new_node,
int begin_idx) {
// update graph from begin idx to the end
for (size_t i = begin_idx; i != ops_.size(); ++i) {
auto* op = ops_[i];
if (uses_[op].find(old_node) != uses_[op].end()) {
uses_[op].erase(old_node);
uses_[op].insert(new_node);
}
if (defs_[op].find(old_node) != defs_[op].end()) {
defs_[op].erase(old_node);
defs_[op].insert(new_node);
}
if (live_in_[op].find(old_node) != live_in_[op].end()) {
live_in_[op].erase(old_node);
live_in_[op].insert(new_node);
}
if (live_out_[op].find(old_node) != live_out_[op].end()) {
live_out_[op].erase(old_node);
live_out_[op].insert(new_node);
}
}
}
const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
auto it = live_in_.find(op);
PADDLE_ENFORCE(
it != live_in_.end(),
string::Sprintf("Expect %s in live_in, but Not Found.", op->Name()));
return it->second;
}
const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
auto it = live_out_.find(op);
PADDLE_ENFORCE(
it != live_out_.end(),
string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
return it->second;
}
const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const {
auto it = uses_.find(op);
PADDLE_ENFORCE(
it != uses_.end(),
string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
return it->second;
}
const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; }
std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }
ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name,
ir::Node* op) const {
// in ssa-graph, different version nodes have same name,
// this function get the latest version var before target op
// It may return nullptr, such as data node.
ir::Node* found_node = nullptr;
for (auto* node : ops_) {
if (node == op) break;
for (auto& output : node->outputs) {
if (output->Name() == name) {
found_node = output;
// erase unused node
auto& nodes = var_nodes_.at(var);
nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
graph->RemoveNode(node);
}
}
}
return found_node;
}
} // namespace details
......@@ -598,5 +330,4 @@ ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name,
REGISTER_PASS(memory_optimize_pass,
paddle::framework::details::MemoryOptimizePass)
.RequireGraphAttr(paddle::framework::details::kGraphNodePool)
.RequireGraphAttr(paddle::framework::details::kAllOpDescs);
......@@ -32,20 +32,15 @@
namespace paddle {
namespace framework {
namespace details {
constexpr char kAllOpDescs[] = "all_op_descs";
std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
class ControlFlowGraph;
class MemoryOptimizePass : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
private:
// fill the variable map(var_nodes) by version.
void InitSSAGraphNodes() const;
private:
// update program descs
void RenameVarInGraphDesc(const std::string& var,
const std::string& cache_var, size_t idx) const;
......@@ -55,13 +50,14 @@ class MemoryOptimizePass : public ir::Pass {
ir::Graph* graph) const;
void SubGraphOptimize(OpDesc* op_desc) const;
// scan subblock and collect the output/input variables.
std::unordered_set<std::string> GetSubBlockVars(
const std::unordered_set<ir::Node*>&) const;
// 1. scan op with subblock and collect the output/input vars.
// while, while_grad, conditional_block
// 2. scan distributed ops and collect the output/input vars
void CollectSkipVarsSet(const std::unordered_set<ir::Node*>&) const;
private:
// Reuse Node Pool, Owned.
mutable OrderedNodeList pool_;
mutable OrderedSet pool_;
// controlflow Graph
mutable std::unique_ptr<ControlFlowGraph> cfg_;
// skip set
......@@ -70,45 +66,6 @@ class MemoryOptimizePass : public ir::Pass {
mutable std::map<std::string, std::vector<ir::Node*>> var_nodes_;
};
class ControlFlowGraph {
public:
ControlFlowGraph() = default;
// For IR Graph in parallelexecutor
explicit ControlFlowGraph(const ir::Graph& graph);
void LiveVariableAnalysis();
void RenameVarInCFGGraph(const std::string& old_node,
const std::string& new_node, int begin_idx);
const std::set<std::string> LiveIn(ir::Node* op) const;
const std::set<std::string> LiveOut(ir::Node* op) const;
const std::set<std::string> Use(ir::Node* op) const;
const std::vector<ir::Node*> Ops() const;
std::vector<ir::Node*>& Ops();
// for ssa-graph nodes
ir::Node* GetNodeFromVarName(const std::string& name, ir::Node* op) const;
private:
void BuildCFGGraph();
void ConnectNodes();
using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>;
using VarSetMap = std::map<ir::Node*, std::set<std::string>>;
// successors ops use the output variables.
NodeListMap successors_;
// predecessors ops generated input variables.
NodeListMap predecessors_;
// variables lived before run current op.
VarSetMap live_in_;
// variables lived after run current op.
VarSetMap live_out_;
VarSetMap uses_; // op inputs
VarSetMap defs_; // op outputs
std::vector<ir::Node*> ops_; // op sequence by topology sort
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/memory_optimize_pass.h"
#include <algorithm>
#include <iostream>
#include <iterator>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/graph_test_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
REGISTER_OPERATOR(sum, paddle::framework::DummyOp,
paddle::framework::SumOpMaker,
paddle::framework::DummyVarTypeInference);
REGISTER_OPERATOR(assign, paddle::framework::DummyOp,
paddle::framework::AssignOpMaker,
paddle::framework::DummyVarTypeInference);
REGISTER_OPERATOR(dummy, paddle::framework::DummyOp,
paddle::framework::SumOpMaker,
paddle::framework::DummyVarTypeInference);
/*
https://en.wikipedia.org/wiki/Live_variable_analysis
Create a customed classical dependency graph, left row is the instruction
number.
1. a = 1
2. b = a
3. c = a
4. d = b + c
5. e = d
a--------+
| |
b c
| |
d--------+
|
e
Then analysis these variable's liveness range
*/
namespace paddle {
namespace framework {
namespace details {
static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
op1->Outputs() == op2->Outputs();
}
inline static ProgramDesc FillProgramDesc() {
ProgramDesc prog;
prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR);
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"b"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"c"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"d"});
op->SetOutput("Out", {"e"});
}
return prog;
}
TEST(CFGGraph, IRGraph) {
// prepare ir graph
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
ControlFlowGraph cfg(graph);
cfg.LiveVariableAnalysis();
// test assign op
ASSERT_TRUE((std::set<std::string>{"a"} == cfg.LiveIn(cfg.Ops()[0])));
ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveOut(cfg.Ops()[0])));
// test assign op
ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveIn(cfg.Ops()[1])));
ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveOut(cfg.Ops()[1])));
// test sum op
ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveIn(cfg.Ops()[2])));
ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveOut(cfg.Ops()[2])));
// test assign op
ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveIn(cfg.Ops()[3])));
ASSERT_TRUE((std::set<std::string>{} == cfg.LiveOut(cfg.Ops()[3])));
}
// 1. normal test
TEST(SortOpLikeDescOrder, NormalTest) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto nodes = SortOpLikeDescOrder(graph);
auto op_descs = prog.Block(0).AllOps();
for (size_t i = 0; i < nodes.size(); ++i) {
auto node = nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 2. remove some op_desc
TEST(SortOpLikeDescOrder, RemoveOpDesc) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto nodes = graph.Nodes();
auto op_descs = prog.Block(0).AllOps();
ir::Node* found_node = nullptr;
for (auto node : nodes) {
if (node->IsOp() && node->outputs.back()->Name() == "e") {
found_node = node;
break;
}
}
PADDLE_ENFORCE(found_node != nullptr);
for (auto it = op_descs.begin(); it != op_descs.end();) {
if (IsSameDesc(*it, found_node->Op())) {
it = op_descs.erase(it);
} else {
++it;
}
}
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
ir::Node* e = find_node_in_graph("e");
ir::Node* d = find_node_in_graph("d");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
graph.RemoveNode(found_node);
graph.RemoveNode(e);
// other node keeps the same order
auto remain_nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < remain_nodes.size(); ++i) {
auto node = remain_nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 3. add some op_desc
TEST(SortOpLikeDescOrder, AddOpDesc) {
auto prog = FillProgramDesc();
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
ir::Graph graph(prog);
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
// cached desc different with real one
// mimic the intermidiete pass modify the programdesc.
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto op_descs = prog.Block(0).AllOps();
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
d1->inputs.emplace_back(node);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
op_descs.insert(op_descs.begin() + 4, op);
auto nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < nodes.size(); ++i) {
auto node = nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 4. add and delete some op_desc
TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
// remove sum node
auto op_descs = prog.Block(0).AllOps();
ir::Node* found_node = nullptr;
auto nodes = graph.Nodes();
for (auto node : nodes) {
if (node->Name() == "sum") {
found_node = node;
break;
}
}
PADDLE_ENFORCE(found_node != nullptr);
for (auto it = op_descs.begin(); it != op_descs.end();) {
if (IsSameDesc(*it, found_node->Op())) {
it = op_descs.erase(it);
} else {
++it;
}
}
{
ir::Node* d = find_node_in_graph("d");
ir::Node* c = find_node_in_graph("c");
ir::Node* e = find_node_in_graph("e");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
std::remove(c->outputs.begin(), c->outputs.end(), found_node);
ir::Node* pending_op = found_node->outputs[0]->outputs[0];
graph.RemoveNode(e);
graph.RemoveNode(pending_op);
graph.RemoveNode(found_node);
}
// add node
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
{
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
}
op_descs.insert(op_descs.begin() + 2, op);
// check the order
auto mynodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < mynodes.size(); ++i) {
auto node = mynodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 5. add and replace some op_desc inplace.
TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
auto op_descs = prog.Block(0).AllOps();
// add node
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
{
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
d1->inputs.emplace_back(node);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
}
op_descs.emplace_back(op);
// replace op_desc inplace
auto nodes = graph.Nodes();
ir::Node* found_node = nullptr;
for (auto node : nodes) {
if (node->IsOp() && node->Op() && node->Name() == "assign") {
if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") {
found_node = node;
break;
}
}
}
{
ir::Node* d = find_node_in_graph("d");
ir::Node* e = find_node_in_graph("e");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
std::remove(e->inputs.begin(), e->inputs.end(), found_node);
graph.RemoveNode(found_node);
}
op_descs.erase(op_descs.begin() + 3);
auto replace_op = prog.MutableBlock(0)->AppendOp();
replace_op->SetType("sum");
replace_op->SetInput("X", {"d", "d1"});
replace_op->SetOutput("Out", {"e"});
{
ir::Node* sum2 = graph.CreateOpNode(replace_op);
ir::Node* e = find_node_in_graph("e");
ir::Node* d = find_node_in_graph("d");
ir::Node* d1 = find_node_in_graph("d1");
sum2->inputs.emplace_back(d);
sum2->inputs.emplace_back(d1);
sum2->outputs.emplace_back(e);
e->inputs.emplace_back(sum2);
d->outputs.emplace_back(sum2);
d1->outputs.emplace_back(sum2);
}
op_descs.emplace_back(replace_op);
// compare op order
auto graph_nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < graph_nodes.size(); ++i) {
auto node = graph_nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -17,6 +17,7 @@
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/op_proto_maker.h"
namespace paddle {
......
......@@ -21,8 +21,6 @@ namespace paddle {
namespace framework {
namespace details {
constexpr char kAllOpDescs[] = "all_op_descs";
class SequentialExecutionPass : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
......
......@@ -69,7 +69,7 @@ class InplaceInToOut : public InplaceOpInference {
bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const {
return in.Name() != out.Name() && details::NodeCanReused(in) &&
details::NodeCanReused(out) &&
details::NodeSizeInBytes(out) <= details::NodeSizeInBytes(in);
details::NodeSize(out) <= details::NodeSize(in);
}
};
......
......@@ -179,11 +179,11 @@ TEST(InferInplace, SingleOpInplaceInToOut) {
op->SetOutput("Out", {"test2_out"});
prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64});
prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_out");
prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16});
prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block());
......@@ -201,11 +201,11 @@ TEST(InferInplace, SingleGradOpInplaceInToOut) {
op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});
prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16});
prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_out");
prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16});
prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block());
......@@ -233,12 +233,12 @@ TEST(InferInplace, MultiOutInplaceInToOut) {
prog.MutableBlock(0)->Var("o0");
prog.MutableBlock(0)->Var("y0");
prog.MutableBlock(0)->Var("z0");
prog.MutableBlock(0)->Var("a0")->SetShape({32, 16});
prog.MutableBlock(0)->Var("b0")->SetShape({32, 16});
prog.MutableBlock(0)->Var("c0")->SetShape({32, 16});
prog.MutableBlock(0)->Var("o0")->SetShape({32, 16});
prog.MutableBlock(0)->Var("y0")->SetShape({32, 16});
prog.MutableBlock(0)->Var("z0")->SetShape({32, 16});
prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block());
......@@ -267,15 +267,16 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
prog.MutableBlock(0)->Var("o0");
prog.MutableBlock(0)->Var("y0");
prog.MutableBlock(0)->Var("z0");
prog.MutableBlock(0)->Var("a0")->SetShape({32, 16});
prog.MutableBlock(0)->Var("b0")->SetShape({32, 16});
prog.MutableBlock(0)->Var("c0")->SetShape({32, 16});
prog.MutableBlock(0)->Var("o0")->SetShape({32, 16});
prog.MutableBlock(0)->Var("y0")->SetShape({32, 16});
prog.MutableBlock(0)->Var("z0")->SetShape({32, 16});
prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block());
EXPECT_EQ(in_to_outs.size(), 3ul);
std::unordered_map<std::string, std::string> expects = {
{"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
......
......@@ -38,9 +38,13 @@ std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl(
->assert_is_op("scale")
->assert_op_attr<float>("scale", 1.)
->assert_op_attr<float>("bias", 0.);
auto scale_out = detector.mutable_pattern()
->NewNode("scale_out")
->assert_is_op_output("scale");
auto scale_out =
detector.mutable_pattern()
->NewNode("scale_out")
->assert_is_op_output("scale")
// scale's output var should has only one consumer, or it can't be
// removed.
->assert_more([](Node* x) { return x->outputs.size() == 1UL; });
pre_op->LinksTo({scale_in});
scale_op->LinksFrom({scale_in}).LinksTo({scale_out});
......
......@@ -37,6 +37,7 @@ class InferCleanGraphPass : public FusePassBase {
std::unordered_set<const Node*> invalid_nodes;
int valid_op = 0;
for (auto* node : graph->Nodes()) {
PADDLE_ENFORCE_NOT_NULL(node);
if (is_valid_node(node)) {
invalid_nodes.insert(node);
} else if (node->IsOp()) {
......
......@@ -171,14 +171,6 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_);
graph = eager_deletion_pass->Apply(std::move(graph));
VLOG(10) << "EagerDeletionPass Applied";
if (build_strategy_.memory_early_delete_) {
auto early_delete_pass =
ir::PassRegistry::Instance().Get("memory_early_delete_pass");
early_delete_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
graph = early_delete_pass->Apply(std::move(graph));
}
VLOG(10) << "MemoryEarlyDeletePass Applied.";
}
return graph;
......@@ -288,6 +280,8 @@ ParallelExecutor::ParallelExecutor(
graphs.push_back(std::move(graph));
#endif
auto max_memory_size = GetEagerDeletionThreshold();
VLOG(10) << "Eager Deletion Threshold "
<< static_cast<float>(max_memory_size) / (1 << 30);
if (max_memory_size >= 0) {
for (size_t i = 0; i < graphs.size(); ++i) {
graphs[i] = member_->PrepareGCAndRefCnts(
......@@ -506,6 +500,5 @@ ParallelExecutor::~ParallelExecutor() {
} // namespace framework
} // namespace paddle
USE_PASS(memory_early_delete_pass);
USE_PASS(reference_count_pass);
USE_PASS(eager_deletion_pass);
......@@ -460,77 +460,6 @@ inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
return node.inputs.size() == n;
}
NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
PADDLE_ENFORCE(!source.empty(),
"Start points of topological sorting should not be empty!");
// CHECK all the inputs' in-degree is 0
for (auto *node : source) {
PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
}
std::unordered_set<Node *> visited;
std::unordered_set<Node *> to_visit{source.begin(), source.end()};
std::vector<Node *> inlink_visited;
while (!to_visit.empty()) {
std::vector<Node *> queue(to_visit.begin(), to_visit.end());
for (auto *p : queue) {
if (Agent(p).deleted()) {
visited.insert(p);
to_visit.erase(p);
}
inlink_visited.clear();
std::copy_if(p->inputs.begin(), p->inputs.end(),
std::back_inserter(inlink_visited),
[&](Node *x) -> bool { return visited.count(x) != 0; });
if (inlink_visited.size() == p->inputs.size()) {
sorted_.push_back(p);
for (auto *_ : p->outputs) {
if (!visited.count(_)) {
to_visit.insert(_);
}
}
to_visit.erase(p);
visited.insert(p);
}
}
}
}
NodesTSIterator::NodesTSIterator(const NodesTSIterator &other)
: sorted_(other.sorted_), cursor_(other.cursor_) {}
Node &NodesTSIterator::operator*() {
PADDLE_ENFORCE_LT(cursor_, sorted_.size());
return *sorted_[cursor_];
}
NodesTSIterator &NodesTSIterator::operator++() {
if (++cursor_ >= sorted_.size()) {
sorted_.clear();
cursor_ = 0;
}
return *this;
}
NodesTSIterator &NodesTSIterator::operator=(const NodesTSIterator &other) {
cursor_ = other.cursor_;
sorted_ = other.sorted_;
return *this;
}
bool NodesTSIterator::operator==(const NodesTSIterator &other) {
return sorted_ == other.sorted_ && cursor_ == other.cursor_;
}
Node *NodesTSIterator::operator->() {
PADDLE_ENFORCE_LT(cursor_, sorted_.size());
return sorted_[cursor_];
}
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -30,6 +30,7 @@ namespace inference {
namespace analysis {
using framework::ir::Graph;
using framework::ir::NodesTSIterator;
const char kIsFunctionNode[] = "__is_function_node__";
const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__";
......@@ -132,32 +133,6 @@ struct Agent {
framework::ir::Node *x_;
};
// Topological sorting iterator on nodes.
struct NodesTSIterator
: public std::iterator<std::forward_iterator_tag, framework::ir::Node *> {
NodesTSIterator() = default;
explicit NodesTSIterator(const std::vector<framework::ir::Node *> &source);
NodesTSIterator(NodesTSIterator &&other)
: sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
other.cursor_ = 0;
}
NodesTSIterator(const NodesTSIterator &other);
framework::ir::Node &operator*();
NodesTSIterator &operator++();
// TODO(Superjomn) current implementation just compare the first
// element, need to compare the graph and all the elements in the queue and
// set.
NodesTSIterator &operator=(const NodesTSIterator &other);
bool operator==(const NodesTSIterator &other);
bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
framework::ir::Node *operator->();
private:
std::vector<framework::ir::Node *> sorted_;
size_t cursor_{0};
};
// The nodes those have no input will be treated as start points.
static std::vector<framework::ir::Node *> ExtractStartPoints(const Graph &g) {
std::vector<framework::ir::Node *> result;
......
......@@ -16,6 +16,12 @@
/*! \file paddle_api.h
*/
/*! \mainpage Paddle Inference APIs
* \section intro_sec Introduction
* The Paddle inference library aims to offer an high performance inference SDK
* for Paddle users.
*/
#include <cassert>
#include <memory>
#include <string>
......@@ -34,26 +40,49 @@ enum PaddleDType {
};
/**
*\brief Memory menager for PaddleTensor.
* \brief Memory manager for `PaddleTensor`.
*
*The PaddleBuf holds a buffer for data input or output. The memory can be
*allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
*should be reused for better performance.
* The PaddleBuf holds a buffer for data input or output. The memory can be
* allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
* should be reused for better performance.
*
*For user allocated memory, the following API can be used:
*- PaddleBuf(void* data, size_t length) to set an external memory by
*specifying
* the memory address and length.
*- Reset(void* data, size_t length) to reset the PaddleBuf with an external
* For user allocated memory, the following API can be used:
* - PaddleBuf(void* data, size_t length) to set an external memory by
* specifying the memory address and length.
* - Reset(void* data, size_t length) to reset the PaddleBuf with an external
*memory.
*ATTENTION, for user allocated memory, deallocation should be done by users
* ATTENTION, for user allocated memory, deallocation should be done by users
*externally after the program finished. The PaddleBuf won't do any allocation
*or deallocation.
*
*To have the PaddleBuf allocate and manage the memory:
*- PaddleBuf(size_t length) will allocate a memory of size `length`.
*- Resize(size_t length) resize the memory to no less than `length`, ATTENTION
* To have the PaddleBuf allocate and manage the memory:
* - PaddleBuf(size_t length) will allocate a memory of size `length`.
* - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
* if the allocated memory is larger than `length`, nothing will done.
*
* Usage:
*
* Let PaddleBuf manage the memory internally.
* \code{cpp}
* const int num_elements = 128;
* PaddleBuf buf(num_elements * sizeof(float));
* \endcode
*
* Or
* \code{cpp}
* PaddleBuf buf;
* buf.Resize(num_elements * sizeof(float));
* \endcode
* Works the exactly the same.
*
* One can also make the `PaddleBuf` use the external memory.
* \code{cpp}
* PaddleBuf buf;
* void* external_memory = new float[num_elements];
* buf.Reset(external_memory, num_elements*sizeof(float));
* ...
* delete[] external_memory; // manage the memory lifetime outside.
* \endcode
*/
class PaddleBuf {
public:
......@@ -78,7 +107,7 @@ class PaddleBuf {
/** Tell whether the buffer is empty.
*/
bool empty() const { return length_ == 0; }
/** Get the memory address.
/** Get the data's memory address.
*/
void* data() const { return data_; }
/** Get the memory length.
......@@ -110,7 +139,8 @@ struct PaddleTensor {
};
enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
/** Tensor without copy, currently only supports AnalysisPredictor.
/** Tensor without copy, currently only supports `AnalysisPredictor`.
*/
class ZeroCopyTensor {
public:
......@@ -269,9 +299,11 @@ struct NativeConfig : public PaddlePredictor::Config {
*
* Usage:
*
* \code{.cpp}
* NativeConfig config;
* ... // change the configs.
* auto native_predictor = CreatePaddlePredictor(config);
* \endcode
*
* FOR EXTENSION DEVELOPER:
* Different predictors are designated by config type. Similar configs can be
......
......@@ -66,8 +66,54 @@ void GpuPassStrategy::EnableMKLDNN() {
LOG(ERROR) << "GPU not support MKLDNN yet";
}
GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
passes_.assign({
"infer_clean_graph_pass", //
"identity_scale_op_clean_pass", //
"conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", //
"conv_bn_fuse_pass", //
#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
// guaranteed at least v7
"conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
#endif
});
for (int i = 6; i >= 3; i--) {
passes_.push_back("transpose_flatten" + std::to_string(i) +
"_concat_fuse_pass");
}
use_gpu_ = true;
}
void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
analysis_passes_.push_back(pass);
}
CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
// NOTE the large fusions should be located in the front, so that they will
// not be damaged by smaller ones.
passes_.assign({
"infer_clean_graph_pass", //
"attention_lstm_fuse_pass", //
"seqpool_concat_fuse_pass", //
"seqconv_eltadd_relu_fuse_pass", //
// "embedding_fc_lstm_fuse_pass", //
"fc_lstm_fuse_pass", //
"mul_lstm_fuse_pass", //
"fc_gru_fuse_pass", //
"mul_gru_fuse_pass", //
"seq_concat_fc_fuse_pass", //
"fc_fuse_pass", //
"repeated_fc_relu_fuse_pass", //
"squared_mat_sub_fuse_pass", //
"conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", //
"is_test_pass", //
"identity_scale_op_clean_pass", //
});
use_gpu_ = false;
}
} // namespace paddle
......@@ -97,30 +97,7 @@ class PassStrategy : public PaddlePassBuilder {
*/
class CpuPassStrategy : public PassStrategy {
public:
CpuPassStrategy() : PassStrategy({}) {
// NOTE the large fusions should be located in the front, so that they will
// not be damaged by smaller ones.
passes_.assign({
"infer_clean_graph_pass", //
"attention_lstm_fuse_pass", //
"seqpool_concat_fuse_pass", //
"seqconv_eltadd_relu_fuse_pass", //
// "embedding_fc_lstm_fuse_pass", //
"fc_lstm_fuse_pass", //
"mul_lstm_fuse_pass", //
"fc_gru_fuse_pass", //
"mul_gru_fuse_pass", //
"seq_concat_fc_fuse_pass", //
"fc_fuse_pass", //
"repeated_fc_relu_fuse_pass", //
"squared_mat_sub_fuse_pass", //
"conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", //
"is_test_pass", //
"identity_scale_op_clean_pass", //
});
use_gpu_ = false;
}
CpuPassStrategy();
explicit CpuPassStrategy(const CpuPassStrategy &other)
: PassStrategy(other.AllPasses()) {}
......@@ -153,27 +130,7 @@ class CpuPassStrategy : public PassStrategy {
*/
class GpuPassStrategy : public PassStrategy {
public:
GpuPassStrategy() : PassStrategy({}) {
passes_.assign({
"infer_clean_graph_pass", //
"identity_scale_op_clean_pass", //
"conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", //
"conv_bn_fuse_pass", //
#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
// guaranteed at least v7
"conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
#endif
});
for (int i = 6; i >= 3; i--) {
passes_.push_back("transpose_flatten" + std::to_string(i) +
"_concat_fuse_pass");
}
use_gpu_ = true;
}
GpuPassStrategy();
explicit GpuPassStrategy(const GpuPassStrategy &other)
: PassStrategy(other.AllPasses()) {
......
......@@ -51,6 +51,11 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
comment.type));
AddInput("Y", string::Sprintf("the right hand operand of %s operator",
comment.type));
AddAttr<int>(
"axis",
"The start dimension index for broadcasting Y onto X. [default -1]")
.SetDefault(-1)
.EqualGreaterThan(-1);
AddAttr<bool>("force_cpu",
"Force fill output variable to cpu "
"memory. Otherwise, fill output variable to the running "
......@@ -64,11 +69,6 @@ N-dim tensor. X and Y could be any type. The each element of the Out tensor is
calculated by $%s$
)DOC",
comment.equation));
AddAttr<int>(
"axis",
"The start dimension index for broadcasting Y onto X. [default -1]")
.SetDefault(-1)
.EqualGreaterThan(-1);
}
};
......
......@@ -72,7 +72,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (int i = 0; i < fixed_ratios.size(); i++) {
for (size_t i = 0; i < fixed_ratios.size(); i++) {
sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i]));
}
......@@ -115,11 +115,10 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
}
}
if (clip) {
platform::Transform<platform::CPUDeviceContext> trans;
ClipFunctor<T> clip_func;
trans(ctx.template device_context<platform::CPUDeviceContext>(),
boxes->data<T>(), boxes->data<T>() + boxes->numel(),
boxes->data<T>(), clip_func);
T* dt = boxes->data<T>();
std::transform(dt, dt + boxes->numel(), dt, [](T v) -> T {
return std::min<T>(std::max<T>(v, 0.), 1.);
});
}
framework::Tensor var_t;
var_t.mutable_data<T>(
......@@ -141,7 +140,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
#pragma omp parallel for collapse(2)
#endif
for (int i = 0; i < box_num; ++i) {
for (int j = 0; j < variances.size(); ++j) {
for (size_t j = 0; j < variances.size(); ++j) {
e_vars(i, j) = variances[j];
}
}
......
......@@ -46,13 +46,6 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
}
}
template <typename T>
struct ClipFunctor {
HOSTDEVICE inline T operator()(T in) const {
return std::min<T>(std::max<T>(in, 0.), 1.);
}
};
template <typename T>
class PriorBoxOpKernel : public framework::OpKernel<T> {
public:
......@@ -101,31 +94,30 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
boxes->mutable_data<T>(ctx.GetPlace());
vars->mutable_data<T>(ctx.GetPlace());
auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes);
T* b_t = boxes->data<T>();
for (int h = 0; h < feature_height; ++h) {
for (int w = 0; w < feature_width; ++w) {
T center_x = (w + offset) * step_width;
T center_y = (h + offset) * step_height;
T box_width, box_height;
int idx = 0;
for (size_t s = 0; s < min_sizes.size(); ++s) {
auto min_size = min_sizes[s];
if (min_max_aspect_ratios_order) {
box_width = box_height = min_size / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
idx++;
b_t[0] = (center_x - box_width) / img_width;
b_t[1] = (center_y - box_height) / img_height;
b_t[2] = (center_x + box_width) / img_width;
b_t[3] = (center_y + box_height) / img_height;
b_t += 4;
if (max_sizes.size() > 0) {
auto max_size = max_sizes[s];
// square prior with size sqrt(minSize * maxSize)
box_width = box_height = sqrt(min_size * max_size) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
idx++;
b_t[0] = (center_x - box_width) / img_width;
b_t[1] = (center_y - box_height) / img_height;
b_t[2] = (center_x + box_width) / img_width;
b_t[3] = (center_y + box_height) / img_height;
b_t += 4;
}
// priors with different aspect ratios
for (size_t r = 0; r < aspect_ratios.size(); ++r) {
......@@ -135,11 +127,11 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
}
box_width = min_size * sqrt(ar) / 2.;
box_height = min_size / sqrt(ar) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
idx++;
b_t[0] = (center_x - box_width) / img_width;
b_t[1] = (center_y - box_height) / img_height;
b_t[2] = (center_x + box_width) / img_width;
b_t[3] = (center_y + box_height) / img_height;
b_t += 4;
}
} else {
// priors with different aspect ratios
......@@ -147,21 +139,21 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
float ar = aspect_ratios[r];
box_width = min_size * sqrt(ar) / 2.;
box_height = min_size / sqrt(ar) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
idx++;
b_t[0] = (center_x - box_width) / img_width;
b_t[1] = (center_y - box_height) / img_height;
b_t[2] = (center_x + box_width) / img_width;
b_t[3] = (center_y + box_height) / img_height;
b_t += 4;
}
if (max_sizes.size() > 0) {
auto max_size = max_sizes[s];
// square prior with size sqrt(minSize * maxSize)
box_width = box_height = sqrt(min_size * max_size) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
idx++;
b_t[0] = (center_x - box_width) / img_width;
b_t[1] = (center_y - box_height) / img_height;
b_t[2] = (center_x + box_width) / img_width;
b_t[3] = (center_y + box_height) / img_height;
b_t += 4;
}
}
}
......@@ -169,11 +161,10 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
}
if (clip) {
platform::Transform<platform::CPUDeviceContext> trans;
ClipFunctor<T> clip_func;
trans(ctx.template device_context<platform::CPUDeviceContext>(),
boxes->data<T>(), boxes->data<T>() + boxes->numel(),
boxes->data<T>(), clip_func);
T* dt = boxes->data<T>();
std::transform(dt, dt + boxes->numel(), dt, [](T v) -> T {
return std::min<T>(std::max<T>(v, 0.), 1.);
});
}
framework::Tensor var_t;
......
......@@ -144,34 +144,40 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
"The ignore threshold to ignore confidence loss.")
.SetDefault(0.7);
AddComment(R"DOC(
This operator generate yolov3 loss by given predict result and ground
This operator generates yolov3 loss based on given predict result and ground
truth boxes.
The output of previous network is in shape [N, C, H, W], while H and W
should be the same, specify the grid size, each grid point predict given
number boxes, this given number is specified by anchors, it should be
half anchors length, which following will be represented as S. In the
second dimention(the channel dimention), C should be S * (class_num + 5),
class_num is the box categoriy number of source dataset(such as coco),
so in the second dimention, stores 4 box location coordinates x, y, w, h
and confidence score of the box and class one-hot key of each anchor box.
should be the same, H and W specify the grid size, each grid point predict
given number boxes, this given number, which following will be represented as S,
is specified by the number of anchors, In the second dimension(the channel
dimension), C should be equal to S * (class_num + 5), class_num is the object
category number of source dataset(such as 80 in coco dataset), so in the
second(channel) dimension, apart from 4 box location coordinates x, y, w, h,
also includes confidence score of the box and class one-hot key of each anchor box.
While the 4 location coordinates if $$tx, ty, tw, th$$, the box predictions
correspnd to:
Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box predictions
should be as follows:
$$
b_x = \sigma(t_x) + c_x
b_y = \sigma(t_y) + c_y
b_x = \\sigma(t_x) + c_x
$$
$$
b_y = \\sigma(t_y) + c_y
$$
$$
b_w = p_w e^{t_w}
$$
$$
b_h = p_h e^{t_h}
$$
While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$
is specified by anchors.
In the equation above, :math:`c_x, c_y` is the left top corner of current grid
and :math:`p_w, p_h` is specified by anchors.
As for confidence score, it is the logistic regression value of IoU between
anchor boxes and ground truth boxes, the score of the anchor box which has
the max IoU should be 1, and if the anchor box has IoU bigger then ignore
the max IoU should be 1, and if the anchor box has IoU bigger than ignore
thresh, the confidence score loss of this anchor box will be ignored.
Therefore, the yolov3 loss consist of three major parts, box location loss,
......@@ -186,13 +192,13 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
In order to trade off box coordinate losses between big boxes and small
boxes, box coordinate losses will be mutiplied by scale weight, which is
calculated as follow.
calculated as follows.
$$
weight_{box} = 2.0 - t_w * t_h
$$
Final loss will be represented as follow.
Final loss will be represented as follows.
$$
loss = (loss_{xy} + loss_{wh}) * weight_{box}
......
......@@ -264,6 +264,23 @@ class ElementwiseOpInplace : public framework::InplaceInToOut {
}
};
class ElementwiseGradOpInplace : public framework::InplaceInToOut {
public:
using framework::InplaceInToOut::InplaceInToOut;
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
std::unordered_map<std::string, std::string> ret;
if (block->HasVar(framework::GradVarName("X")) &&
block->HasVar(framework::GradVarName("Out"))) {
ret[framework::GradVarName("Out")] = framework::GradVarName("X");
}
return ret;
}
};
} // namespace operators
} // namespace paddle
......@@ -316,4 +333,5 @@ class ElementwiseOpInplace : public framework::InplaceInToOut {
op_type##GradMaker, \
::paddle::operators::ElementwiseOpInplace); \
REGISTER_OPERATOR(op_type##_grad, \
::paddle::operators::ElementwiseOpExplicitGrad)
::paddle::operators::ElementwiseOpExplicitGrad, \
::paddle::operators::ElementwiseGradOpInplace)
......@@ -311,6 +311,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
}
// lstm_value.output_value not used in bp, set to nullptr
// lstm_grad.state_active_grad not used in bp, set to nullptr
lstm_value.output_value = nullptr;
lstm_grad.state_active_grad = nullptr;
int cur_batch_size = bend - bstart;
math::LstmUnitGradFunctor<DeviceContext, T>::compute(
device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
......
......@@ -405,6 +405,11 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
}
int cur_batch_size = bend - bstart;
// lstmp_value.output_value not used in bp, set to null
// lstmp_grad.state_active_grad not used in bp, set to null
lstmp_value.output_value = nullptr;
lstmp_grad.state_active_grad = nullptr;
math::LstmUnitGradFunctor<DeviceContext, T>::compute(
device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
gate_act, cell_act, cand_act);
......
......@@ -168,9 +168,10 @@ void Pool2dOpMaker::Make() {
"be ignored."); // TODO(Chengduo): Add checker.
// (Currently,
// TypedAttrChecker don't support vector type.)
AddAttr<bool>("global_pooling",
"(bool, default false) Whether to use the global pooling. "
"If global_pooling = true, ksize and paddings will be ignored.")
AddAttr<bool>(
"global_pooling",
"(bool, default false) Whether to use the global pooling. "
"If global_pooling = true, kernel size and paddings will be ignored.")
.SetDefault(false);
AddAttr<std::vector<int>>("strides",
"(vector<int>, default {1, 1}), strides(height, "
......@@ -182,7 +183,7 @@ void Pool2dOpMaker::Make() {
"paddings",
"(vector<int>, default {0,0}), paddings(height, width) of pooling "
"operator."
"If global_pooling = true, paddings and ksize will be ignored.")
"If global_pooling = true, paddings and kernel size will be ignored.")
.SetDefault({0, 0});
AddAttr<bool>(
"exclusive",
......@@ -204,7 +205,7 @@ void Pool2dOpMaker::Make() {
.SetDefault(false);
AddAttr<bool>(
"ceil_mode",
"(bool, default false) Wether to use the ceil function to calculate "
"(bool, default false) Whether to use the ceil function to calculate "
"output height and width. False is the default. If it is set to False, "
"the floor function will be used.")
.SetDefault(false);
......@@ -259,31 +260,40 @@ Example:
W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
$$
For exclusive = true:
For exclusive = false:
$$
hstart = i * strides[0] - paddings[0]
$$
$$
hend = hstart + ksize[0]
$$
$$
wstart = j * strides[1] - paddings[1]
$$
$$
wend = wstart + ksize[1]
$$
$$
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
$$
For exclusive = false:
For exclusive = true:
$$
hstart = max(0, i * strides[0] - paddings[0])
$$
$$
hend = min(H, hstart + ksize[0])
$$
$$
wstart = max(0, j * strides[1] - paddings[1])
$$
$$
wend = min(W, wstart + ksize[1])
$$
$$
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
$$
For adaptive = true:
$$
hstart = floor(i * H_{in} / H_{out})
hend = ceil((i + 1) * H_{in} / H_{out})
wstart = floor(j * W_{in} / W_{out})
wend = ceil((j + 1) * W_{in} / W_{out})
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
$$
)DOC");
}
......@@ -324,7 +334,7 @@ void Pool3dOpMaker::Make() {
AddAttr<bool>(
"global_pooling",
"(bool, default false) Whether to use the global pooling. "
"If global_pooling = true, ksize and paddings wille be ignored.")
"If global_pooling = true, kernel size and paddings will be ignored.")
.SetDefault(false);
AddAttr<std::vector<int>>(
"strides",
......@@ -359,7 +369,7 @@ void Pool3dOpMaker::Make() {
.SetDefault(false);
AddAttr<bool>(
"ceil_mode",
"(bool, default false) Wether to use the ceil function to calculate "
"(bool, default false) Whether to use the ceil function to calculate "
"output height and width. False is the default. If it is set to False, "
"the floor function will be used.")
.SetDefault(false);
......@@ -392,48 +402,68 @@ Example:
Output:
Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
For ceil_mode = false:
$$
D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
$$
$$
D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
$$
$$
H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[2]} + 1
$$
$$
W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
$$
For ceil_mode = true:
$$
D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1 \\
H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\
W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
$$
For exclusive = true:
$$
dstart = i * strides[0] - paddings[0]
dend = dstart + ksize[0]
hstart = j * strides[1] - paddings[1]
hend = hstart + ksize[1]
wstart = k * strides[2] - paddings[2]
wend = wstart + ksize[2]
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
$$
$$
D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1
$$
$$
H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1
$$
$$
W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
$$
For exclusive = false:
$$
dstart = max(0, i * strides[0] - paddings[0])
dend = min(D, dstart + ksize[0])
hstart = max(0, j * strides[1] - paddings[1])
hend = min(H, hstart + ksize[1])
wstart = max(0, k * strides[2] - paddings[2])
wend = min(W, wstart + ksize[2])
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
$$
For adaptive = true:
$$
dstart = floor(i * D_{in} / D_{out})
dend = ceil((i + 1) * D_{in} / D_{out})
hstart = floor(j * H_{in} / H_{out})
hend = ceil((j + 1) * H_{in} / H_{out})
wstart = floor(k * W_{in} / W_{out})
wend = ceil((k + 1) * W_{in} / W_{out})
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
$$
$$
dstart = i * strides[0] - paddings[0]
$$
$$
dend = dstart + ksize[0]
$$
$$
hstart = j * strides[1] - paddings[1]
$$
$$
hend = hstart + ksize[1]
$$
$$
wstart = k * strides[2] - paddings[2]
$$
$$
wend = wstart + ksize[2]
$$
$$
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
$$
For exclusive = true:
$$
dstart = max(0, i * strides[0] - paddings[0])
$$
$$
dend = min(D, dstart + ksize[0])
$$
$$
hend = min(H, hstart + ksize[1])
$$
$$
wstart = max(0, k * strides[2] - paddings[2])
$$
$$
wend = min(W, wstart + ksize[2])
$$
$$
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
$$
)DOC");
}
......
......@@ -121,7 +121,7 @@ struct RandomCropFunctor {
HOSTDEVICE void operator()(size_t ins_idx) {
typename Random<DeviceContext>::Engine engine(seed_);
engine.discard(ins_idx * (rank_ - num_batchsize_dims_));
size_t offsets[9];
size_t offsets[9] = {};
for (int i = num_batchsize_dims_; i < rank_; ++i) {
typename Random<DeviceContext>::template UniformIntDist<size_t> dist(
0, x_dims_[i] - out_dims_[i]);
......
......@@ -12,18 +12,138 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <thrust/device_vector.h>
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/slice_op.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
template <size_t D>
__global__ void Padding(const paddle::platform::float16* d_out,
const int* out_dims, const int* in_dims,
const int* offsets, int64_t n,
paddle::platform::float16* d_in) {
int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x;
if (out_idx < n) {
int64_t out_idx_tmp = out_idx;
int coords[D] = {0};
for (int i = D - 1; i >= 0; --i) {
coords[i] = out_idx_tmp % out_dims[i];
out_idx_tmp /= out_dims[i];
coords[i] += offsets[i];
}
int64_t in_idx = 0;
for (int i = 0; i < D; ++i) {
in_idx = in_idx * in_dims[i] + coords[i];
}
d_in[in_idx] = d_out[out_idx];
}
}
template <>
class SliceGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>
: public framework::OpKernel<paddle::platform::float16> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* d_in = ctx.Output<framework::Tensor>(framework::GradVarName("Input"));
d_in->mutable_data<paddle::platform::float16>(ctx.GetPlace());
auto out_dims = d_out->dims();
auto in_dims = d_in->dims();
int rank = out_dims.size();
std::vector<int> offsets(rank, 0);
auto axes = ctx.Attr<std::vector<int>>("axes");
auto starts = ctx.Attr<std::vector<int>>("starts");
for (size_t i = 0; i < starts.size(); ++i) {
if (starts[i] < 0) {
starts[i] += in_dims[axes[i]];
}
offsets[axes[i]] = std::max(starts[i], 0);
}
math::SetConstant<paddle::platform::CUDADeviceContext,
paddle::platform::float16>
set_zero;
auto& dev_ctx =
ctx.template device_context<paddle::platform::CUDADeviceContext>();
set_zero(dev_ctx, d_in, static_cast<paddle::platform::float16>(0));
int64_t numel = d_out->numel();
dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1);
dim3 threads(PADDLE_CUDA_NUM_THREADS);
auto stream = ctx.cuda_device_context().stream();
auto out_shape = framework::vectorize2int(out_dims);
thrust::device_vector<int> out_dims_vec(out_shape.begin(), out_shape.end());
auto in_shape = framework::vectorize2int(in_dims);
thrust::device_vector<int> in_dims_vec(in_shape.begin(), in_shape.end());
thrust::device_vector<int> offsets_vec(offsets.begin(), offsets.end());
const int* out_dims_ptr = thrust::raw_pointer_cast(out_dims_vec.data());
const int* in_dims_ptr = thrust::raw_pointer_cast(in_dims_vec.data());
const int* offsets_ptr = thrust::raw_pointer_cast(offsets_vec.data());
switch (rank) {
case 1:
Padding<1><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 2:
Padding<2><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 3:
Padding<3><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 4:
Padding<4><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 5:
Padding<5><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 6:
Padding<6><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, double>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, int>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>);
ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
slice_grad,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
......@@ -439,7 +439,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();
Tensor* logit_grad =
context.Output<Tensor>(framework::GradVarName("Logits"));
logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
framework::TensorCopy(*context.Input<Tensor>("Softmax"), context.GetPlace(),
context.device_context(), logit_grad);
T* logit_grad_data = logit_grad->data<T>();
const int batch_size = logit_grad->dims()[0];
......
......@@ -398,9 +398,9 @@ PYBIND11_MODULE(core, m) {
py::arg("recursive_sequence_lengths"), R"DOC(
Set LoD of the LoDTensor according to recursive sequence length.
For example, if recursive_sequence_lengths=[2, 3], meaning that
For example, if recursive_sequence_lengths=[[2, 3]], meaning that
there are two sequences with length 2 and 3 respectively, the
corresponding lod would be [0, 2, 2+3], i.e, [0, 2, 5].
corresponding lod would be [[0, 2, 2+3]], i.e, [[0, 2, 5]].
Args:
recursive_sequence_lengths (List[List[int]]): sequence lengths.
......@@ -803,9 +803,11 @@ All parameter, weight, gradient are variables in Paddle.
.def(py::init<const platform::Place &>())
.def("close", &Executor::Close)
.def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
int block_id, bool create_local_scope, bool create_vars) {
int block_id, bool create_local_scope, bool create_vars,
const std::vector<std::string> &fetch_vars) {
pybind11::gil_scoped_release release;
self.Run(prog, scope, block_id, create_local_scope, create_vars);
self.Run(prog, scope, block_id, create_local_scope, create_vars,
fetch_vars);
});
m.def("init_gflags", framework::InitGflags);
......
#!/bin/bash
## purple to echo
function purple(){
echo -e "\033[35m$1\033[0m"
}
## green to echo
function green(){
echo -e "\033[32m$1\033[0m"
}
## Error to warning with blink
function bred(){
echo -e "\033[31m\033[01m\033[05m$1\033[0m"
}
## Error to warning with blink
function byellow(){
echo -e "\033[33m\033[01m\033[05m$1\033[0m"
}
## Error
function red(){
echo -e "\033[31m\033[01m$1\033[0m"
}
## warning
function yellow(){
echo -e "\033[33m\033[01m$1\033[0m"
}
path='http://paddlepaddle.org/download?url='
#release_version=`curl -s https://pypi.org/project/paddlepaddle/|grep -E "/project/paddlepaddle/"|grep "release"|awk -F '/' '{print $(NF-1)}'|head -1`
release_version=1.2.0
......@@ -228,36 +260,128 @@ function checkLinuxPaddleVersion(){
done
}
function checkLinuxPip(){
function checkPythonVirtualenv(){
while true
do
echo "请输入您要使用的pip目录(您可以另起终端,并使用which pip来查看):"
read -p "" pip_path
if [ "$pip_path" == "" -o ! -f "$pip_path" ];then
echo "检测结果:pip不存在,请重新输入"
continue
fi
python_version=`$pip_path --version|awk -F "[ |)]" '{print $6}'|sed 's#\.##g'`
if [ "$python_version" == "27" ];then
uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27mu"`
if [[ "$uncode" == "" ]];then
uncode=
else
uncode=u
fi
fi
if [ "$python_version" == "" ];then
echo "检测结果:pip不存在,请重新输入"
else
version_list=`echo "${python_list[@]}" | grep "$python_version" `
if [ "$version_list" != "" ];then
echo "检测结果:找到python${python_version}版本"
break
else
echo "检测结果:找不到可用的 pip, 我们只支持Python27/35/36/37及其对应的pip, 请重新输入, 或使用ctrl + c退出 "
fi
fi
read -p "
是否使用python virtualenv虚环境安装(y/n)": check_virtualenv
case $check_virtualenv in
y)
echo "为您使用python虚环境安装"
;;
n)
break
;;
*)
continue
;;
esac
virtualenv_path=`which virtualenv 2>&1`
if [ "$virtualenv_path" == "" ];then
$python_path -m pip install virtualenv
if [ "$?" != '0' ];then
echo "安装虚拟环境失败,请检查本地环境"
fi
fi
while true
do
read -p "请输入虚拟环境名字:" virtualenv_name
if [ "$virtualenv_name" == "" ];then
echo "不能为空"
continue
fi
break
done
virtualenv -p $python_path ${virtualenv_name}
if [ "$?" != 0 ];then
echo "创建虚环境失败,请检查环境"
exit 2
fi
cd ${virtualenv_name}
source ./bin/activate
if [ "$?" == 0 ];then
use_virtualenv=
python_path=`which python`
break
else
echo "创建虚环境失败,请检查环境"
exit 2
fi
done
}
function checkLinuxPython(){
python_path=`which python 2>/dev/null`
while true
do
if [ "$python_path" == '' ];then
while true
do
read -p "没有找到默认的python版本,请输入要安装的python路径:" python_path
python_path=`$python_path -V`
if [ "$python_path" != "" ];then
break
else
echo "输入路径有误,未找到pyrhon"
fi
done
fi
python_version=`$python_path -V 2>&1|awk -F '[ .]' '{print $2$3}'`
pip_version=`$python_path -m pip -V|awk -F '[ .]' '{print $2}'`
while true
do
read -p "
找到python版本$python_version,使用请输入y,选择其他版本请输n(y/n):" check_python
case $check_python in
n)
read -p "请指定您的python路径:" new_python_path
python_V=`$new_python_path -V 2>/dev/null`
if [ "$python_V" != "" ];then
python_path=$new_python_path
python_version=`$python_path -V 2>&1|awk -F '[ .]' '{print $2$3}'`
pip_version=`python -m pip -V|awk -F '[ .]' '{print $2}'`
echo "您的python版本为${python_version}"
break
else
echo 输入有误,未找到python路径
fi
;;
y)
break
;;
*)
echo "输入有误,请重新输入."
continue
;;
esac
done
if [ "$pip_version" -lt 9 ];then
echo "您的pip版本小于9.0.1 请升级pip (pip install --upgrade pip)"
exit 0
fi
if [ "$python_version" == "27" ];then
uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27mu"`
if [[ "$uncode" == "" ]];then
uncode=
else
uncode=u
fi
fi
version_list=`echo "${python_list[@]}" | grep "$python_version" `
if [ "$version_list" == "" ];then
echo "找不到可用的 pip, 我们只支持Python27/35/36/37及其对应的pip, 请重新输入, 或使用ctrl + c退出 "
else
break
fi
done
}
function checkLinuxAVX(){
......@@ -287,25 +411,36 @@ function PipLinuxInstall(){
wheel_cpu_develop="http://paddle-wheel.bj.bcebos.com/latest-cpu-${AVX}-${math}/paddlepaddle-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
wheel_gpu_develop="http://paddle-wheel.bj.bcebos.com/latest-gpu-cuda${CUDA}-cudnn${CUDNN}-${AVX}-${math}/paddlepaddle_gpu-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
if [[ "$paddle_version" == "2" ]];then
if [[ "$GPU" == "gpu" ]];then
if [[ ${AVX} == "avx" ]];then
rm -rf `echo $wheel_gpu_release|awk -F '/' '{print $NF}'`
wget -q $wheel_gpu_release
if [ "$?" == "0" ];then
$pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release
$python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release
if [ "$?" == 0 ];then
echo 安装成功
else
echo 安装失败
exit 1
fi
else
echo "paddlepaddle whl包下载失败"
echo paddlepaddle whl包下载失败
exit 1
fi
else
rm -rf `echo $wheel_gpu_release_novax|awk -F '/' '{print $NF}'`
wget -q $wheel_gpu_release_novax
if [ "$?" == "0" ];then
$pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release_noavx
$python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release_noavx
if [ "$?" == 0 ];then
echo 安装成功
else
echo 安装失败
exit 1
fi
else
echo "paddlepaddle whl包下载失败"
echo paddlepaddle whl包下载失败
exit 1
fi
fi
......@@ -313,9 +448,15 @@ function PipLinuxInstall(){
rm -rf `echo $wheel_cpu_release|awk -F '/' '{print $NF}'`
wget -q $wheel_cpu_release
if [ "$?" == "0" ];then
$pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_release
$python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_release
if [ "$?" == 0 ];then
echo 安装成功
else
echo 安装失败
exit 1
fi
else
echo "paddlepaddle whl包下载失败"
echo paddlepaddle whl包下载失败
exit 1
fi
fi
......@@ -324,18 +465,30 @@ function PipLinuxInstall(){
rm -rf `echo $wheel_gpu_develop|awk -F '/' '{print $NF}'`
wget -q $wheel_gpu_develop
if [ "$?" == "0" ];then
$pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_develop
$python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_develop
if [ "$?" == 0 ];then
echo 安装成功
else
echo 安装失败
exit 1
fi
else
echo "paddlepaddle whl包下载失败"
echo paddlepaddle whl包下载失败
exit 1
fi
else
rm -rf `echo $wheel_cpu_develop|awk -F '/' '{print $NF}'`
wget -q $wheel_cpu_develop
if [ "$?" == "0" ];then
$pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_develop
$python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_develop
if [ "$?" == 0 ];then
echo 安装成功
else
echo 安装失败
exit 1
fi
else
echo "paddlepaddle whl包下载失败"
echo paddlepaddle whl包下载失败
exit 1
fi
fi
......@@ -575,95 +728,122 @@ gpu_list=(
echo
echo "Step 5. 检测pip版本"
echo
checkLinuxPip
checkLinuxPython
echo
checkLinuxAVX
echo
echo "Step 6.是否使用Python的虚拟环境"
use_virtualenv="--user"
checkPythonVirtualenv
echo "*********************2. 开始安装*****************************"
PipLinuxInstall
if [ "$check_virtualenv" == 'y' ];then
echo "虚环境创建成功,请cd 进入${virtualenv_name}, 执行 source bin/activate 进入虚环境。退出虚环境执行 deactivate命令。
更多虚环境使用方法请参考virtualenv官网:https://virtualenv.pypa.io/en/latest/"
fi
}
function clearMacPythonEnv(){
python_version=""
python_brief_version=""
python_root=""
}
function checkMacPython2(){
while true
do
read -p "
=> 未能在常规路径下找到Python2,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载安装Python2(注意Python版本不能低于2.7.15)
如希望自定义Python路径,请输入路径:" python_root
echo
python_version=`$python_root --version 2>&1 1>&1`
if [ $? == "0" ];then
:
if [[ $? == "0" ]];then
if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ];then
clearMacPythonEnv
else
check_python=`echo $python_version | grep "Python 2"`
if [[ -n "$check_python" ]];then
while true
do
echo -e " => 在您的环境中找到 \033[32m[ $python_version ]\033[0m, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: "
read -p "" use_python
echo
use_python=`echo $use_python | tr 'A-Z' 'a-z'`
if [[ "$use_python" == "y" ]]||[[ "$use_python" == "" ]];then
use_python="y"
break
elif [[ "$use_python" == "n" ]];then
clearMacPythonEnv
break
else
red " 输入错误,请重新输入(y/n)"
fi
done
if [[ "$use_python" == "y" ]];then
return 0
fi
else
red " 您输入Python的不是Python2"
clearMacPythonEnv
fi
fi
else
python_version=""
clearMacPythonEnv
red " => 未能在常规路径下找到可用的Python2,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载安装Python2(注意Python版本不能低于2.7.15)"
read -p " 如希望自定义Python路径,请输入路径
如果希望重新选择Python版本,请回车:" python_root
echo
if [[ "$python_root" == "" ]];then
python_V=""
clearMacPythonEnv
return 1
fi
fi
check_python=`echo $python_version | grep "Python 2"`
if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ] ;then
python_version=""
elif [ -n "$check_python" ];then
while true
do
read -p "
=> 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: " use_python
echo
use_python=`echo $use_python | tr 'A-Z' 'a-z'`
if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then
use_python="y"
break
elif [ "$use_python" == "n" ];then
python_root=""
break
else
echo "输入错误,请重新输入(y/n)"
fi
done
if [ "$use_python" == "y" ];then
break
fi
else
echo "您输入Python的不是Python2"
python_version=""
fi
done
}
function checkMacPython3(){
while true
do
read -p "
=> 未能在常规路径下找到Python3,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载Python3
如希望自定义Python路径,请输入路径:" python_root
python_version=`$python_root --version 2>&1 1>&1`
if [ $? == "0" ];then
:
python_version=`$python_root --version 2>&1 1>&1`
if [[ $? == "0" ]];then
if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ] ;then
clearMacPythonEnv
else
check_python=`echo $python_version | grep "Python 3"`
if [[ -n "$check_python" ]];then
while true
do
echo -e " => 在您的环境中找到 \033[32m[ $python_version ]\033[0m, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: "
read -p "" use_python
echo
use_python=`echo $use_python | tr 'A-Z' 'a-z'`
if [[ "$use_python" == "y" ]]||[[ "$use_python" == "" ]];then
use_python="y"
break
elif [[ "$use_python" == "n" ]];then
clearMacPythonEnv
break
else
red " 输入错误,请重新输入(y/n)"
fi
done
if [[ "$use_python" == "y" ]];then
return 0
fi
else
red " 您输入Python的不是Python3"
clearMacPythonEnv
fi
fi
else
python_version=""
clearMacPythonEnv
red " => 未能在常规路径下找到可用的Python3,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载安装Python3(注意Python版本不能低于3.5.x)"
read -p " 如希望自定义Python路径,请输入路径
如果希望重新选择Python版本,请回车:" python_root
echo
if [[ "$python_root" == "" ]];then
python_V=""
clearMacPythonEnv
return 1
fi
fi
check_python=`echo $python_version | grep "Python 3"`
if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ] ;then
python_version=""
elif [ -n "$check_python" ] ;then
while true
do
read -p "
=> 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: " use_python
echo
use_python=`echo $use_python | tr 'A-Z' 'a-z'`
if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then
use_python="y"
break
elif [ "$use_python" == "n" ];then
python_root=""
break
else
echo "输入错误,请重新输入(y/n)"
fi
done
if [ "$use_python" == "y" ];then
break
fi
else
echo "您输入Python的不是Python3"
python_version=""
fi
done
}
......@@ -672,145 +852,160 @@ function checkMacPaddleVersion(){
do
read -n1 -p "Step 2. 选择PaddlePaddle的版本,请按回车键继续..."
echo
read -p "
1. 开发版:对应Github上develop分支,如您需要开发、或希望使用PaddlePaddle最新功能,请选用此版本
2. 稳定版(推荐):如您无特殊开发需求,建议使用此版本,目前最新的版本号为 ${release_version}
=> 请输入数字1或2。如输入其他字符或直接回车,将会默认选择【 2. 稳定版 】 。请在这里输入并回车:" paddle_version
if [ "$paddle_version" == "1" ]||[ "$paddle_version" == "2" ];then
yellow " 1. 开发版:对应Github上develop分支,如您需要开发、或希望使用PaddlePaddle最新功能,请选用此版本"
yellow " 2. 稳定版(推荐):如您无特殊开发需求,建议使用此版本,目前最新的版本号为 ${release_version}"
read -p " => 请输入数字1或2。如输入其他字符或直接回车,将会默认选择【 2. 稳定版 】 。请在这里输入并回车:" paddle_version
if [[ "$paddle_version" == "1" ]]||[[ "$paddle_version" == "2" ]];then
echo
echo "您选择了数字【"$paddle_version" 】"
yellow " 您选择了数字【"$paddle_version" 】"
echo
break
else
paddle_version="2"
echo
echo "您选择了数字【2】"
yellow " 您选择了数字【2】"
echo
break
fi
done
}
function initCheckMacPython2(){
echo
yellow " 您选择了Python "$python_V",正在寻找符合要求的Python 2版本"
echo
python_root=`which python2.7`
if [[ "$python_root" == "" ]];then
python_root=`which python`
fi
checkMacPython2
if [[ "$?" == "1" ]];then
return 1
else
return 0
fi
}
function checkMacPythonVersion(){
while true
do
read -n1 -p "Step 3. 选择Python版本,请按回车键继续..."
read -p "
2. 使用python 2.x
3. 使用python 3.x
function initCheckMacPython3(){
echo
yellow " 您选择了Python "$python_V",正在寻找符合您要求的Python 2版本"
echo
python_root=`which python3`
checkMacPython3
if [[ "$?" == "1" ]];then
return 1
else
return 0
fi
}
=> 请输入数字2或3。如输入其他字符或直接回车,将会默认使用【Python 2 】。请在这里输入并回车:" python_V
echo
if [ "$python_V" == "" ];then
python_V="2"
function checkMacPip(){
if [[ "$python_V" == "2" ]]||[[ "$python_V" == "3" ]];then
python_brief_version=`$python_root -m pip -V |awk -F "[ |)]" '{print $6}'|sed 's#\.##g'`
if [[ ${python_brief_version} == "" ]];then
red "您输入的python:${python_root} 对应的pip不可用,请检查此pip或重新选择其他python"
echo
return 1
fi
echo "您选择了数字【"$python_V"】,正在寻找符合您要求的Python版本,请按回车键继续..."
echo
if [ "$python_V" == "2" ];then
python_root=`which python2.7`
if [ "$python_root" == "" ];then
python_root=`which python`
fi
python_version=`$python_root --version 2>&1 1>&1`
if [ $? == "0" ];then
:
else
python_version=""
fi
if [ "$python_root" == "" ]||[ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ]||[ "$python_root" == "/usr/bin/python2.7" -a "$python_version" == "Python 2.7.10" ];then
checkMacPython2
fi
while true
do
read -p "
=> 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车:" use_python
echo
use_python=`echo $use_python | tr 'A-Z' 'a-z'`
if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then
break
elif [ "$use_python" == "n" ];then
python_root=""
checkMacPython2
break
pip_version=`$python_root -m pip -V |awk -F '[ .]' '{print $2}'`
if [[ 9 -le ${pip_version} ]];then
:
else
red "您的pip版本过低,请安装pip 9.0.1及以上的版本"
echo
return 1
fi
if [[ "$python_brief_version" == "" ]];then
clearMacPythonEnv
red "您的 $python_root 对应的pip存在问题,请按ctrl + c退出后重新安装pip,或切换其他python版本"
echo
return 1
else
if [[ $python_brief_version == "27" ]];then
uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27"`
if [[ $uncode == "" ]];then
uncode="mu"
else
echo "输入错误,请重新输入(y/n)"
uncode="m"
fi
done
elif [ "$python_V" == "3" ];then
python_root=`which python3`
python_version=`$python_root --version 2>&1 1>&1`
if [ $? == "0" ];then
:
else
python_version=""
fi
if [ "$python_root" == "" ]||[ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ];then
checkMacPython3
fi
while true
do
read -p "
=> 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车:" use_python
fi
version_list=`echo "${python_list[@]}" | grep "$python_brief_version" `
if [[ "$version_list" != "" ]];then
return 0
else
red "未找到可用的pip或pip3。PaddlePaddle目前支持:Python2.7/3.5/3.6/3.7及其对应的pip, 请重新输入,或使用ctrl + c退出"
echo
use_python=`echo $use_python | tr 'A-Z' 'a-z'`
if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then
break
elif [ "$use_python" == "n" ];then
checkMacPython3
break
else
echo "输入错误,请重新输入(y/n)"
fi
done
else
:
fi
clearMacPythonEnv
return 1
fi
fi
fi
}
if [ "$python_V" == "2" ]||[ "$python_V" == "3" ];then
python_brief_version=`$python_root -m pip -V |awk -F "[ |)]" '{print $6}'|sed 's#\.##g'`
if [[ $python_brief_version == "27" ]];then
uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27"`
if [[ $uncode == "" ]];then
uncode="mu"
else
uncode="m"
fi
fi
version_list=`echo "${python_list[@]}" | grep "$python_brief_version" `
if [ "$version_list" != "" ];then
break
function checkMacPythonVersion(){
while true
do
read -n1 -p "Step 3. 选择Python版本,请按回车键继续..."
echo
yellow " 2. 使用python 2.x"
yellow " 3. 使用python 3.x"
read -p " => 请输入数字2或3。如输入其他字符或直接回车,将会默认使用【Python 2 】。请在这里输入并回车:" python_V
if [[ "$python_V" == "" ]];then
python_V="2"
fi
if [[ "$python_V" == "2" ]];then
initCheckMacPython2
if [[ "$?" == "0" ]];then
checkMacPip
if [[ "$?" == "0" ]];then
return 0
else
:
fi
else
echo "未找到可用的pip或pip3。PaddlePaddle目前支持:Python2.7/3.5/3.6/3.7及其对应的pip, 请重新输入,或使用ctrl + c退出"
fi
else
echo "输入错误,请重新输入"
fi
:
fi
elif [[ "$python_V" == "3" ]];then
initCheckMacPython3
if [[ "$?" == "0" ]];then
checkMacPip
if [[ "$?" == "0" ]];then
return 0
else
:
fi
else
:
fi
else
red "输入错误,请重新输入"
fi
done
}
function checkMacAVX(){
read -n1 -p "Step 4. 检测您的Mac是否支持AVX指令集,请按回车键继续..."
echo
if [[ $AVX != "" ]];then
AVX="avx"
echo "检测结果:支持"
echo ""
green " 检测结果:支持"
echo ""
return 0
else
read -n1 -p "检测结果:不支持。非常抱歉,PaddlePaddle在Mac系统暂不提供no_avx类型的安装包,您可以选择在Linux系统中安装no_avx版的PaddlePaddle, 请按回车键退出..."
exit
red " 检测结果:不支持。非常抱歉,PaddlePaddle在Mac系统暂不提供no_avx类型的安装包,您可以选择在Linux系统中安装no_avx版的PaddlePaddle, 请按回车键退出..."
echo
return 1
fi
echo
}
function checkMacGPU(){
read -n1 -p "Step 5. 选择CPU/GPU版本,请按回车键继续..."
echo
if [[ $GPU != "" ]];then
echo "MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle"
yellow " MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle"
else
echo "MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle"
yellow " MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle"
GPU=cpu
fi
echo
......@@ -822,38 +1017,44 @@ function macos() {
while true
do
checkMacPaddleVersion
checkMacPythonVersion
checkMacAVX
checkMacGPU
echo "*********************2. 开始安装*****************************"
green "*********************2. 开始安装*****************************"
echo
read -n1 -p "即将为您下载并安装PaddlePaddle,请按回车键继续..."
yellow "即将为您下载并安装PaddlePaddle,请按回车键继续..."
read -n1 -p ""
echo
if [[ $paddle_version == "2" ]];then
$python_root -m pip install paddlepaddle
if [ $? == "0" ];then
echo "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
if [[ $? == "0" ]];then
green "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
break
else
rm $whl_cpu_release
echo "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
red "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
echo""
echo "=========================================================================================="
echo""
exit 1
fi
else
if [ -f $whl_cpu_develop ];then
if [[ -f $whl_cpu_develop ]];then
$python_root -m pip install $whl_cpu_develop
if [ $? == "0" ];then
if [[ $? == "0" ]];then
rm -rf $whl_cpu_develop
echo "安装成功!小提示:可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
# TODO add install success check here
green "安装成功!小提示:可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
break
else
echo "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
red "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
echo""
echo "=========================================================================================="
echo""
......@@ -861,15 +1062,15 @@ function macos() {
fi
else
wget ${path}$whl_cpu_develop -O $whl_cpu_develop
if [ $? == "0" ];then
if [[ $? == "0" ]];then
$python_root -m pip install $whl_cpu_develop
if [ $? == "0" ];then
if [[ $? == "0" ]];then
rm $wheel_cpu_develop
echo "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
green "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
break
else
rm $whl_cpu_release
echo "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
red "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
echo""
echo "=========================================================================================="
echo""
......@@ -877,7 +1078,7 @@ function macos() {
fi
else
rm $whl_cpu_develop
echo "未能正常安装PaddlePaddle,请检查您的网络 或者确认您是否安装有 wget,或者ctrl + c退出后反馈至https://github.com/PaddlePaddle/Paddle/issues"
red "未能正常安装PaddlePaddle,请检查您的网络 或者确认您是否安装有 wget,或者ctrl + c退出后反馈至https://github.com/PaddlePaddle/Paddle/issues"
echo""
echo "=========================================================================================="
echo""
......@@ -890,33 +1091,35 @@ function macos() {
function main() {
echo "*********************************"
echo "欢迎使用PaddlePaddle快速安装脚本"
green "欢迎使用PaddlePaddle快速安装脚本"
echo "*********************************"
echo
echo "如果您在安装过程中遇到任何问题,请在https://github.com/PaddlePaddle/Paddle/issues反馈,我们的工作人员将会帮您答疑解惑"
yellow "如果您在安装过程中遇到任何问题,请在https://github.com/PaddlePaddle/Paddle/issues反馈,我们的工作人员将会帮您答疑解惑"
echo
echo "本安装包将帮助您在Linux或Mac系统下安装PaddlePaddle,包括 1)安装前的准备和 2)开始安装 两部分"
echo "本安装包将帮助您在Linux或Mac系统下安装PaddlePaddle,包括"
yellow "1)安装前的准备"
yellow "2)开始安装"
echo
read -n1 -p "请按回车键进行下一步..."
echo
echo
echo "*********************1. 安装前的准备*****************************"
green "*********************1. 安装前的准备*****************************"
echo
echo "Step 1. 正在检测您的操作系统信息..."
echo
SYSTEM=`uname -s`
if [ "$SYSTEM" == "Darwin" ];then
echo "您的系统为:MAC OSX"
if [[ "$SYSTEM" == "Darwin" ]];then
yellow " 您的系统为:MAC OSX"
echo
macos
else
echo "您的系统为:Linux"
yellow " 您的系统为:Linux"
echo
OS=`cat /etc/issue|awk 'NR==1 {print $1}'`
if [ $OS == "\S" ] || [ "$OS" == "CentOS" ] || [ $OS == "Ubuntu" ];then
if [[ $OS == "\S" ]] || [[ "$OS" == "CentOS" ]] || [[ $OS == "Ubuntu" ]];then
linux
else
echo "您的系统不在本安装包的支持范围,如您需要在windows环境下安装PaddlePaddle,请您参考PaddlePaddle官网的windows安装文档"
red "您的系统不在本安装包的支持范围,如您需要在windows环境下安装PaddlePaddle,请您参考PaddlePaddle官网的windows安装文档"
fi
fi
}
......
......@@ -25,4 +25,5 @@ import paddle.reader
import paddle.dataset
import paddle.batch
import paddle.compat
import paddle.distributed
batch = batch.batch
......@@ -161,7 +161,6 @@ def __bootstrap__():
'times_excess_than_required_tmp_allocation',
'enable_inplace_whitelist'
]
core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)])
core.init_glog(sys.argv[0])
......
......@@ -177,7 +177,10 @@ class CompiledProgram(object):
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass.
self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True
if self._build_strategy.memory_optimize is True:
self._build_strategy.memory_optimize = False if self._program._is_mem_optimized else True
if self._build_strategy.enable_inplace is True:
self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True
if self._build_strategy.num_trainers > 1 and trainers_endpoints:
assert self._build_strategy.num_trainers == len(
......
......@@ -63,10 +63,10 @@ Notes:
## 4. How to reproduce the results
* Small dataset
```bash
python python/paddle/fluid/contrib/tests/test_calibration.py
FLAGS_use_mkldnn=true python python/paddle/fluid/contrib/tests/test_calibration.py
```
* Full dataset
```bash
DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
FLAGS_use_mkldnn=true DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
```
......@@ -6,5 +6,9 @@ if(APPLE OR WIN32 OR NOT WITH_MKL)
endif()
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
if(src MATCHES "test_calibration")
py_test(${src} SRCS ${src}.py ENVS FLAGS_use_mkldnn=true)
else()
py_test(${src} SRCS ${src}.py)
endif()
endforeach()
......@@ -199,7 +199,6 @@ class TestCalibrationForResnet50(unittest.TestCase):
def run_program(self, model_path, generate_int8=False, algo='direct'):
image_shape = [3, 224, 224]
os.environ['FLAGS_use_mkldnn'] = 'True'
fluid.memory_optimize(fluid.default_main_program())
......@@ -241,9 +240,6 @@ class TestCalibrationForResnet50(unittest.TestCase):
label = label.reshape([-1, 1])
running_program = calibrator.sampling_program.clone(
) if generate_int8 else infer_program.clone()
for op in running_program.current_block().ops:
if op.has_attr("use_mkldnn"):
op._set_attr("use_mkldnn", True)
t1 = time.time()
_, acc1, _ = exe.run(
......
......@@ -261,45 +261,42 @@ def _as_lodtensor(data, place):
class Executor(object):
"""
An Executor in Python, only support the single-GPU running. For multi-cards, please refer to
ParallelExecutor.
Python executor takes a program, add feed operators and fetch operators to this program according
An Executor in Python, supports single/multiple-GPU running, and single/multiple-CPU running.
Python executor takes a program, adds feed operators and fetch operators to this program according
to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
the variables(or names) that user want to get after program run. Note: the executor will run all
the variables(or names) that user wants to get after program runs. Note: the executor will run all
operators in the program but not only the operators dependent by the fetch_list.
It store the global variables into the global scope, and create a local scope for the temporary
variables. The local scope contents will be discarded after every minibatch forward/backward finished.
But the global scope variables will be persistent through different runs.
All of ops in program will be running in sequence.
It stores the global variables into the global scope, and creates a local scope for the temporary
variables. The contents in local scope may be discarded after every minibatch forward/backward
finished. But the global scope variables will be persistent through different runs.
Example:
.. code-block:: python
# First create the Executor.
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
# Run the startup program once and only once.
# Not need to optimize/compile the startup program.
exe.run(fluid.default_startup_program())
# Run the main program directly without compile.
loss, = exe.run(fluid.default_main_program(),
feed=feed_dict,
fetch_list=[loss.name])
# Or, compiled the program and run. See `CompiledProgram` for more detail.
compiled_prog = compiler.CompiledProgram(
fluid.default_main_program()).with_data_parallel(
loss_name=loss.name)
loss, = exe.run(compiled_prog,
feed=feed_dict,
fetch_list=[loss.name])
.. code-block:: python
# First create the Executor.
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
# Run the startup program once and only once.
# Not need to optimize/compile the startup program.
exe.run(fluid.default_startup_program())
# Run the main program directly without compile.
loss, = exe.run(fluid.default_main_program(),
feed=feed_dict,
fetch_list=[loss.name])
# Or, compiled the program and run. See `CompiledProgram` for more detail.
compiled_prog = compiler.CompiledProgram(
fluid.default_main_program()).with_data_parallel(
loss_name=loss.name)
loss, = exe.run(compiled_prog,
feed=feed_dict,
fetch_list=[loss.name])
Args:
place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
Note: For debugging complicated network in parallel-GPUs, you can test it on the executor.
They has the exactly same arguments, and expected the same results.
"""
def __init__(self, place):
......@@ -382,6 +379,12 @@ class Executor(object):
]
return outs
'''
TODO(typhoonzero): Define "no longer use" meaning? Can user create
a new Executor for the same program and run?
TODO(panyx0718): Why ParallelExecutor doesn't have close?
'''
def close(self):
"""
Close this executor.
......@@ -389,9 +392,6 @@ class Executor(object):
You can no longer use this executor after calling this method.
For the distributed training, this method would free the resource on PServers related to
the current Trainer.
TODO(typhoonzero): Define "no longer use" meaning? Can user create
a new Executor for the same program and run?
TODO(panyx0718): Why ParallelExecutor doesn't have close?
Example:
>>> cpu = core.CPUPlace()
......
......@@ -555,7 +555,8 @@ class OpProtoHolder(object):
return {
core.op_proto_and_checker_maker.kOpRoleAttrName(),
core.op_proto_and_checker_maker.kOpRoleVarAttrName(),
core.op_proto_and_checker_maker.kOpNameScopeAttrName()
core.op_proto_and_checker_maker.kOpNameScopeAttrName(),
core.op_proto_and_checker_maker.kOpCreationCallstackAttrName()
}
......
......@@ -766,7 +766,10 @@ def _load_distributed_persistables(executor, dirname, main_program=None):
dtype=slice_var.dtype,
persistable=True)
dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
dim1_flatten = 1
if len(slice.shape) >= 2:
dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
start = int(offset / dim1_flatten)
end = int(offset / dim1_flatten + slice.shape[0])
......@@ -892,7 +895,7 @@ def save_inference_model(dirname,
True is supported.
Returns:
None
target_var_name_list(list): The fetch variables' name list
Raises:
ValueError: If `feed_var_names` is not a list of basestring.
......@@ -945,11 +948,13 @@ def save_inference_model(dirname,
# TODO(Superjomn) add an IR pass to remove 1-scale op.
with program_guard(main_program):
uniq_target_vars = []
for var in target_vars:
for i, var in enumerate(target_vars):
if isinstance(var, Variable):
var1 = layers.scale(var, 1.)
uniq_target_vars.append(var1)
var = layers.scale(
var, 1., name="save_infer_model/scale_{}".format(i))
uniq_target_vars.append(var)
target_vars = uniq_target_vars
target_var_name_list = [var.name for var in target_vars]
# when a pserver and a trainer running on the same machine, mkdir may conflict
try:
......@@ -1006,6 +1011,7 @@ def save_inference_model(dirname,
params_filename = os.path.basename(params_filename)
save_persistables(executor, dirname, main_program, params_filename)
return target_var_name_list
def load_inference_model(dirname,
......
......@@ -506,9 +506,9 @@ class While(object):
while loop control flow.
Args:
cond (Variable): condition used to compare.
cond(Variable): condition used to compare.
is_test(bool): A flag indicating whether execution is in test phase.
name (str): The name of this layer.
name(str): The name of this layer.
Examples:
.. code-block:: python
......@@ -589,7 +589,8 @@ class While(object):
def lod_rank_table(x, level=0):
"""LoD Rank Table Operator. Given an input variable **x** and a level number
"""
LoD Rank Table Operator. Given an input variable **x** and a level number
of LoD, this layer creates a LodRankTable object. A LoDRankTable object
contains a list of bi-element tuples. Each tuple consists of an index and
a length, both of which are int type. Refering to specified level of LoD,
......@@ -883,10 +884,8 @@ def less_than(x, y, force_cpu=None, cond=None, **ignored):
return cond
def equal(x, y, cond=None, **ignored):
def equal(x, y, cond=None):
"""
**equal**
This layer returns the truth value of :math:`x == y` elementwise.
Args:
......@@ -1458,7 +1457,6 @@ class DynamicRNN(object):
Returns:
The current timestep in the input sequence.
"""
self._assert_in_rnn_block_("step_input")
if not isinstance(x, Variable):
......@@ -1535,8 +1533,7 @@ class DynamicRNN(object):
@signature_safe_contextmanager
def block(self):
"""
The block for user to define operators in RNN. See the class docstring
for more details.
The block for user to define operators in RNN.
"""
if self.status != DynamicRNN.BEFORE_RNN:
raise ValueError("rnn.block() can only be invoke once")
......@@ -1640,8 +1637,7 @@ class DynamicRNN(object):
dtype(str|numpy.dtype): The data type of the initialized memory.
Returns:
the memory variable.
The memory variable.
"""
self._assert_in_rnn_block_('memory')
self._init_zero_idx_()
......@@ -1740,7 +1736,7 @@ class DynamicRNN(object):
def output(self, *outputs):
"""
mark the RNN output variables.
Mark the RNN output variables.
Args:
outputs: The output variables.
......
......@@ -545,15 +545,16 @@ def yolov3_loss(x,
TypeError: Attr ignore_thresh of yolov3_loss must be a float number
Examples:
.. code-block:: python
x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
anchors = [0, 1, 2]
loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80, anchors=anchors,
ignore_thresh=0.5, downsample_ratio=32)
.. code-block:: python
x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
anchor_mask = [0, 1, 2]
loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel, anchors=anchors,
anchor_mask=anchor_mask, class_num=80,
ignore_thresh=0.7, downsample_ratio=32)
"""
helper = LayerHelper('yolov3_loss', **locals())
......
......@@ -56,7 +56,10 @@ def data(name,
Args:
name(str): The name/alias of the function
shape(list): Tuple declaring the shape.
shape(list): Tuple declaring the shape. If :code:`append_batch_size` is
True and there is no -1 inside :code:`shape`, it should be
considered as the shape of the each sample. Otherwise, it
should be considered as the shape of the batched data.
append_batch_size(bool):
1. If true, it prepends -1 to the shape.
For example if shape=[1], the resulting shape is [-1, 1].
......
......@@ -24,7 +24,7 @@ from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype
from ..layer_helper import LayerHelper
__all__ = [
'deprecated', 'generate_layer_fn', 'generate_layer_fn_noattr', 'autodoc',
'deprecated', 'generate_layer_fn', 'generate_activation_fn', 'autodoc',
'templatedoc'
]
......@@ -89,6 +89,9 @@ def _generate_doc_string_(op_proto, additional_args_lines=None):
buf.write('\n')
skip_attrs = OpProtoHolder.generated_op_attr_names()
# attr use_mkldnn and is_test also should not be visible to users.
skip_attrs.add("use_mkldnn")
skip_attrs.add("is_test")
for each_attr in op_proto.attrs:
if each_attr.name in skip_attrs:
......@@ -226,7 +229,7 @@ def generate_layer_fn(op_type):
return func
def generate_layer_fn_noattr(op_type):
def generate_activation_fn(op_type):
"""Register the Python layer for an Operator without Attribute.
Args:
......@@ -246,6 +249,7 @@ def generate_layer_fn_noattr(op_type):
func.__name__ = op_type
func.__doc__ = _generate_doc_string_(op_proto)
return func
......
......@@ -2441,7 +2441,7 @@ def pool2d(input,
data = fluid.layers.data(
name='data', shape=[3, 32, 32], dtype='float32')
conv2d = fluid.layers.pool2d(
pool2d = fluid.layers.pool2d(
input=data,
pool_size=2,
pool_type='max',
......@@ -2490,6 +2490,7 @@ def pool2d(input,
return pool_out
@templatedoc()
def pool3d(input,
pool_size=-1,
pool_type="max",
......@@ -2501,13 +2502,19 @@ def pool3d(input,
name=None,
exclusive=True):
"""
This function adds the operator for pooling in 3-dimensions, using the
pooling configurations mentioned in input parameters.
${comment}
Args:
input (Variable): ${input_comment}
pool_size (int): ${ksize_comment}
pool_type (str): ${pooling_type_comment}
input (Variable): The input tensor of pooling operator. The format of
input tensor is NCDHW, where N is batch size, C is
the number of channels, D is the depth of the feature,
H is the height of the feature, and W is the width
of the feature.
pool_size (int|list|tuple): The pool kernel size. If pool kernel size
is a tuple or list, it must contain three integers,
(pool_size_Depth, pool_size_Height, pool_size_Width).
Otherwise, the pool kernel size will be the cube of an int.
pool_type (string): ${pooling_type_comment}
pool_stride (int): stride of the pooling layer.
pool_padding (int): padding size.
global_pooling (bool): ${global_pooling_comment}
......@@ -2520,6 +2527,19 @@ def pool3d(input,
Returns:
Variable: output of pool3d layer.
Examples:
.. code-block:: python
data = fluid.layers.data(
name='data', shape=[3, 32, 32, 32], dtype='float32')
pool3d = fluid.layers.pool3d(
input=data,
pool_size=2,
pool_type='max',
pool_stride=1,
global_pooling=False)
"""
if pool_type not in ["max", "avg"]:
raise ValueError(
......@@ -2569,7 +2589,27 @@ def adaptive_pool2d(input,
require_index=False,
name=None):
"""
${comment}
**Adaptive Pool2d Operator**
The adaptive_pool2d operation calculates the output based on the input, pool_size,
pool_type parameters. Input(X) and output(Out) are in NCHW format, where N is batch
size, C is the number of channels, H is the height of the feature, and W is
the width of the feature. Parameters(pool_size) should contain two elements which
represent height and width, respectively. Also the H and W dimensions of output(Out)
is same as Parameter(pool_size).
For average adaptive pool2d:
.. math::
hstart &= floor(i * H_{in} / H_{out})
hend &= ceil((i + 1) * H_{in} / H_{out})
wstart &= floor(j * W_{in} / W_{out})
wend &= ceil((j + 1) * W_{in} / W_{out})
Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
Args:
input (Variable): The input tensor of pooling operator. The format of
......@@ -2579,8 +2619,8 @@ def adaptive_pool2d(input,
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain two integers, (pool_size_Height, pool_size_Width).
pool_type: ${pooling_type_comment}
require_index (bool): If true, the index of max pooling point along with outputs.
it cannot be set in average pooling type.
require_index (bool): If true, the index of max pooling point will be returned along
with outputs. It cannot be set in average pooling type.
name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically.
......@@ -2661,18 +2701,42 @@ def adaptive_pool3d(input,
require_index=False,
name=None):
"""
${comment}
**Adaptive Pool3d Operator**
The adaptive_pool3d operation calculates the output based on the input, pool_size,
pool_type parameters. Input(X) and output(Out) are in NCDHW format, where N is batch
size, C is the number of channels, D is the depth of the feature, H is the height of
the feature, and W is the width of the feature. Parameters(pool_size) should contain
three elements which represent height and width, respectively. Also the D, H and W
dimensions of output(Out) is same as Parameter(pool_size).
For average adaptive pool3d:
.. math::
dstart &= floor(i * D_{in} / D_{out})
dend &= ceil((i + 1) * D_{in} / D_{out})
hstart &= floor(j * H_{in} / H_{out})
hend &= ceil((j + 1) * H_{in} / H_{out})
wstart &= floor(k * W_{in} / W_{out})
wend &= ceil((k + 1) * W_{in} / W_{out})
Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
Args:
input (Variable): The input tensor of pooling operator. The format of
input tensor is NCHW, where N is batch size, C is
the number of channels, H is the height of the
feature, and W is the width of the feature.
input tensor is NCDHW, where N is batch size, C is
the number of channels, D is the depth of the feature,
H is the height of the feature, and W is the width of the feature.
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain two integers, (Depth, Height, Width).
it must contain three integers, (Depth, Height, Width).
pool_type: ${pooling_type_comment}
require_index (bool): If true, the index of max pooling point along with outputs.
it cannot be set in average pooling type.
require_index (bool): If true, the index of max pooling point will be returned along
with outputs. It cannot be set in average pooling type.
name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically.
......@@ -2709,7 +2773,7 @@ def adaptive_pool3d(input,
name='data', shape=[3, 32, 32], dtype='float32')
pool_out, mask = fluid.layers.adaptive_pool3d(
input=data,
pool_size=[3, 3],
pool_size=[3, 3, 3],
pool_type='avg')
"""
if pool_type not in ["max", "avg"]:
......@@ -2930,6 +2994,7 @@ def batch_norm(input,
"momentum": momentum,
"epsilon": epsilon,
"is_test": is_test,
"data_layout": data_layout,
"use_mkldnn": False,
"fuse_with_relu": fuse_with_relu,
"use_global_stats": use_global_stats
......@@ -3235,7 +3300,7 @@ def group_norm(input,
# create output
mean_out = helper.create_variable(dtype=dtype, stop_gradient=True)
variance_out = helper.create_variable(dtype=dtype, stop_gradient=True)
group_norm_out = helper.create_variable(dtype)
group_norm_out = helper.create_variable(dtype=dtype)
helper.append_op(
type="group_norm",
......
......@@ -14,7 +14,7 @@
from __future__ import print_function
import os
from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr
from .layer_function_generator import generate_layer_fn, generate_activation_fn
from .. import core
from ..framework import convert_np_dtype_to_dtype_
......@@ -53,7 +53,7 @@ globals()['_elementwise_div'] = generate_layer_fn('elementwise_div')
__all__ += __activations_noattr__
for _OP in set(__activations_noattr__):
globals()[_OP] = generate_layer_fn_noattr(_OP)
globals()[_OP] = generate_activation_fn(_OP)
__all__ += ["uniform_random"]
......
......@@ -1368,9 +1368,9 @@ class FtrlOptimizer(Optimizer):
Args:
learning_rate (float|Variable): global learning rate.
l1 (float):
l2 (float):
lr_power (float):
l1 (float): L1 regularization strength.
l2 (float): L2 regularization strength.
lr_power (float): Learning Rate Power.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
......
......@@ -148,7 +148,10 @@ class ParallelExecutor(object):
else framework.default_main_program()
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass.
build_strategy.enable_inplace = False if main._is_mem_optimized else True
if build_strategy.memory_optimize is True:
build_strategy.memory_optimize = False if main._is_mem_optimized else True
if build_strategy.enable_inplace is True:
build_strategy.enable_inplace = False if main._is_mem_optimized else True
scope = scope if scope is not None else executor.global_scope()
if share_vars_from and not isinstance(share_vars_from,
......
......@@ -77,6 +77,7 @@ list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
list(REMOVE_ITEM TEST_OPS test_imperative_optimizer)
list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer)
foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP})
endforeach(TEST_OP)
......@@ -107,10 +108,18 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
if(NOT WIN32)
py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL)
endif()
if(NOT APPLE)
py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# change the timeout from 600 to 1200, because in debug mode, this test need more time.
set_tests_properties(test_image_classification_resnet PROPERTIES TIMEOUT 1200)
endif()
endif()
if (WITH_NGRAPH)
add_subdirectory(ngraph)
endif()
......
......@@ -79,7 +79,7 @@ class TestParallelExecutorBase(unittest.TestCase):
if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv
build_strategy.memory_optimize = use_ir_memory_optimize
build_strategy.memory_optimize = False if memory_opt else use_ir_memory_optimize
# python memory optimization is conflict with inplace pass.
# Use ir graph memory optimization after inplace pass is the correct way.
build_strategy.enable_inplace = False if memory_opt else enable_inplace
......
......@@ -121,6 +121,8 @@ class TestMNIST(TestParallelExecutorBase):
regularization=fluid.regularizer.L2Decay(1e-6))
return optimizer
# NOTE(dzh):
# need to make it compatible with elewise fuse act
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
model,
feed_dict={"image": img,
......@@ -128,6 +130,7 @@ class TestMNIST(TestParallelExecutorBase):
use_cuda=use_cuda,
fuse_elewise_add_act_ops=False,
memory_opt=False,
use_ir_memory_optimize=False,
optimizer=_optimizer)
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
model,
......@@ -136,6 +139,7 @@ class TestMNIST(TestParallelExecutorBase):
use_cuda=use_cuda,
fuse_elewise_add_act_ops=True,
memory_opt=False,
use_ir_memory_optimize=False,
optimizer=_optimizer)
for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
......
......@@ -16,6 +16,7 @@ from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid.core as core
from op_test import OpTest
......@@ -63,5 +64,54 @@ class TestCase2(TestSliceOp):
self.out = self.input[-3:3, 0:100, :, 2:-1]
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestFP16(TestSliceOp):
def config(self):
self.dtype = "float16"
self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
self.starts = [-3, 0, 2]
self.ends = [3, 100, -1]
self.axes = [0, 1, 3]
self.out = self.input[-3:3, 0:100, :, 2:-1]
def test_check_output(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-5)
def test_check_grad_normal(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ['Input'], 'Out', max_relative_error=0.006)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestFP16_2(TestSliceOp):
def config(self):
self.dtype = "float16"
self.input = np.random.random([3, 4, 5]).astype(self.dtype)
self.starts = [0]
self.ends = [1]
self.axes = [1]
self.out = self.input[:, 0:1, :]
def test_check_output(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-5)
def test_check_grad_normal(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ['Input'],
'Out',
max_relative_error=0.006,
numeric_grad_delta=0.5)
if __name__ == '__main__':
unittest.main()
......@@ -1020,7 +1020,11 @@ class DistributeTranspiler(object):
skip_dim0 = 0
slice_vars = self.param_var_mapping[orig_var_name]
orig_dim1_flatten = reduce(lambda x, y: x * y, slice_vars[0].shape[1:])
orig_dim1_flatten = 1
if len(slice_vars[0].shape) >= 2:
orig_dim1_flatten = reduce(lambda x, y: x * y,
slice_vars[0].shape[1:])
for slice_var in slice_vars[:block_idx]:
skip_dim0 += slice_var.shape[0]
......
requests==2.9.2
numpy>=1.12
protobuf==3.1
protobuf>=3.1.0
recordio>=0.1.0
matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib
rarfile
......
......@@ -100,6 +100,7 @@ packages=['paddle',
'paddle.utils',
'paddle.dataset',
'paddle.reader',
'paddle.distributed',
'paddle.fluid',
'paddle.fluid.imperative',
'paddle.fluid.proto',
......
......@@ -17,7 +17,6 @@ OPENSSL_ROOT=openssl-1.1.0i
OPENSSL_HASH=ebbfc844a8c8cc0ea5dc10b86c9ce97f401837f3fa08c17b2cdadc118253cf99
EPEL_RPM_HASH=e5ed9ecf22d0c4279e92075a64c757ad2b38049bcf5c16c4f2b75d5f6860dc0d
DEVTOOLS_HASH=a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc
PATCHELF_HASH=d9afdff4baeacfbc64861454f368b7f2c15c44d245293f7587bbf726bfe722fb
CURL_ROOT=curl-7.49.1
CURL_HASH=eb63cec4bef692eab9db459033f409533e6d10e20942f4b060b32819e81885f1
AUTOCONF_ROOT=autoconf-2.69
......@@ -107,11 +106,7 @@ curl-config --features
rm -rf /usr/local/ssl
# Install patchelf (latest with unreleased bug fixes)
curl -sLO http://nipy.bic.berkeley.edu/manylinux/patchelf-0.9njs2.tar.gz
check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH
tar -xzf patchelf-0.9njs2.tar.gz
(cd patchelf-0.9njs2 && ./configure && make && make install)
rm -rf patchelf-0.9njs2.tar.gz patchelf-0.9njs2
yum install -y patchelf
# Install latest pypi release of auditwheel
LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import subprocess
import os
import sys
import time
import argparse
default_envs = {
"PADDLE_TRAINER_ENDPOINTS":
"127.0.0.1:6170,127.0.0.1:6171,127.0.0.1:6172,127.0.0.1:6173,127.0.0.1:6174,127.0.0.1:6175,127.0.0.1:6176,127.0.0.1:6177",
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
"PATH": os.getenv("PATH"),
"LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
"PADDLE_TRAINERS_NUM": "8",
"NCCL_DEBUG": "INFO",
"GLOG_v": "0",
"NCCL_SOCKET_IFNAME": "eth0",
"NCCL_IB_GID_INDEX": "3",
"NCCL_IB_RETRY_CNT": "0",
}
GPUS = 8
def start_procs(gpus, cmd, log_dir):
procs = []
log_fns = []
os.system("mkdir -p %s" % log_dir)
# ======== update parent envs =======
for k, v in os.environ.items():
if k.startswith("FLAGS_") or k.startswith("NCCL_") or \
k.startswith("GLOG_"):
default_envs[k] = v
# ======== for dist training =======
node_trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
current_ip = os.getenv("POD_IP", "127.0.0.1")
trainer_ips = os.getenv("PADDLE_TRAINERS", current_ip).split(",")
num_nodes = len(trainer_ips)
all_nodes_devices_endpoints = ""
for n in trainer_ips:
for i in range(gpus):
if all_nodes_devices_endpoints:
all_nodes_devices_endpoints += ","
all_nodes_devices_endpoints += "%s:617%d" % (n, i)
nranks = num_nodes * gpus
# ======== for dist training =======
for i in range(gpus):
curr_env = {}
curr_env.update(default_envs)
curr_env.update({
"FLAGS_selected_gpus": "%d" % i,
"PADDLE_TRAINER_ID": "%d" % (node_trainer_id * gpus + i),
"PADDLE_CURRENT_ENDPOINT": "%s:617%d" % (current_ip, i),
# nranks
"PADDLE_TRAINERS_NUM": "%d" % nranks,
"PADDLE_TRAINER_ENDPOINTS": all_nodes_devices_endpoints
})
print("starting process ", i, cmd, curr_env)
fn = open("%s/workerlog.%d" % (log_dir, i), "w")
log_fns.append(fn)
procs.append(
subprocess.Popen(
cmd.strip().split(" "), stdout=fn, stderr=fn, env=curr_env))
for i in range(gpus):
try:
procs[i].communicate()
procs[i].terminate()
log_fns[i].close()
except:
pass
def main():
parser = argparse.ArgumentParser(
description='''start paddle training using multi-process mode.
NOTE: your train program ***must*** run as distributed nccl2 mode,
see: http://www.paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
And your train program must read environment variables below in order to let different
process init properly:
FLAGS_selected_gpus
PADDLE_TRAINER_ID
PADDLE_CURRENT_ENDPOINT
PADDLE_TRAINERS_NUM
PADDLE_TRAINER_ENDPOINTS
POD_IP (current node ip address, not needed for local training)
''')
parser.add_argument(
'--gpus',
type=int,
default=8,
help='start number of processes for every gpu')
parser.add_argument(
'--cmd',
type=str,
default="",
help='command to run for each process, e.g. python train.py --lr 0.1')
parser.add_argument(
'--log_dir',
type=str,
default="mylog",
help='directory to put logs per process.')
args = parser.parse_args()
if args.cmd == "":
parser.print_help()
exit(0)
start_procs(args.gpus, args.cmd, args.log_dir)
if __name__ == "__main__":
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册