提交 f79a3a83 编写于 作者: S sneaxiy

merge release/1.3

上级 d2d3f2b5
...@@ -231,7 +231,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ...@@ -231,7 +231,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
) )
ENDFUNCTION() ENDFUNCTION()
SET(PROTOBUF_VERSION 3.1) SET(PROTOBUF_VERSION 3.1.0)
IF(NOT PROTOBUF_FOUND) IF(NOT PROTOBUF_FOUND)
build_protobuf(extern_protobuf FALSE) build_protobuf(extern_protobuf FALSE)
......
...@@ -21,7 +21,7 @@ function(CheckCompilerCXX11Flag) ...@@ -21,7 +21,7 @@ function(CheckCompilerCXX11Flag)
if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3) if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.") message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
endif() endif()
endif() endif()
endif() endif()
endfunction() endfunction()
...@@ -147,6 +147,7 @@ set(GPU_COMMON_FLAGS ...@@ -147,6 +147,7 @@ set(GPU_COMMON_FLAGS
-Wno-error=unused-function # Warnings in Numpy Header. -Wno-error=unused-function # Warnings in Numpy Header.
-Wno-error=array-bounds # Warnings in Eigen::array -Wno-error=array-bounds # Warnings in Eigen::array
) )
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
endif(NOT WIN32) endif(NOT WIN32)
if (APPLE) if (APPLE)
......
...@@ -261,7 +261,7 @@ paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=N ...@@ -261,7 +261,7 @@ paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=N
paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None)) paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None))
paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords='ignored', defaults=(None,)) paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
......
...@@ -50,12 +50,15 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_ ...@@ -50,12 +50,15 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper) if(WITH_GPU)
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info)
else()
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper cpu_info)
endif()
cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass) cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass)
cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info) cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info)
cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle
all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
...@@ -67,13 +70,11 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he ...@@ -67,13 +70,11 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass inplace_op_pass) set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass)
if (WITH_GPU) if (WITH_GPU)
list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
endif() endif()
cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph) cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry)
cc_test(memory_optimize_pass_test SRCS memory_optimize_pass_test.cc memory_optimize_pass.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry pass)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
......
...@@ -206,8 +206,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -206,8 +206,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
new std::vector<OpDesc *>(main_program.Block(0).AllOps()); new std::vector<OpDesc *>(main_program.Block(0).AllOps());
graph->Set<const std::vector<OpDesc *>>(kAllOpDescs, graph->Set<const std::vector<OpDesc *>>(kAllOpDescs,
all_op_descs); // take ownership all_op_descs); // take ownership
graph->Set<GraphNodePool>(kGraphNodePool,
new GraphNodePool); // take ownership
pass->Erase(kAllOpDescs); pass->Erase(kAllOpDescs);
pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs); pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs);
...@@ -242,7 +240,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -242,7 +240,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
continue; continue;
} }
} }
VLOG(3) << "Start Apply Pass " << pass->Type();
graph = pass->Apply(std::move(graph)); graph = pass->Apply(std::move(graph));
VLOG(3) << "Finish Apply Pass " << pass->Type();
} }
return graph; return graph;
} }
......
...@@ -49,7 +49,7 @@ DEFINE_bool( ...@@ -49,7 +49,7 @@ DEFINE_bool(
"If this option turns on, only these op in whitelist can be inplaced." "If this option turns on, only these op in whitelist can be inplaced."
"If it turns off, all of the running op can be candidate of inplaced op." "If it turns off, all of the running op can be candidate of inplaced op."
"Such as scale, elementwise_add" "Such as scale, elementwise_add"
"By default, it's turned on"); "By default, it's turned off");
DECLARE_string(memory_optimize_debug); DECLARE_string(memory_optimize_debug);
...@@ -171,16 +171,15 @@ void InplacePass::InplaceModifyDesc(const std::string& var, ...@@ -171,16 +171,15 @@ void InplacePass::InplaceModifyDesc(const std::string& var,
} }
} }
const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var, const NodeSwapQueue InplacePass::TryInplaceModifyVar(
const std::string& cache_var, const std::string& var, const std::string& cache_var, const size_t& idx,
const size_t& idx, ir::Graph* graph) const {
ir::Graph* graph) const {
PADDLE_ENFORCE(var_nodes_[var].size() >= 1 && PADDLE_ENFORCE(var_nodes_[var].size() >= 1 &&
var_nodes_[var].at(0)->Var() != nullptr); var_nodes_[var].at(0)->Var() != nullptr);
std::unique_ptr<VarDesc> var_desc(new VarDesc(*var_nodes_[var].at(0)->Var())); std::unique_ptr<VarDesc> var_desc(new VarDesc(*var_nodes_[var].at(0)->Var()));
var_desc->SetName(cache_var); var_desc->SetName(cache_var);
SSANodePair swap_nodes; NodeSwapQueue swap_nodes;
for (size_t i = idx; i < view_.AllOps().size(); ++i) { for (size_t i = idx; i < view_.AllOps().size(); ++i) {
auto* op = view_.AllOps()[i]; auto* op = view_.AllOps()[i];
...@@ -230,7 +229,7 @@ const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var, ...@@ -230,7 +229,7 @@ const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var,
return swap_nodes; return swap_nodes;
} }
void InplacePass::CommitModify(const SSANodePair& swap_nodes, void InplacePass::CommitModify(const NodeSwapQueue& swap_nodes,
ir::Graph* graph) const { ir::Graph* graph) const {
for (auto& pair : swap_nodes) { for (auto& pair : swap_nodes) {
auto *node = pair.first, *cache_node = pair.second; auto *node = pair.first, *cache_node = pair.second;
...@@ -245,7 +244,7 @@ void InplacePass::CommitModify(const SSANodePair& swap_nodes, ...@@ -245,7 +244,7 @@ void InplacePass::CommitModify(const SSANodePair& swap_nodes,
} }
} }
void InplacePass::WithdrawModify(const SSANodePair& nodes, void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
ir::Graph* graph) const { ir::Graph* graph) const {
for (auto& pair : nodes) { for (auto& pair : nodes) {
auto *node = pair.first, *cache_node = pair.second; auto *node = pair.first, *cache_node = pair.second;
...@@ -403,18 +402,20 @@ void GraphView::Build(ir::Graph* g) { ...@@ -403,18 +402,20 @@ void GraphView::Build(ir::Graph* g) {
// 2. track the nodes which used by parameter server. // 2. track the nodes which used by parameter server.
// these node can not be inplaced, otherwise trainer // these node can not be inplaced, otherwise trainer
// pserver can not find each other name. // pserver can not find each other name.
for (auto& node : g->Nodes()) { auto update_skip_set = [&](ir::Node* node) {
if (!node->IsOp()) continue; for (auto& in : node->inputs) {
if (node->Name() == "send") { if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name());
for (auto& in : node->inputs) {
dup_nodes_.emplace(in->Name());
}
} }
if (node->Name() == "recv") { for (auto& out : node->outputs) {
for (auto& out : node->outputs) { if (out->IsVar() && out->Var() != nullptr)
dup_nodes_.emplace(out->Name()); dup_nodes_.emplace(out->Name());
}
} }
};
for (auto& node : g->Nodes()) {
if (!node->IsOp()) continue;
if (node->Name() == "send") update_skip_set(node);
if (node->Name() == "recv") update_skip_set(node);
if (node->Name() == "prefetch") update_skip_set(node);
} }
} }
......
...@@ -56,7 +56,8 @@ class GraphView { ...@@ -56,7 +56,8 @@ class GraphView {
std::map<ir::Node*, std::unordered_set<ir::Node*>> adj_list_; std::map<ir::Node*, std::unordered_set<ir::Node*>> adj_list_;
}; };
typedef std::vector<std::pair<ir::Node*, ir::Node*>> SSANodePair; // swap pairs in sequence
typedef std::vector<std::pair<ir::Node*, ir::Node*>> NodeSwapQueue;
class InplacePass : public ir::Pass { class InplacePass : public ir::Pass {
public: public:
InplacePass(); InplacePass();
...@@ -68,14 +69,14 @@ class InplacePass : public ir::Pass { ...@@ -68,14 +69,14 @@ class InplacePass : public ir::Pass {
void InitSSAGraphNodes() const; void InitSSAGraphNodes() const;
private: private:
const SSANodePair TryInplaceModifyVar(const std::string& var, const NodeSwapQueue TryInplaceModifyVar(const std::string& var,
const std::string& cache_var, const std::string& cache_var,
const size_t& idx, const size_t& idx,
ir::Graph* graph) const; ir::Graph* graph) const;
void CommitModify(const SSANodePair&, ir::Graph* graph) const; void CommitModify(const NodeSwapQueue&, ir::Graph* graph) const;
void WithdrawModify(const SSANodePair& nodes, ir::Graph* graph) const; void WithdrawModify(const NodeSwapQueue& nodes, ir::Graph* graph) const;
void InplaceModifyDesc(const std::string& in_var, const std::string& out_var, void InplaceModifyDesc(const std::string& in_var, const std::string& out_var,
const size_t& idx) const; const size_t& idx) const;
......
...@@ -13,17 +13,114 @@ ...@@ -13,17 +13,114 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include <algorithm>
#include <deque>
#include <functional> #include <functional>
#include <iostream> #include <iterator>
#include <numeric> #include <numeric>
#include <sstream> #include <sstream>
#include <string> #include <string>
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/cpu_info.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/gpu_info.h"
#endif // PADDLE_WITH_CUDA
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
using paddle::framework::VarDesc;
std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
PADDLE_ENFORCE(graph.Has(kAllOpDescs),
"Graph has no attribute of kAllOpDescs.");
// 1. get op desc order
auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs);
// 2. topology sort order
auto nodes = graph.Nodes();
std::deque<ir::Node*> ops;
FilterVariables(nodes, [&](ir::Node* op) {
if (op->IsOp() && op->Op() != nullptr) {
ops.emplace_back(op);
}
});
std::unordered_map<ir::Node*, size_t> op_deps;
std::list<ir::Node*> ready_ops;
std::unordered_map<ir::Node*, std::unordered_set<ir::Node*>> pending_ops;
for (auto* op : ops) {
std::unordered_set<ir::Node*> preceding_op;
for (auto* in : op->inputs) {
if (in->inputs.empty()) continue;
PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp());
preceding_op.emplace(in->inputs[0]);
pending_ops[in->inputs[0]].emplace(op);
}
op_deps[op] = preceding_op.size();
if (preceding_op.empty()) {
ready_ops.emplace_back(op);
}
}
// 3. generated op list based desc order and the topology order
std::vector<ir::Node*> ret;
std::list<OpDesc*> op_descs_list(op_descs.begin(), op_descs.end());
auto update_by_found_node = [&](ir::Node* found_node) {
for (auto* pending_op : pending_ops[found_node]) {
if (--op_deps[pending_op] == 0) {
ready_ops.emplace_back(pending_op);
}
}
ready_ops.remove(found_node);
ret.emplace_back(found_node);
};
while (!ready_ops.empty()) {
bool all_of_ready_op_unmatched = true;
for (auto it = op_descs_list.begin(); it != op_descs_list.end();) {
auto op_desc = *it;
ir::Node* found_node = nullptr;
for (auto* op : ready_ops) {
if (IsSameDesc(op->Op(), op_desc)) {
found_node = op;
break;
}
}
// 3.1 op desc deleted by other pass
if (found_node == nullptr) {
++it;
continue;
} else {
all_of_ready_op_unmatched = false;
it = op_descs_list.erase(it);
}
update_by_found_node(found_node);
}
// 3.2 op descs are added by other pass
// preceding op non empty means some new op descs are
// created, but not contained in return node list.
// these new op desc may depend on each other.
std::list<ir::Node*> prev_ready_ops(ready_ops);
if (all_of_ready_op_unmatched) {
for (auto op : prev_ready_ops) {
update_by_found_node(op);
}
}
}
PADDLE_ENFORCE(std::all_of(
op_deps.begin(), op_deps.end(),
[&](const std::pair<ir::Node*, size_t>& p) { return p.second == 0; }));
return ret;
}
size_t NodeSizeInBytes(const VarDesc& node) { size_t NodeSize(const VarDesc& node) {
auto shape = node.GetShape(); auto shape = node.GetShape();
int size = int size =
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()); std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
...@@ -31,9 +128,9 @@ size_t NodeSizeInBytes(const VarDesc& node) { ...@@ -31,9 +128,9 @@ size_t NodeSizeInBytes(const VarDesc& node) {
return type_size * std::abs(size); return type_size * std::abs(size);
} }
size_t NodeSizeInBytes(ir::Node* n) { size_t NodeSize(ir::Node* n) {
auto* desc = FindVarDescInBlock(n); auto* desc = FindVarDescInBlock(n);
return NodeSizeInBytes(*desc); return NodeSize(*desc);
} }
std::string DebugStringImpl(VarDesc* var) { std::string DebugStringImpl(VarDesc* var) {
...@@ -59,7 +156,6 @@ std::string DebugStringImpl(VarDesc* var) { ...@@ -59,7 +156,6 @@ std::string DebugStringImpl(VarDesc* var) {
std::string DebugString(ir::Node* var) { std::string DebugString(ir::Node* var) {
return DebugStringImpl(FindVarDescInBlock(var)); return DebugStringImpl(FindVarDescInBlock(var));
} }
// return DebugString(var->Var()); }
// NOTE(dzh): based ir node, if a large node has been reused // NOTE(dzh): based ir node, if a large node has been reused
// by a small size node, then next time it appear in pool, it will // by a small size node, then next time it appear in pool, it will
...@@ -76,22 +172,26 @@ struct NodeComparator { ...@@ -76,22 +172,26 @@ struct NodeComparator {
bool operator()(ir::Node* lhs, ir::Node* rhs) const { bool operator()(ir::Node* lhs, ir::Node* rhs) const {
auto* lhs_desc = FindVarDescInBlock(lhs); auto* lhs_desc = FindVarDescInBlock(lhs);
auto* rhs_desc = FindVarDescInBlock(rhs); auto* rhs_desc = FindVarDescInBlock(rhs);
// match data type
if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) {
return false;
}
// match shape
auto lhs_shape = lhs_desc->GetShape(); auto lhs_shape = lhs_desc->GetShape();
auto rhs_shape = rhs_desc->GetShape(); auto rhs_shape = rhs_desc->GetShape();
if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
(lhs_shape[0] != -1 && rhs_shape[0] != -1)) { (lhs_shape[0] != -1 && rhs_shape[0] != -1)) {
return NodeSizeInBytes(lhs) <= NodeSizeInBytes(rhs); return NodeSize(lhs) <= NodeSize(rhs);
} else { } else {
return false; return false;
} }
} }
}; };
void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) { void OrderedSet::Insert(ir::Node* var) {
PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar()); PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar());
PADDLE_ENFORCE(op->IsOp());
if (mark_table_.count(var->Name()) != 0) { if (mark_table_.count(var->Name()) != 0) {
mark_table_[var->Name()]->second.insert(op); mark_table_[var->Name()]->emplace_back(var);
return; return;
} }
...@@ -99,14 +199,15 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) { ...@@ -99,14 +199,15 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
auto var_shape = var_desc->GetShape(); auto var_shape = var_desc->GetShape();
int batch_size = static_cast<int>(var_shape[0]); int batch_size = static_cast<int>(var_shape[0]);
NodeComparator compare_node; NodeComparator functor;
Iter it = nodes_.begin(); Iter it = nodes_.begin();
while (it != nodes_.end()) { while (it != nodes_.end()) {
auto* cache_desc = FindVarDescInBlock(it->first); auto& prev = it->front();
auto* cache_desc = FindVarDescInBlock(prev);
int cache_batch_size = cache_desc->GetShape()[0]; int cache_batch_size = cache_desc->GetShape()[0];
if ((cache_batch_size == -1 && batch_size == -1) || if ((cache_batch_size == -1 && batch_size == -1) ||
(cache_batch_size != -1 && batch_size != -1)) { (cache_batch_size != -1 && batch_size != -1)) {
if (compare_node(it->first, var)) { if (functor(prev, var)) {
++it; ++it;
} else { } else {
break; break;
...@@ -118,62 +219,127 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) { ...@@ -118,62 +219,127 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
} }
} }
it = it = nodes_.insert(it, {var});
nodes_.insert(it, std::make_pair(var, std::unordered_set<ir::Node*>{op}));
mark_table_[var->Name()] = it; mark_table_[var->Name()] = it;
} }
int OrderedNodeList::GetIndex(ir::Node* var) { int OrderedSet::GetNodeIndexInPool(ir::Node* var) {
return std::distance(nodes_.begin(), mark_table_[var->Name()]); return std::distance(nodes_.begin(), mark_table_[var->Name()]);
} }
ir::Node* OrderedNodeList::NodeMatch(ir::Node* var) const { ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const {
ir::Node* found_node = nullptr; ir::Node* found_node = nullptr;
NodeComparator compare_node; NodeComparator functor;
for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
if (compare_node(var, it->first)) { auto& candidate = it->front();
found_node = it->first; if (functor(var, candidate)) {
found_node = candidate;
break; break;
} }
} }
return found_node; return found_node;
} }
void OrderedNodeList::Erase(ir::Node* var) { Erase(var->Name()); } ir::Node* OrderedSet::FindNextBestFitNode(ir::Node* var, ir::Node* prev) const {
ir::Node* found_node = nullptr;
NodeComparator functor;
auto it =
std::find_if(nodes_.begin(), nodes_.end(), [&](const NodeVector& v) {
if (v.front() == prev)
return true;
else
return false;
});
PADDLE_ENFORCE(it != nodes_.end(), "Not found previous in node list!");
for (it = std::next(it); it != nodes_.end(); ++it) {
auto& candidate = it->front();
if (functor(var, candidate)) {
found_node = candidate;
break;
}
}
return found_node;
}
bool OrderedSet::Has(ir::Node* var) const {
if (mark_table_.count(var->Name())) {
auto& node_in_samename = mark_table_.at(var->Name());
auto iter =
std::find_if(node_in_samename->begin(), node_in_samename->end(),
[&](ir::Node* n) { return n->Name() == var->Name(); });
return iter != node_in_samename->end();
}
return false;
}
void OrderedNodeList::Erase(const std::string& var) { void OrderedSet::Erase(const std::string& var) {
PADDLE_ENFORCE(mark_table_.count(var)); PADDLE_ENFORCE(mark_table_.count(var));
nodes_.erase(mark_table_[var]); nodes_.erase(mark_table_[var]);
mark_table_.erase(var); mark_table_.erase(var);
} }
std::string OrderedNodeList::ToString() const { void OrderedSet::Erase(ir::Node* var) {
PADDLE_ENFORCE(var != nullptr);
Erase(var->Name());
}
std::string OrderedSet::ToString() const {
std::stringstream ss; std::stringstream ss;
for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
ss << DebugString(it->first) << " "; for (auto& node : *it) {
ss << DebugString(node) << " ";
}
} }
return ss.str(); return ss.str();
} }
bool NodeCanReused(ir::Node* node) { bool NodeCanReused(ir::Node* node) {
// valid the node is a var node
if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false; if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false;
// auto* desc = node->Var();
bool flag = NodeCanReused(*node->Var()); bool flag = true;
// op output force generated in cpu, can not be reused.
for (auto* op : node->inputs) { for (auto* op : node->inputs) {
if (op->Op()->HasAttr("force_cpu")) { if (op->Op()->HasAttr("force_cpu")) {
// op output force generated in cpu, can not be reused.
flag &= framework::AttrReader(op->Op()->GetAttrMap()) flag &= framework::AttrReader(op->Op()->GetAttrMap())
.Get<bool>("force_cpu") == 0; .Get<bool>("force_cpu") == 0;
} }
} }
// var desc validation.
flag &= NodeCanReused(*node->Var());
return flag; return flag;
} }
int MinChunkSize() {
int size{0};
#ifdef PADDLE_WITH_CUDA
size = platform::GpuMinChunkSize();
#else
size = platform::CpuMinChunkSize();
#endif // PADDLE_WITH_CUDA
return size;
}
bool NodeCanReused(const VarDesc& node) { bool NodeCanReused(const VarDesc& node) {
auto type = node.GetType(); auto type = node.GetType();
if (node.Persistable() || type != proto::VarType::LOD_TENSOR || // only these types holds bulk of gpu memory
node.GetShape().empty()) { if (!(type == proto::VarType::LOD_TENSOR ||
type == proto::VarType::SELECTED_ROWS ||
type == proto::VarType::LOD_TENSOR_ARRAY)) {
return false;
}
// persistable variable is parameter
if (node.Persistable()) {
return false;
}
// shape < min_chunk_size is meaningless.
// further more, fetched loss always has size = 1
// which should not be reused.
auto shape = node.GetShape();
int size = std::abs(
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()));
if (shape.empty() || size < MinChunkSize()) {
return false; return false;
} }
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
...@@ -193,6 +359,176 @@ bool OpHasSubBlock(OpDesc* desc) { ...@@ -193,6 +359,176 @@ bool OpHasSubBlock(OpDesc* desc) {
return false; return false;
} }
ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) {
ops_ = SortOpLikeDescOrder(graph);
ConnectNodes();
}
void ControlFlowGraph::BuildCFGGraph() {
// FIXME(dzh): same effect with ConnectNodes, but use the control
// link to build dependency graph, it goes wrong in transformer.
for (ir::Node* op : ops_) {
for (auto& input_var : op->inputs) {
if (!input_var->inputs.empty()) {
PADDLE_ENFORCE(
input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(),
"Preceding Op Node of Var Node must be unique");
auto* pred_op = input_var->inputs[0];
if (pred_op->Op() != nullptr) {
predecessors_[op].insert(pred_op);
successors_[pred_op].insert(op);
}
}
if (input_var->IsVar() && !input_var->IsCtrlVar()) {
uses_[op].insert(input_var->Name());
}
}
for (auto& output_var : op->outputs) {
// output var may be used by many op
for (auto* succ_op : output_var->outputs) {
if (succ_op->Op() != nullptr) {
successors_[op].insert(succ_op);
predecessors_[succ_op].insert(op);
}
}
if (output_var->IsVar() && !output_var->IsCtrlVar()) {
defs_[op].insert(output_var->Name());
}
}
}
}
void ControlFlowGraph::ConnectNodes() {
for (size_t i = 0; i < ops_.size(); ++i) {
auto& op = ops_[i];
try {
auto& next_op = ops_.at(i + 1);
successors_[op].insert(next_op);
predecessors_[next_op].insert(op);
} catch (...) {
// do nothing
}
FilterVariables(op->inputs,
[&](ir::Node* var) { uses_[op].emplace(var->Name()); });
FilterVariables(op->outputs,
[&](ir::Node* var) { defs_[op].emplace(var->Name()); });
}
}
void ControlFlowGraph::LiveVariableAnalysis() {
// NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm)
// compute the liveness of for each variable though reversed_ops algorithm.
// It iterates the operators from end to begin, compute the live in/live out
// variable set for each op, then the diff between in/out will be used for
// the variable reuse. For detail refer to
// http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf
std::list<ir::Node*> work_list(ops_.rbegin(), ops_.rend());
while (!work_list.empty()) {
ir::Node* op = work_list.front();
work_list.pop_front();
// get the live_in calculated before. Empty if first.
auto prev_live_in = std::move(live_in_[op]);
for (auto& s : successors_[op]) {
for (auto& var : live_in_[s]) {
live_out_[op].insert(var);
}
}
for (auto& var : uses_[op]) {
live_in_[op].insert(var);
}
for (auto& var : live_out_[op]) {
live_in_[op].insert(var);
}
for (auto& var : defs_[op]) {
live_in_[op].erase(var);
}
// If the live_in is not changed, then the liveness analysis of
// predecessors is completed.
//
// Otherwise, recalculate the predecessors liveness
if (live_in_[op] != prev_live_in) {
for (auto& pre : predecessors_[op]) {
work_list.push_back(pre);
}
}
}
}
void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
const std::string& new_node,
int begin_idx) {
// update graph from begin idx to the end
for (size_t i = begin_idx; i != ops_.size(); ++i) {
auto* op = ops_[i];
if (uses_[op].find(old_node) != uses_[op].end()) {
uses_[op].erase(old_node);
uses_[op].insert(new_node);
}
if (defs_[op].find(old_node) != defs_[op].end()) {
defs_[op].erase(old_node);
defs_[op].insert(new_node);
}
if (live_in_[op].find(old_node) != live_in_[op].end()) {
live_in_[op].erase(old_node);
live_in_[op].insert(new_node);
}
if (live_out_[op].find(old_node) != live_out_[op].end()) {
live_out_[op].erase(old_node);
live_out_[op].insert(new_node);
}
}
}
const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
auto it = live_in_.find(op);
PADDLE_ENFORCE(
it != live_in_.end(),
string::Sprintf("Expect %s in live_in, but Not Found.", op->Name()));
return it->second;
}
const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
auto it = live_out_.find(op);
PADDLE_ENFORCE(
it != live_out_.end(),
string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
return it->second;
}
const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const {
auto it = uses_.find(op);
PADDLE_ENFORCE(
it != uses_.end(),
string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
return it->second;
}
const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; }
std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }
ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name,
ir::Node* op) const {
// in ssa-graph, different version nodes have same name,
// this function get the latest version var before target op
// It may return nullptr, such as data node.
ir::Node* found_node = nullptr;
for (auto* node : ops_) {
if (node == op) break;
for (auto& output : node->outputs) {
PADDLE_ENFORCE((output != nullptr && output->IsVar()),
"Output is empty!");
if (output->Var() && output->Name() == name) {
found_node = output;
}
}
}
return found_node;
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
#include <iostream> #include <iostream>
#include <iterator> #include <iterator>
#include <list> #include <list>
#include <map>
#include <set>
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
...@@ -27,41 +29,43 @@ namespace paddle { ...@@ -27,41 +29,43 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
constexpr char kFetchedVars[] = "fetched_vars"; constexpr char kAllOpDescs[] = "all_op_descs";
constexpr char kGraphNodePool[] = "graph_node_pool";
// NOTE(dzh): Variable and the operators use the var. std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
// for early delete pass.
// Because analysis var pass build base on ir::Node, which maybe released
// or modified between passes, so we use OpDesc* to mark ops.
using GraphNodePool = std::vector<
std::pair<std::string /*var node*/, std::unordered_set<OpDesc*> /* ops */>>;
// NOTE(dzh): by default, it sort node in ascend order(by node bytes size). // NOTE(dzh): A ordered set for node reuse in memory optimize.
// in fluid, -1 means the batch_size is determined in runtime. // the orderedset sort node in ascend order(by node bytes size).
// the node batch_size equal -1 always ranking in the front than the node not. // in fluid, -1 means the batch_size, which is determined in runtime.
// So the reuse happens between nodes who's batch_size both are -1
// simultaneously or not.
//
// sort rule:
// rule 0 : smaller node ranking in front.
// rule 1 : batch_size equal -1 ranking in the front than the node not.
//
// For example, // For example,
// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], .. // node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], ..
// O(1) insert, delete
class OrderedNodeList {
public:
using NodePair = std::pair<ir::Node*, std::unordered_set<ir::Node*>>;
using Iter = typename std::list<NodePair>::iterator;
using ConstIter = typename std::list<NodePair>::const_iterator;
void Insert(ir::Node* var, ir::Node* op); class OrderedSet {
public:
// nodes with same name exists in pool.
using NodeVector = std::vector<ir::Node*>;
using Iter = typename std::list<NodeVector>::iterator;
using ConstIter = typename std::list<NodeVector>::const_iterator;
void Insert(ir::Node* var);
void Erase(ir::Node* var); void Erase(ir::Node* var);
void Erase(const std::string& var); void Erase(const std::string& var);
bool Has(ir::Node* var) const;
bool Has(ir::Node* var) { return mark_table_.count(var->Name()); } void Clear() {
mark_table_.clear();
bool Has(const std::string& var) { return mark_table_.count(var); } nodes_.clear();
}
ir::Node* NodeMatch(ir::Node* var) const; // find the bestfit shape node block with var.
ir::Node* FindBestFitNode(ir::Node* var) const;
ir::Node* FindNextBestFitNode(ir::Node* var, ir::Node* prev) const;
// map store non-const iterator, can not promise const // map store non-const iterator, can not promise const
int GetIndex(ir::Node* var); int GetNodeIndexInPool(ir::Node* var);
// pool all node to string // pool all node to string
std::string ToString() const; std::string ToString() const;
...@@ -69,18 +73,54 @@ class OrderedNodeList { ...@@ -69,18 +73,54 @@ class OrderedNodeList {
Iter end() { return nodes_.end(); } Iter end() { return nodes_.end(); }
ConstIter begin() const { return nodes_.begin(); } ConstIter begin() const { return nodes_.begin(); }
ConstIter end() const { return nodes_.end(); } ConstIter end() const { return nodes_.end(); }
size_t size() const { return nodes_.size(); }
void Clear() { size_t size() const { return nodes_.size(); }
mark_table_.clear();
nodes_.clear();
}
private: private:
// for searching. // for searching.
std::unordered_map<std::string, Iter> mark_table_; std::unordered_map<std::string, Iter> mark_table_;
// node swap pairs. var -> ops dep var // node pool
std::list<NodePair> nodes_; std::list<NodeVector> nodes_;
};
class ControlFlowGraph {
public:
ControlFlowGraph() = default;
// IR Graph
explicit ControlFlowGraph(const ir::Graph& graph);
void LiveVariableAnalysis();
void RenameVarInCFGGraph(const std::string& old_node,
const std::string& new_node, int begin_idx);
const std::set<std::string> LiveIn(ir::Node* op) const;
const std::set<std::string> LiveOut(ir::Node* op) const;
const std::set<std::string> Use(ir::Node* op) const;
const std::vector<ir::Node*> Ops() const;
std::vector<ir::Node*>& Ops();
// for ssa-graph nodes
ir::Node* GetNodeByName(const std::string& name, ir::Node* op) const;
private:
void BuildCFGGraph();
void ConnectNodes();
using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>;
using VarSetMap = std::map<ir::Node*, std::set<std::string>>;
// successors ops use the output variables.
NodeListMap successors_;
// predecessors ops generated input variables.
NodeListMap predecessors_;
// variables lived before run current op.
VarSetMap live_in_;
// variables lived after run current op.
VarSetMap live_out_;
VarSetMap uses_; // op inputs
VarSetMap defs_; // op outputs
std::vector<ir::Node*> ops_; // op sequence by topology sort
}; };
// valid a tensor can be reuse or not // valid a tensor can be reuse or not
...@@ -93,15 +133,24 @@ bool NodeCanReused(const VarDesc& node); ...@@ -93,15 +133,24 @@ bool NodeCanReused(const VarDesc& node);
bool OpHasSubBlock(OpDesc* desc); bool OpHasSubBlock(OpDesc* desc);
// node memory size in bytes // node memory size in bytes
size_t NodeSizeInBytes(ir::Node* n); size_t NodeSize(ir::Node* n);
// node memory size in bytes // node memory size in bytes
size_t NodeSizeInBytes(const VarDesc&); size_t NodeSize(const VarDesc&);
std::string DebugString(ir::Node* var); std::string DebugString(ir::Node* var);
// NOTE(dzhwinter)
// after node reuse, the replaced node shape is
// different with its VarDesc. So need to find the
// correct VarDesc in Block.
VarDesc* FindVarDescInBlock(ir::Node* n); VarDesc* FindVarDescInBlock(ir::Node* n);
static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
op1->Outputs() == op2->Outputs();
}
template <typename Container, typename Callback> template <typename Container, typename Callback>
class FilterVariableImpl { class FilterVariableImpl {
public: public:
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include <algorithm> #include <algorithm>
#include <iostream> #include <iostream>
#include <iterator>
#include <memory> #include <memory>
#include <sstream> #include <sstream>
#include <string> #include <string>
...@@ -22,13 +23,19 @@ ...@@ -22,13 +23,19 @@
#include <vector> #include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/details/graph_test_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
TEST(OrderedNodeList, Normal) { TEST(OrderedSet, Normal) {
OrderedNodeList pool; OrderedSet pool;
std::vector<std::unique_ptr<ir::Node>> nodes; std::vector<std::unique_ptr<ir::Node>> nodes;
// clang-format off // clang-format off
...@@ -56,8 +63,15 @@ TEST(OrderedNodeList, Normal) { ...@@ -56,8 +63,15 @@ TEST(OrderedNodeList, Normal) {
nodes.emplace_back(std::move(node)); nodes.emplace_back(std::move(node));
} }
// Insert
for (auto& node : nodes) { for (auto& node : nodes) {
pool.Insert(node.get(), op.get()); pool.Insert(node.get());
}
// Has/size
ASSERT_EQ(pool.size(), shapes.size());
for (auto& node : nodes) {
ASSERT_TRUE(pool.Has(node.get()));
} }
// assert its order and interface. // assert its order and interface.
...@@ -66,14 +80,14 @@ TEST(OrderedNodeList, Normal) { ...@@ -66,14 +80,14 @@ TEST(OrderedNodeList, Normal) {
std::cout << pool.ToString() << std::endl; std::cout << pool.ToString() << std::endl;
ASSERT_EQ(pool.size(), static_cast<size_t>(COUNT - 1)); ASSERT_EQ(pool.size(), static_cast<size_t>(COUNT - 1));
ASSERT_EQ(pool.GetIndex(nodes.back().get()), 0); ASSERT_EQ(pool.GetNodeIndexInPool(nodes.back().get()), 0);
{ {
auto v1 = block_desc->Var("11"); auto v1 = block_desc->Var("11");
v1->SetShape({-1, 256, 56, 56}); v1->SetShape({-1, 256, 56, 56});
std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v1); std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v1);
node1->inputs.emplace_back(op.get()); node1->inputs.emplace_back(op.get());
auto* cache = pool.NodeMatch(node1.get()); auto* cache = pool.FindBestFitNode(node1.get());
ASSERT_EQ(cache, nullptr); ASSERT_EQ(cache, nullptr);
} }
{ {
...@@ -81,16 +95,447 @@ TEST(OrderedNodeList, Normal) { ...@@ -81,16 +95,447 @@ TEST(OrderedNodeList, Normal) {
v2->SetShape({-1, 2, 5}); v2->SetShape({-1, 2, 5});
std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v2); std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v2);
node1->inputs.emplace_back(op.get()); node1->inputs.emplace_back(op.get());
auto* cache = pool.NodeMatch(node1.get()); auto* cache = pool.FindBestFitNode(node1.get());
ASSERT_EQ(pool.GetIndex(cache), 2); // match 6:[-1,2,5] ASSERT_EQ(pool.GetNodeIndexInPool(cache), 2); // match 6:[-1,2,5]
} }
{ {
auto v3 = block_desc->Var("13"); auto v3 = block_desc->Var("13");
v3->SetShape({2, 5}); v3->SetShape({2, 5});
std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v3); std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v3);
node1->inputs.emplace_back(op.get()); node1->inputs.emplace_back(op.get());
auto* cache = pool.NodeMatch(node1.get()); auto* cache = pool.FindBestFitNode(node1.get());
ASSERT_EQ(pool.GetIndex(cache), 5); // match 4:[5,2] ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5); // match 4:[5,2]
}
}
TEST(OrderedSet, FindBestFitNode) {
OrderedSet pool;
std::vector<std::unique_ptr<ir::Node>> nodes;
ProgramDesc prog;
BlockDesc* block_desc = prog.MutableBlock(0);
auto* op_desc = block_desc->AppendOp();
op_desc->SetType("dummy");
std::unique_ptr<ir::Node> op = ir::CreateNodeForTest(op_desc);
{
auto desc = block_desc->Var("a");
desc->SetShape({128, 128});
std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
node->inputs.emplace_back(op.get());
nodes.emplace_back(std::move(node));
}
{
auto desc = block_desc->Var("b");
desc->SetShape({128, 129});
std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
node->inputs.emplace_back(op.get());
nodes.emplace_back(std::move(node));
}
{
auto desc = block_desc->Var("c");
desc->SetShape({128, 128});
std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
node->inputs.emplace_back(op.get());
nodes.emplace_back(std::move(node));
}
for (auto& node : nodes) {
pool.Insert(node.get());
}
// FindNextBestFitNode
auto* n = nodes[0].get();
auto* cache = pool.FindBestFitNode(n);
PADDLE_ENFORCE(cache->Name() == "a");
cache = pool.FindNextBestFitNode(n, cache);
PADDLE_ENFORCE(cache->Name() == "c");
cache = pool.FindNextBestFitNode(n, cache);
PADDLE_ENFORCE(cache->Name() == "b");
}
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_OPERATOR(sum, paddle::framework::DummyOp,
paddle::framework::SumOpMaker,
paddle::framework::DummyVarTypeInference);
REGISTER_OPERATOR(assign, paddle::framework::DummyOp,
paddle::framework::AssignOpMaker,
paddle::framework::DummyVarTypeInference);
REGISTER_OPERATOR(dummy, paddle::framework::DummyOp,
paddle::framework::SumOpMaker,
paddle::framework::DummyVarTypeInference);
/*
https://en.wikipedia.org/wiki/Live_variable_analysis
Create a customed classical dependency graph, left row is the instruction
number.
1. a = 1
2. b = a
3. c = a
4. d = b + c
5. e = d
a--------+
| |
b c
| |
d--------+
|
e
Then analysis these variable's liveness range
*/
namespace paddle {
namespace framework {
namespace details {
inline static ProgramDesc FillProgramDesc() {
ProgramDesc prog;
prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR);
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"b"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"c"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"d"});
op->SetOutput("Out", {"e"});
}
return prog;
}
TEST(CFGGraph, IRGraph) {
// prepare ir graph
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
ControlFlowGraph cfg(graph);
cfg.LiveVariableAnalysis();
// test assign op
ASSERT_TRUE((std::set<std::string>{"a"} == cfg.LiveIn(cfg.Ops()[0])));
ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveOut(cfg.Ops()[0])));
// test assign op
ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveIn(cfg.Ops()[1])));
ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveOut(cfg.Ops()[1])));
// test sum op
ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveIn(cfg.Ops()[2])));
ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveOut(cfg.Ops()[2])));
// test assign op
ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveIn(cfg.Ops()[3])));
ASSERT_TRUE((std::set<std::string>{} == cfg.LiveOut(cfg.Ops()[3])));
}
// 1. normal test
TEST(SortOpLikeDescOrder, NormalTest) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto nodes = SortOpLikeDescOrder(graph);
auto op_descs = prog.Block(0).AllOps();
for (size_t i = 0; i < nodes.size(); ++i) {
auto node = nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 2. remove some op_desc
TEST(SortOpLikeDescOrder, RemoveOpDesc) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto nodes = graph.Nodes();
auto op_descs = prog.Block(0).AllOps();
ir::Node* found_node = nullptr;
for (auto node : nodes) {
if (node->IsOp() && node->outputs.back()->Name() == "e") {
found_node = node;
break;
}
}
PADDLE_ENFORCE(found_node != nullptr);
for (auto it = op_descs.begin(); it != op_descs.end();) {
if (IsSameDesc(*it, found_node->Op())) {
it = op_descs.erase(it);
} else {
++it;
}
}
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
ir::Node* e = find_node_in_graph("e");
ir::Node* d = find_node_in_graph("d");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
graph.RemoveNode(found_node);
graph.RemoveNode(e);
// other node keeps the same order
auto remain_nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < remain_nodes.size(); ++i) {
auto node = remain_nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 3. add some op_desc
TEST(SortOpLikeDescOrder, AddOpDesc) {
auto prog = FillProgramDesc();
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
ir::Graph graph(prog);
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
// cached desc different with real one
// mimic the intermidiete pass modify the programdesc.
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto op_descs = prog.Block(0).AllOps();
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
d1->inputs.emplace_back(node);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
op_descs.insert(op_descs.begin() + 4, op);
auto nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < nodes.size(); ++i) {
auto node = nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 4. add and delete some op_desc
TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
// remove sum node
auto op_descs = prog.Block(0).AllOps();
ir::Node* found_node = nullptr;
auto nodes = graph.Nodes();
for (auto node : nodes) {
if (node->Name() == "sum") {
found_node = node;
break;
}
}
PADDLE_ENFORCE(found_node != nullptr);
for (auto it = op_descs.begin(); it != op_descs.end();) {
if (IsSameDesc(*it, found_node->Op())) {
it = op_descs.erase(it);
} else {
++it;
}
}
{
ir::Node* d = find_node_in_graph("d");
ir::Node* c = find_node_in_graph("c");
ir::Node* e = find_node_in_graph("e");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
std::remove(c->outputs.begin(), c->outputs.end(), found_node);
ir::Node* pending_op = found_node->outputs[0]->outputs[0];
graph.RemoveNode(e);
graph.RemoveNode(pending_op);
graph.RemoveNode(found_node);
}
// add node
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
{
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
}
op_descs.insert(op_descs.begin() + 2, op);
// check the order
auto mynodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < mynodes.size(); ++i) {
auto node = mynodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 5. add and replace some op_desc inplace.
TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
auto op_descs = prog.Block(0).AllOps();
// add node
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
{
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
d1->inputs.emplace_back(node);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
}
op_descs.emplace_back(op);
// replace op_desc inplace
auto nodes = graph.Nodes();
ir::Node* found_node = nullptr;
for (auto node : nodes) {
if (node->IsOp() && node->Op() && node->Name() == "assign") {
if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") {
found_node = node;
break;
}
}
}
{
ir::Node* d = find_node_in_graph("d");
ir::Node* e = find_node_in_graph("e");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
std::remove(e->inputs.begin(), e->inputs.end(), found_node);
graph.RemoveNode(found_node);
}
op_descs.erase(op_descs.begin() + 3);
auto replace_op = prog.MutableBlock(0)->AppendOp();
replace_op->SetType("sum");
replace_op->SetInput("X", {"d", "d1"});
replace_op->SetOutput("Out", {"e"});
{
ir::Node* sum2 = graph.CreateOpNode(replace_op);
ir::Node* e = find_node_in_graph("e");
ir::Node* d = find_node_in_graph("d");
ir::Node* d1 = find_node_in_graph("d1");
sum2->inputs.emplace_back(d);
sum2->inputs.emplace_back(d1);
sum2->outputs.emplace_back(e);
e->inputs.emplace_back(sum2);
d->outputs.emplace_back(sum2);
d1->outputs.emplace_back(sum2);
}
op_descs.emplace_back(replace_op);
// compare op order
auto graph_nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < graph_nodes.size(); ++i) {
auto node = graph_nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
} }
} }
......
...@@ -43,16 +43,10 @@ namespace paddle { ...@@ -43,16 +43,10 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
op1->Outputs() == op2->Outputs();
}
std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl( std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
auto nodes = graph->Nodes(); auto nodes = graph->Nodes();
auto subblock_vars = GetSubBlockVars(nodes); CollectSkipVarsSet(nodes);
skip_set_.insert(subblock_vars.begin(), subblock_vars.end());
cfg_.reset(new details::ControlFlowGraph(*graph)); cfg_.reset(new details::ControlFlowGraph(*graph));
cfg_->LiveVariableAnalysis(); cfg_->LiveVariableAnalysis();
...@@ -75,82 +69,67 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl( ...@@ -75,82 +69,67 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
} }
for (auto& var : op->outputs) { for (auto& var : op->outputs) {
if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 || if (var->IsVar() && !var->IsCtrlVar() && skip_set_.count(var->Name())) {
skip_set_.count(var->Name())) VLOG(3) << "Skip set contains variable of " << var->Name()
<< "disable reuse on it. skipped";
continue; continue;
ir::Node* cache = pool_.NodeMatch(var);
if (var->Name() == FLAGS_memory_optimize_debug) {
VLOG(3) << "start match var " << DebugString(var) << " of op "
<< op->Name();
VLOG(3) << pool_.ToString();
VLOG(3) << "matched in pool : "
<< ((cache == nullptr) ? "False" : "True");
} }
if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) {
ir::Node* cache = pool_.FindBestFitNode(var);
while (cache != nullptr && var->Name() == cache->Name()) {
VLOG(3) << "The same cache variable is cascade reused. "
<< cache->Name() << " is re-filled to the pool after "
<< "the reused op is finished. Current op can not "
<< "replace it again. Skip this candidate.";
cache = pool_.FindNextBestFitNode(var, cache);
}
if (var->Name() == FLAGS_memory_optimize_debug) {
VLOG(3) << "start match var " << DebugString(var) << " of op "
<< op->Name();
VLOG(3) << pool_.ToString();
VLOG(3) << "matched in pool : "
<< ((cache == nullptr) ? "False" : "True");
}
if (cache == nullptr) continue; if (cache != nullptr) {
if (var->Name() == cache->Name()) { int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
VLOG(3) << "The same cache variable is cascade reused." << var->Name() VLOG(3) << string::Sprintf(
<< " is re-filled to the pool after" "!!! %s, %s => %s, cache idx %d, pool size %d",
<< "the reused op is finished. Current op can not " std::to_string(reuse_id++), DebugString(var), DebugString(cache),
<< "replace it again. Skip this candidate."; node_idx_in_pool, static_cast<int>(pool_.size()));
continue; // NOTE(dzhwinter): update the ProgramDesc/IR Graph
// and the CFG Graph on the fly.
int node_idx_in_pool = pool_.GetIndex(cache); //
VLOG(3) << string::Sprintf( // IR Graph define the dependence relationship between nodes.
"!!! %s, %s => %s, cache idx %d, pool size %d", //
std::to_string(reuse_id++), DebugString(var), DebugString(cache), // ProgramDesc defines the input/output vars. Its used in
node_idx_in_pool, static_cast<int>(pool_.size())); // CreateOp, CreateVar when running happens.
// update CFG Graph on the fly. //
// reused var maybe re-fill into the pool // CFG Graph store the liveness information, when reuse happens
cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx); // we also need to update the variable liveness.
// NOTE(dzhwinter): we need to both update the ProgramDesc const std::string var_name = var->Name();
// and IR Graph. because op_desc/var_desc is used in CreateOp, const std::string cache_name = cache->Name();
// CreateVar when running happens. But IR Graph
// define the dependence relationship between nodes. cfg_->RenameVarInCFGGraph(var_name, cache_name, idx);
RenameVarInGraphDesc(var->Name(), cache->Name(), idx); RenameVarInGraphDesc(var_name, cache_name, idx);
RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get()); RenameVarInGraphNode(var_name, cache_name, idx, graph.get());
pool_.Erase(cache_name);
pool_.Erase(cache);
}
// fill the pool
std::unordered_set<std::string> unlived_vars;
for (auto var : cfg_->LiveIn(op)) {
if (cfg_->LiveOut(op).count(var) == 0) {
unlived_vars.emplace(var);
} }
} }
for (auto var : unlived_vars) { }
ir::Node* var_node = cfg_->GetNodeFromVarName(var, op); // fill the pool
for (auto var : cfg_->LiveIn(op)) {
if (cfg_->LiveOut(op).count(var) == 0) {
ir::Node* var_node = cfg_->GetNodeByName(var, op);
if (var_node == nullptr || var_node->IsCtrlVar()) continue;
if (NodeCanReused(var_node) && !pool_.Has(var_node)) { if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
pool_.Insert(var_node, op); pool_.Insert(var_node);
} }
} }
} }
} }
graph->ResolveHazard(var_nodes_); graph->ResolveHazard(var_nodes_);
// For early delete pass. use GraphNodePool load the unlived vars.
// 1. find all deps op for each unlived var in memory pool.
for (auto& op : graph->Nodes()) {
for (auto& var : op->inputs) {
if (pool_.Has(var)) {
pool_.Insert(var, op);
}
}
}
// 2. convert ir node based memory pool to graph node
// because Node* maybe released bettwen passes.
auto& graph_pool = graph->Get<GraphNodePool>(kGraphNodePool);
for (auto it = pool_.begin(); it != pool_.end(); ++it) {
std::unordered_set<OpDesc*> descs;
for (auto& op : it->second) {
PADDLE_ENFORCE(op->IsOp());
descs.insert(op->Op());
}
graph_pool.push_back(std::make_pair(it->first->Name(), descs));
}
return graph; return graph;
} }
...@@ -199,12 +178,12 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { ...@@ -199,12 +178,12 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
PADDLE_ENFORCE(sub_op != nullptr); PADDLE_ENFORCE(sub_op != nullptr);
for (auto* var : sub_op->outputs) { for (auto* var : sub_op->outputs) {
if (NodeCanReused(var)) { if (NodeCanReused(var)) {
ir::Node* cache = pool_.NodeMatch(var); ir::Node* cache = pool_.FindBestFitNode(var);
if (cache != nullptr) { if (cache != nullptr) {
if (var->Var()->GetDataType() != cache->Var()->GetDataType()) { if (var->Var()->GetDataType() != cache->Var()->GetDataType()) {
continue; continue;
} }
int node_idx_in_pool = pool_.GetIndex(cache); int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
VLOG(3) << string::Sprintf( VLOG(3) << string::Sprintf(
"!!! %s, %s => %s, cache idx %d, pool size %d", "!!! %s, %s => %s, cache idx %d, pool size %d",
std::to_string(sub_reuse_id++), DebugString(var), std::to_string(sub_reuse_id++), DebugString(var),
...@@ -224,20 +203,27 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { ...@@ -224,20 +203,27 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
} }
} }
std::unordered_set<std::string> MemoryOptimizePass::GetSubBlockVars( void MemoryOptimizePass::CollectSkipVarsSet(
const std::unordered_set<ir::Node*>& nodes) const { const std::unordered_set<ir::Node*>& nodes) const {
std::unordered_set<std::string> vars; auto update_skip_set = [&](OpDesc* op_desc) {
auto inputs = op_desc->InputArgumentNames();
auto outputs = op_desc->OutputArgumentNames();
skip_set_.insert(inputs.begin(), inputs.end());
skip_set_.insert(outputs.begin(), outputs.end());
};
for (auto& op : nodes) { for (auto& op : nodes) {
if (!op->IsOp() || op->Op() == nullptr) continue; if (!op->IsOp() || op->Op() == nullptr) continue;
auto* op_desc = op->Op(); auto* op_desc = op->Op();
if (OpHasSubBlock(op_desc)) { // NOTE(dzhwinter):
auto inputs = op_desc->InputArgumentNames(); // current block can not reuse next level block vars.
auto outputs = op_desc->OutputArgumentNames(); if (OpHasSubBlock(op_desc)) update_skip_set(op_desc);
vars.insert(inputs.begin(), inputs.end()); // NOTE(dzhwinter):
vars.insert(outputs.begin(), outputs.end()); // distributed ops input/output name need to
} // keep same bettwen trainer/pserver
if (op_desc->Type() == "send") update_skip_set(op_desc);
if (op_desc->Type() == "recv") update_skip_set(op_desc);
if (op_desc->Type() == "prefetch") update_skip_set(op_desc);
} }
return vars;
} }
void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var, void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var,
...@@ -291,8 +277,7 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, ...@@ -291,8 +277,7 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
// redirect the input to the latest version of cache_var // redirect the input to the latest version of cache_var
for (auto* node : op->inputs) { for (auto* node : op->inputs) {
if (node->Name() == var) { if (node->Name() == var) {
ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); ir::Node* cache_node = var_nodes_[cache_var].back();
var_nodes_[cache_var].emplace_back(cache_node);
// swap node to cache_node // swap node to cache_node
cache_node->outputs.insert(cache_node->outputs.end(), cache_node->outputs.insert(cache_node->outputs.end(),
...@@ -301,11 +286,15 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, ...@@ -301,11 +286,15 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
auto* prev_op = node->inputs[0]; auto* prev_op = node->inputs[0];
std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node, std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node,
cache_node); cache_node);
cache_node->inputs.emplace_back(prev_op);
for (auto* next_op : node->outputs) { for (auto* next_op : node->outputs) {
std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
cache_node); cache_node);
} }
// erase unused node
auto& nodes = var_nodes_.at(var);
nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
graph->RemoveNode(node);
} }
} }
...@@ -325,271 +314,14 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, ...@@ -325,271 +314,14 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
cache_node); cache_node);
} }
}
}
}
// release node of unused var in graph
for (auto* node : var_nodes_[var]) {
graph->RemoveNode(node);
}
var_nodes_.at(var).clear();
}
std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
PADDLE_ENFORCE(graph.Has(kAllOpDescs),
"Graph has no attribute of kAllOpDescs.");
// 1. get op desc order
auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs);
// 2. topology sort order
auto nodes = graph.Nodes();
std::deque<ir::Node*> ops;
FilterVariables(nodes, [&](ir::Node* op) {
if (op->IsOp() && op->Op() != nullptr) {
ops.emplace_back(op);
}
});
std::unordered_map<ir::Node*, size_t> op_deps;
std::list<ir::Node*> ready_ops;
std::unordered_map<ir::Node*, std::unordered_set<ir::Node*>> pending_ops;
for (auto* op : ops) {
std::unordered_set<ir::Node*> preceding_op;
for (auto* in : op->inputs) {
if (in->inputs.empty()) continue;
PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp());
preceding_op.emplace(in->inputs[0]);
pending_ops[in->inputs[0]].emplace(op);
}
op_deps[op] = preceding_op.size();
if (preceding_op.empty()) {
ready_ops.emplace_back(op);
}
}
// 3. generated op list based desc order and the topology order
std::vector<ir::Node*> ret;
std::list<OpDesc*> op_descs_list(op_descs.begin(), op_descs.end());
auto update_by_found_node = [&](ir::Node* found_node) {
for (auto* pending_op : pending_ops[found_node]) {
if (--op_deps[pending_op] == 0) {
ready_ops.emplace_back(pending_op);
}
}
ready_ops.remove(found_node);
ret.emplace_back(found_node);
};
while (!ready_ops.empty()) {
bool all_of_ready_op_unmatched = true;
for (auto it = op_descs_list.begin(); it != op_descs_list.end();) {
auto op_desc = *it;
ir::Node* found_node = nullptr;
for (auto* op : ready_ops) {
if (IsSameDesc(op->Op(), op_desc)) {
found_node = op;
break;
}
}
// 3.1 op desc deleted by other pass
if (found_node == nullptr) {
++it;
continue;
} else {
all_of_ready_op_unmatched = false;
it = op_descs_list.erase(it);
}
update_by_found_node(found_node);
}
// 3.2 op descs are added by other pass
// preceding op non empty means some new op descs are
// created, but not contained in return node list.
// these new op desc may depend on each other.
std::list<ir::Node*> prev_ready_ops(ready_ops);
if (all_of_ready_op_unmatched) {
for (auto op : prev_ready_ops) {
update_by_found_node(op);
}
}
}
PADDLE_ENFORCE(std::all_of(
op_deps.begin(), op_deps.end(),
[&](const std::pair<ir::Node*, size_t>& p) { return p.second == 0; }));
return ret;
}
ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) {
ops_ = SortOpLikeDescOrder(graph);
ConnectNodes();
}
void ControlFlowGraph::BuildCFGGraph() {
// FIXME(dzh): same effect with ConnectNodes, but use the control
// link to build dependency graph, it goes wrong in transformer.
for (ir::Node* op : ops_) {
for (auto& input_var : op->inputs) {
if (!input_var->inputs.empty()) {
PADDLE_ENFORCE(
input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(),
"Preceding Op Node of Var Node must be unique");
auto* pred_op = input_var->inputs[0];
if (pred_op->Op() != nullptr) {
predecessors_[op].insert(pred_op);
successors_[pred_op].insert(op);
}
}
if (input_var->IsVar() && !input_var->IsCtrlVar()) {
uses_[op].insert(input_var->Name());
}
}
for (auto& output_var : op->outputs) {
// output var may be used by many op
for (auto* succ_op : output_var->outputs) {
if (succ_op->Op() != nullptr) {
successors_[op].insert(succ_op);
predecessors_[succ_op].insert(op);
}
}
if (output_var->IsVar() && !output_var->IsCtrlVar()) {
defs_[op].insert(output_var->Name());
}
}
}
}
void ControlFlowGraph::ConnectNodes() {
for (size_t i = 0; i < ops_.size(); ++i) {
auto& op = ops_[i];
try {
auto& next_op = ops_.at(i + 1);
successors_[op].insert(next_op);
predecessors_[next_op].insert(op);
} catch (...) {
// do nothing
}
FilterVariables(op->inputs,
[&](ir::Node* var) { uses_[op].emplace(var->Name()); });
FilterVariables(op->outputs,
[&](ir::Node* var) { defs_[op].emplace(var->Name()); });
}
}
void ControlFlowGraph::LiveVariableAnalysis() {
// NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm)
// compute the liveness of for each variable though reversed_ops algorithm.
// It iterates the operators from end to begin, compute the live in/live out
// variable set for each op, then the diff between in/out will be used for
// the variable reuse. For detail refer to
// http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf
std::list<ir::Node*> work_list(ops_.rbegin(), ops_.rend());
while (!work_list.empty()) {
ir::Node* op = work_list.front();
work_list.pop_front();
// get the live_in calculated before. Empty if first.
auto prev_live_in = std::move(live_in_[op]);
for (auto& s : successors_[op]) {
for (auto& var : live_in_[s]) {
live_out_[op].insert(var);
}
}
for (auto& var : uses_[op]) {
live_in_[op].insert(var);
}
for (auto& var : live_out_[op]) {
live_in_[op].insert(var);
}
for (auto& var : defs_[op]) {
live_in_[op].erase(var);
}
// If the live_in is not changed, then the liveness analysis of
// predecessors is completed.
//
// Otherwise, recalculate the predecessors liveness
if (live_in_[op] != prev_live_in) {
for (auto& pre : predecessors_[op]) {
work_list.push_back(pre);
}
}
}
}
void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
const std::string& new_node,
int begin_idx) {
// update graph from begin idx to the end
for (size_t i = begin_idx; i != ops_.size(); ++i) {
auto* op = ops_[i];
if (uses_[op].find(old_node) != uses_[op].end()) {
uses_[op].erase(old_node);
uses_[op].insert(new_node);
}
if (defs_[op].find(old_node) != defs_[op].end()) {
defs_[op].erase(old_node);
defs_[op].insert(new_node);
}
if (live_in_[op].find(old_node) != live_in_[op].end()) {
live_in_[op].erase(old_node);
live_in_[op].insert(new_node);
}
if (live_out_[op].find(old_node) != live_out_[op].end()) {
live_out_[op].erase(old_node);
live_out_[op].insert(new_node);
}
}
}
const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
auto it = live_in_.find(op);
PADDLE_ENFORCE(
it != live_in_.end(),
string::Sprintf("Expect %s in live_in, but Not Found.", op->Name()));
return it->second;
}
const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
auto it = live_out_.find(op);
PADDLE_ENFORCE(
it != live_out_.end(),
string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
return it->second;
}
const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const {
auto it = uses_.find(op);
PADDLE_ENFORCE(
it != uses_.end(),
string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
return it->second;
}
const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; }
std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }
ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name, // erase unused node
ir::Node* op) const { auto& nodes = var_nodes_.at(var);
// in ssa-graph, different version nodes have same name, nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
// this function get the latest version var before target op graph->RemoveNode(node);
// It may return nullptr, such as data node.
ir::Node* found_node = nullptr;
for (auto* node : ops_) {
if (node == op) break;
for (auto& output : node->outputs) {
if (output->Name() == name) {
found_node = output;
} }
} }
} }
return found_node;
} }
} // namespace details } // namespace details
...@@ -598,5 +330,4 @@ ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name, ...@@ -598,5 +330,4 @@ ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name,
REGISTER_PASS(memory_optimize_pass, REGISTER_PASS(memory_optimize_pass,
paddle::framework::details::MemoryOptimizePass) paddle::framework::details::MemoryOptimizePass)
.RequireGraphAttr(paddle::framework::details::kGraphNodePool)
.RequireGraphAttr(paddle::framework::details::kAllOpDescs); .RequireGraphAttr(paddle::framework::details::kAllOpDescs);
...@@ -32,20 +32,15 @@ ...@@ -32,20 +32,15 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
constexpr char kAllOpDescs[] = "all_op_descs";
std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
class ControlFlowGraph;
class MemoryOptimizePass : public ir::Pass { class MemoryOptimizePass : public ir::Pass {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl( std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override; std::unique_ptr<ir::Graph> graph) const override;
private:
// fill the variable map(var_nodes) by version. // fill the variable map(var_nodes) by version.
void InitSSAGraphNodes() const; void InitSSAGraphNodes() const;
private:
// update program descs // update program descs
void RenameVarInGraphDesc(const std::string& var, void RenameVarInGraphDesc(const std::string& var,
const std::string& cache_var, size_t idx) const; const std::string& cache_var, size_t idx) const;
...@@ -55,13 +50,14 @@ class MemoryOptimizePass : public ir::Pass { ...@@ -55,13 +50,14 @@ class MemoryOptimizePass : public ir::Pass {
ir::Graph* graph) const; ir::Graph* graph) const;
void SubGraphOptimize(OpDesc* op_desc) const; void SubGraphOptimize(OpDesc* op_desc) const;
// scan subblock and collect the output/input variables. // 1. scan op with subblock and collect the output/input vars.
std::unordered_set<std::string> GetSubBlockVars( // while, while_grad, conditional_block
const std::unordered_set<ir::Node*>&) const; // 2. scan distributed ops and collect the output/input vars
void CollectSkipVarsSet(const std::unordered_set<ir::Node*>&) const;
private: private:
// Reuse Node Pool, Owned. // Reuse Node Pool, Owned.
mutable OrderedNodeList pool_; mutable OrderedSet pool_;
// controlflow Graph // controlflow Graph
mutable std::unique_ptr<ControlFlowGraph> cfg_; mutable std::unique_ptr<ControlFlowGraph> cfg_;
// skip set // skip set
...@@ -70,45 +66,6 @@ class MemoryOptimizePass : public ir::Pass { ...@@ -70,45 +66,6 @@ class MemoryOptimizePass : public ir::Pass {
mutable std::map<std::string, std::vector<ir::Node*>> var_nodes_; mutable std::map<std::string, std::vector<ir::Node*>> var_nodes_;
}; };
class ControlFlowGraph {
public:
ControlFlowGraph() = default;
// For IR Graph in parallelexecutor
explicit ControlFlowGraph(const ir::Graph& graph);
void LiveVariableAnalysis();
void RenameVarInCFGGraph(const std::string& old_node,
const std::string& new_node, int begin_idx);
const std::set<std::string> LiveIn(ir::Node* op) const;
const std::set<std::string> LiveOut(ir::Node* op) const;
const std::set<std::string> Use(ir::Node* op) const;
const std::vector<ir::Node*> Ops() const;
std::vector<ir::Node*>& Ops();
// for ssa-graph nodes
ir::Node* GetNodeFromVarName(const std::string& name, ir::Node* op) const;
private:
void BuildCFGGraph();
void ConnectNodes();
using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>;
using VarSetMap = std::map<ir::Node*, std::set<std::string>>;
// successors ops use the output variables.
NodeListMap successors_;
// predecessors ops generated input variables.
NodeListMap predecessors_;
// variables lived before run current op.
VarSetMap live_in_;
// variables lived after run current op.
VarSetMap live_out_;
VarSetMap uses_; // op inputs
VarSetMap defs_; // op outputs
std::vector<ir::Node*> ops_; // op sequence by topology sort
};
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/memory_optimize_pass.h"
#include <algorithm>
#include <iostream>
#include <iterator>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/graph_test_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
REGISTER_OPERATOR(sum, paddle::framework::DummyOp,
paddle::framework::SumOpMaker,
paddle::framework::DummyVarTypeInference);
REGISTER_OPERATOR(assign, paddle::framework::DummyOp,
paddle::framework::AssignOpMaker,
paddle::framework::DummyVarTypeInference);
REGISTER_OPERATOR(dummy, paddle::framework::DummyOp,
paddle::framework::SumOpMaker,
paddle::framework::DummyVarTypeInference);
/*
https://en.wikipedia.org/wiki/Live_variable_analysis
Create a customed classical dependency graph, left row is the instruction
number.
1. a = 1
2. b = a
3. c = a
4. d = b + c
5. e = d
a--------+
| |
b c
| |
d--------+
|
e
Then analysis these variable's liveness range
*/
namespace paddle {
namespace framework {
namespace details {
static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
op1->Outputs() == op2->Outputs();
}
inline static ProgramDesc FillProgramDesc() {
ProgramDesc prog;
prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR);
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"b"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"c"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d"});
}
{
auto* op = prog.MutableBlock(0)->AppendOp();
op->SetType("assign");
op->SetInput("X", {"d"});
op->SetOutput("Out", {"e"});
}
return prog;
}
TEST(CFGGraph, IRGraph) {
// prepare ir graph
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
ControlFlowGraph cfg(graph);
cfg.LiveVariableAnalysis();
// test assign op
ASSERT_TRUE((std::set<std::string>{"a"} == cfg.LiveIn(cfg.Ops()[0])));
ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveOut(cfg.Ops()[0])));
// test assign op
ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveIn(cfg.Ops()[1])));
ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveOut(cfg.Ops()[1])));
// test sum op
ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveIn(cfg.Ops()[2])));
ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveOut(cfg.Ops()[2])));
// test assign op
ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveIn(cfg.Ops()[3])));
ASSERT_TRUE((std::set<std::string>{} == cfg.LiveOut(cfg.Ops()[3])));
}
// 1. normal test
TEST(SortOpLikeDescOrder, NormalTest) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto nodes = SortOpLikeDescOrder(graph);
auto op_descs = prog.Block(0).AllOps();
for (size_t i = 0; i < nodes.size(); ++i) {
auto node = nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 2. remove some op_desc
TEST(SortOpLikeDescOrder, RemoveOpDesc) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto nodes = graph.Nodes();
auto op_descs = prog.Block(0).AllOps();
ir::Node* found_node = nullptr;
for (auto node : nodes) {
if (node->IsOp() && node->outputs.back()->Name() == "e") {
found_node = node;
break;
}
}
PADDLE_ENFORCE(found_node != nullptr);
for (auto it = op_descs.begin(); it != op_descs.end();) {
if (IsSameDesc(*it, found_node->Op())) {
it = op_descs.erase(it);
} else {
++it;
}
}
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
ir::Node* e = find_node_in_graph("e");
ir::Node* d = find_node_in_graph("d");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
graph.RemoveNode(found_node);
graph.RemoveNode(e);
// other node keeps the same order
auto remain_nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < remain_nodes.size(); ++i) {
auto node = remain_nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 3. add some op_desc
TEST(SortOpLikeDescOrder, AddOpDesc) {
auto prog = FillProgramDesc();
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
ir::Graph graph(prog);
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
// cached desc different with real one
// mimic the intermidiete pass modify the programdesc.
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto op_descs = prog.Block(0).AllOps();
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
d1->inputs.emplace_back(node);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
op_descs.insert(op_descs.begin() + 4, op);
auto nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < nodes.size(); ++i) {
auto node = nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 4. add and delete some op_desc
TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
// remove sum node
auto op_descs = prog.Block(0).AllOps();
ir::Node* found_node = nullptr;
auto nodes = graph.Nodes();
for (auto node : nodes) {
if (node->Name() == "sum") {
found_node = node;
break;
}
}
PADDLE_ENFORCE(found_node != nullptr);
for (auto it = op_descs.begin(); it != op_descs.end();) {
if (IsSameDesc(*it, found_node->Op())) {
it = op_descs.erase(it);
} else {
++it;
}
}
{
ir::Node* d = find_node_in_graph("d");
ir::Node* c = find_node_in_graph("c");
ir::Node* e = find_node_in_graph("e");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
std::remove(c->outputs.begin(), c->outputs.end(), found_node);
ir::Node* pending_op = found_node->outputs[0]->outputs[0];
graph.RemoveNode(e);
graph.RemoveNode(pending_op);
graph.RemoveNode(found_node);
}
// add node
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
{
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
}
op_descs.insert(op_descs.begin() + 2, op);
// check the order
auto mynodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < mynodes.size(); ++i) {
auto node = mynodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
// 5. add and replace some op_desc inplace.
TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
auto prog = FillProgramDesc();
ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr;
for (auto n : graph.Nodes()) {
if (n->Name() == s) {
ret = n;
break;
}
}
PADDLE_ENFORCE(ret != nullptr);
return ret;
};
auto op_descs = prog.Block(0).AllOps();
// add node
auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
op->SetType("sum");
op->SetInput("X", {"b", "c"});
op->SetOutput("Out", {"d1"});
{
ir::Node* node = graph.CreateOpNode(op);
ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
ir::Node* b = find_node_in_graph("b");
ir::Node* c = find_node_in_graph("c");
node->outputs.emplace_back(d1);
node->inputs.emplace_back(b);
node->inputs.emplace_back(c);
d1->inputs.emplace_back(node);
b->outputs.emplace_back(node);
c->outputs.emplace_back(node);
}
op_descs.emplace_back(op);
// replace op_desc inplace
auto nodes = graph.Nodes();
ir::Node* found_node = nullptr;
for (auto node : nodes) {
if (node->IsOp() && node->Op() && node->Name() == "assign") {
if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") {
found_node = node;
break;
}
}
}
{
ir::Node* d = find_node_in_graph("d");
ir::Node* e = find_node_in_graph("e");
std::remove(d->outputs.begin(), d->outputs.end(), found_node);
std::remove(e->inputs.begin(), e->inputs.end(), found_node);
graph.RemoveNode(found_node);
}
op_descs.erase(op_descs.begin() + 3);
auto replace_op = prog.MutableBlock(0)->AppendOp();
replace_op->SetType("sum");
replace_op->SetInput("X", {"d", "d1"});
replace_op->SetOutput("Out", {"e"});
{
ir::Node* sum2 = graph.CreateOpNode(replace_op);
ir::Node* e = find_node_in_graph("e");
ir::Node* d = find_node_in_graph("d");
ir::Node* d1 = find_node_in_graph("d1");
sum2->inputs.emplace_back(d);
sum2->inputs.emplace_back(d1);
sum2->outputs.emplace_back(e);
e->inputs.emplace_back(sum2);
d->outputs.emplace_back(sum2);
d1->outputs.emplace_back(sum2);
}
op_descs.emplace_back(replace_op);
// compare op order
auto graph_nodes = SortOpLikeDescOrder(graph);
for (size_t i = 0; i < graph_nodes.size(); ++i) {
auto node = graph_nodes[i];
auto op_desc = op_descs[i];
ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
}
}
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/op_proto_maker.h"
namespace paddle { namespace paddle {
......
...@@ -21,8 +21,6 @@ namespace paddle { ...@@ -21,8 +21,6 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
constexpr char kAllOpDescs[] = "all_op_descs";
class SequentialExecutionPass : public ir::Pass { class SequentialExecutionPass : public ir::Pass {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl( std::unique_ptr<ir::Graph> ApplyImpl(
......
...@@ -69,7 +69,7 @@ class InplaceInToOut : public InplaceOpInference { ...@@ -69,7 +69,7 @@ class InplaceInToOut : public InplaceOpInference {
bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const { bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const {
return in.Name() != out.Name() && details::NodeCanReused(in) && return in.Name() != out.Name() && details::NodeCanReused(in) &&
details::NodeCanReused(out) && details::NodeCanReused(out) &&
details::NodeSizeInBytes(out) <= details::NodeSizeInBytes(in); details::NodeSize(out) <= details::NodeSize(in);
} }
}; };
......
...@@ -179,11 +179,11 @@ TEST(InferInplace, SingleOpInplaceInToOut) { ...@@ -179,11 +179,11 @@ TEST(InferInplace, SingleOpInplaceInToOut) {
op->SetOutput("Out", {"test2_out"}); op->SetOutput("Out", {"test2_out"});
prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64}); prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_out"); prog.MutableBlock(0)->Var("test2_out");
prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16}); prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
...@@ -201,11 +201,11 @@ TEST(InferInplace, SingleGradOpInplaceInToOut) { ...@@ -201,11 +201,11 @@ TEST(InferInplace, SingleGradOpInplaceInToOut) {
op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"}); op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});
prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16}); prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_out"); prog.MutableBlock(0)->Var("test2_out");
prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16}); prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
...@@ -233,12 +233,12 @@ TEST(InferInplace, MultiOutInplaceInToOut) { ...@@ -233,12 +233,12 @@ TEST(InferInplace, MultiOutInplaceInToOut) {
prog.MutableBlock(0)->Var("o0"); prog.MutableBlock(0)->Var("o0");
prog.MutableBlock(0)->Var("y0"); prog.MutableBlock(0)->Var("y0");
prog.MutableBlock(0)->Var("z0"); prog.MutableBlock(0)->Var("z0");
prog.MutableBlock(0)->Var("a0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("b0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("c0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("o0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("y0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("z0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
...@@ -267,15 +267,16 @@ TEST(InferInplace, MultiGradInplaceInToOut) { ...@@ -267,15 +267,16 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
prog.MutableBlock(0)->Var("o0"); prog.MutableBlock(0)->Var("o0");
prog.MutableBlock(0)->Var("y0"); prog.MutableBlock(0)->Var("y0");
prog.MutableBlock(0)->Var("z0"); prog.MutableBlock(0)->Var("z0");
prog.MutableBlock(0)->Var("a0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("b0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("c0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("o0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("y0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("z0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
EXPECT_EQ(in_to_outs.size(), 3ul); EXPECT_EQ(in_to_outs.size(), 3ul);
std::unordered_map<std::string, std::string> expects = { std::unordered_map<std::string, std::string> expects = {
{"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"}, {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
......
...@@ -38,9 +38,13 @@ std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl( ...@@ -38,9 +38,13 @@ std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl(
->assert_is_op("scale") ->assert_is_op("scale")
->assert_op_attr<float>("scale", 1.) ->assert_op_attr<float>("scale", 1.)
->assert_op_attr<float>("bias", 0.); ->assert_op_attr<float>("bias", 0.);
auto scale_out = detector.mutable_pattern() auto scale_out =
->NewNode("scale_out") detector.mutable_pattern()
->assert_is_op_output("scale"); ->NewNode("scale_out")
->assert_is_op_output("scale")
// scale's output var should has only one consumer, or it can't be
// removed.
->assert_more([](Node* x) { return x->outputs.size() == 1UL; });
pre_op->LinksTo({scale_in}); pre_op->LinksTo({scale_in});
scale_op->LinksFrom({scale_in}).LinksTo({scale_out}); scale_op->LinksFrom({scale_in}).LinksTo({scale_out});
......
...@@ -37,6 +37,7 @@ class InferCleanGraphPass : public FusePassBase { ...@@ -37,6 +37,7 @@ class InferCleanGraphPass : public FusePassBase {
std::unordered_set<const Node*> invalid_nodes; std::unordered_set<const Node*> invalid_nodes;
int valid_op = 0; int valid_op = 0;
for (auto* node : graph->Nodes()) { for (auto* node : graph->Nodes()) {
PADDLE_ENFORCE_NOT_NULL(node);
if (is_valid_node(node)) { if (is_valid_node(node)) {
invalid_nodes.insert(node); invalid_nodes.insert(node);
} else if (node->IsOp()) { } else if (node->IsOp()) {
......
...@@ -171,14 +171,6 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts( ...@@ -171,14 +171,6 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_); eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_);
graph = eager_deletion_pass->Apply(std::move(graph)); graph = eager_deletion_pass->Apply(std::move(graph));
VLOG(10) << "EagerDeletionPass Applied"; VLOG(10) << "EagerDeletionPass Applied";
if (build_strategy_.memory_early_delete_) {
auto early_delete_pass =
ir::PassRegistry::Instance().Get("memory_early_delete_pass");
early_delete_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
graph = early_delete_pass->Apply(std::move(graph));
}
VLOG(10) << "MemoryEarlyDeletePass Applied.";
} }
return graph; return graph;
...@@ -288,6 +280,8 @@ ParallelExecutor::ParallelExecutor( ...@@ -288,6 +280,8 @@ ParallelExecutor::ParallelExecutor(
graphs.push_back(std::move(graph)); graphs.push_back(std::move(graph));
#endif #endif
auto max_memory_size = GetEagerDeletionThreshold(); auto max_memory_size = GetEagerDeletionThreshold();
VLOG(10) << "Eager Deletion Threshold "
<< static_cast<float>(max_memory_size) / (1 << 30);
if (max_memory_size >= 0) { if (max_memory_size >= 0) {
for (size_t i = 0; i < graphs.size(); ++i) { for (size_t i = 0; i < graphs.size(); ++i) {
graphs[i] = member_->PrepareGCAndRefCnts( graphs[i] = member_->PrepareGCAndRefCnts(
...@@ -506,6 +500,5 @@ ParallelExecutor::~ParallelExecutor() { ...@@ -506,6 +500,5 @@ ParallelExecutor::~ParallelExecutor() {
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
USE_PASS(memory_early_delete_pass);
USE_PASS(reference_count_pass); USE_PASS(reference_count_pass);
USE_PASS(eager_deletion_pass); USE_PASS(eager_deletion_pass);
...@@ -460,77 +460,6 @@ inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) { ...@@ -460,77 +460,6 @@ inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
return node.inputs.size() == n; return node.inputs.size() == n;
} }
NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
PADDLE_ENFORCE(!source.empty(),
"Start points of topological sorting should not be empty!");
// CHECK all the inputs' in-degree is 0
for (auto *node : source) {
PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
}
std::unordered_set<Node *> visited;
std::unordered_set<Node *> to_visit{source.begin(), source.end()};
std::vector<Node *> inlink_visited;
while (!to_visit.empty()) {
std::vector<Node *> queue(to_visit.begin(), to_visit.end());
for (auto *p : queue) {
if (Agent(p).deleted()) {
visited.insert(p);
to_visit.erase(p);
}
inlink_visited.clear();
std::copy_if(p->inputs.begin(), p->inputs.end(),
std::back_inserter(inlink_visited),
[&](Node *x) -> bool { return visited.count(x) != 0; });
if (inlink_visited.size() == p->inputs.size()) {
sorted_.push_back(p);
for (auto *_ : p->outputs) {
if (!visited.count(_)) {
to_visit.insert(_);
}
}
to_visit.erase(p);
visited.insert(p);
}
}
}
}
NodesTSIterator::NodesTSIterator(const NodesTSIterator &other)
: sorted_(other.sorted_), cursor_(other.cursor_) {}
Node &NodesTSIterator::operator*() {
PADDLE_ENFORCE_LT(cursor_, sorted_.size());
return *sorted_[cursor_];
}
NodesTSIterator &NodesTSIterator::operator++() {
if (++cursor_ >= sorted_.size()) {
sorted_.clear();
cursor_ = 0;
}
return *this;
}
NodesTSIterator &NodesTSIterator::operator=(const NodesTSIterator &other) {
cursor_ = other.cursor_;
sorted_ = other.sorted_;
return *this;
}
bool NodesTSIterator::operator==(const NodesTSIterator &other) {
return sorted_ == other.sorted_ && cursor_ == other.cursor_;
}
Node *NodesTSIterator::operator->() {
PADDLE_ENFORCE_LT(cursor_, sorted_.size());
return sorted_[cursor_];
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -30,6 +30,7 @@ namespace inference { ...@@ -30,6 +30,7 @@ namespace inference {
namespace analysis { namespace analysis {
using framework::ir::Graph; using framework::ir::Graph;
using framework::ir::NodesTSIterator;
const char kIsFunctionNode[] = "__is_function_node__"; const char kIsFunctionNode[] = "__is_function_node__";
const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__"; const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__";
...@@ -132,32 +133,6 @@ struct Agent { ...@@ -132,32 +133,6 @@ struct Agent {
framework::ir::Node *x_; framework::ir::Node *x_;
}; };
// Topological sorting iterator on nodes.
struct NodesTSIterator
: public std::iterator<std::forward_iterator_tag, framework::ir::Node *> {
NodesTSIterator() = default;
explicit NodesTSIterator(const std::vector<framework::ir::Node *> &source);
NodesTSIterator(NodesTSIterator &&other)
: sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
other.cursor_ = 0;
}
NodesTSIterator(const NodesTSIterator &other);
framework::ir::Node &operator*();
NodesTSIterator &operator++();
// TODO(Superjomn) current implementation just compare the first
// element, need to compare the graph and all the elements in the queue and
// set.
NodesTSIterator &operator=(const NodesTSIterator &other);
bool operator==(const NodesTSIterator &other);
bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
framework::ir::Node *operator->();
private:
std::vector<framework::ir::Node *> sorted_;
size_t cursor_{0};
};
// The nodes those have no input will be treated as start points. // The nodes those have no input will be treated as start points.
static std::vector<framework::ir::Node *> ExtractStartPoints(const Graph &g) { static std::vector<framework::ir::Node *> ExtractStartPoints(const Graph &g) {
std::vector<framework::ir::Node *> result; std::vector<framework::ir::Node *> result;
......
...@@ -16,6 +16,12 @@ ...@@ -16,6 +16,12 @@
/*! \file paddle_api.h /*! \file paddle_api.h
*/ */
/*! \mainpage Paddle Inference APIs
* \section intro_sec Introduction
* The Paddle inference library aims to offer an high performance inference SDK
* for Paddle users.
*/
#include <cassert> #include <cassert>
#include <memory> #include <memory>
#include <string> #include <string>
...@@ -34,26 +40,49 @@ enum PaddleDType { ...@@ -34,26 +40,49 @@ enum PaddleDType {
}; };
/** /**
*\brief Memory menager for PaddleTensor. * \brief Memory manager for `PaddleTensor`.
* *
*The PaddleBuf holds a buffer for data input or output. The memory can be * The PaddleBuf holds a buffer for data input or output. The memory can be
*allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
*should be reused for better performance. * should be reused for better performance.
* *
*For user allocated memory, the following API can be used: * For user allocated memory, the following API can be used:
*- PaddleBuf(void* data, size_t length) to set an external memory by * - PaddleBuf(void* data, size_t length) to set an external memory by
*specifying * specifying the memory address and length.
* the memory address and length. * - Reset(void* data, size_t length) to reset the PaddleBuf with an external
*- Reset(void* data, size_t length) to reset the PaddleBuf with an external
*memory. *memory.
*ATTENTION, for user allocated memory, deallocation should be done by users * ATTENTION, for user allocated memory, deallocation should be done by users
*externally after the program finished. The PaddleBuf won't do any allocation *externally after the program finished. The PaddleBuf won't do any allocation
*or deallocation. *or deallocation.
* *
*To have the PaddleBuf allocate and manage the memory: * To have the PaddleBuf allocate and manage the memory:
*- PaddleBuf(size_t length) will allocate a memory of size `length`. * - PaddleBuf(size_t length) will allocate a memory of size `length`.
*- Resize(size_t length) resize the memory to no less than `length`, ATTENTION * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
* if the allocated memory is larger than `length`, nothing will done. * if the allocated memory is larger than `length`, nothing will done.
*
* Usage:
*
* Let PaddleBuf manage the memory internally.
* \code{cpp}
* const int num_elements = 128;
* PaddleBuf buf(num_elements * sizeof(float));
* \endcode
*
* Or
* \code{cpp}
* PaddleBuf buf;
* buf.Resize(num_elements * sizeof(float));
* \endcode
* Works the exactly the same.
*
* One can also make the `PaddleBuf` use the external memory.
* \code{cpp}
* PaddleBuf buf;
* void* external_memory = new float[num_elements];
* buf.Reset(external_memory, num_elements*sizeof(float));
* ...
* delete[] external_memory; // manage the memory lifetime outside.
* \endcode
*/ */
class PaddleBuf { class PaddleBuf {
public: public:
...@@ -78,7 +107,7 @@ class PaddleBuf { ...@@ -78,7 +107,7 @@ class PaddleBuf {
/** Tell whether the buffer is empty. /** Tell whether the buffer is empty.
*/ */
bool empty() const { return length_ == 0; } bool empty() const { return length_ == 0; }
/** Get the memory address. /** Get the data's memory address.
*/ */
void* data() const { return data_; } void* data() const { return data_; }
/** Get the memory length. /** Get the memory length.
...@@ -110,7 +139,8 @@ struct PaddleTensor { ...@@ -110,7 +139,8 @@ struct PaddleTensor {
}; };
enum class PaddlePlace { kUNK = -1, kCPU, kGPU }; enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
/** Tensor without copy, currently only supports AnalysisPredictor.
/** Tensor without copy, currently only supports `AnalysisPredictor`.
*/ */
class ZeroCopyTensor { class ZeroCopyTensor {
public: public:
...@@ -269,9 +299,11 @@ struct NativeConfig : public PaddlePredictor::Config { ...@@ -269,9 +299,11 @@ struct NativeConfig : public PaddlePredictor::Config {
* *
* Usage: * Usage:
* *
* \code{.cpp}
* NativeConfig config; * NativeConfig config;
* ... // change the configs. * ... // change the configs.
* auto native_predictor = CreatePaddlePredictor(config); * auto native_predictor = CreatePaddlePredictor(config);
* \endcode
* *
* FOR EXTENSION DEVELOPER: * FOR EXTENSION DEVELOPER:
* Different predictors are designated by config type. Similar configs can be * Different predictors are designated by config type. Similar configs can be
......
...@@ -66,8 +66,54 @@ void GpuPassStrategy::EnableMKLDNN() { ...@@ -66,8 +66,54 @@ void GpuPassStrategy::EnableMKLDNN() {
LOG(ERROR) << "GPU not support MKLDNN yet"; LOG(ERROR) << "GPU not support MKLDNN yet";
} }
GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
passes_.assign({
"infer_clean_graph_pass", //
"identity_scale_op_clean_pass", //
"conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", //
"conv_bn_fuse_pass", //
#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
// guaranteed at least v7
"conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
#endif
});
for (int i = 6; i >= 3; i--) {
passes_.push_back("transpose_flatten" + std::to_string(i) +
"_concat_fuse_pass");
}
use_gpu_ = true;
}
void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
analysis_passes_.push_back(pass); analysis_passes_.push_back(pass);
} }
CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
// NOTE the large fusions should be located in the front, so that they will
// not be damaged by smaller ones.
passes_.assign({
"infer_clean_graph_pass", //
"attention_lstm_fuse_pass", //
"seqpool_concat_fuse_pass", //
"seqconv_eltadd_relu_fuse_pass", //
// "embedding_fc_lstm_fuse_pass", //
"fc_lstm_fuse_pass", //
"mul_lstm_fuse_pass", //
"fc_gru_fuse_pass", //
"mul_gru_fuse_pass", //
"seq_concat_fc_fuse_pass", //
"fc_fuse_pass", //
"repeated_fc_relu_fuse_pass", //
"squared_mat_sub_fuse_pass", //
"conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", //
"is_test_pass", //
"identity_scale_op_clean_pass", //
});
use_gpu_ = false;
}
} // namespace paddle } // namespace paddle
...@@ -97,30 +97,7 @@ class PassStrategy : public PaddlePassBuilder { ...@@ -97,30 +97,7 @@ class PassStrategy : public PaddlePassBuilder {
*/ */
class CpuPassStrategy : public PassStrategy { class CpuPassStrategy : public PassStrategy {
public: public:
CpuPassStrategy() : PassStrategy({}) { CpuPassStrategy();
// NOTE the large fusions should be located in the front, so that they will
// not be damaged by smaller ones.
passes_.assign({
"infer_clean_graph_pass", //
"attention_lstm_fuse_pass", //
"seqpool_concat_fuse_pass", //
"seqconv_eltadd_relu_fuse_pass", //
// "embedding_fc_lstm_fuse_pass", //
"fc_lstm_fuse_pass", //
"mul_lstm_fuse_pass", //
"fc_gru_fuse_pass", //
"mul_gru_fuse_pass", //
"seq_concat_fc_fuse_pass", //
"fc_fuse_pass", //
"repeated_fc_relu_fuse_pass", //
"squared_mat_sub_fuse_pass", //
"conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", //
"is_test_pass", //
"identity_scale_op_clean_pass", //
});
use_gpu_ = false;
}
explicit CpuPassStrategy(const CpuPassStrategy &other) explicit CpuPassStrategy(const CpuPassStrategy &other)
: PassStrategy(other.AllPasses()) {} : PassStrategy(other.AllPasses()) {}
...@@ -153,27 +130,7 @@ class CpuPassStrategy : public PassStrategy { ...@@ -153,27 +130,7 @@ class CpuPassStrategy : public PassStrategy {
*/ */
class GpuPassStrategy : public PassStrategy { class GpuPassStrategy : public PassStrategy {
public: public:
GpuPassStrategy() : PassStrategy({}) { GpuPassStrategy();
passes_.assign({
"infer_clean_graph_pass", //
"identity_scale_op_clean_pass", //
"conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", //
"conv_bn_fuse_pass", //
#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
// guaranteed at least v7
"conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
#endif
});
for (int i = 6; i >= 3; i--) {
passes_.push_back("transpose_flatten" + std::to_string(i) +
"_concat_fuse_pass");
}
use_gpu_ = true;
}
explicit GpuPassStrategy(const GpuPassStrategy &other) explicit GpuPassStrategy(const GpuPassStrategy &other)
: PassStrategy(other.AllPasses()) { : PassStrategy(other.AllPasses()) {
......
...@@ -51,6 +51,11 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker { ...@@ -51,6 +51,11 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
comment.type)); comment.type));
AddInput("Y", string::Sprintf("the right hand operand of %s operator", AddInput("Y", string::Sprintf("the right hand operand of %s operator",
comment.type)); comment.type));
AddAttr<int>(
"axis",
"The start dimension index for broadcasting Y onto X. [default -1]")
.SetDefault(-1)
.EqualGreaterThan(-1);
AddAttr<bool>("force_cpu", AddAttr<bool>("force_cpu",
"Force fill output variable to cpu " "Force fill output variable to cpu "
"memory. Otherwise, fill output variable to the running " "memory. Otherwise, fill output variable to the running "
...@@ -64,11 +69,6 @@ N-dim tensor. X and Y could be any type. The each element of the Out tensor is ...@@ -64,11 +69,6 @@ N-dim tensor. X and Y could be any type. The each element of the Out tensor is
calculated by $%s$ calculated by $%s$
)DOC", )DOC",
comment.equation)); comment.equation));
AddAttr<int>(
"axis",
"The start dimension index for broadcasting Y onto X. [default -1]")
.SetDefault(-1)
.EqualGreaterThan(-1);
} }
}; };
......
...@@ -72,7 +72,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -72,7 +72,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#pragma omp parallel for #pragma omp parallel for
#endif #endif
for (int i = 0; i < fixed_ratios.size(); i++) { for (size_t i = 0; i < fixed_ratios.size(); i++) {
sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i])); sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i]));
} }
...@@ -115,11 +115,10 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -115,11 +115,10 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
} }
} }
if (clip) { if (clip) {
platform::Transform<platform::CPUDeviceContext> trans; T* dt = boxes->data<T>();
ClipFunctor<T> clip_func; std::transform(dt, dt + boxes->numel(), dt, [](T v) -> T {
trans(ctx.template device_context<platform::CPUDeviceContext>(), return std::min<T>(std::max<T>(v, 0.), 1.);
boxes->data<T>(), boxes->data<T>() + boxes->numel(), });
boxes->data<T>(), clip_func);
} }
framework::Tensor var_t; framework::Tensor var_t;
var_t.mutable_data<T>( var_t.mutable_data<T>(
...@@ -141,7 +140,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -141,7 +140,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
#endif #endif
for (int i = 0; i < box_num; ++i) { for (int i = 0; i < box_num; ++i) {
for (int j = 0; j < variances.size(); ++j) { for (size_t j = 0; j < variances.size(); ++j) {
e_vars(i, j) = variances[j]; e_vars(i, j) = variances[j];
} }
} }
......
...@@ -46,13 +46,6 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior, ...@@ -46,13 +46,6 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
} }
} }
template <typename T>
struct ClipFunctor {
HOSTDEVICE inline T operator()(T in) const {
return std::min<T>(std::max<T>(in, 0.), 1.);
}
};
template <typename T> template <typename T>
class PriorBoxOpKernel : public framework::OpKernel<T> { class PriorBoxOpKernel : public framework::OpKernel<T> {
public: public:
...@@ -101,31 +94,30 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -101,31 +94,30 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
boxes->mutable_data<T>(ctx.GetPlace()); boxes->mutable_data<T>(ctx.GetPlace());
vars->mutable_data<T>(ctx.GetPlace()); vars->mutable_data<T>(ctx.GetPlace());
auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes); T* b_t = boxes->data<T>();
for (int h = 0; h < feature_height; ++h) { for (int h = 0; h < feature_height; ++h) {
for (int w = 0; w < feature_width; ++w) { for (int w = 0; w < feature_width; ++w) {
T center_x = (w + offset) * step_width; T center_x = (w + offset) * step_width;
T center_y = (h + offset) * step_height; T center_y = (h + offset) * step_height;
T box_width, box_height; T box_width, box_height;
int idx = 0;
for (size_t s = 0; s < min_sizes.size(); ++s) { for (size_t s = 0; s < min_sizes.size(); ++s) {
auto min_size = min_sizes[s]; auto min_size = min_sizes[s];
if (min_max_aspect_ratios_order) { if (min_max_aspect_ratios_order) {
box_width = box_height = min_size / 2.; box_width = box_height = min_size / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; b_t[0] = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; b_t[1] = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; b_t[2] = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; b_t[3] = (center_y + box_height) / img_height;
idx++; b_t += 4;
if (max_sizes.size() > 0) { if (max_sizes.size() > 0) {
auto max_size = max_sizes[s]; auto max_size = max_sizes[s];
// square prior with size sqrt(minSize * maxSize) // square prior with size sqrt(minSize * maxSize)
box_width = box_height = sqrt(min_size * max_size) / 2.; box_width = box_height = sqrt(min_size * max_size) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; b_t[0] = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; b_t[1] = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; b_t[2] = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; b_t[3] = (center_y + box_height) / img_height;
idx++; b_t += 4;
} }
// priors with different aspect ratios // priors with different aspect ratios
for (size_t r = 0; r < aspect_ratios.size(); ++r) { for (size_t r = 0; r < aspect_ratios.size(); ++r) {
...@@ -135,11 +127,11 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -135,11 +127,11 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
} }
box_width = min_size * sqrt(ar) / 2.; box_width = min_size * sqrt(ar) / 2.;
box_height = min_size / sqrt(ar) / 2.; box_height = min_size / sqrt(ar) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; b_t[0] = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; b_t[1] = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; b_t[2] = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; b_t[3] = (center_y + box_height) / img_height;
idx++; b_t += 4;
} }
} else { } else {
// priors with different aspect ratios // priors with different aspect ratios
...@@ -147,21 +139,21 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -147,21 +139,21 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
float ar = aspect_ratios[r]; float ar = aspect_ratios[r];
box_width = min_size * sqrt(ar) / 2.; box_width = min_size * sqrt(ar) / 2.;
box_height = min_size / sqrt(ar) / 2.; box_height = min_size / sqrt(ar) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; b_t[0] = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; b_t[1] = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; b_t[2] = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; b_t[3] = (center_y + box_height) / img_height;
idx++; b_t += 4;
} }
if (max_sizes.size() > 0) { if (max_sizes.size() > 0) {
auto max_size = max_sizes[s]; auto max_size = max_sizes[s];
// square prior with size sqrt(minSize * maxSize) // square prior with size sqrt(minSize * maxSize)
box_width = box_height = sqrt(min_size * max_size) / 2.; box_width = box_height = sqrt(min_size * max_size) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; b_t[0] = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; b_t[1] = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; b_t[2] = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; b_t[3] = (center_y + box_height) / img_height;
idx++; b_t += 4;
} }
} }
} }
...@@ -169,11 +161,10 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -169,11 +161,10 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
} }
if (clip) { if (clip) {
platform::Transform<platform::CPUDeviceContext> trans; T* dt = boxes->data<T>();
ClipFunctor<T> clip_func; std::transform(dt, dt + boxes->numel(), dt, [](T v) -> T {
trans(ctx.template device_context<platform::CPUDeviceContext>(), return std::min<T>(std::max<T>(v, 0.), 1.);
boxes->data<T>(), boxes->data<T>() + boxes->numel(), });
boxes->data<T>(), clip_func);
} }
framework::Tensor var_t; framework::Tensor var_t;
......
...@@ -144,34 +144,40 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -144,34 +144,40 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
"The ignore threshold to ignore confidence loss.") "The ignore threshold to ignore confidence loss.")
.SetDefault(0.7); .SetDefault(0.7);
AddComment(R"DOC( AddComment(R"DOC(
This operator generate yolov3 loss by given predict result and ground This operator generates yolov3 loss based on given predict result and ground
truth boxes. truth boxes.
The output of previous network is in shape [N, C, H, W], while H and W The output of previous network is in shape [N, C, H, W], while H and W
should be the same, specify the grid size, each grid point predict given should be the same, H and W specify the grid size, each grid point predict
number boxes, this given number is specified by anchors, it should be given number boxes, this given number, which following will be represented as S,
half anchors length, which following will be represented as S. In the is specified by the number of anchors, In the second dimension(the channel
second dimention(the channel dimention), C should be S * (class_num + 5), dimension), C should be equal to S * (class_num + 5), class_num is the object
class_num is the box categoriy number of source dataset(such as coco), category number of source dataset(such as 80 in coco dataset), so in the
so in the second dimention, stores 4 box location coordinates x, y, w, h second(channel) dimension, apart from 4 box location coordinates x, y, w, h,
and confidence score of the box and class one-hot key of each anchor box. also includes confidence score of the box and class one-hot key of each anchor box.
While the 4 location coordinates if $$tx, ty, tw, th$$, the box predictions Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box predictions
correspnd to: should be as follows:
$$ $$
b_x = \sigma(t_x) + c_x b_x = \\sigma(t_x) + c_x
b_y = \sigma(t_y) + c_y $$
$$
b_y = \\sigma(t_y) + c_y
$$
$$
b_w = p_w e^{t_w} b_w = p_w e^{t_w}
$$
$$
b_h = p_h e^{t_h} b_h = p_h e^{t_h}
$$ $$
While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$ In the equation above, :math:`c_x, c_y` is the left top corner of current grid
is specified by anchors. and :math:`p_w, p_h` is specified by anchors.
As for confidence score, it is the logistic regression value of IoU between As for confidence score, it is the logistic regression value of IoU between
anchor boxes and ground truth boxes, the score of the anchor box which has anchor boxes and ground truth boxes, the score of the anchor box which has
the max IoU should be 1, and if the anchor box has IoU bigger then ignore the max IoU should be 1, and if the anchor box has IoU bigger than ignore
thresh, the confidence score loss of this anchor box will be ignored. thresh, the confidence score loss of this anchor box will be ignored.
Therefore, the yolov3 loss consist of three major parts, box location loss, Therefore, the yolov3 loss consist of three major parts, box location loss,
...@@ -186,13 +192,13 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -186,13 +192,13 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
In order to trade off box coordinate losses between big boxes and small In order to trade off box coordinate losses between big boxes and small
boxes, box coordinate losses will be mutiplied by scale weight, which is boxes, box coordinate losses will be mutiplied by scale weight, which is
calculated as follow. calculated as follows.
$$ $$
weight_{box} = 2.0 - t_w * t_h weight_{box} = 2.0 - t_w * t_h
$$ $$
Final loss will be represented as follow. Final loss will be represented as follows.
$$ $$
loss = (loss_{xy} + loss_{wh}) * weight_{box} loss = (loss_{xy} + loss_{wh}) * weight_{box}
......
...@@ -264,6 +264,23 @@ class ElementwiseOpInplace : public framework::InplaceInToOut { ...@@ -264,6 +264,23 @@ class ElementwiseOpInplace : public framework::InplaceInToOut {
} }
}; };
class ElementwiseGradOpInplace : public framework::InplaceInToOut {
public:
using framework::InplaceInToOut::InplaceInToOut;
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
std::unordered_map<std::string, std::string> ret;
if (block->HasVar(framework::GradVarName("X")) &&
block->HasVar(framework::GradVarName("Out"))) {
ret[framework::GradVarName("Out")] = framework::GradVarName("X");
}
return ret;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -316,4 +333,5 @@ class ElementwiseOpInplace : public framework::InplaceInToOut { ...@@ -316,4 +333,5 @@ class ElementwiseOpInplace : public framework::InplaceInToOut {
op_type##GradMaker, \ op_type##GradMaker, \
::paddle::operators::ElementwiseOpInplace); \ ::paddle::operators::ElementwiseOpInplace); \
REGISTER_OPERATOR(op_type##_grad, \ REGISTER_OPERATOR(op_type##_grad, \
::paddle::operators::ElementwiseOpExplicitGrad) ::paddle::operators::ElementwiseOpExplicitGrad, \
::paddle::operators::ElementwiseGradOpInplace)
...@@ -311,6 +311,10 @@ class LSTMGradKernel : public framework::OpKernel<T> { ...@@ -311,6 +311,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr; lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
} }
// lstm_value.output_value not used in bp, set to nullptr
// lstm_grad.state_active_grad not used in bp, set to nullptr
lstm_value.output_value = nullptr;
lstm_grad.state_active_grad = nullptr;
int cur_batch_size = bend - bstart; int cur_batch_size = bend - bstart;
math::LstmUnitGradFunctor<DeviceContext, T>::compute( math::LstmUnitGradFunctor<DeviceContext, T>::compute(
device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size, device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
......
...@@ -405,6 +405,11 @@ class LSTMPGradKernel : public framework::OpKernel<T> { ...@@ -405,6 +405,11 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
} }
int cur_batch_size = bend - bstart; int cur_batch_size = bend - bstart;
// lstmp_value.output_value not used in bp, set to null
// lstmp_grad.state_active_grad not used in bp, set to null
lstmp_value.output_value = nullptr;
lstmp_grad.state_active_grad = nullptr;
math::LstmUnitGradFunctor<DeviceContext, T>::compute( math::LstmUnitGradFunctor<DeviceContext, T>::compute(
device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size, device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
gate_act, cell_act, cand_act); gate_act, cell_act, cand_act);
......
...@@ -168,9 +168,10 @@ void Pool2dOpMaker::Make() { ...@@ -168,9 +168,10 @@ void Pool2dOpMaker::Make() {
"be ignored."); // TODO(Chengduo): Add checker. "be ignored."); // TODO(Chengduo): Add checker.
// (Currently, // (Currently,
// TypedAttrChecker don't support vector type.) // TypedAttrChecker don't support vector type.)
AddAttr<bool>("global_pooling", AddAttr<bool>(
"(bool, default false) Whether to use the global pooling. " "global_pooling",
"If global_pooling = true, ksize and paddings will be ignored.") "(bool, default false) Whether to use the global pooling. "
"If global_pooling = true, kernel size and paddings will be ignored.")
.SetDefault(false); .SetDefault(false);
AddAttr<std::vector<int>>("strides", AddAttr<std::vector<int>>("strides",
"(vector<int>, default {1, 1}), strides(height, " "(vector<int>, default {1, 1}), strides(height, "
...@@ -182,7 +183,7 @@ void Pool2dOpMaker::Make() { ...@@ -182,7 +183,7 @@ void Pool2dOpMaker::Make() {
"paddings", "paddings",
"(vector<int>, default {0,0}), paddings(height, width) of pooling " "(vector<int>, default {0,0}), paddings(height, width) of pooling "
"operator." "operator."
"If global_pooling = true, paddings and ksize will be ignored.") "If global_pooling = true, paddings and kernel size will be ignored.")
.SetDefault({0, 0}); .SetDefault({0, 0});
AddAttr<bool>( AddAttr<bool>(
"exclusive", "exclusive",
...@@ -204,7 +205,7 @@ void Pool2dOpMaker::Make() { ...@@ -204,7 +205,7 @@ void Pool2dOpMaker::Make() {
.SetDefault(false); .SetDefault(false);
AddAttr<bool>( AddAttr<bool>(
"ceil_mode", "ceil_mode",
"(bool, default false) Wether to use the ceil function to calculate " "(bool, default false) Whether to use the ceil function to calculate "
"output height and width. False is the default. If it is set to False, " "output height and width. False is the default. If it is set to False, "
"the floor function will be used.") "the floor function will be used.")
.SetDefault(false); .SetDefault(false);
...@@ -259,31 +260,40 @@ Example: ...@@ -259,31 +260,40 @@ Example:
W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1 W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
$$ $$
For exclusive = true: For exclusive = false:
$$ $$
hstart = i * strides[0] - paddings[0] hstart = i * strides[0] - paddings[0]
$$
$$
hend = hstart + ksize[0] hend = hstart + ksize[0]
$$
$$
wstart = j * strides[1] - paddings[1] wstart = j * strides[1] - paddings[1]
$$
$$
wend = wstart + ksize[1] wend = wstart + ksize[1]
$$
$$
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]} Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
$$ $$
For exclusive = false:
For exclusive = true:
$$ $$
hstart = max(0, i * strides[0] - paddings[0]) hstart = max(0, i * strides[0] - paddings[0])
$$
$$
hend = min(H, hstart + ksize[0]) hend = min(H, hstart + ksize[0])
$$
$$
wstart = max(0, j * strides[1] - paddings[1]) wstart = max(0, j * strides[1] - paddings[1])
$$
$$
wend = min(W, wstart + ksize[1]) wend = min(W, wstart + ksize[1])
$$
$$
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)} Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
$$ $$
For adaptive = true:
$$
hstart = floor(i * H_{in} / H_{out})
hend = ceil((i + 1) * H_{in} / H_{out})
wstart = floor(j * W_{in} / W_{out})
wend = ceil((j + 1) * W_{in} / W_{out})
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
$$
)DOC"); )DOC");
} }
...@@ -324,7 +334,7 @@ void Pool3dOpMaker::Make() { ...@@ -324,7 +334,7 @@ void Pool3dOpMaker::Make() {
AddAttr<bool>( AddAttr<bool>(
"global_pooling", "global_pooling",
"(bool, default false) Whether to use the global pooling. " "(bool, default false) Whether to use the global pooling. "
"If global_pooling = true, ksize and paddings wille be ignored.") "If global_pooling = true, kernel size and paddings will be ignored.")
.SetDefault(false); .SetDefault(false);
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>(
"strides", "strides",
...@@ -359,7 +369,7 @@ void Pool3dOpMaker::Make() { ...@@ -359,7 +369,7 @@ void Pool3dOpMaker::Make() {
.SetDefault(false); .SetDefault(false);
AddAttr<bool>( AddAttr<bool>(
"ceil_mode", "ceil_mode",
"(bool, default false) Wether to use the ceil function to calculate " "(bool, default false) Whether to use the ceil function to calculate "
"output height and width. False is the default. If it is set to False, " "output height and width. False is the default. If it is set to False, "
"the floor function will be used.") "the floor function will be used.")
.SetDefault(false); .SetDefault(false);
...@@ -392,48 +402,68 @@ Example: ...@@ -392,48 +402,68 @@ Example:
Output: Output:
Out shape: $(N, C, D_{out}, H_{out}, W_{out})$ Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
For ceil_mode = false: For ceil_mode = false:
$$ $$
D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\ $$
W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1 $$
$$ H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[2]} + 1
$$
$$
W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
$$
For ceil_mode = true: For ceil_mode = true:
$$ $$
D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1 \\ D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1
H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\ $$
W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1 $$
$$ H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1
For exclusive = true: $$
$$ $$
dstart = i * strides[0] - paddings[0] W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
dend = dstart + ksize[0] $$
hstart = j * strides[1] - paddings[1]
hend = hstart + ksize[1]
wstart = k * strides[2] - paddings[2]
wend = wstart + ksize[2]
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
$$
For exclusive = false: For exclusive = false:
$$ $$
dstart = max(0, i * strides[0] - paddings[0]) dstart = i * strides[0] - paddings[0]
dend = min(D, dstart + ksize[0]) $$
hstart = max(0, j * strides[1] - paddings[1]) $$
hend = min(H, hstart + ksize[1]) dend = dstart + ksize[0]
wstart = max(0, k * strides[2] - paddings[2]) $$
wend = min(W, wstart + ksize[2]) $$
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)} hstart = j * strides[1] - paddings[1]
$$ $$
$$
For adaptive = true: hend = hstart + ksize[1]
$$ $$
dstart = floor(i * D_{in} / D_{out}) $$
dend = ceil((i + 1) * D_{in} / D_{out}) wstart = k * strides[2] - paddings[2]
hstart = floor(j * H_{in} / H_{out}) $$
hend = ceil((j + 1) * H_{in} / H_{out}) $$
wstart = floor(k * W_{in} / W_{out}) wend = wstart + ksize[2]
wend = ceil((k + 1) * W_{in} / W_{out}) $$
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)} $$
$$ Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
$$
For exclusive = true:
$$
dstart = max(0, i * strides[0] - paddings[0])
$$
$$
dend = min(D, dstart + ksize[0])
$$
$$
hend = min(H, hstart + ksize[1])
$$
$$
wstart = max(0, k * strides[2] - paddings[2])
$$
$$
wend = min(W, wstart + ksize[2])
$$
$$
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
$$
)DOC"); )DOC");
} }
......
...@@ -121,7 +121,7 @@ struct RandomCropFunctor { ...@@ -121,7 +121,7 @@ struct RandomCropFunctor {
HOSTDEVICE void operator()(size_t ins_idx) { HOSTDEVICE void operator()(size_t ins_idx) {
typename Random<DeviceContext>::Engine engine(seed_); typename Random<DeviceContext>::Engine engine(seed_);
engine.discard(ins_idx * (rank_ - num_batchsize_dims_)); engine.discard(ins_idx * (rank_ - num_batchsize_dims_));
size_t offsets[9]; size_t offsets[9] = {};
for (int i = num_batchsize_dims_; i < rank_; ++i) { for (int i = num_batchsize_dims_; i < rank_; ++i) {
typename Random<DeviceContext>::template UniformIntDist<size_t> dist( typename Random<DeviceContext>::template UniformIntDist<size_t> dist(
0, x_dims_[i] - out_dims_[i]); 0, x_dims_[i] - out_dims_[i]);
......
...@@ -12,18 +12,138 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,18 +12,138 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <thrust/device_vector.h>
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/slice_op.h" #include "paddle/fluid/operators/slice_op.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
template <size_t D>
__global__ void Padding(const paddle::platform::float16* d_out,
const int* out_dims, const int* in_dims,
const int* offsets, int64_t n,
paddle::platform::float16* d_in) {
int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x;
if (out_idx < n) {
int64_t out_idx_tmp = out_idx;
int coords[D] = {0};
for (int i = D - 1; i >= 0; --i) {
coords[i] = out_idx_tmp % out_dims[i];
out_idx_tmp /= out_dims[i];
coords[i] += offsets[i];
}
int64_t in_idx = 0;
for (int i = 0; i < D; ++i) {
in_idx = in_idx * in_dims[i] + coords[i];
}
d_in[in_idx] = d_out[out_idx];
}
}
template <>
class SliceGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>
: public framework::OpKernel<paddle::platform::float16> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* d_in = ctx.Output<framework::Tensor>(framework::GradVarName("Input"));
d_in->mutable_data<paddle::platform::float16>(ctx.GetPlace());
auto out_dims = d_out->dims();
auto in_dims = d_in->dims();
int rank = out_dims.size();
std::vector<int> offsets(rank, 0);
auto axes = ctx.Attr<std::vector<int>>("axes");
auto starts = ctx.Attr<std::vector<int>>("starts");
for (size_t i = 0; i < starts.size(); ++i) {
if (starts[i] < 0) {
starts[i] += in_dims[axes[i]];
}
offsets[axes[i]] = std::max(starts[i], 0);
}
math::SetConstant<paddle::platform::CUDADeviceContext,
paddle::platform::float16>
set_zero;
auto& dev_ctx =
ctx.template device_context<paddle::platform::CUDADeviceContext>();
set_zero(dev_ctx, d_in, static_cast<paddle::platform::float16>(0));
int64_t numel = d_out->numel();
dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1);
dim3 threads(PADDLE_CUDA_NUM_THREADS);
auto stream = ctx.cuda_device_context().stream();
auto out_shape = framework::vectorize2int(out_dims);
thrust::device_vector<int> out_dims_vec(out_shape.begin(), out_shape.end());
auto in_shape = framework::vectorize2int(in_dims);
thrust::device_vector<int> in_dims_vec(in_shape.begin(), in_shape.end());
thrust::device_vector<int> offsets_vec(offsets.begin(), offsets.end());
const int* out_dims_ptr = thrust::raw_pointer_cast(out_dims_vec.data());
const int* in_dims_ptr = thrust::raw_pointer_cast(in_dims_vec.data());
const int* offsets_ptr = thrust::raw_pointer_cast(offsets_vec.data());
switch (rank) {
case 1:
Padding<1><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 2:
Padding<2><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 3:
Padding<3><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 4:
Padding<4><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 5:
Padding<5><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 6:
Padding<6><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>, slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, double>, ops::SliceKernel<paddle::platform::CUDADeviceContext, double>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, int>, ops::SliceKernel<paddle::platform::CUDADeviceContext, int>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
slice_grad, slice_grad,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, float>, ops::SliceGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, double>, ops::SliceGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int>, ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
...@@ -439,7 +439,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> { ...@@ -439,7 +439,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>(); context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();
Tensor* logit_grad = Tensor* logit_grad =
context.Output<Tensor>(framework::GradVarName("Logits")); context.Output<Tensor>(framework::GradVarName("Logits"));
logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax")); framework::TensorCopy(*context.Input<Tensor>("Softmax"), context.GetPlace(),
context.device_context(), logit_grad);
T* logit_grad_data = logit_grad->data<T>(); T* logit_grad_data = logit_grad->data<T>();
const int batch_size = logit_grad->dims()[0]; const int batch_size = logit_grad->dims()[0];
......
...@@ -398,9 +398,9 @@ PYBIND11_MODULE(core, m) { ...@@ -398,9 +398,9 @@ PYBIND11_MODULE(core, m) {
py::arg("recursive_sequence_lengths"), R"DOC( py::arg("recursive_sequence_lengths"), R"DOC(
Set LoD of the LoDTensor according to recursive sequence length. Set LoD of the LoDTensor according to recursive sequence length.
For example, if recursive_sequence_lengths=[2, 3], meaning that For example, if recursive_sequence_lengths=[[2, 3]], meaning that
there are two sequences with length 2 and 3 respectively, the there are two sequences with length 2 and 3 respectively, the
corresponding lod would be [0, 2, 2+3], i.e, [0, 2, 5]. corresponding lod would be [[0, 2, 2+3]], i.e, [[0, 2, 5]].
Args: Args:
recursive_sequence_lengths (List[List[int]]): sequence lengths. recursive_sequence_lengths (List[List[int]]): sequence lengths.
...@@ -803,9 +803,11 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -803,9 +803,11 @@ All parameter, weight, gradient are variables in Paddle.
.def(py::init<const platform::Place &>()) .def(py::init<const platform::Place &>())
.def("close", &Executor::Close) .def("close", &Executor::Close)
.def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope, .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
int block_id, bool create_local_scope, bool create_vars) { int block_id, bool create_local_scope, bool create_vars,
const std::vector<std::string> &fetch_vars) {
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
self.Run(prog, scope, block_id, create_local_scope, create_vars); self.Run(prog, scope, block_id, create_local_scope, create_vars,
fetch_vars);
}); });
m.def("init_gflags", framework::InitGflags); m.def("init_gflags", framework::InitGflags);
......
#!/bin/bash #!/bin/bash
## purple to echo
function purple(){
echo -e "\033[35m$1\033[0m"
}
## green to echo
function green(){
echo -e "\033[32m$1\033[0m"
}
## Error to warning with blink
function bred(){
echo -e "\033[31m\033[01m\033[05m$1\033[0m"
}
## Error to warning with blink
function byellow(){
echo -e "\033[33m\033[01m\033[05m$1\033[0m"
}
## Error
function red(){
echo -e "\033[31m\033[01m$1\033[0m"
}
## warning
function yellow(){
echo -e "\033[33m\033[01m$1\033[0m"
}
path='http://paddlepaddle.org/download?url=' path='http://paddlepaddle.org/download?url='
#release_version=`curl -s https://pypi.org/project/paddlepaddle/|grep -E "/project/paddlepaddle/"|grep "release"|awk -F '/' '{print $(NF-1)}'|head -1` #release_version=`curl -s https://pypi.org/project/paddlepaddle/|grep -E "/project/paddlepaddle/"|grep "release"|awk -F '/' '{print $(NF-1)}'|head -1`
release_version=1.2.0 release_version=1.2.0
...@@ -228,36 +260,128 @@ function checkLinuxPaddleVersion(){ ...@@ -228,36 +260,128 @@ function checkLinuxPaddleVersion(){
done done
} }
function checkLinuxPip(){ function checkPythonVirtualenv(){
while true while true
do do
echo "请输入您要使用的pip目录(您可以另起终端,并使用which pip来查看):" read -p "
read -p "" pip_path 是否使用python virtualenv虚环境安装(y/n)": check_virtualenv
if [ "$pip_path" == "" -o ! -f "$pip_path" ];then case $check_virtualenv in
echo "检测结果:pip不存在,请重新输入" y)
continue echo "为您使用python虚环境安装"
fi ;;
python_version=`$pip_path --version|awk -F "[ |)]" '{print $6}'|sed 's#\.##g'` n)
if [ "$python_version" == "27" ];then break
uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27mu"` ;;
if [[ "$uncode" == "" ]];then *)
uncode= continue
else ;;
uncode=u esac
fi
fi virtualenv_path=`which virtualenv 2>&1`
if [ "$python_version" == "" ];then if [ "$virtualenv_path" == "" ];then
echo "检测结果:pip不存在,请重新输入" $python_path -m pip install virtualenv
else if [ "$?" != '0' ];then
version_list=`echo "${python_list[@]}" | grep "$python_version" ` echo "安装虚拟环境失败,请检查本地环境"
if [ "$version_list" != "" ];then fi
echo "检测结果:找到python${python_version}版本" fi
break
else while true
echo "检测结果:找不到可用的 pip, 我们只支持Python27/35/36/37及其对应的pip, 请重新输入, 或使用ctrl + c退出 " do
fi read -p "请输入虚拟环境名字:" virtualenv_name
fi if [ "$virtualenv_name" == "" ];then
echo "不能为空"
continue
fi
break
done
virtualenv -p $python_path ${virtualenv_name}
if [ "$?" != 0 ];then
echo "创建虚环境失败,请检查环境"
exit 2
fi
cd ${virtualenv_name}
source ./bin/activate
if [ "$?" == 0 ];then
use_virtualenv=
python_path=`which python`
break
else
echo "创建虚环境失败,请检查环境"
exit 2
fi
done
}
function checkLinuxPython(){
python_path=`which python 2>/dev/null`
while true
do
if [ "$python_path" == '' ];then
while true
do
read -p "没有找到默认的python版本,请输入要安装的python路径:" python_path
python_path=`$python_path -V`
if [ "$python_path" != "" ];then
break
else
echo "输入路径有误,未找到pyrhon"
fi
done done
fi
python_version=`$python_path -V 2>&1|awk -F '[ .]' '{print $2$3}'`
pip_version=`$python_path -m pip -V|awk -F '[ .]' '{print $2}'`
while true
do
read -p "
找到python版本$python_version,使用请输入y,选择其他版本请输n(y/n):" check_python
case $check_python in
n)
read -p "请指定您的python路径:" new_python_path
python_V=`$new_python_path -V 2>/dev/null`
if [ "$python_V" != "" ];then
python_path=$new_python_path
python_version=`$python_path -V 2>&1|awk -F '[ .]' '{print $2$3}'`
pip_version=`python -m pip -V|awk -F '[ .]' '{print $2}'`
echo "您的python版本为${python_version}"
break
else
echo 输入有误,未找到python路径
fi
;;
y)
break
;;
*)
echo "输入有误,请重新输入."
continue
;;
esac
done
if [ "$pip_version" -lt 9 ];then
echo "您的pip版本小于9.0.1 请升级pip (pip install --upgrade pip)"
exit 0
fi
if [ "$python_version" == "27" ];then
uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27mu"`
if [[ "$uncode" == "" ]];then
uncode=
else
uncode=u
fi
fi
version_list=`echo "${python_list[@]}" | grep "$python_version" `
if [ "$version_list" == "" ];then
echo "找不到可用的 pip, 我们只支持Python27/35/36/37及其对应的pip, 请重新输入, 或使用ctrl + c退出 "
else
break
fi
done
} }
function checkLinuxAVX(){ function checkLinuxAVX(){
...@@ -287,25 +411,36 @@ function PipLinuxInstall(){ ...@@ -287,25 +411,36 @@ function PipLinuxInstall(){
wheel_cpu_develop="http://paddle-wheel.bj.bcebos.com/latest-cpu-${AVX}-${math}/paddlepaddle-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl" wheel_cpu_develop="http://paddle-wheel.bj.bcebos.com/latest-cpu-${AVX}-${math}/paddlepaddle-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
wheel_gpu_develop="http://paddle-wheel.bj.bcebos.com/latest-gpu-cuda${CUDA}-cudnn${CUDNN}-${AVX}-${math}/paddlepaddle_gpu-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl" wheel_gpu_develop="http://paddle-wheel.bj.bcebos.com/latest-gpu-cuda${CUDA}-cudnn${CUDNN}-${AVX}-${math}/paddlepaddle_gpu-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
if [[ "$paddle_version" == "2" ]];then if [[ "$paddle_version" == "2" ]];then
if [[ "$GPU" == "gpu" ]];then if [[ "$GPU" == "gpu" ]];then
if [[ ${AVX} == "avx" ]];then if [[ ${AVX} == "avx" ]];then
rm -rf `echo $wheel_gpu_release|awk -F '/' '{print $NF}'` rm -rf `echo $wheel_gpu_release|awk -F '/' '{print $NF}'`
wget -q $wheel_gpu_release wget -q $wheel_gpu_release
if [ "$?" == "0" ];then if [ "$?" == "0" ];then
$pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release
if [ "$?" == 0 ];then
echo 安装成功
else
echo 安装失败
exit 1
fi
else else
echo "paddlepaddle whl包下载失败" echo paddlepaddle whl包下载失败
exit 1 exit 1
fi fi
else else
rm -rf `echo $wheel_gpu_release_novax|awk -F '/' '{print $NF}'` rm -rf `echo $wheel_gpu_release_novax|awk -F '/' '{print $NF}'`
wget -q $wheel_gpu_release_novax wget -q $wheel_gpu_release_novax
if [ "$?" == "0" ];then if [ "$?" == "0" ];then
$pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release_noavx $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release_noavx
if [ "$?" == 0 ];then
echo 安装成功
else
echo 安装失败
exit 1
fi
else else
echo "paddlepaddle whl包下载失败" echo paddlepaddle whl包下载失败
exit 1 exit 1
fi fi
fi fi
...@@ -313,9 +448,15 @@ function PipLinuxInstall(){ ...@@ -313,9 +448,15 @@ function PipLinuxInstall(){
rm -rf `echo $wheel_cpu_release|awk -F '/' '{print $NF}'` rm -rf `echo $wheel_cpu_release|awk -F '/' '{print $NF}'`
wget -q $wheel_cpu_release wget -q $wheel_cpu_release
if [ "$?" == "0" ];then if [ "$?" == "0" ];then
$pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_release $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_release
if [ "$?" == 0 ];then
echo 安装成功
else
echo 安装失败
exit 1
fi
else else
echo "paddlepaddle whl包下载失败" echo paddlepaddle whl包下载失败
exit 1 exit 1
fi fi
fi fi
...@@ -324,18 +465,30 @@ function PipLinuxInstall(){ ...@@ -324,18 +465,30 @@ function PipLinuxInstall(){
rm -rf `echo $wheel_gpu_develop|awk -F '/' '{print $NF}'` rm -rf `echo $wheel_gpu_develop|awk -F '/' '{print $NF}'`
wget -q $wheel_gpu_develop wget -q $wheel_gpu_develop
if [ "$?" == "0" ];then if [ "$?" == "0" ];then
$pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_develop $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_develop
if [ "$?" == 0 ];then
echo 安装成功
else
echo 安装失败
exit 1
fi
else else
echo "paddlepaddle whl包下载失败" echo paddlepaddle whl包下载失败
exit 1 exit 1
fi fi
else else
rm -rf `echo $wheel_cpu_develop|awk -F '/' '{print $NF}'` rm -rf `echo $wheel_cpu_develop|awk -F '/' '{print $NF}'`
wget -q $wheel_cpu_develop wget -q $wheel_cpu_develop
if [ "$?" == "0" ];then if [ "$?" == "0" ];then
$pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_develop $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_develop
if [ "$?" == 0 ];then
echo 安装成功
else
echo 安装失败
exit 1
fi
else else
echo "paddlepaddle whl包下载失败" echo paddlepaddle whl包下载失败
exit 1 exit 1
fi fi
fi fi
...@@ -575,95 +728,122 @@ gpu_list=( ...@@ -575,95 +728,122 @@ gpu_list=(
echo echo
echo "Step 5. 检测pip版本" echo "Step 5. 检测pip版本"
echo echo
checkLinuxPip checkLinuxPython
echo echo
checkLinuxAVX checkLinuxAVX
echo
echo "Step 6.是否使用Python的虚拟环境"
use_virtualenv="--user"
checkPythonVirtualenv
echo "*********************2. 开始安装*****************************" echo "*********************2. 开始安装*****************************"
PipLinuxInstall PipLinuxInstall
if [ "$check_virtualenv" == 'y' ];then
echo "虚环境创建成功,请cd 进入${virtualenv_name}, 执行 source bin/activate 进入虚环境。退出虚环境执行 deactivate命令。
更多虚环境使用方法请参考virtualenv官网:https://virtualenv.pypa.io/en/latest/"
fi
}
function clearMacPythonEnv(){
python_version=""
python_brief_version=""
python_root=""
} }
function checkMacPython2(){ function checkMacPython2(){
while true while true
do do
read -p "
=> 未能在常规路径下找到Python2,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载安装Python2(注意Python版本不能低于2.7.15)
如希望自定义Python路径,请输入路径:" python_root
echo
python_version=`$python_root --version 2>&1 1>&1` python_version=`$python_root --version 2>&1 1>&1`
if [ $? == "0" ];then if [[ $? == "0" ]];then
: if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ];then
clearMacPythonEnv
else
check_python=`echo $python_version | grep "Python 2"`
if [[ -n "$check_python" ]];then
while true
do
echo -e " => 在您的环境中找到 \033[32m[ $python_version ]\033[0m, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: "
read -p "" use_python
echo
use_python=`echo $use_python | tr 'A-Z' 'a-z'`
if [[ "$use_python" == "y" ]]||[[ "$use_python" == "" ]];then
use_python="y"
break
elif [[ "$use_python" == "n" ]];then
clearMacPythonEnv
break
else
red " 输入错误,请重新输入(y/n)"
fi
done
if [[ "$use_python" == "y" ]];then
return 0
fi
else
red " 您输入Python的不是Python2"
clearMacPythonEnv
fi
fi
else else
python_version="" clearMacPythonEnv
red " => 未能在常规路径下找到可用的Python2,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载安装Python2(注意Python版本不能低于2.7.15)"
read -p " 如希望自定义Python路径,请输入路径
如果希望重新选择Python版本,请回车:" python_root
echo
if [[ "$python_root" == "" ]];then
python_V=""
clearMacPythonEnv
return 1
fi
fi fi
check_python=`echo $python_version | grep "Python 2"`
if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ] ;then
python_version=""
elif [ -n "$check_python" ];then
while true
do
read -p "
=> 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: " use_python
echo
use_python=`echo $use_python | tr 'A-Z' 'a-z'`
if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then
use_python="y"
break
elif [ "$use_python" == "n" ];then
python_root=""
break
else
echo "输入错误,请重新输入(y/n)"
fi
done
if [ "$use_python" == "y" ];then
break
fi
else
echo "您输入Python的不是Python2"
python_version=""
fi
done done
} }
function checkMacPython3(){ function checkMacPython3(){
while true while true
do do
read -p " python_version=`$python_root --version 2>&1 1>&1`
=> 未能在常规路径下找到Python3,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载Python3 if [[ $? == "0" ]];then
如希望自定义Python路径,请输入路径:" python_root if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ] ;then
python_version=`$python_root --version 2>&1 1>&1` clearMacPythonEnv
if [ $? == "0" ];then else
: check_python=`echo $python_version | grep "Python 3"`
if [[ -n "$check_python" ]];then
while true
do
echo -e " => 在您的环境中找到 \033[32m[ $python_version ]\033[0m, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: "
read -p "" use_python
echo
use_python=`echo $use_python | tr 'A-Z' 'a-z'`
if [[ "$use_python" == "y" ]]||[[ "$use_python" == "" ]];then
use_python="y"
break
elif [[ "$use_python" == "n" ]];then
clearMacPythonEnv
break
else
red " 输入错误,请重新输入(y/n)"
fi
done
if [[ "$use_python" == "y" ]];then
return 0
fi
else
red " 您输入Python的不是Python3"
clearMacPythonEnv
fi
fi
else else
python_version="" clearMacPythonEnv
red " => 未能在常规路径下找到可用的Python3,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载安装Python3(注意Python版本不能低于3.5.x)"
read -p " 如希望自定义Python路径,请输入路径
如果希望重新选择Python版本,请回车:" python_root
echo
if [[ "$python_root" == "" ]];then
python_V=""
clearMacPythonEnv
return 1
fi
fi fi
check_python=`echo $python_version | grep "Python 3"`
if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ] ;then
python_version=""
elif [ -n "$check_python" ] ;then
while true
do
read -p "
=> 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: " use_python
echo
use_python=`echo $use_python | tr 'A-Z' 'a-z'`
if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then
use_python="y"
break
elif [ "$use_python" == "n" ];then
python_root=""
break
else
echo "输入错误,请重新输入(y/n)"
fi
done
if [ "$use_python" == "y" ];then
break
fi
else
echo "您输入Python的不是Python3"
python_version=""
fi
done done
} }
...@@ -672,145 +852,160 @@ function checkMacPaddleVersion(){ ...@@ -672,145 +852,160 @@ function checkMacPaddleVersion(){
do do
read -n1 -p "Step 2. 选择PaddlePaddle的版本,请按回车键继续..." read -n1 -p "Step 2. 选择PaddlePaddle的版本,请按回车键继续..."
echo echo
read -p " yellow " 1. 开发版:对应Github上develop分支,如您需要开发、或希望使用PaddlePaddle最新功能,请选用此版本"
1. 开发版:对应Github上develop分支,如您需要开发、或希望使用PaddlePaddle最新功能,请选用此版本 yellow " 2. 稳定版(推荐):如您无特殊开发需求,建议使用此版本,目前最新的版本号为 ${release_version}"
2. 稳定版(推荐):如您无特殊开发需求,建议使用此版本,目前最新的版本号为 ${release_version} read -p " => 请输入数字1或2。如输入其他字符或直接回车,将会默认选择【 2. 稳定版 】 。请在这里输入并回车:" paddle_version
if [[ "$paddle_version" == "1" ]]||[[ "$paddle_version" == "2" ]];then
=> 请输入数字1或2。如输入其他字符或直接回车,将会默认选择【 2. 稳定版 】 。请在这里输入并回车:" paddle_version
if [ "$paddle_version" == "1" ]||[ "$paddle_version" == "2" ];then
echo echo
echo "您选择了数字【"$paddle_version" 】" yellow " 您选择了数字【"$paddle_version" 】"
echo echo
break break
else else
paddle_version="2" paddle_version="2"
echo echo
echo "您选择了数字【2】" yellow " 您选择了数字【2】"
echo echo
break break
fi fi
done done
} }
function initCheckMacPython2(){
echo
yellow " 您选择了Python "$python_V",正在寻找符合要求的Python 2版本"
echo
python_root=`which python2.7`
if [[ "$python_root" == "" ]];then
python_root=`which python`
fi
checkMacPython2
if [[ "$?" == "1" ]];then
return 1
else
return 0
fi
}
function checkMacPythonVersion(){ function initCheckMacPython3(){
while true echo
do yellow " 您选择了Python "$python_V",正在寻找符合您要求的Python 2版本"
read -n1 -p "Step 3. 选择Python版本,请按回车键继续..." echo
read -p " python_root=`which python3`
2. 使用python 2.x checkMacPython3
3. 使用python 3.x if [[ "$?" == "1" ]];then
return 1
else
return 0
fi
}
=> 请输入数字2或3。如输入其他字符或直接回车,将会默认使用【Python 2 】。请在这里输入并回车:" python_V function checkMacPip(){
echo if [[ "$python_V" == "2" ]]||[[ "$python_V" == "3" ]];then
if [ "$python_V" == "" ];then
python_V="2" python_brief_version=`$python_root -m pip -V |awk -F "[ |)]" '{print $6}'|sed 's#\.##g'`
if [[ ${python_brief_version} == "" ]];then
red "您输入的python:${python_root} 对应的pip不可用,请检查此pip或重新选择其他python"
echo
return 1
fi fi
echo "您选择了数字【"$python_V"】,正在寻找符合您要求的Python版本,请按回车键继续..." pip_version=`$python_root -m pip -V |awk -F '[ .]' '{print $2}'`
echo if [[ 9 -le ${pip_version} ]];then
if [ "$python_V" == "2" ];then :
python_root=`which python2.7` else
if [ "$python_root" == "" ];then red "您的pip版本过低,请安装pip 9.0.1及以上的版本"
python_root=`which python` echo
fi return 1
python_version=`$python_root --version 2>&1 1>&1` fi
if [ $? == "0" ];then if [[ "$python_brief_version" == "" ]];then
: clearMacPythonEnv
else red "您的 $python_root 对应的pip存在问题,请按ctrl + c退出后重新安装pip,或切换其他python版本"
python_version="" echo
fi return 1
if [ "$python_root" == "" ]||[ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ]||[ "$python_root" == "/usr/bin/python2.7" -a "$python_version" == "Python 2.7.10" ];then else
checkMacPython2 if [[ $python_brief_version == "27" ]];then
fi uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27"`
while true if [[ $uncode == "" ]];then
do uncode="mu"
read -p "
=> 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车:" use_python
echo
use_python=`echo $use_python | tr 'A-Z' 'a-z'`
if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then
break
elif [ "$use_python" == "n" ];then
python_root=""
checkMacPython2
break
else else
echo "输入错误,请重新输入(y/n)" uncode="m"
fi fi
done fi
version_list=`echo "${python_list[@]}" | grep "$python_brief_version" `
elif [ "$python_V" == "3" ];then if [[ "$version_list" != "" ]];then
python_root=`which python3` return 0
python_version=`$python_root --version 2>&1 1>&1` else
if [ $? == "0" ];then red "未找到可用的pip或pip3。PaddlePaddle目前支持:Python2.7/3.5/3.6/3.7及其对应的pip, 请重新输入,或使用ctrl + c退出"
:
else
python_version=""
fi
if [ "$python_root" == "" ]||[ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ];then
checkMacPython3
fi
while true
do
read -p "
=> 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车:" use_python
echo echo
use_python=`echo $use_python | tr 'A-Z' 'a-z'` clearMacPythonEnv
if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then return 1
break fi
elif [ "$use_python" == "n" ];then
checkMacPython3
break
else
echo "输入错误,请重新输入(y/n)"
fi
done
else
:
fi
fi
fi
}
if [ "$python_V" == "2" ]||[ "$python_V" == "3" ];then function checkMacPythonVersion(){
python_brief_version=`$python_root -m pip -V |awk -F "[ |)]" '{print $6}'|sed 's#\.##g'` while true
if [[ $python_brief_version == "27" ]];then do
uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27"` read -n1 -p "Step 3. 选择Python版本,请按回车键继续..."
if [[ $uncode == "" ]];then echo
uncode="mu" yellow " 2. 使用python 2.x"
else yellow " 3. 使用python 3.x"
uncode="m" read -p " => 请输入数字2或3。如输入其他字符或直接回车,将会默认使用【Python 2 】。请在这里输入并回车:" python_V
fi if [[ "$python_V" == "" ]];then
fi python_V="2"
version_list=`echo "${python_list[@]}" | grep "$python_brief_version" ` fi
if [ "$version_list" != "" ];then if [[ "$python_V" == "2" ]];then
break initCheckMacPython2
if [[ "$?" == "0" ]];then
checkMacPip
if [[ "$?" == "0" ]];then
return 0
else
:
fi
else else
echo "未找到可用的pip或pip3。PaddlePaddle目前支持:Python2.7/3.5/3.6/3.7及其对应的pip, 请重新输入,或使用ctrl + c退出" :
fi fi
else elif [[ "$python_V" == "3" ]];then
echo "输入错误,请重新输入" initCheckMacPython3
fi if [[ "$?" == "0" ]];then
checkMacPip
if [[ "$?" == "0" ]];then
return 0
else
:
fi
else
:
fi
else
red "输入错误,请重新输入"
fi
done done
} }
function checkMacAVX(){ function checkMacAVX(){
read -n1 -p "Step 4. 检测您的Mac是否支持AVX指令集,请按回车键继续..." read -n1 -p "Step 4. 检测您的Mac是否支持AVX指令集,请按回车键继续..."
echo
if [[ $AVX != "" ]];then if [[ $AVX != "" ]];then
AVX="avx" AVX="avx"
echo "检测结果:支持" echo ""
green " 检测结果:支持"
echo ""
return 0
else else
read -n1 -p "检测结果:不支持。非常抱歉,PaddlePaddle在Mac系统暂不提供no_avx类型的安装包,您可以选择在Linux系统中安装no_avx版的PaddlePaddle, 请按回车键退出..." red " 检测结果:不支持。非常抱歉,PaddlePaddle在Mac系统暂不提供no_avx类型的安装包,您可以选择在Linux系统中安装no_avx版的PaddlePaddle, 请按回车键退出..."
exit echo
return 1
fi fi
echo
} }
function checkMacGPU(){ function checkMacGPU(){
read -n1 -p "Step 5. 选择CPU/GPU版本,请按回车键继续..." read -n1 -p "Step 5. 选择CPU/GPU版本,请按回车键继续..."
echo echo
if [[ $GPU != "" ]];then if [[ $GPU != "" ]];then
echo "MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle" yellow " MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle"
else else
echo "MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle" yellow " MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle"
GPU=cpu GPU=cpu
fi fi
echo echo
...@@ -822,38 +1017,44 @@ function macos() { ...@@ -822,38 +1017,44 @@ function macos() {
while true while true
do do
checkMacPaddleVersion checkMacPaddleVersion
checkMacPythonVersion checkMacPythonVersion
checkMacAVX checkMacAVX
checkMacGPU checkMacGPU
echo "*********************2. 开始安装*****************************" green "*********************2. 开始安装*****************************"
echo echo
read -n1 -p "即将为您下载并安装PaddlePaddle,请按回车键继续..." yellow "即将为您下载并安装PaddlePaddle,请按回车键继续..."
read -n1 -p ""
echo echo
if [[ $paddle_version == "2" ]];then if [[ $paddle_version == "2" ]];then
$python_root -m pip install paddlepaddle $python_root -m pip install paddlepaddle
if [ $? == "0" ];then if [[ $? == "0" ]];then
echo "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器" green "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
break break
else else
rm $whl_cpu_release rm $whl_cpu_release
echo "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用" red "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
echo"" echo""
echo "==========================================================================================" echo "=========================================================================================="
echo"" echo""
exit 1 exit 1
fi fi
else else
if [ -f $whl_cpu_develop ];then if [[ -f $whl_cpu_develop ]];then
$python_root -m pip install $whl_cpu_develop $python_root -m pip install $whl_cpu_develop
if [ $? == "0" ];then if [[ $? == "0" ]];then
rm -rf $whl_cpu_develop rm -rf $whl_cpu_develop
echo "安装成功!小提示:可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器" # TODO add install success check here
green "安装成功!小提示:可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
break break
else else
echo "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用" red "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
echo"" echo""
echo "==========================================================================================" echo "=========================================================================================="
echo"" echo""
...@@ -861,15 +1062,15 @@ function macos() { ...@@ -861,15 +1062,15 @@ function macos() {
fi fi
else else
wget ${path}$whl_cpu_develop -O $whl_cpu_develop wget ${path}$whl_cpu_develop -O $whl_cpu_develop
if [ $? == "0" ];then if [[ $? == "0" ]];then
$python_root -m pip install $whl_cpu_develop $python_root -m pip install $whl_cpu_develop
if [ $? == "0" ];then if [[ $? == "0" ]];then
rm $wheel_cpu_develop rm $wheel_cpu_develop
echo "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器" green "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
break break
else else
rm $whl_cpu_release rm $whl_cpu_release
echo "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用" red "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
echo"" echo""
echo "==========================================================================================" echo "=========================================================================================="
echo"" echo""
...@@ -877,7 +1078,7 @@ function macos() { ...@@ -877,7 +1078,7 @@ function macos() {
fi fi
else else
rm $whl_cpu_develop rm $whl_cpu_develop
echo "未能正常安装PaddlePaddle,请检查您的网络 或者确认您是否安装有 wget,或者ctrl + c退出后反馈至https://github.com/PaddlePaddle/Paddle/issues" red "未能正常安装PaddlePaddle,请检查您的网络 或者确认您是否安装有 wget,或者ctrl + c退出后反馈至https://github.com/PaddlePaddle/Paddle/issues"
echo"" echo""
echo "==========================================================================================" echo "=========================================================================================="
echo"" echo""
...@@ -890,33 +1091,35 @@ function macos() { ...@@ -890,33 +1091,35 @@ function macos() {
function main() { function main() {
echo "*********************************" echo "*********************************"
echo "欢迎使用PaddlePaddle快速安装脚本" green "欢迎使用PaddlePaddle快速安装脚本"
echo "*********************************" echo "*********************************"
echo echo
echo "如果您在安装过程中遇到任何问题,请在https://github.com/PaddlePaddle/Paddle/issues反馈,我们的工作人员将会帮您答疑解惑" yellow "如果您在安装过程中遇到任何问题,请在https://github.com/PaddlePaddle/Paddle/issues反馈,我们的工作人员将会帮您答疑解惑"
echo echo
echo "本安装包将帮助您在Linux或Mac系统下安装PaddlePaddle,包括 1)安装前的准备和 2)开始安装 两部分" echo "本安装包将帮助您在Linux或Mac系统下安装PaddlePaddle,包括"
yellow "1)安装前的准备"
yellow "2)开始安装"
echo echo
read -n1 -p "请按回车键进行下一步..." read -n1 -p "请按回车键进行下一步..."
echo echo
echo echo
echo "*********************1. 安装前的准备*****************************" green "*********************1. 安装前的准备*****************************"
echo echo
echo "Step 1. 正在检测您的操作系统信息..." echo "Step 1. 正在检测您的操作系统信息..."
echo echo
SYSTEM=`uname -s` SYSTEM=`uname -s`
if [ "$SYSTEM" == "Darwin" ];then if [[ "$SYSTEM" == "Darwin" ]];then
echo "您的系统为:MAC OSX" yellow " 您的系统为:MAC OSX"
echo echo
macos macos
else else
echo "您的系统为:Linux" yellow " 您的系统为:Linux"
echo echo
OS=`cat /etc/issue|awk 'NR==1 {print $1}'` OS=`cat /etc/issue|awk 'NR==1 {print $1}'`
if [ $OS == "\S" ] || [ "$OS" == "CentOS" ] || [ $OS == "Ubuntu" ];then if [[ $OS == "\S" ]] || [[ "$OS" == "CentOS" ]] || [[ $OS == "Ubuntu" ]];then
linux linux
else else
echo "您的系统不在本安装包的支持范围,如您需要在windows环境下安装PaddlePaddle,请您参考PaddlePaddle官网的windows安装文档" red "您的系统不在本安装包的支持范围,如您需要在windows环境下安装PaddlePaddle,请您参考PaddlePaddle官网的windows安装文档"
fi fi
fi fi
} }
......
...@@ -25,4 +25,5 @@ import paddle.reader ...@@ -25,4 +25,5 @@ import paddle.reader
import paddle.dataset import paddle.dataset
import paddle.batch import paddle.batch
import paddle.compat import paddle.compat
import paddle.distributed
batch = batch.batch batch = batch.batch
...@@ -161,7 +161,6 @@ def __bootstrap__(): ...@@ -161,7 +161,6 @@ def __bootstrap__():
'times_excess_than_required_tmp_allocation', 'times_excess_than_required_tmp_allocation',
'enable_inplace_whitelist' 'enable_inplace_whitelist'
] ]
core.init_gflags([sys.argv[0]] + core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)]) ["--tryfromenv=" + ",".join(read_env_flags)])
core.init_glog(sys.argv[0]) core.init_glog(sys.argv[0])
......
...@@ -177,7 +177,10 @@ class CompiledProgram(object): ...@@ -177,7 +177,10 @@ class CompiledProgram(object):
# FIXME(dzhwinter): enable_inplace should be after memory_optimize # FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass. # if turn on python memory optimize, turn off the inplace_pass.
self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True if self._build_strategy.memory_optimize is True:
self._build_strategy.memory_optimize = False if self._program._is_mem_optimized else True
if self._build_strategy.enable_inplace is True:
self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True
if self._build_strategy.num_trainers > 1 and trainers_endpoints: if self._build_strategy.num_trainers > 1 and trainers_endpoints:
assert self._build_strategy.num_trainers == len( assert self._build_strategy.num_trainers == len(
......
...@@ -63,10 +63,10 @@ Notes: ...@@ -63,10 +63,10 @@ Notes:
## 4. How to reproduce the results ## 4. How to reproduce the results
* Small dataset * Small dataset
```bash ```bash
python python/paddle/fluid/contrib/tests/test_calibration.py FLAGS_use_mkldnn=true python python/paddle/fluid/contrib/tests/test_calibration.py
``` ```
* Full dataset * Full dataset
```bash ```bash
DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py FLAGS_use_mkldnn=true DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
``` ```
...@@ -6,5 +6,9 @@ if(APPLE OR WIN32 OR NOT WITH_MKL) ...@@ -6,5 +6,9 @@ if(APPLE OR WIN32 OR NOT WITH_MKL)
endif() endif()
foreach(src ${TEST_OPS}) foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py) if(src MATCHES "test_calibration")
py_test(${src} SRCS ${src}.py ENVS FLAGS_use_mkldnn=true)
else()
py_test(${src} SRCS ${src}.py)
endif()
endforeach() endforeach()
...@@ -199,7 +199,6 @@ class TestCalibrationForResnet50(unittest.TestCase): ...@@ -199,7 +199,6 @@ class TestCalibrationForResnet50(unittest.TestCase):
def run_program(self, model_path, generate_int8=False, algo='direct'): def run_program(self, model_path, generate_int8=False, algo='direct'):
image_shape = [3, 224, 224] image_shape = [3, 224, 224]
os.environ['FLAGS_use_mkldnn'] = 'True'
fluid.memory_optimize(fluid.default_main_program()) fluid.memory_optimize(fluid.default_main_program())
...@@ -241,9 +240,6 @@ class TestCalibrationForResnet50(unittest.TestCase): ...@@ -241,9 +240,6 @@ class TestCalibrationForResnet50(unittest.TestCase):
label = label.reshape([-1, 1]) label = label.reshape([-1, 1])
running_program = calibrator.sampling_program.clone( running_program = calibrator.sampling_program.clone(
) if generate_int8 else infer_program.clone() ) if generate_int8 else infer_program.clone()
for op in running_program.current_block().ops:
if op.has_attr("use_mkldnn"):
op._set_attr("use_mkldnn", True)
t1 = time.time() t1 = time.time()
_, acc1, _ = exe.run( _, acc1, _ = exe.run(
......
...@@ -261,45 +261,42 @@ def _as_lodtensor(data, place): ...@@ -261,45 +261,42 @@ def _as_lodtensor(data, place):
class Executor(object): class Executor(object):
""" """
An Executor in Python, only support the single-GPU running. For multi-cards, please refer to An Executor in Python, supports single/multiple-GPU running, and single/multiple-CPU running.
ParallelExecutor. Python executor takes a program, adds feed operators and fetch operators to this program according
Python executor takes a program, add feed operators and fetch operators to this program according
to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
the variables(or names) that user want to get after program run. Note: the executor will run all the variables(or names) that user wants to get after program runs. Note: the executor will run all
operators in the program but not only the operators dependent by the fetch_list. operators in the program but not only the operators dependent by the fetch_list.
It store the global variables into the global scope, and create a local scope for the temporary It stores the global variables into the global scope, and creates a local scope for the temporary
variables. The local scope contents will be discarded after every minibatch forward/backward finished. variables. The contents in local scope may be discarded after every minibatch forward/backward
But the global scope variables will be persistent through different runs. finished. But the global scope variables will be persistent through different runs.
All of ops in program will be running in sequence.
Example: Example:
.. code-block:: python
# First create the Executor. .. code-block:: python
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place) # First create the Executor.
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
# Run the startup program once and only once. exe = fluid.Executor(place)
# Not need to optimize/compile the startup program.
exe.run(fluid.default_startup_program()) # Run the startup program once and only once.
# Not need to optimize/compile the startup program.
# Run the main program directly without compile. exe.run(fluid.default_startup_program())
loss, = exe.run(fluid.default_main_program(),
feed=feed_dict, # Run the main program directly without compile.
fetch_list=[loss.name]) loss, = exe.run(fluid.default_main_program(),
# Or, compiled the program and run. See `CompiledProgram` for more detail. feed=feed_dict,
compiled_prog = compiler.CompiledProgram( fetch_list=[loss.name])
fluid.default_main_program()).with_data_parallel( # Or, compiled the program and run. See `CompiledProgram` for more detail.
loss_name=loss.name) compiled_prog = compiler.CompiledProgram(
loss, = exe.run(compiled_prog, fluid.default_main_program()).with_data_parallel(
feed=feed_dict, loss_name=loss.name)
fetch_list=[loss.name]) loss, = exe.run(compiled_prog,
feed=feed_dict,
fetch_list=[loss.name])
Args: Args:
place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
Note: For debugging complicated network in parallel-GPUs, you can test it on the executor.
They has the exactly same arguments, and expected the same results.
""" """
def __init__(self, place): def __init__(self, place):
...@@ -382,6 +379,12 @@ class Executor(object): ...@@ -382,6 +379,12 @@ class Executor(object):
] ]
return outs return outs
'''
TODO(typhoonzero): Define "no longer use" meaning? Can user create
a new Executor for the same program and run?
TODO(panyx0718): Why ParallelExecutor doesn't have close?
'''
def close(self): def close(self):
""" """
Close this executor. Close this executor.
...@@ -389,9 +392,6 @@ class Executor(object): ...@@ -389,9 +392,6 @@ class Executor(object):
You can no longer use this executor after calling this method. You can no longer use this executor after calling this method.
For the distributed training, this method would free the resource on PServers related to For the distributed training, this method would free the resource on PServers related to
the current Trainer. the current Trainer.
TODO(typhoonzero): Define "no longer use" meaning? Can user create
a new Executor for the same program and run?
TODO(panyx0718): Why ParallelExecutor doesn't have close?
Example: Example:
>>> cpu = core.CPUPlace() >>> cpu = core.CPUPlace()
......
...@@ -555,7 +555,8 @@ class OpProtoHolder(object): ...@@ -555,7 +555,8 @@ class OpProtoHolder(object):
return { return {
core.op_proto_and_checker_maker.kOpRoleAttrName(), core.op_proto_and_checker_maker.kOpRoleAttrName(),
core.op_proto_and_checker_maker.kOpRoleVarAttrName(), core.op_proto_and_checker_maker.kOpRoleVarAttrName(),
core.op_proto_and_checker_maker.kOpNameScopeAttrName() core.op_proto_and_checker_maker.kOpNameScopeAttrName(),
core.op_proto_and_checker_maker.kOpCreationCallstackAttrName()
} }
......
...@@ -766,7 +766,10 @@ def _load_distributed_persistables(executor, dirname, main_program=None): ...@@ -766,7 +766,10 @@ def _load_distributed_persistables(executor, dirname, main_program=None):
dtype=slice_var.dtype, dtype=slice_var.dtype,
persistable=True) persistable=True)
dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:]) dim1_flatten = 1
if len(slice.shape) >= 2:
dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
start = int(offset / dim1_flatten) start = int(offset / dim1_flatten)
end = int(offset / dim1_flatten + slice.shape[0]) end = int(offset / dim1_flatten + slice.shape[0])
...@@ -892,7 +895,7 @@ def save_inference_model(dirname, ...@@ -892,7 +895,7 @@ def save_inference_model(dirname,
True is supported. True is supported.
Returns: Returns:
None target_var_name_list(list): The fetch variables' name list
Raises: Raises:
ValueError: If `feed_var_names` is not a list of basestring. ValueError: If `feed_var_names` is not a list of basestring.
...@@ -945,11 +948,13 @@ def save_inference_model(dirname, ...@@ -945,11 +948,13 @@ def save_inference_model(dirname,
# TODO(Superjomn) add an IR pass to remove 1-scale op. # TODO(Superjomn) add an IR pass to remove 1-scale op.
with program_guard(main_program): with program_guard(main_program):
uniq_target_vars = [] uniq_target_vars = []
for var in target_vars: for i, var in enumerate(target_vars):
if isinstance(var, Variable): if isinstance(var, Variable):
var1 = layers.scale(var, 1.) var = layers.scale(
uniq_target_vars.append(var1) var, 1., name="save_infer_model/scale_{}".format(i))
uniq_target_vars.append(var)
target_vars = uniq_target_vars target_vars = uniq_target_vars
target_var_name_list = [var.name for var in target_vars]
# when a pserver and a trainer running on the same machine, mkdir may conflict # when a pserver and a trainer running on the same machine, mkdir may conflict
try: try:
...@@ -1006,6 +1011,7 @@ def save_inference_model(dirname, ...@@ -1006,6 +1011,7 @@ def save_inference_model(dirname,
params_filename = os.path.basename(params_filename) params_filename = os.path.basename(params_filename)
save_persistables(executor, dirname, main_program, params_filename) save_persistables(executor, dirname, main_program, params_filename)
return target_var_name_list
def load_inference_model(dirname, def load_inference_model(dirname,
......
...@@ -506,9 +506,9 @@ class While(object): ...@@ -506,9 +506,9 @@ class While(object):
while loop control flow. while loop control flow.
Args: Args:
cond (Variable): condition used to compare. cond(Variable): condition used to compare.
is_test(bool): A flag indicating whether execution is in test phase. is_test(bool): A flag indicating whether execution is in test phase.
name (str): The name of this layer. name(str): The name of this layer.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -589,7 +589,8 @@ class While(object): ...@@ -589,7 +589,8 @@ class While(object):
def lod_rank_table(x, level=0): def lod_rank_table(x, level=0):
"""LoD Rank Table Operator. Given an input variable **x** and a level number """
LoD Rank Table Operator. Given an input variable **x** and a level number
of LoD, this layer creates a LodRankTable object. A LoDRankTable object of LoD, this layer creates a LodRankTable object. A LoDRankTable object
contains a list of bi-element tuples. Each tuple consists of an index and contains a list of bi-element tuples. Each tuple consists of an index and
a length, both of which are int type. Refering to specified level of LoD, a length, both of which are int type. Refering to specified level of LoD,
...@@ -883,10 +884,8 @@ def less_than(x, y, force_cpu=None, cond=None, **ignored): ...@@ -883,10 +884,8 @@ def less_than(x, y, force_cpu=None, cond=None, **ignored):
return cond return cond
def equal(x, y, cond=None, **ignored): def equal(x, y, cond=None):
""" """
**equal**
This layer returns the truth value of :math:`x == y` elementwise. This layer returns the truth value of :math:`x == y` elementwise.
Args: Args:
...@@ -1458,7 +1457,6 @@ class DynamicRNN(object): ...@@ -1458,7 +1457,6 @@ class DynamicRNN(object):
Returns: Returns:
The current timestep in the input sequence. The current timestep in the input sequence.
""" """
self._assert_in_rnn_block_("step_input") self._assert_in_rnn_block_("step_input")
if not isinstance(x, Variable): if not isinstance(x, Variable):
...@@ -1535,8 +1533,7 @@ class DynamicRNN(object): ...@@ -1535,8 +1533,7 @@ class DynamicRNN(object):
@signature_safe_contextmanager @signature_safe_contextmanager
def block(self): def block(self):
""" """
The block for user to define operators in RNN. See the class docstring The block for user to define operators in RNN.
for more details.
""" """
if self.status != DynamicRNN.BEFORE_RNN: if self.status != DynamicRNN.BEFORE_RNN:
raise ValueError("rnn.block() can only be invoke once") raise ValueError("rnn.block() can only be invoke once")
...@@ -1640,8 +1637,7 @@ class DynamicRNN(object): ...@@ -1640,8 +1637,7 @@ class DynamicRNN(object):
dtype(str|numpy.dtype): The data type of the initialized memory. dtype(str|numpy.dtype): The data type of the initialized memory.
Returns: Returns:
the memory variable. The memory variable.
""" """
self._assert_in_rnn_block_('memory') self._assert_in_rnn_block_('memory')
self._init_zero_idx_() self._init_zero_idx_()
...@@ -1740,7 +1736,7 @@ class DynamicRNN(object): ...@@ -1740,7 +1736,7 @@ class DynamicRNN(object):
def output(self, *outputs): def output(self, *outputs):
""" """
mark the RNN output variables. Mark the RNN output variables.
Args: Args:
outputs: The output variables. outputs: The output variables.
......
...@@ -545,15 +545,16 @@ def yolov3_loss(x, ...@@ -545,15 +545,16 @@ def yolov3_loss(x,
TypeError: Attr ignore_thresh of yolov3_loss must be a float number TypeError: Attr ignore_thresh of yolov3_loss must be a float number
Examples: Examples:
.. code-block:: python .. code-block:: python
x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32') x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32') gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32') gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326] anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
anchors = [0, 1, 2] anchor_mask = [0, 1, 2]
loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80, anchors=anchors, loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel, anchors=anchors,
ignore_thresh=0.5, downsample_ratio=32) anchor_mask=anchor_mask, class_num=80,
ignore_thresh=0.7, downsample_ratio=32)
""" """
helper = LayerHelper('yolov3_loss', **locals()) helper = LayerHelper('yolov3_loss', **locals())
......
...@@ -56,7 +56,10 @@ def data(name, ...@@ -56,7 +56,10 @@ def data(name,
Args: Args:
name(str): The name/alias of the function name(str): The name/alias of the function
shape(list): Tuple declaring the shape. shape(list): Tuple declaring the shape. If :code:`append_batch_size` is
True and there is no -1 inside :code:`shape`, it should be
considered as the shape of the each sample. Otherwise, it
should be considered as the shape of the batched data.
append_batch_size(bool): append_batch_size(bool):
1. If true, it prepends -1 to the shape. 1. If true, it prepends -1 to the shape.
For example if shape=[1], the resulting shape is [-1, 1]. For example if shape=[1], the resulting shape is [-1, 1].
......
...@@ -24,7 +24,7 @@ from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype ...@@ -24,7 +24,7 @@ from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
__all__ = [ __all__ = [
'deprecated', 'generate_layer_fn', 'generate_layer_fn_noattr', 'autodoc', 'deprecated', 'generate_layer_fn', 'generate_activation_fn', 'autodoc',
'templatedoc' 'templatedoc'
] ]
...@@ -89,6 +89,9 @@ def _generate_doc_string_(op_proto, additional_args_lines=None): ...@@ -89,6 +89,9 @@ def _generate_doc_string_(op_proto, additional_args_lines=None):
buf.write('\n') buf.write('\n')
skip_attrs = OpProtoHolder.generated_op_attr_names() skip_attrs = OpProtoHolder.generated_op_attr_names()
# attr use_mkldnn and is_test also should not be visible to users.
skip_attrs.add("use_mkldnn")
skip_attrs.add("is_test")
for each_attr in op_proto.attrs: for each_attr in op_proto.attrs:
if each_attr.name in skip_attrs: if each_attr.name in skip_attrs:
...@@ -226,7 +229,7 @@ def generate_layer_fn(op_type): ...@@ -226,7 +229,7 @@ def generate_layer_fn(op_type):
return func return func
def generate_layer_fn_noattr(op_type): def generate_activation_fn(op_type):
"""Register the Python layer for an Operator without Attribute. """Register the Python layer for an Operator without Attribute.
Args: Args:
...@@ -246,6 +249,7 @@ def generate_layer_fn_noattr(op_type): ...@@ -246,6 +249,7 @@ def generate_layer_fn_noattr(op_type):
func.__name__ = op_type func.__name__ = op_type
func.__doc__ = _generate_doc_string_(op_proto) func.__doc__ = _generate_doc_string_(op_proto)
return func return func
......
...@@ -2441,7 +2441,7 @@ def pool2d(input, ...@@ -2441,7 +2441,7 @@ def pool2d(input,
data = fluid.layers.data( data = fluid.layers.data(
name='data', shape=[3, 32, 32], dtype='float32') name='data', shape=[3, 32, 32], dtype='float32')
conv2d = fluid.layers.pool2d( pool2d = fluid.layers.pool2d(
input=data, input=data,
pool_size=2, pool_size=2,
pool_type='max', pool_type='max',
...@@ -2490,6 +2490,7 @@ def pool2d(input, ...@@ -2490,6 +2490,7 @@ def pool2d(input,
return pool_out return pool_out
@templatedoc()
def pool3d(input, def pool3d(input,
pool_size=-1, pool_size=-1,
pool_type="max", pool_type="max",
...@@ -2501,13 +2502,19 @@ def pool3d(input, ...@@ -2501,13 +2502,19 @@ def pool3d(input,
name=None, name=None,
exclusive=True): exclusive=True):
""" """
This function adds the operator for pooling in 3-dimensions, using the ${comment}
pooling configurations mentioned in input parameters.
Args: Args:
input (Variable): ${input_comment} input (Variable): The input tensor of pooling operator. The format of
pool_size (int): ${ksize_comment} input tensor is NCDHW, where N is batch size, C is
pool_type (str): ${pooling_type_comment} the number of channels, D is the depth of the feature,
H is the height of the feature, and W is the width
of the feature.
pool_size (int|list|tuple): The pool kernel size. If pool kernel size
is a tuple or list, it must contain three integers,
(pool_size_Depth, pool_size_Height, pool_size_Width).
Otherwise, the pool kernel size will be the cube of an int.
pool_type (string): ${pooling_type_comment}
pool_stride (int): stride of the pooling layer. pool_stride (int): stride of the pooling layer.
pool_padding (int): padding size. pool_padding (int): padding size.
global_pooling (bool): ${global_pooling_comment} global_pooling (bool): ${global_pooling_comment}
...@@ -2520,6 +2527,19 @@ def pool3d(input, ...@@ -2520,6 +2527,19 @@ def pool3d(input,
Returns: Returns:
Variable: output of pool3d layer. Variable: output of pool3d layer.
Examples:
.. code-block:: python
data = fluid.layers.data(
name='data', shape=[3, 32, 32, 32], dtype='float32')
pool3d = fluid.layers.pool3d(
input=data,
pool_size=2,
pool_type='max',
pool_stride=1,
global_pooling=False)
""" """
if pool_type not in ["max", "avg"]: if pool_type not in ["max", "avg"]:
raise ValueError( raise ValueError(
...@@ -2569,7 +2589,27 @@ def adaptive_pool2d(input, ...@@ -2569,7 +2589,27 @@ def adaptive_pool2d(input,
require_index=False, require_index=False,
name=None): name=None):
""" """
${comment} **Adaptive Pool2d Operator**
The adaptive_pool2d operation calculates the output based on the input, pool_size,
pool_type parameters. Input(X) and output(Out) are in NCHW format, where N is batch
size, C is the number of channels, H is the height of the feature, and W is
the width of the feature. Parameters(pool_size) should contain two elements which
represent height and width, respectively. Also the H and W dimensions of output(Out)
is same as Parameter(pool_size).
For average adaptive pool2d:
.. math::
hstart &= floor(i * H_{in} / H_{out})
hend &= ceil((i + 1) * H_{in} / H_{out})
wstart &= floor(j * W_{in} / W_{out})
wend &= ceil((j + 1) * W_{in} / W_{out})
Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
Args: Args:
input (Variable): The input tensor of pooling operator. The format of input (Variable): The input tensor of pooling operator. The format of
...@@ -2579,8 +2619,8 @@ def adaptive_pool2d(input, ...@@ -2579,8 +2619,8 @@ def adaptive_pool2d(input,
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain two integers, (pool_size_Height, pool_size_Width). it must contain two integers, (pool_size_Height, pool_size_Width).
pool_type: ${pooling_type_comment} pool_type: ${pooling_type_comment}
require_index (bool): If true, the index of max pooling point along with outputs. require_index (bool): If true, the index of max pooling point will be returned along
it cannot be set in average pooling type. with outputs. It cannot be set in average pooling type.
name (str|None): A name for this layer(optional). If set None, the name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically. layer will be named automatically.
...@@ -2661,18 +2701,42 @@ def adaptive_pool3d(input, ...@@ -2661,18 +2701,42 @@ def adaptive_pool3d(input,
require_index=False, require_index=False,
name=None): name=None):
""" """
${comment} **Adaptive Pool3d Operator**
The adaptive_pool3d operation calculates the output based on the input, pool_size,
pool_type parameters. Input(X) and output(Out) are in NCDHW format, where N is batch
size, C is the number of channels, D is the depth of the feature, H is the height of
the feature, and W is the width of the feature. Parameters(pool_size) should contain
three elements which represent height and width, respectively. Also the D, H and W
dimensions of output(Out) is same as Parameter(pool_size).
For average adaptive pool3d:
.. math::
dstart &= floor(i * D_{in} / D_{out})
dend &= ceil((i + 1) * D_{in} / D_{out})
hstart &= floor(j * H_{in} / H_{out})
hend &= ceil((j + 1) * H_{in} / H_{out})
wstart &= floor(k * W_{in} / W_{out})
wend &= ceil((k + 1) * W_{in} / W_{out})
Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
Args: Args:
input (Variable): The input tensor of pooling operator. The format of input (Variable): The input tensor of pooling operator. The format of
input tensor is NCHW, where N is batch size, C is input tensor is NCDHW, where N is batch size, C is
the number of channels, H is the height of the the number of channels, D is the depth of the feature,
feature, and W is the width of the feature. H is the height of the feature, and W is the width of the feature.
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain two integers, (Depth, Height, Width). it must contain three integers, (Depth, Height, Width).
pool_type: ${pooling_type_comment} pool_type: ${pooling_type_comment}
require_index (bool): If true, the index of max pooling point along with outputs. require_index (bool): If true, the index of max pooling point will be returned along
it cannot be set in average pooling type. with outputs. It cannot be set in average pooling type.
name (str|None): A name for this layer(optional). If set None, the name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically. layer will be named automatically.
...@@ -2709,7 +2773,7 @@ def adaptive_pool3d(input, ...@@ -2709,7 +2773,7 @@ def adaptive_pool3d(input,
name='data', shape=[3, 32, 32], dtype='float32') name='data', shape=[3, 32, 32], dtype='float32')
pool_out, mask = fluid.layers.adaptive_pool3d( pool_out, mask = fluid.layers.adaptive_pool3d(
input=data, input=data,
pool_size=[3, 3], pool_size=[3, 3, 3],
pool_type='avg') pool_type='avg')
""" """
if pool_type not in ["max", "avg"]: if pool_type not in ["max", "avg"]:
...@@ -2930,6 +2994,7 @@ def batch_norm(input, ...@@ -2930,6 +2994,7 @@ def batch_norm(input,
"momentum": momentum, "momentum": momentum,
"epsilon": epsilon, "epsilon": epsilon,
"is_test": is_test, "is_test": is_test,
"data_layout": data_layout,
"use_mkldnn": False, "use_mkldnn": False,
"fuse_with_relu": fuse_with_relu, "fuse_with_relu": fuse_with_relu,
"use_global_stats": use_global_stats "use_global_stats": use_global_stats
...@@ -3235,7 +3300,7 @@ def group_norm(input, ...@@ -3235,7 +3300,7 @@ def group_norm(input,
# create output # create output
mean_out = helper.create_variable(dtype=dtype, stop_gradient=True) mean_out = helper.create_variable(dtype=dtype, stop_gradient=True)
variance_out = helper.create_variable(dtype=dtype, stop_gradient=True) variance_out = helper.create_variable(dtype=dtype, stop_gradient=True)
group_norm_out = helper.create_variable(dtype) group_norm_out = helper.create_variable(dtype=dtype)
helper.append_op( helper.append_op(
type="group_norm", type="group_norm",
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
from __future__ import print_function from __future__ import print_function
import os import os
from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr from .layer_function_generator import generate_layer_fn, generate_activation_fn
from .. import core from .. import core
from ..framework import convert_np_dtype_to_dtype_ from ..framework import convert_np_dtype_to_dtype_
...@@ -53,7 +53,7 @@ globals()['_elementwise_div'] = generate_layer_fn('elementwise_div') ...@@ -53,7 +53,7 @@ globals()['_elementwise_div'] = generate_layer_fn('elementwise_div')
__all__ += __activations_noattr__ __all__ += __activations_noattr__
for _OP in set(__activations_noattr__): for _OP in set(__activations_noattr__):
globals()[_OP] = generate_layer_fn_noattr(_OP) globals()[_OP] = generate_activation_fn(_OP)
__all__ += ["uniform_random"] __all__ += ["uniform_random"]
......
...@@ -1368,9 +1368,9 @@ class FtrlOptimizer(Optimizer): ...@@ -1368,9 +1368,9 @@ class FtrlOptimizer(Optimizer):
Args: Args:
learning_rate (float|Variable): global learning rate. learning_rate (float|Variable): global learning rate.
l1 (float): l1 (float): L1 regularization strength.
l2 (float): l2 (float): L2 regularization strength.
lr_power (float): lr_power (float): Learning Rate Power.
regularization: A Regularizer, such as regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer. fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix. name: A optional name prefix.
......
...@@ -148,7 +148,10 @@ class ParallelExecutor(object): ...@@ -148,7 +148,10 @@ class ParallelExecutor(object):
else framework.default_main_program() else framework.default_main_program()
# FIXME(dzhwinter): enable_inplace should be after memory_optimize # FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass. # if turn on python memory optimize, turn off the inplace_pass.
build_strategy.enable_inplace = False if main._is_mem_optimized else True if build_strategy.memory_optimize is True:
build_strategy.memory_optimize = False if main._is_mem_optimized else True
if build_strategy.enable_inplace is True:
build_strategy.enable_inplace = False if main._is_mem_optimized else True
scope = scope if scope is not None else executor.global_scope() scope = scope if scope is not None else executor.global_scope()
if share_vars_from and not isinstance(share_vars_from, if share_vars_from and not isinstance(share_vars_from,
......
...@@ -77,6 +77,7 @@ list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op) ...@@ -77,6 +77,7 @@ list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
list(REMOVE_ITEM TEST_OPS test_nearest_interp_op) list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
list(REMOVE_ITEM TEST_OPS test_imperative_resnet) list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
list(REMOVE_ITEM TEST_OPS test_imperative_optimizer) list(REMOVE_ITEM TEST_OPS test_imperative_optimizer)
list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer)
foreach(TEST_OP ${TEST_OPS}) foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP}) py_test_modules(${TEST_OP} MODULES ${TEST_OP})
endforeach(TEST_OP) endforeach(TEST_OP)
...@@ -107,10 +108,18 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE ...@@ -107,10 +108,18 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450) set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL) py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
if(NOT WIN32)
py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL)
endif()
if(NOT APPLE) if(NOT APPLE)
py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# change the timeout from 600 to 1200, because in debug mode, this test need more time.
set_tests_properties(test_image_classification_resnet PROPERTIES TIMEOUT 1200)
endif()
endif() endif()
if (WITH_NGRAPH) if (WITH_NGRAPH)
add_subdirectory(ngraph) add_subdirectory(ngraph)
endif() endif()
......
...@@ -79,7 +79,7 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -79,7 +79,7 @@ class TestParallelExecutorBase(unittest.TestCase):
if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv
build_strategy.memory_optimize = use_ir_memory_optimize build_strategy.memory_optimize = False if memory_opt else use_ir_memory_optimize
# python memory optimization is conflict with inplace pass. # python memory optimization is conflict with inplace pass.
# Use ir graph memory optimization after inplace pass is the correct way. # Use ir graph memory optimization after inplace pass is the correct way.
build_strategy.enable_inplace = False if memory_opt else enable_inplace build_strategy.enable_inplace = False if memory_opt else enable_inplace
......
...@@ -121,6 +121,8 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -121,6 +121,8 @@ class TestMNIST(TestParallelExecutorBase):
regularization=fluid.regularizer.L2Decay(1e-6)) regularization=fluid.regularizer.L2Decay(1e-6))
return optimizer return optimizer
# NOTE(dzh):
# need to make it compatible with elewise fuse act
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
model, model,
feed_dict={"image": img, feed_dict={"image": img,
...@@ -128,6 +130,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -128,6 +130,7 @@ class TestMNIST(TestParallelExecutorBase):
use_cuda=use_cuda, use_cuda=use_cuda,
fuse_elewise_add_act_ops=False, fuse_elewise_add_act_ops=False,
memory_opt=False, memory_opt=False,
use_ir_memory_optimize=False,
optimizer=_optimizer) optimizer=_optimizer)
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence( fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
model, model,
...@@ -136,6 +139,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -136,6 +139,7 @@ class TestMNIST(TestParallelExecutorBase):
use_cuda=use_cuda, use_cuda=use_cuda,
fuse_elewise_add_act_ops=True, fuse_elewise_add_act_ops=True,
memory_opt=False, memory_opt=False,
use_ir_memory_optimize=False,
optimizer=_optimizer) optimizer=_optimizer)
for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss): for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
......
...@@ -16,6 +16,7 @@ from __future__ import print_function ...@@ -16,6 +16,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core
from op_test import OpTest from op_test import OpTest
...@@ -63,5 +64,54 @@ class TestCase2(TestSliceOp): ...@@ -63,5 +64,54 @@ class TestCase2(TestSliceOp):
self.out = self.input[-3:3, 0:100, :, 2:-1] self.out = self.input[-3:3, 0:100, :, 2:-1]
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestFP16(TestSliceOp):
def config(self):
self.dtype = "float16"
self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
self.starts = [-3, 0, 2]
self.ends = [3, 100, -1]
self.axes = [0, 1, 3]
self.out = self.input[-3:3, 0:100, :, 2:-1]
def test_check_output(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-5)
def test_check_grad_normal(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ['Input'], 'Out', max_relative_error=0.006)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestFP16_2(TestSliceOp):
def config(self):
self.dtype = "float16"
self.input = np.random.random([3, 4, 5]).astype(self.dtype)
self.starts = [0]
self.ends = [1]
self.axes = [1]
self.out = self.input[:, 0:1, :]
def test_check_output(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-5)
def test_check_grad_normal(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ['Input'],
'Out',
max_relative_error=0.006,
numeric_grad_delta=0.5)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -1020,7 +1020,11 @@ class DistributeTranspiler(object): ...@@ -1020,7 +1020,11 @@ class DistributeTranspiler(object):
skip_dim0 = 0 skip_dim0 = 0
slice_vars = self.param_var_mapping[orig_var_name] slice_vars = self.param_var_mapping[orig_var_name]
orig_dim1_flatten = reduce(lambda x, y: x * y, slice_vars[0].shape[1:]) orig_dim1_flatten = 1
if len(slice_vars[0].shape) >= 2:
orig_dim1_flatten = reduce(lambda x, y: x * y,
slice_vars[0].shape[1:])
for slice_var in slice_vars[:block_idx]: for slice_var in slice_vars[:block_idx]:
skip_dim0 += slice_var.shape[0] skip_dim0 += slice_var.shape[0]
......
requests==2.9.2 requests==2.9.2
numpy>=1.12 numpy>=1.12
protobuf==3.1 protobuf>=3.1.0
recordio>=0.1.0 recordio>=0.1.0
matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib
rarfile rarfile
......
...@@ -100,6 +100,7 @@ packages=['paddle', ...@@ -100,6 +100,7 @@ packages=['paddle',
'paddle.utils', 'paddle.utils',
'paddle.dataset', 'paddle.dataset',
'paddle.reader', 'paddle.reader',
'paddle.distributed',
'paddle.fluid', 'paddle.fluid',
'paddle.fluid.imperative', 'paddle.fluid.imperative',
'paddle.fluid.proto', 'paddle.fluid.proto',
......
...@@ -17,7 +17,6 @@ OPENSSL_ROOT=openssl-1.1.0i ...@@ -17,7 +17,6 @@ OPENSSL_ROOT=openssl-1.1.0i
OPENSSL_HASH=ebbfc844a8c8cc0ea5dc10b86c9ce97f401837f3fa08c17b2cdadc118253cf99 OPENSSL_HASH=ebbfc844a8c8cc0ea5dc10b86c9ce97f401837f3fa08c17b2cdadc118253cf99
EPEL_RPM_HASH=e5ed9ecf22d0c4279e92075a64c757ad2b38049bcf5c16c4f2b75d5f6860dc0d EPEL_RPM_HASH=e5ed9ecf22d0c4279e92075a64c757ad2b38049bcf5c16c4f2b75d5f6860dc0d
DEVTOOLS_HASH=a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc DEVTOOLS_HASH=a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc
PATCHELF_HASH=d9afdff4baeacfbc64861454f368b7f2c15c44d245293f7587bbf726bfe722fb
CURL_ROOT=curl-7.49.1 CURL_ROOT=curl-7.49.1
CURL_HASH=eb63cec4bef692eab9db459033f409533e6d10e20942f4b060b32819e81885f1 CURL_HASH=eb63cec4bef692eab9db459033f409533e6d10e20942f4b060b32819e81885f1
AUTOCONF_ROOT=autoconf-2.69 AUTOCONF_ROOT=autoconf-2.69
...@@ -107,11 +106,7 @@ curl-config --features ...@@ -107,11 +106,7 @@ curl-config --features
rm -rf /usr/local/ssl rm -rf /usr/local/ssl
# Install patchelf (latest with unreleased bug fixes) # Install patchelf (latest with unreleased bug fixes)
curl -sLO http://nipy.bic.berkeley.edu/manylinux/patchelf-0.9njs2.tar.gz yum install -y patchelf
check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH
tar -xzf patchelf-0.9njs2.tar.gz
(cd patchelf-0.9njs2 && ./configure && make && make install)
rm -rf patchelf-0.9njs2.tar.gz patchelf-0.9njs2
# Install latest pypi release of auditwheel # Install latest pypi release of auditwheel
LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import subprocess
import os
import sys
import time
import argparse
default_envs = {
"PADDLE_TRAINER_ENDPOINTS":
"127.0.0.1:6170,127.0.0.1:6171,127.0.0.1:6172,127.0.0.1:6173,127.0.0.1:6174,127.0.0.1:6175,127.0.0.1:6176,127.0.0.1:6177",
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
"PATH": os.getenv("PATH"),
"LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
"PADDLE_TRAINERS_NUM": "8",
"NCCL_DEBUG": "INFO",
"GLOG_v": "0",
"NCCL_SOCKET_IFNAME": "eth0",
"NCCL_IB_GID_INDEX": "3",
"NCCL_IB_RETRY_CNT": "0",
}
GPUS = 8
def start_procs(gpus, cmd, log_dir):
procs = []
log_fns = []
os.system("mkdir -p %s" % log_dir)
# ======== update parent envs =======
for k, v in os.environ.items():
if k.startswith("FLAGS_") or k.startswith("NCCL_") or \
k.startswith("GLOG_"):
default_envs[k] = v
# ======== for dist training =======
node_trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
current_ip = os.getenv("POD_IP", "127.0.0.1")
trainer_ips = os.getenv("PADDLE_TRAINERS", current_ip).split(",")
num_nodes = len(trainer_ips)
all_nodes_devices_endpoints = ""
for n in trainer_ips:
for i in range(gpus):
if all_nodes_devices_endpoints:
all_nodes_devices_endpoints += ","
all_nodes_devices_endpoints += "%s:617%d" % (n, i)
nranks = num_nodes * gpus
# ======== for dist training =======
for i in range(gpus):
curr_env = {}
curr_env.update(default_envs)
curr_env.update({
"FLAGS_selected_gpus": "%d" % i,
"PADDLE_TRAINER_ID": "%d" % (node_trainer_id * gpus + i),
"PADDLE_CURRENT_ENDPOINT": "%s:617%d" % (current_ip, i),
# nranks
"PADDLE_TRAINERS_NUM": "%d" % nranks,
"PADDLE_TRAINER_ENDPOINTS": all_nodes_devices_endpoints
})
print("starting process ", i, cmd, curr_env)
fn = open("%s/workerlog.%d" % (log_dir, i), "w")
log_fns.append(fn)
procs.append(
subprocess.Popen(
cmd.strip().split(" "), stdout=fn, stderr=fn, env=curr_env))
for i in range(gpus):
try:
procs[i].communicate()
procs[i].terminate()
log_fns[i].close()
except:
pass
def main():
parser = argparse.ArgumentParser(
description='''start paddle training using multi-process mode.
NOTE: your train program ***must*** run as distributed nccl2 mode,
see: http://www.paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
And your train program must read environment variables below in order to let different
process init properly:
FLAGS_selected_gpus
PADDLE_TRAINER_ID
PADDLE_CURRENT_ENDPOINT
PADDLE_TRAINERS_NUM
PADDLE_TRAINER_ENDPOINTS
POD_IP (current node ip address, not needed for local training)
''')
parser.add_argument(
'--gpus',
type=int,
default=8,
help='start number of processes for every gpu')
parser.add_argument(
'--cmd',
type=str,
default="",
help='command to run for each process, e.g. python train.py --lr 0.1')
parser.add_argument(
'--log_dir',
type=str,
default="mylog",
help='directory to put logs per process.')
args = parser.parse_args()
if args.cmd == "":
parser.print_help()
exit(0)
start_procs(args.gpus, args.cmd, args.log_dir)
if __name__ == "__main__":
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册