提交 d79d2f68 编写于 作者: C chengduozh

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_shape_api_doc

test=develop
...@@ -50,7 +50,12 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_ ...@@ -50,7 +50,12 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper) if(WITH_GPU)
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info)
else()
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper cpu_info)
endif()
cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass) cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass)
cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info) cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info)
cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
......
...@@ -240,7 +240,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -240,7 +240,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
continue; continue;
} }
} }
VLOG(3) << "Start Apply Pass " << pass->Type();
graph = pass->Apply(std::move(graph)); graph = pass->Apply(std::move(graph));
VLOG(3) << "Finish Apply Pass " << pass->Type();
} }
return graph; return graph;
} }
......
...@@ -49,7 +49,7 @@ DEFINE_bool( ...@@ -49,7 +49,7 @@ DEFINE_bool(
"If this option turns on, only these op in whitelist can be inplaced." "If this option turns on, only these op in whitelist can be inplaced."
"If it turns off, all of the running op can be candidate of inplaced op." "If it turns off, all of the running op can be candidate of inplaced op."
"Such as scale, elementwise_add" "Such as scale, elementwise_add"
"By default, it's turned on"); "By default, it's turned off");
DECLARE_string(memory_optimize_debug); DECLARE_string(memory_optimize_debug);
......
...@@ -13,13 +13,19 @@ ...@@ -13,13 +13,19 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include <algorithm>
#include <deque> #include <deque>
#include <functional> #include <functional>
#include <iostream> #include <iterator>
#include <numeric> #include <numeric>
#include <sstream> #include <sstream>
#include <string> #include <string>
#include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/cpu_info.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/gpu_info.h"
#endif // PADDLE_WITH_CUDA
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -166,6 +172,11 @@ struct NodeComparator { ...@@ -166,6 +172,11 @@ struct NodeComparator {
bool operator()(ir::Node* lhs, ir::Node* rhs) const { bool operator()(ir::Node* lhs, ir::Node* rhs) const {
auto* lhs_desc = FindVarDescInBlock(lhs); auto* lhs_desc = FindVarDescInBlock(lhs);
auto* rhs_desc = FindVarDescInBlock(rhs); auto* rhs_desc = FindVarDescInBlock(rhs);
// match data type
if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) {
return false;
}
// match shape
auto lhs_shape = lhs_desc->GetShape(); auto lhs_shape = lhs_desc->GetShape();
auto rhs_shape = rhs_desc->GetShape(); auto rhs_shape = rhs_desc->GetShape();
if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
...@@ -230,6 +241,27 @@ ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const { ...@@ -230,6 +241,27 @@ ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const {
return found_node; return found_node;
} }
ir::Node* OrderedSet::FindNextBestFitNode(ir::Node* var, ir::Node* prev) const {
ir::Node* found_node = nullptr;
NodeComparator functor;
auto it =
std::find_if(nodes_.begin(), nodes_.end(), [&](const NodeVector& v) {
if (v.front() == prev)
return true;
else
return false;
});
PADDLE_ENFORCE(it != nodes_.end(), "Not found previous in node list!");
for (it = std::next(it); it != nodes_.end(); ++it) {
auto& candidate = it->front();
if (functor(var, candidate)) {
found_node = candidate;
break;
}
}
return found_node;
}
bool OrderedSet::Has(ir::Node* var) const { bool OrderedSet::Has(ir::Node* var) const {
if (mark_table_.count(var->Name())) { if (mark_table_.count(var->Name())) {
auto& node_in_samename = mark_table_.at(var->Name()); auto& node_in_samename = mark_table_.at(var->Name());
...@@ -241,10 +273,15 @@ bool OrderedSet::Has(ir::Node* var) const { ...@@ -241,10 +273,15 @@ bool OrderedSet::Has(ir::Node* var) const {
return false; return false;
} }
void OrderedSet::Erase(const std::string& var) {
PADDLE_ENFORCE(mark_table_.count(var));
nodes_.erase(mark_table_[var]);
mark_table_.erase(var);
}
void OrderedSet::Erase(ir::Node* var) { void OrderedSet::Erase(ir::Node* var) {
PADDLE_ENFORCE(mark_table_.count(var->Name())); PADDLE_ENFORCE(var != nullptr);
nodes_.erase(mark_table_[var->Name()]); Erase(var->Name());
mark_table_.erase(var->Name());
} }
std::string OrderedSet::ToString() const { std::string OrderedSet::ToString() const {
...@@ -274,14 +311,35 @@ bool NodeCanReused(ir::Node* node) { ...@@ -274,14 +311,35 @@ bool NodeCanReused(ir::Node* node) {
return flag; return flag;
} }
int MinChunkSize() {
int size{0};
#ifdef PADDLE_WITH_CUDA
size = platform::GpuMinChunkSize();
#else
size = platform::CpuMinChunkSize();
#endif // PADDLE_WITH_CUDA
return size;
}
bool NodeCanReused(const VarDesc& node) { bool NodeCanReused(const VarDesc& node) {
auto type = node.GetType(); auto type = node.GetType();
// only these types holds bulk of gpu memory
if (!(type == proto::VarType::LOD_TENSOR || if (!(type == proto::VarType::LOD_TENSOR ||
type == proto::VarType::SELECTED_ROWS || type == proto::VarType::SELECTED_ROWS ||
type == proto::VarType::LOD_TENSOR_ARRAY)) { type == proto::VarType::LOD_TENSOR_ARRAY)) {
return false; return false;
} }
if (node.Persistable() || node.GetShape().empty()) { // persistable variable is parameter
if (node.Persistable()) {
return false;
}
// shape < min_chunk_size is meaningless.
// further more, fetched loss always has size = 1
// which should not be reused.
auto shape = node.GetShape();
int size = std::abs(
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()));
if (shape.empty() || size < MinChunkSize()) {
return false; return false;
} }
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
...@@ -461,7 +519,9 @@ ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name, ...@@ -461,7 +519,9 @@ ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name,
for (auto* node : ops_) { for (auto* node : ops_) {
if (node == op) break; if (node == op) break;
for (auto& output : node->outputs) { for (auto& output : node->outputs) {
if (output->Name() == name) { PADDLE_ENFORCE((output != nullptr && output->IsVar()),
"Output is empty!");
if (output->Var() && output->Name() == name) {
found_node = output; found_node = output;
} }
} }
......
...@@ -55,6 +55,7 @@ class OrderedSet { ...@@ -55,6 +55,7 @@ class OrderedSet {
void Insert(ir::Node* var); void Insert(ir::Node* var);
void Erase(ir::Node* var); void Erase(ir::Node* var);
void Erase(const std::string& var);
bool Has(ir::Node* var) const; bool Has(ir::Node* var) const;
void Clear() { void Clear() {
mark_table_.clear(); mark_table_.clear();
...@@ -62,6 +63,7 @@ class OrderedSet { ...@@ -62,6 +63,7 @@ class OrderedSet {
} }
// find the bestfit shape node block with var. // find the bestfit shape node block with var.
ir::Node* FindBestFitNode(ir::Node* var) const; ir::Node* FindBestFitNode(ir::Node* var) const;
ir::Node* FindNextBestFitNode(ir::Node* var, ir::Node* prev) const;
// map store non-const iterator, can not promise const // map store non-const iterator, can not promise const
int GetNodeIndexInPool(ir::Node* var); int GetNodeIndexInPool(ir::Node* var);
// pool all node to string // pool all node to string
......
...@@ -107,6 +107,52 @@ TEST(OrderedSet, Normal) { ...@@ -107,6 +107,52 @@ TEST(OrderedSet, Normal) {
ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5); // match 4:[5,2] ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5); // match 4:[5,2]
} }
} }
TEST(OrderedSet, FindBestFitNode) {
OrderedSet pool;
std::vector<std::unique_ptr<ir::Node>> nodes;
ProgramDesc prog;
BlockDesc* block_desc = prog.MutableBlock(0);
auto* op_desc = block_desc->AppendOp();
op_desc->SetType("dummy");
std::unique_ptr<ir::Node> op = ir::CreateNodeForTest(op_desc);
{
auto desc = block_desc->Var("a");
desc->SetShape({128, 128});
std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
node->inputs.emplace_back(op.get());
nodes.emplace_back(std::move(node));
}
{
auto desc = block_desc->Var("b");
desc->SetShape({128, 129});
std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
node->inputs.emplace_back(op.get());
nodes.emplace_back(std::move(node));
}
{
auto desc = block_desc->Var("c");
desc->SetShape({128, 128});
std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
node->inputs.emplace_back(op.get());
nodes.emplace_back(std::move(node));
}
for (auto& node : nodes) {
pool.Insert(node.get());
}
// FindNextBestFitNode
auto* n = nodes[0].get();
auto* cache = pool.FindBestFitNode(n);
PADDLE_ENFORCE(cache->Name() == "a");
cache = pool.FindNextBestFitNode(n, cache);
PADDLE_ENFORCE(cache->Name() == "c");
cache = pool.FindNextBestFitNode(n, cache);
PADDLE_ENFORCE(cache->Name() == "b");
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
......
...@@ -69,55 +69,59 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl( ...@@ -69,55 +69,59 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
} }
for (auto& var : op->outputs) { for (auto& var : op->outputs) {
if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 || if (var->IsVar() && !var->IsCtrlVar() && skip_set_.count(var->Name())) {
skip_set_.count(var->Name())) VLOG(3) << "Skip set contains variable of " << var->Name()
<< "disable reuse on it. skipped";
continue; continue;
ir::Node* cache = pool_.FindBestFitNode(var);
if (var->Name() == FLAGS_memory_optimize_debug) {
VLOG(3) << "start match var " << DebugString(var) << " of op "
<< op->Name();
VLOG(3) << pool_.ToString();
VLOG(3) << "matched in pool : "
<< ((cache == nullptr) ? "False" : "True");
} }
if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) {
ir::Node* cache = pool_.FindBestFitNode(var);
while (cache != nullptr && var->Name() == cache->Name()) {
VLOG(3) << "The same cache variable is cascade reused. "
<< cache->Name() << " is re-filled to the pool after "
<< "the reused op is finished. Current op can not "
<< "replace it again. Skip this candidate.";
cache = pool_.FindNextBestFitNode(var, cache);
}
if (var->Name() == FLAGS_memory_optimize_debug) {
VLOG(3) << "start match var " << DebugString(var) << " of op "
<< op->Name();
VLOG(3) << pool_.ToString();
VLOG(3) << "matched in pool : "
<< ((cache == nullptr) ? "False" : "True");
}
if (cache == nullptr) continue; if (cache != nullptr) {
if (var->Name() == cache->Name()) { int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
VLOG(3) << "The same cache variable is cascade reused." << var->Name() VLOG(3) << string::Sprintf(
<< " is re-filled to the pool after" "!!! %s, %s => %s, cache idx %d, pool size %d",
<< "the reused op is finished. Current op can not " std::to_string(reuse_id++), DebugString(var), DebugString(cache),
<< "replace it again. Skip this candidate."; node_idx_in_pool, static_cast<int>(pool_.size()));
continue; // NOTE(dzhwinter): update the ProgramDesc/IR Graph
// and the CFG Graph on the fly.
int node_idx_in_pool = pool_.GetNodeIndexInPool(cache); //
VLOG(3) << string::Sprintf( // IR Graph define the dependence relationship between nodes.
"!!! %s, %s => %s, cache idx %d, pool size %d", //
std::to_string(reuse_id++), DebugString(var), DebugString(cache), // ProgramDesc defines the input/output vars. Its used in
node_idx_in_pool, static_cast<int>(pool_.size())); // CreateOp, CreateVar when running happens.
//
// update CFG Graph on the fly. // CFG Graph store the liveness information, when reuse happens
// reused var maybe re-fill into the pool // we also need to update the variable liveness.
cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx); const std::string var_name = var->Name();
// NOTE(dzhwinter): we need to both update the ProgramDesc const std::string cache_name = cache->Name();
// and IR Graph. because op_desc/var_desc is used in CreateOp,
// CreateVar when running happens. But IR Graph
// define the dependence relationship between nodes.
RenameVarInGraphDesc(var->Name(), cache->Name(), idx);
RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get());
pool_.Erase(cache);
}
// fill the pool cfg_->RenameVarInCFGGraph(var_name, cache_name, idx);
std::unordered_set<std::string> unlived_vars; RenameVarInGraphDesc(var_name, cache_name, idx);
for (auto var : cfg_->LiveIn(op)) { RenameVarInGraphNode(var_name, cache_name, idx, graph.get());
if (cfg_->LiveOut(op).count(var) == 0) { pool_.Erase(cache_name);
unlived_vars.emplace(var);
} }
} }
for (auto var : unlived_vars) { }
// fill the pool
for (auto var : cfg_->LiveIn(op)) {
if (cfg_->LiveOut(op).count(var) == 0) {
ir::Node* var_node = cfg_->GetNodeByName(var, op); ir::Node* var_node = cfg_->GetNodeByName(var, op);
if (var_node == nullptr || var_node->IsCtrlVar()) continue;
if (NodeCanReused(var_node) && !pool_.Has(var_node)) { if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
pool_.Insert(var_node); pool_.Insert(var_node);
} }
...@@ -273,8 +277,7 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, ...@@ -273,8 +277,7 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
// redirect the input to the latest version of cache_var // redirect the input to the latest version of cache_var
for (auto* node : op->inputs) { for (auto* node : op->inputs) {
if (node->Name() == var) { if (node->Name() == var) {
ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); ir::Node* cache_node = var_nodes_[cache_var].back();
var_nodes_[cache_var].emplace_back(cache_node);
// swap node to cache_node // swap node to cache_node
cache_node->outputs.insert(cache_node->outputs.end(), cache_node->outputs.insert(cache_node->outputs.end(),
...@@ -283,11 +286,15 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, ...@@ -283,11 +286,15 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
auto* prev_op = node->inputs[0]; auto* prev_op = node->inputs[0];
std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node, std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node,
cache_node); cache_node);
cache_node->inputs.emplace_back(prev_op);
for (auto* next_op : node->outputs) { for (auto* next_op : node->outputs) {
std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
cache_node); cache_node);
} }
// erase unused node
auto& nodes = var_nodes_.at(var);
nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
graph->RemoveNode(node);
} }
} }
...@@ -307,15 +314,14 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, ...@@ -307,15 +314,14 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
cache_node); cache_node);
} }
// erase unused node
auto& nodes = var_nodes_.at(var);
nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
graph->RemoveNode(node);
} }
} }
} }
// release node of unused var in graph
for (auto* node : var_nodes_[var]) {
graph->RemoveNode(node);
}
var_nodes_.at(var).clear();
} }
} // namespace details } // namespace details
......
...@@ -179,11 +179,11 @@ TEST(InferInplace, SingleOpInplaceInToOut) { ...@@ -179,11 +179,11 @@ TEST(InferInplace, SingleOpInplaceInToOut) {
op->SetOutput("Out", {"test2_out"}); op->SetOutput("Out", {"test2_out"});
prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64}); prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_out"); prog.MutableBlock(0)->Var("test2_out");
prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16}); prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
...@@ -201,11 +201,11 @@ TEST(InferInplace, SingleGradOpInplaceInToOut) { ...@@ -201,11 +201,11 @@ TEST(InferInplace, SingleGradOpInplaceInToOut) {
op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"}); op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});
prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16}); prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_out"); prog.MutableBlock(0)->Var("test2_out");
prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16}); prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
...@@ -233,12 +233,12 @@ TEST(InferInplace, MultiOutInplaceInToOut) { ...@@ -233,12 +233,12 @@ TEST(InferInplace, MultiOutInplaceInToOut) {
prog.MutableBlock(0)->Var("o0"); prog.MutableBlock(0)->Var("o0");
prog.MutableBlock(0)->Var("y0"); prog.MutableBlock(0)->Var("y0");
prog.MutableBlock(0)->Var("z0"); prog.MutableBlock(0)->Var("z0");
prog.MutableBlock(0)->Var("a0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("b0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("c0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("o0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("y0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("z0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
...@@ -267,12 +267,12 @@ TEST(InferInplace, MultiGradInplaceInToOut) { ...@@ -267,12 +267,12 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
prog.MutableBlock(0)->Var("o0"); prog.MutableBlock(0)->Var("o0");
prog.MutableBlock(0)->Var("y0"); prog.MutableBlock(0)->Var("y0");
prog.MutableBlock(0)->Var("z0"); prog.MutableBlock(0)->Var("z0");
prog.MutableBlock(0)->Var("a0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("b0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("c0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("o0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("y0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("z0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
......
...@@ -38,9 +38,13 @@ std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl( ...@@ -38,9 +38,13 @@ std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl(
->assert_is_op("scale") ->assert_is_op("scale")
->assert_op_attr<float>("scale", 1.) ->assert_op_attr<float>("scale", 1.)
->assert_op_attr<float>("bias", 0.); ->assert_op_attr<float>("bias", 0.);
auto scale_out = detector.mutable_pattern() auto scale_out =
->NewNode("scale_out") detector.mutable_pattern()
->assert_is_op_output("scale"); ->NewNode("scale_out")
->assert_is_op_output("scale")
// scale's output var should has only one consumer, or it can't be
// removed.
->assert_more([](Node* x) { return x->outputs.size() == 1UL; });
pre_op->LinksTo({scale_in}); pre_op->LinksTo({scale_in});
scale_op->LinksFrom({scale_in}).LinksTo({scale_out}); scale_op->LinksFrom({scale_in}).LinksTo({scale_out});
......
...@@ -207,7 +207,7 @@ framework::LoDTensor& VarBase::GradValue() { ...@@ -207,7 +207,7 @@ framework::LoDTensor& VarBase::GradValue() {
std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() { std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
if (grad_op_descs_.empty() && backward_id_ <= 0) { if (grad_op_descs_.empty() && backward_id_ <= 0) {
LOG(WARNING) << "op with no grad: " << op_desc_->Type(); VLOG(3) << "op with no grad: " << op_desc_->Type();
return {}; return {};
} }
......
...@@ -460,77 +460,6 @@ inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) { ...@@ -460,77 +460,6 @@ inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
return node.inputs.size() == n; return node.inputs.size() == n;
} }
NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
PADDLE_ENFORCE(!source.empty(),
"Start points of topological sorting should not be empty!");
// CHECK all the inputs' in-degree is 0
for (auto *node : source) {
PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
}
std::unordered_set<Node *> visited;
std::unordered_set<Node *> to_visit{source.begin(), source.end()};
std::vector<Node *> inlink_visited;
while (!to_visit.empty()) {
std::vector<Node *> queue(to_visit.begin(), to_visit.end());
for (auto *p : queue) {
if (Agent(p).deleted()) {
visited.insert(p);
to_visit.erase(p);
}
inlink_visited.clear();
std::copy_if(p->inputs.begin(), p->inputs.end(),
std::back_inserter(inlink_visited),
[&](Node *x) -> bool { return visited.count(x) != 0; });
if (inlink_visited.size() == p->inputs.size()) {
sorted_.push_back(p);
for (auto *_ : p->outputs) {
if (!visited.count(_)) {
to_visit.insert(_);
}
}
to_visit.erase(p);
visited.insert(p);
}
}
}
}
NodesTSIterator::NodesTSIterator(const NodesTSIterator &other)
: sorted_(other.sorted_), cursor_(other.cursor_) {}
Node &NodesTSIterator::operator*() {
PADDLE_ENFORCE_LT(cursor_, sorted_.size());
return *sorted_[cursor_];
}
NodesTSIterator &NodesTSIterator::operator++() {
if (++cursor_ >= sorted_.size()) {
sorted_.clear();
cursor_ = 0;
}
return *this;
}
NodesTSIterator &NodesTSIterator::operator=(const NodesTSIterator &other) {
cursor_ = other.cursor_;
sorted_ = other.sorted_;
return *this;
}
bool NodesTSIterator::operator==(const NodesTSIterator &other) {
return sorted_ == other.sorted_ && cursor_ == other.cursor_;
}
Node *NodesTSIterator::operator->() {
PADDLE_ENFORCE_LT(cursor_, sorted_.size());
return sorted_[cursor_];
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -30,6 +30,7 @@ namespace inference { ...@@ -30,6 +30,7 @@ namespace inference {
namespace analysis { namespace analysis {
using framework::ir::Graph; using framework::ir::Graph;
using framework::ir::NodesTSIterator;
const char kIsFunctionNode[] = "__is_function_node__"; const char kIsFunctionNode[] = "__is_function_node__";
const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__"; const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__";
...@@ -132,32 +133,6 @@ struct Agent { ...@@ -132,32 +133,6 @@ struct Agent {
framework::ir::Node *x_; framework::ir::Node *x_;
}; };
// Topological sorting iterator on nodes.
struct NodesTSIterator
: public std::iterator<std::forward_iterator_tag, framework::ir::Node *> {
NodesTSIterator() = default;
explicit NodesTSIterator(const std::vector<framework::ir::Node *> &source);
NodesTSIterator(NodesTSIterator &&other)
: sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
other.cursor_ = 0;
}
NodesTSIterator(const NodesTSIterator &other);
framework::ir::Node &operator*();
NodesTSIterator &operator++();
// TODO(Superjomn) current implementation just compare the first
// element, need to compare the graph and all the elements in the queue and
// set.
NodesTSIterator &operator=(const NodesTSIterator &other);
bool operator==(const NodesTSIterator &other);
bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
framework::ir::Node *operator->();
private:
std::vector<framework::ir::Node *> sorted_;
size_t cursor_{0};
};
// The nodes those have no input will be treated as start points. // The nodes those have no input will be treated as start points.
static std::vector<framework::ir::Node *> ExtractStartPoints(const Graph &g) { static std::vector<framework::ir::Node *> ExtractStartPoints(const Graph &g) {
std::vector<framework::ir::Node *> result; std::vector<framework::ir::Node *> result;
......
...@@ -72,7 +72,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -72,7 +72,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#pragma omp parallel for #pragma omp parallel for
#endif #endif
for (int i = 0; i < fixed_ratios.size(); i++) { for (size_t i = 0; i < fixed_ratios.size(); i++) {
sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i])); sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i]));
} }
...@@ -115,11 +115,10 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -115,11 +115,10 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
} }
} }
if (clip) { if (clip) {
platform::Transform<platform::CPUDeviceContext> trans; T* dt = boxes->data<T>();
ClipFunctor<T> clip_func; std::transform(dt, dt + boxes->numel(), dt, [](T v) -> T {
trans(ctx.template device_context<platform::CPUDeviceContext>(), return std::min<T>(std::max<T>(v, 0.), 1.);
boxes->data<T>(), boxes->data<T>() + boxes->numel(), });
boxes->data<T>(), clip_func);
} }
framework::Tensor var_t; framework::Tensor var_t;
var_t.mutable_data<T>( var_t.mutable_data<T>(
...@@ -141,7 +140,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -141,7 +140,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
#endif #endif
for (int i = 0; i < box_num; ++i) { for (int i = 0; i < box_num; ++i) {
for (int j = 0; j < variances.size(); ++j) { for (size_t j = 0; j < variances.size(); ++j) {
e_vars(i, j) = variances[j]; e_vars(i, j) = variances[j];
} }
} }
......
...@@ -46,13 +46,6 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior, ...@@ -46,13 +46,6 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
} }
} }
template <typename T>
struct ClipFunctor {
HOSTDEVICE inline T operator()(T in) const {
return std::min<T>(std::max<T>(in, 0.), 1.);
}
};
template <typename T> template <typename T>
class PriorBoxOpKernel : public framework::OpKernel<T> { class PriorBoxOpKernel : public framework::OpKernel<T> {
public: public:
...@@ -101,31 +94,30 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -101,31 +94,30 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
boxes->mutable_data<T>(ctx.GetPlace()); boxes->mutable_data<T>(ctx.GetPlace());
vars->mutable_data<T>(ctx.GetPlace()); vars->mutable_data<T>(ctx.GetPlace());
auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes); T* b_t = boxes->data<T>();
for (int h = 0; h < feature_height; ++h) { for (int h = 0; h < feature_height; ++h) {
for (int w = 0; w < feature_width; ++w) { for (int w = 0; w < feature_width; ++w) {
T center_x = (w + offset) * step_width; T center_x = (w + offset) * step_width;
T center_y = (h + offset) * step_height; T center_y = (h + offset) * step_height;
T box_width, box_height; T box_width, box_height;
int idx = 0;
for (size_t s = 0; s < min_sizes.size(); ++s) { for (size_t s = 0; s < min_sizes.size(); ++s) {
auto min_size = min_sizes[s]; auto min_size = min_sizes[s];
if (min_max_aspect_ratios_order) { if (min_max_aspect_ratios_order) {
box_width = box_height = min_size / 2.; box_width = box_height = min_size / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; b_t[0] = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; b_t[1] = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; b_t[2] = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; b_t[3] = (center_y + box_height) / img_height;
idx++; b_t += 4;
if (max_sizes.size() > 0) { if (max_sizes.size() > 0) {
auto max_size = max_sizes[s]; auto max_size = max_sizes[s];
// square prior with size sqrt(minSize * maxSize) // square prior with size sqrt(minSize * maxSize)
box_width = box_height = sqrt(min_size * max_size) / 2.; box_width = box_height = sqrt(min_size * max_size) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; b_t[0] = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; b_t[1] = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; b_t[2] = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; b_t[3] = (center_y + box_height) / img_height;
idx++; b_t += 4;
} }
// priors with different aspect ratios // priors with different aspect ratios
for (size_t r = 0; r < aspect_ratios.size(); ++r) { for (size_t r = 0; r < aspect_ratios.size(); ++r) {
...@@ -135,11 +127,11 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -135,11 +127,11 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
} }
box_width = min_size * sqrt(ar) / 2.; box_width = min_size * sqrt(ar) / 2.;
box_height = min_size / sqrt(ar) / 2.; box_height = min_size / sqrt(ar) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; b_t[0] = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; b_t[1] = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; b_t[2] = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; b_t[3] = (center_y + box_height) / img_height;
idx++; b_t += 4;
} }
} else { } else {
// priors with different aspect ratios // priors with different aspect ratios
...@@ -147,21 +139,21 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -147,21 +139,21 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
float ar = aspect_ratios[r]; float ar = aspect_ratios[r];
box_width = min_size * sqrt(ar) / 2.; box_width = min_size * sqrt(ar) / 2.;
box_height = min_size / sqrt(ar) / 2.; box_height = min_size / sqrt(ar) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; b_t[0] = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; b_t[1] = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; b_t[2] = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; b_t[3] = (center_y + box_height) / img_height;
idx++; b_t += 4;
} }
if (max_sizes.size() > 0) { if (max_sizes.size() > 0) {
auto max_size = max_sizes[s]; auto max_size = max_sizes[s];
// square prior with size sqrt(minSize * maxSize) // square prior with size sqrt(minSize * maxSize)
box_width = box_height = sqrt(min_size * max_size) / 2.; box_width = box_height = sqrt(min_size * max_size) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; b_t[0] = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; b_t[1] = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; b_t[2] = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; b_t[3] = (center_y + box_height) / img_height;
idx++; b_t += 4;
} }
} }
} }
...@@ -169,11 +161,10 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -169,11 +161,10 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
} }
if (clip) { if (clip) {
platform::Transform<platform::CPUDeviceContext> trans; T* dt = boxes->data<T>();
ClipFunctor<T> clip_func; std::transform(dt, dt + boxes->numel(), dt, [](T v) -> T {
trans(ctx.template device_context<platform::CPUDeviceContext>(), return std::min<T>(std::max<T>(v, 0.), 1.);
boxes->data<T>(), boxes->data<T>() + boxes->numel(), });
boxes->data<T>(), clip_func);
} }
framework::Tensor var_t; framework::Tensor var_t;
......
...@@ -170,13 +170,48 @@ class GroupNormGradMaker : public framework::SingleGradOpDescMaker { ...@@ -170,13 +170,48 @@ class GroupNormGradMaker : public framework::SingleGradOpDescMaker {
} }
}; };
class GroupNormInplaceInToOut : public framework::InplaceInToOut {
public:
using InplaceInToOut::InplaceInToOut;
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
return {{"X", "Y"}};
}
};
class GroupNormGradInplaceInToOut : public framework::InplaceInToOut {
public:
using InplaceInToOut::InplaceInToOut;
protected:
std::unordered_map<std::string, std::string> Apply(
const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
return {{framework::GradVarName("Y"), framework::GradVarName("X")}};
}
};
class GroupNormOpInferVarType
: public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return {{"X", /*->*/ "Y"}};
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(group_norm, ops::GroupNormOp, ops::GroupNormOpMaker, REGISTER_OPERATOR(group_norm, ops::GroupNormOp, ops::GroupNormOpMaker,
ops::GroupNormGradMaker); ops::GroupNormOpInferVarType, ops::GroupNormGradMaker,
REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp); ops::GroupNormInplaceInToOut);
REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp,
ops::GroupNormGradInplaceInToOut);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
group_norm, ops::GroupNormKernel<paddle::platform::CPUDeviceContext, float>, group_norm, ops::GroupNormKernel<paddle::platform::CPUDeviceContext, float>,
ops::GroupNormKernel<paddle::platform::CPUDeviceContext, double>); ops::GroupNormKernel<paddle::platform::CPUDeviceContext, double>);
......
...@@ -339,6 +339,71 @@ void BenchSoftmaxKernel() { ...@@ -339,6 +339,71 @@ void BenchSoftmaxKernel() {
} }
} }
template <jit::KernelType KT, typename T, typename PlaceType>
void BenchLayerNormKernel() {
const T epsilon = 9.99999975e-06;
for (int n : {1, 2, 10}) {
for (int x_dim_0 : {1, 9, 17, 50}) {
int left = n * x_dim_0;
for (int x_dim_1 : TestSizes()) {
int right = x_dim_1;
int sz = left * right;
Tensor x, mean, var, scale, bias, out;
x.Resize({n, x_dim_0, x_dim_1});
out.Resize({n, x_dim_0, x_dim_1});
mean.Resize({n, x_dim_0});
var.Resize({n, x_dim_0});
scale.Resize({x_dim_1});
bias.Resize({x_dim_1});
RandomVec<T>(sz, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
RandomVec<T>(left, mean.mutable_data<T>(PlaceType()), -2.f, 2.f);
RandomVec<T>(left, var.mutable_data<T>(PlaceType()), -2.f, 2.f);
RandomVec<T>(right, scale.mutable_data<T>(PlaceType()), -2.f, 2.f);
RandomVec<T>(right, bias.mutable_data<T>(PlaceType()), -2.f, 2.f);
const T* scale_data = scale.data<T>();
const T* bias_data = bias.data<T>();
T* x_data = x.data<T>();
T* mean_data = mean.data<T>();
T* var_data = var.data<T>();
T* out_data = out.mutable_data<T>(PlaceType());
BenchAllImpls<KT, jit::LayerNormTuples<T>, PlaceType>(
right, x_data, out_data, mean_data, var_data, scale_data, bias_data,
left, epsilon, right);
}
}
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void BenchCRFDecodingKernel() {
constexpr int state_trans_base_idx = 2;
for (int seq_len : {1, 11, 17, 50}) {
for (int tag_num : TestSizes()) {
int x_sz = seq_len * tag_num;
int w_sz = (tag_num + state_trans_base_idx) * tag_num;
Tensor x, w, alpha, track;
x.Resize({seq_len, tag_num});
w.Resize({tag_num + state_trans_base_idx, tag_num});
alpha.Resize({seq_len, tag_num});
track.Resize({seq_len, tag_num});
RandomVec<T>(x_sz, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
RandomVec<T>(w_sz, w.mutable_data<T>(PlaceType()), -2.f, 2.f);
const T* x_data = x.data<T>();
const T* w_data = w.data<T>();
T* alpha_data = alpha.mutable_data<T>(PlaceType());
int* track_data = track.mutable_data<int>(PlaceType());
BenchAllImpls<KT, jit::CRFDecodingTuples<T>, PlaceType>(
tag_num, seq_len, x_data, w_data, alpha_data, track_data, tag_num);
}
}
}
using T = float; using T = float;
using CPUPlace = paddle::platform::CPUPlace; using CPUPlace = paddle::platform::CPUPlace;
...@@ -382,6 +447,16 @@ BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, CPUPlace>(); } ...@@ -382,6 +447,16 @@ BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, CPUPlace>(); }
// softmax // softmax
BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel<jit::kSoftmax, T, CPUPlace>(); } BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel<jit::kSoftmax, T, CPUPlace>(); }
// layernorm
BENCH_FP32_CPU(kLayerNorm) {
BenchLayerNormKernel<jit::kLayerNorm, T, CPUPlace>();
}
// crfdecoding
BENCH_FP32_CPU(kCRFDecoding) {
BenchCRFDecodingKernel<jit::kCRFDecoding, T, CPUPlace>();
}
// Benchmark all jit kernels including jitcode, mkl and refer. // Benchmark all jit kernels including jitcode, mkl and refer.
// To use this tool, run command: ./benchmark [options...] // To use this tool, run command: ./benchmark [options...]
// Options: // Options:
......
...@@ -292,6 +292,63 @@ struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>, ...@@ -292,6 +292,63 @@ struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>,
} }
}; };
template <typename T>
struct TestFuncWithRefer<jit::LayerNormTuples<T>, std::vector<T>,
std::vector<T>, std::vector<T>, std::vector<T>,
std::vector<T>, std::vector<T>, int, float, int> {
void operator()(const typename jit::LayerNormTuples<T>::func_type tgt,
std::vector<T>& x, std::vector<T>& outref, // NOLINT
std::vector<T>& mean, std::vector<T>& var, // NOLINT
const std::vector<T>& scale, const std::vector<T>& bias,
int left, const float epsilon, int right) {
EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(x.size(), static_cast<size_t>(left * right));
EXPECT_EQ(outref.size(), static_cast<size_t>(left * right));
EXPECT_EQ(mean.size(), static_cast<size_t>(left));
EXPECT_EQ(var.size(), static_cast<size_t>(left));
EXPECT_EQ(scale.size(), static_cast<size_t>(right));
EXPECT_EQ(bias.size(), static_cast<size_t>(right));
std::vector<T> outtgt(outref.size());
const T* scale_data = scale.data();
const T* bias_data = bias.data();
T* x_data = x.data();
T* mean_data = mean.data();
T* var_data = var.data();
T* outref_data = outref.data();
T* outtgt_data = outtgt.data();
tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data, left,
epsilon, right);
ExpectEQ<T>(outtgt_data, outref_data, left * right);
}
};
template <typename T>
struct TestFuncWithRefer<jit::CRFDecodingTuples<T>, int, std::vector<T>,
std::vector<T>, std::vector<T>, std::vector<int>,
int> {
void operator()(const typename jit::CRFDecodingTuples<T>::func_type tgt,
const int seq_len, const std::vector<T>& x,
const std::vector<T>& w, std::vector<T>& alpharef, // NOLINT
std::vector<int>& trackref, int tag_num) { // NOLINT
constexpr int state_trans_base_idx = 2;
EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(x.size(), static_cast<size_t>(seq_len * tag_num));
EXPECT_EQ(w.size(),
static_cast<size_t>((tag_num + state_trans_base_idx) * tag_num));
EXPECT_EQ(alpharef.size(), static_cast<size_t>(seq_len * tag_num));
EXPECT_EQ(trackref.size(), static_cast<size_t>(seq_len * tag_num));
std::vector<T> alphatgt(alpharef.size());
std::vector<int> tracktgt(trackref.size());
memcpy(trackref.data(), tracktgt.data(), tag_num * sizeof(int));
tgt(seq_len, (const T*)x.data(), (const T*)w.data(), alphatgt.data(),
tracktgt.data(), tag_num);
ExpectEQ<T>(alpharef.data(), alphatgt.data(), seq_len * tag_num);
ExpectEQ<int>(trackref.data(), tracktgt.data(), seq_len * tag_num);
}
};
template <jit::KernelType KT, typename KernelTuples, typename PlaceType, template <jit::KernelType KT, typename KernelTuples, typename PlaceType,
typename... Args> typename... Args>
void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
...@@ -640,6 +697,71 @@ void TestNCHW16CMulNCKernel() { ...@@ -640,6 +697,71 @@ void TestNCHW16CMulNCKernel() {
} }
} }
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
void TestLayerNormKernel() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
const T epsilon = 9.99999975e-06;
for (int n : {1, 2, 10}) {
for (int x_dim_0 : {1, 9, 17, 50}) {
int left = n * x_dim_0;
for (int x_dim_1 : TestSizes()) {
int right = x_dim_1;
auto ref = jit::GetRefer<KT, jit::LayerNormTuples<T>>();
EXPECT_TRUE(ref != nullptr);
int sz = left * right;
std::vector<T> x(sz), mean(left), var(left), scale(right), bias(right),
outref(sz);
RandomVec<T>(sz, x.data(), -2.f, 2.f);
RandomVec<T>(left, mean.data(), -2.f, 2.f);
RandomVec<T>(left, var.data(), -2.f, 2.f);
RandomVec<T>(right, scale.data(), -2.f, 2.f);
RandomVec<T>(right, bias.data(), -2.f, 2.f);
const T* scale_data = scale.data();
const T* bias_data = bias.data();
T* x_data = x.data();
T* mean_data = mean.data();
T* var_data = var.data();
T* outref_data = outref.data();
ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data,
left, epsilon, right);
TestAllImpls<KT, jit::LayerNormTuples<T>, PlaceType, std::vector<T>,
std::vector<T>, std::vector<T>, std::vector<T>,
std::vector<T>, std::vector<T>, int, float>(
right, x, outref, mean, var, scale, bias, left, epsilon, right);
}
}
}
}
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
void TestCRFDecodingKernel() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
constexpr int state_trans_base_idx = 2;
for (int seq_len : {1, 11, 17, 50}) {
for (int tag_num : TestSizes()) {
auto ref = jit::GetRefer<KT, jit::CRFDecodingTuples<T>>();
EXPECT_TRUE(ref != nullptr);
int x_sz = seq_len * tag_num;
int w_sz = (tag_num + state_trans_base_idx) * tag_num;
std::vector<T> x(x_sz), w(w_sz), alpharef(x_sz);
std::vector<int> trackref(x_sz);
RandomVec<T>(x_sz, x.data(), -2.f, 2.f);
RandomVec<T>(w_sz, w.data(), -2.f, 2.f);
ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(),
trackref.data(), tag_num);
TestAllImpls<KT, jit::CRFDecodingTuples<T>, PlaceType, int,
std::vector<T>, std::vector<T>, std::vector<T>,
std::vector<int>, int>(tag_num, seq_len, x, w, alpharef,
trackref, tag_num);
}
}
}
// XYZNTuple // XYZNTuple
TEST(JITKernel, kVMul) { TEST(JITKernel, kVMul) {
TestXYZNKernel<jit::kVMul, float, CPUPlace>(); TestXYZNKernel<jit::kVMul, float, CPUPlace>();
...@@ -761,7 +883,16 @@ TEST(JITKernel, kNCHW16CMulNC) { ...@@ -761,7 +883,16 @@ TEST(JITKernel, kNCHW16CMulNC) {
TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, double, CPUPlace>(); TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, double, CPUPlace>();
} }
// TODO(yihua/TJ): add crf decoding and layer norm unit tests TEST(JITKernel, kLayerNorm) {
TestLayerNormKernel<jit::kLayerNorm, float, paddle::platform::CPUPlace>();
TestLayerNormKernel<jit::kLayerNorm, double, paddle::platform::CPUPlace>();
}
TEST(JITKernel, kCRFDecoding) {
TestCRFDecodingKernel<jit::kCRFDecoding, float, paddle::platform::CPUPlace>();
TestCRFDecodingKernel<jit::kCRFDecoding, double,
paddle::platform::CPUPlace>();
}
TEST(JITKernel, pool) { TEST(JITKernel, pool) {
// TODO(TJ): add some test // TODO(TJ): add some test
......
...@@ -64,7 +64,7 @@ class LoadCombineOp : public framework::OperatorBase { ...@@ -64,7 +64,7 @@ class LoadCombineOp : public framework::OperatorBase {
auto *tensor = out_var->GetMutable<framework::LoDTensor>(); auto *tensor = out_var->GetMutable<framework::LoDTensor>();
// Error checking // Error checking
PADDLE_ENFORCE(static_cast<bool>(buffer), "Cannot read more"); PADDLE_ENFORCE(static_cast<bool>(*buffer), "Cannot read more");
// Get data from fin to tensor // Get data from fin to tensor
DeserializeFromStream(*buffer, tensor, dev_ctx); DeserializeFromStream(*buffer, tensor, dev_ctx);
...@@ -90,6 +90,10 @@ class LoadCombineOp : public framework::OperatorBase { ...@@ -90,6 +90,10 @@ class LoadCombineOp : public framework::OperatorBase {
tensor->ShareDataWith(fp16_tensor); tensor->ShareDataWith(fp16_tensor);
} }
} }
buffer->peek();
PADDLE_ENFORCE(buffer->eof(),
"You are not allowed to load partial data via "
"load_combine_op, use load_op instead.");
} }
}; };
......
...@@ -311,6 +311,10 @@ class LSTMGradKernel : public framework::OpKernel<T> { ...@@ -311,6 +311,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr; lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
} }
// lstm_value.output_value not used in bp, set to nullptr
// lstm_grad.state_active_grad not used in bp, set to nullptr
lstm_value.output_value = nullptr;
lstm_grad.state_active_grad = nullptr;
int cur_batch_size = bend - bstart; int cur_batch_size = bend - bstart;
math::LstmUnitGradFunctor<DeviceContext, T>::compute( math::LstmUnitGradFunctor<DeviceContext, T>::compute(
device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size, device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
......
...@@ -405,6 +405,11 @@ class LSTMPGradKernel : public framework::OpKernel<T> { ...@@ -405,6 +405,11 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
} }
int cur_batch_size = bend - bstart; int cur_batch_size = bend - bstart;
// lstmp_value.output_value not used in bp, set to null
// lstmp_grad.state_active_grad not used in bp, set to null
lstmp_value.output_value = nullptr;
lstmp_grad.state_active_grad = nullptr;
math::LstmUnitGradFunctor<DeviceContext, T>::compute( math::LstmUnitGradFunctor<DeviceContext, T>::compute(
device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size, device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
gate_act, cell_act, cand_act); gate_act, cell_act, cand_act);
......
...@@ -109,23 +109,23 @@ from future subsequences in a computationally efficient manner to improve ...@@ -109,23 +109,23 @@ from future subsequences in a computationally efficient manner to improve
unidirectional recurrent neural networks. The row convolution operator is unidirectional recurrent neural networks. The row convolution operator is
different from the 1D sequence convolution, and is computed as follows: different from the 1D sequence convolution, and is computed as follows:
Given an input sequence $in$ of length $t$ and input dimension $d$, Given an input sequence $X$ of length $t$ and input dimension $D$,
and a filter ($W$) of size $context \times d$, and a filter ($W$) of size $context \times D$,
the output sequence is convolved as: the output sequence is convolved as:
$$ $$
out_{i, :} = \\sum_{j=i}^{i + context} in_{j,:} \\cdot W_{i-j, :} out_{i} = \\sum_{j=i}^{i + context - 1} X_{j} \\cdot W_{j-i}
$$ $$
In the above equation: In the above equation:
* $Out_{i}$: The i-th row of output variable with shape [1, D]. * $Out_{i}$: The i-th row of output variable with shape [1, D].
* $\\tau$: Future context size. * $context$: Future context size.
* $X_{j}$: The j-th row of input variable with shape [1, D]. * $X_{j}$: The j-th row of input variable with shape [1, D].
* $W_{i-j}$: The (i-j)-th row of parameters with shape [1, D]. * $W_{j-i}$: The (j-i)-th row of parameters with shape [1, D].
More details about row_conv please refer to More details about row_conv please refer to
the design document the design document
......
...@@ -233,9 +233,11 @@ inline void throw_on_error(ncclResult_t stat, const std::string& msg) { ...@@ -233,9 +233,11 @@ inline void throw_on_error(ncclResult_t stat, const std::string& msg) {
#endif // __APPLE__ and windows #endif // __APPLE__ and windows
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
#define PADDLE_THROW(...) \ #define PADDLE_THROW(...) \
throw ::paddle::platform::EnforceNotMet( \ do { \
::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__) throw ::paddle::platform::EnforceNotMet( \
::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \
} while (0)
#define PADDLE_ENFORCE(COND, ...) \ #define PADDLE_ENFORCE(COND, ...) \
do { \ do { \
...@@ -270,23 +272,25 @@ inline void throw_on_error(ncclResult_t stat, const std::string& msg) { ...@@ -270,23 +272,25 @@ inline void throw_on_error(ncclResult_t stat, const std::string& msg) {
* extra messages is also supported, for example: * extra messages is also supported, for example:
* PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2) * PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
*/ */
#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \ #define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \
do { \ do { \
if (UNLIKELY(nullptr == (__VAL))) { \ if (UNLIKELY(nullptr == (__VAL))) { \
PADDLE_THROW(#__VAL " should not be null\n%s", \ PADDLE_THROW(#__VAL " should not be null\n%s", \
paddle::string::Sprintf("" __VA_ARGS__)); \ ::paddle::string::Sprintf(__VA_ARGS__)); \
} \ } \
} while (0) } while (0)
#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \ #define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
do { \ do { \
if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) { \ auto __cond1__ = (__VAL0); \
auto __cond2__ = (__VAL1); \
if (UNLIKELY(!((__cond1__)__CMP(__cond2__)))) { \
PADDLE_THROW("Enforce failed. Expected %s " #__CMP \ PADDLE_THROW("Enforce failed. Expected %s " #__CMP \
" %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s", \ " %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s", \
#__VAL0, #__VAL1, #__VAL0, \ #__VAL0, #__VAL1, #__VAL0, \
paddle::string::to_string(__VAL0), #__VAL1, \ ::paddle::string::to_string(__cond1__), #__VAL1, \
paddle::string::to_string(__VAL1), \ ::paddle::string::to_string(__cond2__), \
paddle::string::Sprintf("" __VA_ARGS__)); \ ::paddle::string::Sprintf(__VA_ARGS__)); \
} \ } \
} while (0) } while (0)
......
...@@ -13,10 +13,12 @@ ...@@ -13,10 +13,12 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/pybind/ir.h" #include "paddle/fluid/pybind/ir.h"
#include <algorithm>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
...@@ -27,6 +29,10 @@ namespace py = pybind11; ...@@ -27,6 +29,10 @@ namespace py = pybind11;
using paddle::framework::ir::Graph; using paddle::framework::ir::Graph;
using paddle::framework::ir::Node; using paddle::framework::ir::Node;
using paddle::framework::ir::GraphSafeRemoveNodes; using paddle::framework::ir::GraphSafeRemoveNodes;
using paddle::framework::ir::HasCircle;
using paddle::framework::ir::GraphNum;
using paddle::framework::ir::TopologySortOperations;
using paddle::framework::ir::BuildOperationAdjList;
using paddle::framework::OpDesc; using paddle::framework::OpDesc;
using paddle::framework::ProgramDesc; using paddle::framework::ProgramDesc;
using paddle::framework::VarDesc; using paddle::framework::VarDesc;
...@@ -36,6 +42,12 @@ namespace paddle { ...@@ -36,6 +42,12 @@ namespace paddle {
namespace pybind { namespace pybind {
void BindGraph(py::module *m) { void BindGraph(py::module *m) {
m->def("graph_safe_remove_nodes", GraphSafeRemoveNodes); m->def("graph_safe_remove_nodes", GraphSafeRemoveNodes);
m->def("has_circle", HasCircle);
m->def("graph_num", GraphNum);
m->def("topology_sort", TopologySortOperations,
return_value_policy::reference);
m->def("build_adjacency_list", BuildOperationAdjList,
return_value_policy::reference);
py::class_<Graph, std::shared_ptr<Graph>>( py::class_<Graph, std::shared_ptr<Graph>>(
*m, "Graph", *m, "Graph",
"The graph is a Directed Acyclic Single Static Assignment Graph, see " "The graph is a Directed Acyclic Single Static Assignment Graph, see "
...@@ -46,7 +58,6 @@ void BindGraph(py::module *m) { ...@@ -46,7 +58,6 @@ void BindGraph(py::module *m) {
.def("get_float", &Graph::Get<float>) .def("get_float", &Graph::Get<float>)
.def("get_double", &Graph::Get<double>) .def("get_double", &Graph::Get<double>)
.def("get_string", &Graph::Get<std::string>) .def("get_string", &Graph::Get<std::string>)
.def("get_program", &Graph::Get<ProgramDesc>)
.def("get_marked_nodes", &Graph::Get<std::unordered_set<const Node *>>) .def("get_marked_nodes", &Graph::Get<std::unordered_set<const Node *>>)
.def("set", [](Graph &self, const std::string &attr_name, .def("set", [](Graph &self, const std::string &attr_name,
int attr) { return self.Set(attr_name, new int(attr)); }) int attr) { return self.Set(attr_name, new int(attr)); })
...@@ -63,11 +74,6 @@ void BindGraph(py::module *m) { ...@@ -63,11 +74,6 @@ void BindGraph(py::module *m) {
[](Graph &self, const std::string &attr_name, double attr) { [](Graph &self, const std::string &attr_name, double attr) {
return self.Set(attr_name, new double(attr)); return self.Set(attr_name, new double(attr));
}) })
.def("set",
[](Graph &self, const std::string &attr_name,
const ProgramDesc &attr) {
return self.Set(attr_name, new ProgramDesc(attr));
})
.def("set", .def("set",
[](Graph &self, const std::string &attr_name, [](Graph &self, const std::string &attr_name,
const std::unordered_set<const Node *> &attr) { const std::unordered_set<const Node *> &attr) {
...@@ -108,42 +114,42 @@ void BindNode(py::module *m) { ...@@ -108,42 +114,42 @@ void BindNode(py::module *m) {
.def("is_op", &Node::IsOp) .def("is_op", &Node::IsOp)
.def("is_var", &Node::IsVar) .def("is_var", &Node::IsVar)
.def("is_ctrl_var", &Node::IsCtrlVar) .def("is_ctrl_var", &Node::IsCtrlVar)
.def("clear_inputs", [](Node &self) { self.inputs.clear(); })
.def("inputs_remove", .def("inputs_remove",
[](Node &self, int node_id) { [](Node &self, int node_id) {
for (auto it = self.inputs.begin(); it != self.inputs.end(); auto pos = std::find_if(
it++) { self.inputs.begin(), self.inputs.end(),
if ((*it)->id() == node_id) { [&node_id](const Node *n) { return n->id() == node_id; });
self.inputs.erase(it); if (pos != self.inputs.end()) {
} self.inputs.erase(pos);
} }
}) })
.def("inputs_remove", .def("inputs_remove",
[](Node &self, Node &node) { [](Node &self, Node &node) {
for (auto it = self.inputs.begin(); it != self.inputs.end(); auto pos =
it++) { std::find(self.inputs.begin(), self.inputs.end(), &node);
if (*it == &node) { if (pos != self.inputs.end()) {
self.inputs.erase(it); self.inputs.erase(pos);
}
} }
}) })
.def("inputs_append", .def("inputs_append",
[](Node &self, Node &node) { self.inputs.push_back(&node); }) [](Node &self, Node &node) { self.inputs.push_back(&node); })
.def("clear_outputs", [](Node &self) { self.outputs.clear(); })
.def("outputs_remove", .def("outputs_remove",
[](Node &self, int node_id) { [](Node &self, int node_id) {
for (auto it = self.outputs.begin(); it != self.outputs.end(); auto pos = std::find_if(
it++) { self.outputs.begin(), self.outputs.end(),
if ((*it)->id() == node_id) { [&node_id](const Node *n) { return n->id() == node_id; });
self.outputs.erase(it); if (pos != self.outputs.end()) {
} self.outputs.erase(pos);
} }
}) })
.def("outputs_remove", .def("outputs_remove",
[](Node &self, Node &node) { [](Node &self, Node &node) {
for (auto it = self.outputs.begin(); it != self.outputs.end(); auto pos =
it++) { std::find(self.outputs.begin(), self.outputs.end(), &node);
if (*it == &node) { if (pos != self.outputs.end()) {
self.outputs.erase(it); self.outputs.erase(pos);
}
} }
}) })
.def("outputs_append", .def("outputs_append",
......
...@@ -829,8 +829,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -829,8 +829,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("disable_profiler", platform::DisableProfiler); m.def("disable_profiler", platform::DisableProfiler);
m.def("is_profiler_enabled", platform::IsProfileEnabled); m.def("is_profiler_enabled", platform::IsProfileEnabled);
m.def("reset_profiler", platform::ResetProfiler); m.def("reset_profiler", platform::ResetProfiler);
m.def("get_pass", [](const py::bytes &binary_str) { m.def("get_pass", [](const std::string &pass_type) {
std::string pass_type(binary_str);
auto pass = framework::ir::PassRegistry::Instance().Get(pass_type); auto pass = framework::ir::PassRegistry::Instance().Get(pass_type);
return std::shared_ptr<framework::ir::Pass>(std::move(pass)); return std::shared_ptr<framework::ir::Pass>(std::move(pass));
}); });
...@@ -838,10 +837,9 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -838,10 +837,9 @@ All parameter, weight, gradient are variables in Paddle.
py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass"); py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
pass.def(py::init()) pass.def(py::init())
.def("has", &ir::Pass::Has) .def("has", &ir::Pass::Has)
.def("set", .def("set_not_owned",
[](ir::Pass &self, const std::string &attr_name, [](ir::Pass &self, const std::string &attr_name, ProgramDesc &attr) {
const ProgramDesc &attr) { self.SetNotOwned<ProgramDesc>(attr_name, &attr);
return self.Set(attr_name, new ProgramDesc(attr));
}) })
.def( .def(
"set", "set",
...@@ -850,7 +848,6 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -850,7 +848,6 @@ All parameter, weight, gradient are variables in Paddle.
}) })
.def("set", [](ir::Pass &self, const std::string &name, .def("set", [](ir::Pass &self, const std::string &name,
int val) { self.Set<const int>(name, new int(val)); }) int val) { self.Set<const int>(name, new int(val)); })
.def("get_program", &ir::Pass::Get<ProgramDesc>)
.def("type", &ir::Pass::Type) .def("type", &ir::Pass::Type)
.def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) { .def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
std::unique_ptr<ir::Graph> origin_graph(graph.get()); std::unique_ptr<ir::Graph> origin_graph(graph.get());
......
...@@ -64,6 +64,7 @@ if (WITH_TESTING) ...@@ -64,6 +64,7 @@ if (WITH_TESTING)
add_subdirectory(paddle/dataset/tests) add_subdirectory(paddle/dataset/tests)
add_subdirectory(paddle/fluid/tests) add_subdirectory(paddle/fluid/tests)
add_subdirectory(paddle/fluid/contrib/tests) add_subdirectory(paddle/fluid/contrib/tests)
add_subdirectory(paddle/fluid/contrib/slim/tests)
endif() endif()
install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR} install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
DESTINATION opt/paddle/share/wheels DESTINATION opt/paddle/share/wheels
......
...@@ -177,7 +177,10 @@ class CompiledProgram(object): ...@@ -177,7 +177,10 @@ class CompiledProgram(object):
# FIXME(dzhwinter): enable_inplace should be after memory_optimize # FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass. # if turn on python memory optimize, turn off the inplace_pass.
self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True if self._build_strategy.memory_optimize is None:
self._build_strategy.memory_optimize = False if main._is_mem_optimized else True
if self._build_strategy.enable_inplace is None:
self._build_strategy.enable_inplace = False if main._is_mem_optimized else True
if self._build_strategy.num_trainers > 1 and trainers_endpoints: if self._build_strategy.num_trainers > 1 and trainers_endpoints:
assert self._build_strategy.num_trainers == len( assert self._build_strategy.num_trainers == len(
......
...@@ -63,10 +63,10 @@ Notes: ...@@ -63,10 +63,10 @@ Notes:
## 4. How to reproduce the results ## 4. How to reproduce the results
* Small dataset * Small dataset
```bash ```bash
python python/paddle/fluid/contrib/tests/test_calibration.py FLAGS_use_mkldnn=true python python/paddle/fluid/contrib/tests/test_calibration.py
``` ```
* Full dataset * Full dataset
```bash ```bash
DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py FLAGS_use_mkldnn=true DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
``` ```
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
endforeach()
version: 1.0 version: 1.0
include: ["./unitest/configs/pruners.yaml", "./unitest/configs/pruners_0.yaml"] include: ["./configs/pruners.yaml", "./configs/pruners_0.yaml"]
pruners: pruners:
pruner_1: pruner_1:
class: 'RatioPruner' class: 'RatioPruner'
......
...@@ -18,7 +18,7 @@ import unittest ...@@ -18,7 +18,7 @@ import unittest
class TestFactory(unittest.TestCase): class TestFactory(unittest.TestCase):
def test_parse(self): def test_parse(self):
factory = ConfigFactory('./unitest/configs/config.yaml') factory = ConfigFactory('./configs/config.yaml')
pruner = factory.instance('pruner_1') pruner = factory.instance('pruner_1')
self.assertEquals(pruner.ratios['conv1_1.w'], 0.3) self.assertEquals(pruner.ratios['conv1_1.w'], 0.3)
......
# copyright (c) 2018 paddlepaddle authors. all rights reserved.
#
# licensed under the apache license, version 2.0 (the "license");
# you may not use this file except in compliance with the license.
# you may obtain a copy of the license at
#
# http://www.apache.org/licenses/license-2.0
#
# unless required by applicable law or agreed to in writing, software
# distributed under the license is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# see the license for the specific language governing permissions and
# limitations under the license.
from __future__ import print_function
import unittest
import paddle.fluid as fluid
import six
from paddle.fluid.framework import IrGraph
from paddle.fluid import core
def residual_block(num):
def conv_bn_layer(input,
ch_out,
filter_size,
stride,
padding,
act='relu',
bias_attr=False):
tmp = fluid.layers.conv2d(
input=input,
filter_size=filter_size,
num_filters=ch_out,
stride=stride,
padding=padding,
act=None,
bias_attr=bias_attr)
return fluid.layers.batch_norm(input=tmp, act=act)
data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = data
for _ in six.moves.xrange(num):
conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
fc = fluid.layers.fc(input=hidden, size=10)
loss = fluid.layers.cross_entropy(input=fc, label=label)
loss = fluid.layers.mean(loss)
return loss
class TestGraph(unittest.TestCase):
def test_graph_functions(self):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
loss = residual_block(2)
opt = fluid.optimizer.Adam(learning_rate=0.001)
opt.minimize(loss)
graph = IrGraph(core.Graph(main.desc), for_test=False)
marked_nodes = set()
for op in graph.all_ops():
if op.name().find('conv2d') > -1:
marked_nodes.add(op)
graph.draw('.', 'residual', marked_nodes)
self.assertFalse(graph.has_circle())
self.assertEqual(graph.graph_num(), 1)
nodes = graph.topology_sort()
self.assertEqual(len(nodes), len(graph.all_ops()))
nodes_map = graph.build_adjacency_list()
self.assertEqual(len(nodes_map), len(graph.all_ops()))
nodes_num = len(graph.all_nodes())
graph.safe_remove_nodes(marked_nodes)
self.assertEqual(len(graph.all_nodes()), nodes_num - len(marked_nodes))
if __name__ == '__main__':
unittest.main()
...@@ -17,9 +17,12 @@ import random ...@@ -17,9 +17,12 @@ import random
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
import six import six
from paddle.fluid.framework import Program import paddle
from paddle.fluid.framework import IrGraph from paddle.fluid.framework import IrGraph
from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass
from paddle.fluid.contrib.slim.quantization import ConvertToInt8Pass
from paddle.fluid.contrib.slim.quantization import TransformForMobilePass
from paddle.fluid import core from paddle.fluid import core
...@@ -65,6 +68,28 @@ def residual_block(num): ...@@ -65,6 +68,28 @@ def residual_block(num):
return loss return loss
def conv_net(img, label):
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=img,
filter_size=5,
num_filters=20,
pool_size=2,
pool_stride=2,
act="relu")
conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
pool_size=2,
pool_stride=2,
act="relu")
prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
avg_loss = fluid.layers.mean(loss)
return avg_loss
class TestQuantizationTransformPass(unittest.TestCase): class TestQuantizationTransformPass(unittest.TestCase):
def setUp(self): def setUp(self):
self.quantizable_op_and_inputs = { self.quantizable_op_and_inputs = {
...@@ -171,5 +196,177 @@ class TestQuantizationTransformPass(unittest.TestCase): ...@@ -171,5 +196,177 @@ class TestQuantizationTransformPass(unittest.TestCase):
self.residual_block_quant('range_abs_max') self.residual_block_quant('range_abs_max')
class TestQuantizationFreezePass(unittest.TestCase):
def freeze_graph(self, use_cuda, seed, quant_type):
def build_program(main, startup, is_test):
main.random_seed = seed
startup.random_seed = seed
with fluid.unique_name.guard():
with fluid.program_guard(main, startup):
img = fluid.layers.data(
name='image', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
loss = conv_net(img, label)
if not is_test:
opt = fluid.optimizer.Adam(learning_rate=0.001)
opt.minimize(loss)
return [img, label], loss
random.seed(0)
np.random.seed(0)
main = fluid.Program()
startup = fluid.Program()
test_program = fluid.Program()
feeds, loss = build_program(main, startup, False)
build_program(test_program, startup, True)
test_program = test_program.clone(for_test=True)
main_graph = IrGraph(core.Graph(main.desc), for_test=False)
test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
scope = fluid.Scope()
with fluid.scope_guard(scope):
exe.run(startup)
transform_pass = QuantizationTransformPass(
scope=scope, program_exe=exe, activation_quantize_type=quant_type)
transform_pass.apply(main_graph)
transform_pass.apply(test_graph)
dev_name = '_gpu_' if use_cuda else '_cpu_'
marked_nodes = set()
for op in main_graph.all_ops():
if op.name().find('quantize') > -1:
marked_nodes.add(op)
main_graph.draw('.', 'main' + dev_name + quant_type, marked_nodes)
marked_nodes = set()
for op in test_graph.all_ops():
if op.name().find('quantize') > -1:
marked_nodes.add(op)
test_graph.draw('.', 'test' + dev_name + quant_type, marked_nodes)
quantized_main_program = main_graph.to_program()
quantized_test_program = test_graph.to_program()
iters = 5
batch_size = 8
#train_exe = fluid.ParallelExecutor(
# main_program=quantized_main_program,
# use_cuda=bool(use_cuda),
# loss_name=loss.name,
# scope=scope)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=500),
batch_size=batch_size)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size)
feeder = fluid.DataFeeder(feed_list=feeds, place=place)
with fluid.scope_guard(scope):
for _ in range(iters):
data = next(train_reader())
loss_v = exe.run(program=quantized_main_program,
feed=feeder.feed(data),
fetch_list=[loss])
#loss_v = train_exe.run(feed=feeder.feed(data),
# fetch_list=[loss.name])
#print('{}: {}'.format('loss' + dev_name + quant_type, loss_v))
test_data = next(test_reader())
with fluid.program_guard(quantized_test_program):
w_var = fluid.framework._get_var('conv2d_1.w_0.quantized',
quantized_test_program)
# Testing
with fluid.scope_guard(scope):
test_loss1, w_quant = exe.run(program=quantized_test_program,
feed=feeder.feed(test_data),
fetch_list=[loss, w_var])
# Freeze graph for inference, but the weight of fc/conv is still float type.
freeze_pass = QuantizationFreezePass(scope=scope, place=place)
freeze_pass.apply(test_graph)
marked_nodes = set()
for op in test_graph.all_ops():
if op.name().find('quantize') > -1:
marked_nodes.add(op)
test_graph.draw('.', 'test_freeze' + dev_name + quant_type,
marked_nodes)
server_program = test_graph.to_program()
with fluid.scope_guard(scope):
test_loss2, = exe.run(program=server_program,
feed=feeder.feed(test_data),
fetch_list=[loss])
self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
#print('{}: {}'.format('test_loss1' + dev_name + quant_type, test_loss1))
#print('{}: {}'.format('test_loss2' + dev_name + quant_type, test_loss2))
w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor())
# Maybe failed, this is due to the calculation precision
# self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
#print('{}: {}'.format('w_freeze' + dev_name + quant_type,
# np.sum(w_freeze)))
#print('{}: {}'.format('w_quant' + dev_name + quant_type,
# np.sum(w_quant)))
# Convert parameter to 8-bit.
convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place)
convert_int8_pass.apply(test_graph)
marked_nodes = set()
for op in test_graph.all_ops():
if op.name().find('quantize') > -1:
marked_nodes.add(op)
test_graph.draw('.', 'test_int8' + dev_name + quant_type, marked_nodes)
server_program_int8 = test_graph.to_program()
# Save the 8-bit parameter and model file.
with fluid.scope_guard(scope):
fluid.io.save_inference_model('server_int8' + dev_name + quant_type,
['image', 'label'], [loss], exe,
server_program_int8)
# Test whether the 8-bit parameter and model file can be loaded successfully.
[infer, feed, fetch] = fluid.io.load_inference_model(
'server_int8' + dev_name + quant_type, exe)
# Check the loaded 8-bit weight.
w_8bit = np.array(scope.find_var('conv2d_1.w_0.int8').get_tensor())
self.assertEqual(w_8bit.dtype, np.int8)
self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
#print('{}: {}'.format('w_8bit' + dev_name + quant_type, np.sum(w_8bit)))
#print('{}: {}'.format('w_freeze' + dev_name + quant_type,
# np.sum(w_freeze)))
mobile_pass = TransformForMobilePass()
mobile_pass.apply(test_graph)
marked_nodes = set()
for op in test_graph.all_ops():
if op.name().find('quantize') > -1:
marked_nodes.add(op)
test_graph.draw('.', 'test_mobile' + dev_name + quant_type,
marked_nodes)
mobile_program = test_graph.to_program()
with fluid.scope_guard(scope):
fluid.io.save_inference_model('mobile_int8' + dev_name + quant_type,
['image', 'label'], [loss], exe,
mobile_program)
def test_freeze_graph_cuda_dynamic(self):
if fluid.core.is_compiled_with_cuda():
with fluid.unique_name.guard():
self.freeze_graph(True, seed=1, quant_type='abs_max')
def test_freeze_graph_cpu_dynamic(self):
with fluid.unique_name.guard():
self.freeze_graph(False, seed=2, quant_type='abs_max')
def test_freeze_graph_cuda_static(self):
if fluid.core.is_compiled_with_cuda():
with fluid.unique_name.guard():
self.freeze_graph(True, seed=1, quant_type='range_abs_max')
def test_freeze_graph_cpu_static(self):
with fluid.unique_name.guard():
self.freeze_graph(False, seed=2, quant_type='range_abs_max')
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -6,5 +6,9 @@ if(APPLE OR WIN32 OR NOT WITH_MKL) ...@@ -6,5 +6,9 @@ if(APPLE OR WIN32 OR NOT WITH_MKL)
endif() endif()
foreach(src ${TEST_OPS}) foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py) if(src MATCHES "test_calibration")
py_test(${src} SRCS ${src}.py ENVS FLAGS_use_mkldnn=true)
else()
py_test(${src} SRCS ${src}.py)
endif()
endforeach() endforeach()
...@@ -199,7 +199,6 @@ class TestCalibrationForResnet50(unittest.TestCase): ...@@ -199,7 +199,6 @@ class TestCalibrationForResnet50(unittest.TestCase):
def run_program(self, model_path, generate_int8=False, algo='direct'): def run_program(self, model_path, generate_int8=False, algo='direct'):
image_shape = [3, 224, 224] image_shape = [3, 224, 224]
os.environ['FLAGS_use_mkldnn'] = 'True'
fluid.memory_optimize(fluid.default_main_program()) fluid.memory_optimize(fluid.default_main_program())
...@@ -241,9 +240,6 @@ class TestCalibrationForResnet50(unittest.TestCase): ...@@ -241,9 +240,6 @@ class TestCalibrationForResnet50(unittest.TestCase):
label = label.reshape([-1, 1]) label = label.reshape([-1, 1])
running_program = calibrator.sampling_program.clone( running_program = calibrator.sampling_program.clone(
) if generate_int8 else infer_program.clone() ) if generate_int8 else infer_program.clone()
for op in running_program.current_block().ops:
if op.has_attr("use_mkldnn"):
op._set_attr("use_mkldnn", True)
t1 = time.time() t1 = time.time()
_, acc1, _ = exe.run( _, acc1, _ = exe.run(
......
...@@ -204,9 +204,11 @@ class TestQuantizeTranspiler(unittest.TestCase): ...@@ -204,9 +204,11 @@ class TestQuantizeTranspiler(unittest.TestCase):
build_program(test_program, startup, True) build_program(test_program, startup, True)
test_program = test_program.clone(for_test=True) test_program = test_program.clone(for_test=True)
quant_transpiler = QuantizeTranspiler() quant_type = 'range_abs_max' # 'range_abs_max' or 'abs_max'
quant_transpiler.training_transpile(main) quant_transpiler = QuantizeTranspiler(
quant_transpiler.training_transpile(test_program) activation_quantize_type=quant_type)
quant_transpiler.training_transpile(main, startup)
quant_transpiler.training_transpile(test_program, startup)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
......
...@@ -16,6 +16,8 @@ from __future__ import print_function ...@@ -16,6 +16,8 @@ from __future__ import print_function
import collections import collections
from collections import defaultdict from collections import defaultdict
from collections import Iterable
import contextlib
from .wrapped_decorator import signature_safe_contextmanager from .wrapped_decorator import signature_safe_contextmanager
import os import os
import re import re
...@@ -1529,12 +1531,16 @@ class Block(object): ...@@ -1529,12 +1531,16 @@ class Block(object):
class IrGraph(object): class IrGraph(object):
""" """
IrGraph uses core.Graph as the delegation to accomplish the manipulation. Python IrGraph. Beneath it is a core.Graph, which is used for
create a c++ Ir Pass Graph. An IrGraph is just a graph view of
a Program. In an IrGraph, both Variables and Operators are graph
nodes.
""" """
def __init__(self, graph, for_test=False): def __init__(self, graph, for_test=False):
""" """
Construct the IrGraph using core.Graph. Construct an IrGraph using core.Graph.
Args: Args:
graph(core.Graph): C++ Graph. graph(core.Graph): C++ Graph.
for_test(bool): True for the test graph and false for the train graph. for_test(bool): True for the test graph and false for the train graph.
...@@ -1545,23 +1551,81 @@ class IrGraph(object): ...@@ -1545,23 +1551,81 @@ class IrGraph(object):
self._for_test = for_test self._for_test = for_test
def is_test(self): def is_test(self):
"""
If the graph is used for testing, the function returns true. Otherwise, returns false.
"""
return self._for_test return self._for_test
def all_parameters(self): def all_nodes(self):
param_nodes = set() """
for node in self.graph.nodes(): Return all nodes included in the graph as a set.
if node.is_var() and node.var() is not None and node.var( """
).persistable(): return {node for node in self.graph.nodes()}
param_nodes.add(node)
return param_nodes
def all_vars(self): def all_vars(self):
"""
Return all variable nodes included in the graph as a set.
"""
return {node for node in self.graph.nodes() if node.is_var()} return {node for node in self.graph.nodes() if node.is_var()}
def all_persistable_vars(self):
"""
Return all persistable variable nodes included in the graph as a set.
"""
persistable_nodes = set()
for node in self.graph.nodes():
if node.is_var() and node.var() is not None and node.var(
).persistable():
persistable_nodes.add(node)
return persistable_nodes
def all_ops(self): def all_ops(self):
"""
Return all operator nodes included in the graph as a set.
"""
return {node for node in self.graph.nodes() if node.is_op()} return {node for node in self.graph.nodes() if node.is_op()}
def var_node(self, name):
"""
Get a variable node by name from the graph.
Args:
name(str): the name of the variable node.
Raises:
ValueError: The If input's type is not str, or this graph
doesn't have a variable with the giving name.
Returns:
core.Node: the variable node with the giving name.
"""
if not isinstance(name, six.string_types):
raise TypeError(
"var require string as parameter, but get %s instead." %
(type(name)))
target_var_node = None
var_nodes = self.all_vars()
for var_node in var_nodes:
if var_node.name() == name:
target_var_node = var_node
if target_var_node is None:
raise ValueError("var_node %s not in this graph" % name)
return target_var_node
def create_param_node(self, name, var_type, shape, var_dtype): def create_param_node(self, name, var_type, shape, var_dtype):
"""
Create a persistable variable node in the graph. In IrGraph,
it can not distinguish between persistable variables and parameters.
Args:
name(str): the name of the persistable variable node.
vart_type(core.VarDesc.VarType): the type of the persistable variable node.
shape(list): the shape of the persistable variable node.
var_dtype(core.VarDesc.VarType): the data type of the persistable variable node.
Returns:
core.Node: the created persistable variable node.
"""
var_desc = core.VarDesc(name) var_desc = core.VarDesc(name)
var_desc.set_type(var_type) var_desc.set_type(var_type)
var_desc.set_shape(shape) var_desc.set_shape(shape)
...@@ -1570,6 +1634,20 @@ class IrGraph(object): ...@@ -1570,6 +1634,20 @@ class IrGraph(object):
return self.graph.create_var_node(var_desc) return self.graph.create_var_node(var_desc)
def create_var_node(self, name, var_type, shape, var_dtype): def create_var_node(self, name, var_type, shape, var_dtype):
"""
Create a variable node in the graph. The created variable node is
not persistable.
Args:
name(str): the name of the variable node.
vart_type(core.VarDesc.VarType): the type of the variable node.
shape(list): the shape of the variable node.
var_dtype(core.VarDesc.VarType): the data type of the variable node.
Returns:
core.Node: the created variable node.
"""
var_desc = core.VarDesc(name) var_desc = core.VarDesc(name)
var_desc.set_type(var_type) var_desc.set_type(var_type)
var_desc.set_shape(shape) var_desc.set_shape(shape)
...@@ -1577,19 +1655,41 @@ class IrGraph(object): ...@@ -1577,19 +1655,41 @@ class IrGraph(object):
return self.graph.create_var_node(var_desc) return self.graph.create_var_node(var_desc)
def create_var_node_from_desc(self, var_desc): def create_var_node_from_desc(self, var_desc):
"""
Create a variable node by using an existing VarDesc in the graph.
Depend on the giving VarDesc, the created variable node may be persistable.
Args:
var_desc(core.VarDesc): the giving variable description.
Returns:
core.Node: the created variable node.
"""
return self.graph.create_var_node(var_desc) return self.graph.create_var_node(var_desc)
def create_op_node(self, op_type, attrs, inputs, outputs): def create_op_node(self, op_type, attrs, inputs, outputs):
"""
Create a operator node in the graph.
Args:
op_type(str): the type of the operator node.
attrs(dict): the attributes of the operator node.
inputs(dict): the inputs of the operator node.
outputs(dict): the outpus of the operator node.
Returns:
core.Node: the created operator node.
"""
op_desc = core.OpDesc() op_desc = core.OpDesc()
op_desc.set_type(op_type) op_desc.set_type(op_type)
for attr, value in attrs.iteritems(): for attr, value in six.iteritems(attrs):
self._update_desc_attr(op_desc, attr, value) self._update_desc_attr(op_desc, attr, value)
for input_name, var_nodes in inputs.iteritems(): for input_name, var_nodes in six.iteritems(inputs):
if not isinstance(var_nodes, list): if not isinstance(var_nodes, list):
var_nodes = [var_nodes] var_nodes = [var_nodes]
op_desc.set_input(input_name, op_desc.set_input(input_name,
[var_node.name() for var_node in var_nodes]) [var_node.name() for var_node in var_nodes])
for output_name, var_nodes in outputs.iteritems(): for output_name, var_nodes in six.iteritems(outputs):
if not isinstance(var_nodes, list): if not isinstance(var_nodes, list):
var_nodes = [var_nodes] var_nodes = [var_nodes]
op_desc.set_output(output_name, op_desc.set_output(output_name,
...@@ -1597,11 +1697,29 @@ class IrGraph(object): ...@@ -1597,11 +1697,29 @@ class IrGraph(object):
return self.graph.create_op_node(op_desc) return self.graph.create_op_node(op_desc)
def create_op_node_from_desc(self, op_desc): def create_op_node_from_desc(self, op_desc):
"""
Create a operator node by using an existing OpDesc in the graph.
Args:
op_desc(core.VarDesc): the giving operator description.
Returns:
core.Node: the created operator node.
"""
return self.graph.create_op_node(op_desc) return self.graph.create_op_node(op_desc)
def update_input_link(self, old_input_node, new_input_node, op_node): def update_input_link(self, old_input_node, new_input_node, op_node):
assert old_input_node in self.graph.nodes() and new_input_node in self.graph.nodes() and \ """
op_node in self.graph.nodes(), 'Th three arguments must be in the graph nodes.' Update the input's link of a operator node.
Args:
old_input_node(core.Node): the old input node of the giving op_node.
new_input_node(core.Node): the new input node of the giving op_node.
op_node(core.Node): the operator node that is needed to update input's link.
"""
assert old_input_node in self.graph.nodes() and new_input_node in \
self.graph.nodes() and op_node in self.graph.nodes(), \
'The three arguments(old_input_node&new_input_node&op_node) must be in the graph nodes.'
old_input_node.outputs_remove(op_node) old_input_node.outputs_remove(op_node)
op_node.inputs_remove(old_input_node) op_node.inputs_remove(old_input_node)
new_input_node.outputs_append(op_node) new_input_node.outputs_append(op_node)
...@@ -1609,17 +1727,85 @@ class IrGraph(object): ...@@ -1609,17 +1727,85 @@ class IrGraph(object):
op_node.op()._rename_input(old_input_node.name(), new_input_node.name()) op_node.op()._rename_input(old_input_node.name(), new_input_node.name())
def link_to(self, node_in, node_out): def link_to(self, node_in, node_out):
"""
Connect two nodes.
Args:
node_in(core.Node): the input node.
node_out(core.Node): the output node.
"""
assert node_in in self.graph.nodes() and node_out in self.graph.nodes(), \ assert node_in in self.graph.nodes() and node_out in self.graph.nodes(), \
'Th two arguments must be in the graph nodes.' 'The two arguments(node_in&node_out) must be in the graph nodes.'
node_in.outputs_append(node_out) node_in.outputs_append(node_out)
node_out.inputs_append(node_in) node_out.inputs_append(node_in)
def safe_remove_nodes(self, remove_nodes): def safe_remove_nodes(self, remove_nodes):
"""
Remove nodes safely since links connected to these removed nodes are
also removed.
Args:
remove_nodes(set): the nodes prepared to be removed.
"""
if not isinstance(remove_nodes, set): if not isinstance(remove_nodes, set):
remove_nodes = set(remove_nodes) if isinstance(remove_nodes, Iterable):
remove_nodes = set(remove_nodes)
else:
remove_nodes = {remove_nodes}
core.graph_safe_remove_nodes(self.graph, remove_nodes) core.graph_safe_remove_nodes(self.graph, remove_nodes)
def draw(self, save_path, name, marked_nodes=None): def has_circle(self):
"""
Check if the graph has a circle.
Returns:
bool: True if the graph has a circle else False.
"""
return core.has_circle(self.graph)
def graph_num(self):
"""
Count the number of unconnected graphs in this graph.
Returns:
int: the number of unconnected graphs.
"""
return core.graph_num(self.graph)
def topology_sort(self):
"""
Perform the topology sort operation on the graph.
Notes: the `graph` cannot contain a circle.
Returns:
set(core.Node): nodes in topology order.
"""
return core.topology_sort(self.graph)
def build_adjacency_list(self):
"""
Build an adjacency list of operations for the `graph`.
Returns:
dict{core.Node: set(core.Node)}: the adjacency list.
"""
return core.build_adjacency_list(self.graph)
def draw(self, save_path, name, marked_nodes=None, remove_ctr_var=True):
"""
Draw the graph. If `dot` command is installed, the drawn graph
will be saved as pdf file type, otherwise dot file type is used.
Args:
save_path(str): the save path of drawn graph.
name(str): the name of drawn graph.
marked_nodes(set(core.Node)): nodes that are needed to be marked.
Default value is None.
remove_ctr_var(bool): If it is set True, all control variable nodes
in the graph will be removed. Default value is True.
"""
def _convert_to_pdf(dot_file_path): def _convert_to_pdf(dot_file_path):
pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf' pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf'
exited_code = subprocess.call('dot -Tpdf ' + dot_file_path \ exited_code = subprocess.call('dot -Tpdf ' + dot_file_path \
...@@ -1629,15 +1815,17 @@ class IrGraph(object): ...@@ -1629,15 +1815,17 @@ class IrGraph(object):
print('The {} is saved as the dot filetype.'.format( print('The {} is saved as the dot filetype.'.format(
dot_file_path)) dot_file_path))
remove_ctr_vars = set() if remove_ctr_var:
remove_ctr_vars = set()
for node in self.graph.nodes():
if node.is_ctrl_var():
remove_ctr_vars.add(node)
self.safe_remove_nodes(remove_ctr_vars)
ops_num = 0 ops_num = 0
for node in self.graph.nodes(): for node in self.graph.nodes():
if node.is_ctrl_var(): if node.is_op():
remove_ctr_vars.add(node)
elif node.is_op():
ops_num += 1 ops_num += 1
print('Total ops num = {}.'.format(ops_num)) print('Total ops num = {}.'.format(ops_num))
self.safe_remove_nodes(remove_ctr_vars)
if marked_nodes is not None: if marked_nodes is not None:
if not isinstance(marked_nodes, set): if not isinstance(marked_nodes, set):
marked_nodes = set(marked_nodes) marked_nodes = set(marked_nodes)
...@@ -1652,10 +1840,20 @@ class IrGraph(object): ...@@ -1652,10 +1840,20 @@ class IrGraph(object):
_convert_to_pdf(viz_dot_path) _convert_to_pdf(viz_dot_path)
def to_program(self): def to_program(self):
"""
Convert the graph into a Program.
Notes: When the graph includes backward operator nodes, the
conversion process may be failed. Usually, this function is
only used to convert a test graph.
Returns:
Program: a program converted from the graph.
"""
convert_pass = core.get_pass('graph_to_program_pass') convert_pass = core.get_pass('graph_to_program_pass')
convert_pass.set('program', Program().desc) desc = core.ProgramDesc()
convert_pass.set_not_owned('program', desc)
convert_pass.apply(self.graph) convert_pass.apply(self.graph)
desc = convert_pass.get_program('program')
program = Program._construct_from_desc(desc) program = Program._construct_from_desc(desc)
return program return program
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import collections
import contextlib import contextlib
import sys import sys
import numpy as np import numpy as np
...@@ -30,31 +31,45 @@ class Layer(core.Layer): ...@@ -30,31 +31,45 @@ class Layer(core.Layer):
def __init__(self, dtype=core.VarDesc.VarType.FP32, name=None): def __init__(self, dtype=core.VarDesc.VarType.FP32, name=None):
self._built = False self._built = False
self._dtype = dtype self._dtype = dtype
self._parameters = collections.OrderedDict()
self._sub_layers = collections.OrderedDict()
def parameters(self, include_sublayers=True):
"""Returns a list of Parameters from current and sub-layers.
Args:
include_sublayers: If true, also include the parameters from
sublayers.
Returns a list of Parameters.
"""
ret = [p for p in self._parameters.values()]
if include_sublayers:
for l in self._sub_layers.values():
for p in l.parameters(include_sublayers):
ret.append(p)
return ret
def sublayers(self, include_sublayers=True):
"""Returns a list of sub layers.
def parameters(self): Args:
params = [] include_sublayers: If true, also include the layers from sublayers.
for key in self.__dict__.keys():
value = self.__dict__[key] Returns a list of sub layers.
if isinstance(value, framework.Parameter): """
params.append(value) ret = [l for l in self._sub_layers.values()]
elif isinstance(value, core.Layer): if include_sublayers:
params.extend(value.parameters()) for l in self._sub_layers.values():
elif isinstance(value, collections.Container): for sub_l in l.sublayers(include_sublayers):
if len(value) == 0: ret.append(sub_l)
continue return ret
if isinstance(value[0], framework.Parameter):
params.extend(value)
elif isinstance(value[0], core.Layer):
for v in value:
params.extend(v.parameters())
return params
def clear_gradients(self): def clear_gradients(self):
for p in self.parameters(): for p in self.parameters():
p._clear_gradient() p._clear_gradient()
def _build_once(self, inputs): def _build_once(self, *args):
pass pass
def __call__(self, *inputs): def __call__(self, *inputs):
...@@ -71,6 +86,66 @@ class Layer(core.Layer): ...@@ -71,6 +86,66 @@ class Layer(core.Layer):
def backward(self, *inputs): def backward(self, *inputs):
raise ValueError("Layer shouldn't implement backward") raise ValueError("Layer shouldn't implement backward")
def add_sublayer(self, name, sublayer):
"""Adds a sub Layer instance.
Added sublayer can be access like self.name.
Args:
name: name of this sublayer.
sublayer: an instance of Layer.
Returns:
the sublayer passed in.
"""
assert isinstance(sublayer, core.Layer)
self._sub_layers[name] = sublayer
return sublayer
def add_parameter(self, name, parameter):
"""Adds a Parameter instance.
Added parameter can be access like self.name.
Args:
name: name of this sublayer.
parameter: an instance of Parameter.
Returns:
the parameter passed in.
"""
assert isinstance(parameter, framework.Parameter)
self._parameters[name] = parameter
return parameter
def __getattr__(self, name):
if name in self._parameters:
return self._parameters[name]
elif name in self._sub_layers:
return self._sub_layers[name]
def __setattr__(self, name, value):
if isinstance(value, framework.Parameter):
params = self.__dict__.get('_parameters', None)
if params is None:
raise ValueError(
"super(YourLayer, self).__init__() should be called first")
params[name] = value
elif isinstance(value, core.Layer):
layers = self.__dict__.get('_sub_layers', None)
if layers is None:
raise ValueError(
"super(YourLayer, self).__init__() should be called first")
layers[name] = value
else:
object.__setattr__(self, name, value)
def __delattr__(self, name):
if name in self._parameters:
del self._parameters[name]
elif name in self._sub_layers:
del self._sub_layers[name]
else:
object.__delattr__(self, name)
class PyLayer(core.PyLayer): class PyLayer(core.PyLayer):
"""Layers composed of user-defined python codes.""" """Layers composed of user-defined python codes."""
......
...@@ -225,9 +225,6 @@ class FC(layers.Layer): ...@@ -225,9 +225,6 @@ class FC(layers.Layer):
act=act, act=act,
name=name) name=name)
def parameters(self):
return [self._w, self._b]
def _build_once(self, input): def _build_once(self, input):
input_shape = input.shape input_shape = input.shape
param_shape = [ param_shape = [
...@@ -478,9 +475,6 @@ class Embedding(layers.Layer): ...@@ -478,9 +475,6 @@ class Embedding(layers.Layer):
dtype=self._dtype, dtype=self._dtype,
is_bias=False) is_bias=False)
def parameters(self):
return [self._w]
def forward(self, input): def forward(self, input):
out = self._helper.create_variable_for_type_inference(self._dtype) out = self._helper.create_variable_for_type_inference(self._dtype)
self._helper.append_op( self._helper.append_op(
......
...@@ -3236,7 +3236,7 @@ def group_norm(input, ...@@ -3236,7 +3236,7 @@ def group_norm(input,
# create output # create output
mean_out = helper.create_variable(dtype=dtype, stop_gradient=True) mean_out = helper.create_variable(dtype=dtype, stop_gradient=True)
variance_out = helper.create_variable(dtype=dtype, stop_gradient=True) variance_out = helper.create_variable(dtype=dtype, stop_gradient=True)
group_norm_out = helper.create_variable(dtype) group_norm_out = helper.create_variable(dtype=dtype)
helper.append_op( helper.append_op(
type="group_norm", type="group_norm",
...@@ -5936,13 +5936,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None): ...@@ -5936,13 +5936,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
than :attr:`shape`. than :attr:`shape`.
act (str): The non-linear activation to be applied to the reshaped tensor act (str): The non-linear activation to be applied to the reshaped tensor
variable. variable.
inplace(bool): Must use :attr:`False` if :attr:`x` is used in multiple inplace(bool): If ``inplace`` is `True`, the input and output of ``layers.reshape``
operators. If this flag is set :attr:`True`, reuse input are the same variable, otherwise, the input and output of
:attr:`x` to reshape, which will change the shape of ``layers.reshape`` are different variables. Note that if :attr:`x`
tensor variable :attr:`x` and might cause errors when is more than one layer's input, ``inplace`` must be :attr:`False`.
:attr:`x` is used in multiple operators. If :attr:`False`,
preserve the shape :attr:`x` and create a new output tensor
variable whose data is copied from input x but reshaped.
name (str): The name of this layer. It is optional. name (str): The name of this layer. It is optional.
Returns: Returns:
...@@ -8335,6 +8332,46 @@ def stack(x, axis=0): ...@@ -8335,6 +8332,46 @@ def stack(x, axis=0):
If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x[0])+1`. If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x[0])+1`.
If :code:`axis` is None, it would be replaced with 0. If :code:`axis` is None, it would be replaced with 0.
For Example:
.. code-block:: text
Case 1:
Input:
x[0].data = [ [1.0 , 2.0 ] ]
x[0].dims = [1, 2]
x[1].data = [ [3.0 , 4.0 ] ]
x[1].dims = [1, 2]
x[2].data = [ [5.0 , 6.0 ] ]
x[2].dims = [1, 2]
Attrs:
axis = 0
Output:
Out.data =[ [ [1.0, 2.0] ],
[ [3.0, 4.0] ],
[ [5.0, 6.0] ] ]
Out.dims = [3, 1, 2]
Case 2:
Given
x[0].data = [ [1.0 , 2.0 ] ]
x[0].dims = [1, 2]
x[1].data = [ [3.0 , 4.0 ] ]
x[1].dims = [1, 2]
x[2].data = [ [5.0 , 6.0 ] ]
x[2].dims = [1, 2]
Attrs:
axis = 1 or axis = -2
Output:
Out.data =[ [ [1.0, 2.0]
[3.0, 4.0]
[5.0, 6.0] ] ]
Out.dims = [1, 3, 2]
Args: Args:
x (Variable|list(Variable)|tuple(Variable)): Input variables. x (Variable|list(Variable)|tuple(Variable)): Input variables.
axis (int|None): The axis along which all inputs are stacked. axis (int|None): The axis along which all inputs are stacked.
......
...@@ -567,7 +567,7 @@ def ones(shape, dtype, force_cpu=False): ...@@ -567,7 +567,7 @@ def ones(shape, dtype, force_cpu=False):
It also sets *stop_gradient* to True. It also sets *stop_gradient* to True.
Args: Args:
shape(tuple|list|None): Shape of output tensor shape(tuple|list): Shape of output tensor
dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor
Returns: Returns:
...@@ -578,6 +578,10 @@ def ones(shape, dtype, force_cpu=False): ...@@ -578,6 +578,10 @@ def ones(shape, dtype, force_cpu=False):
data = fluid.layers.ones(shape=[1], dtype='int64') data = fluid.layers.ones(shape=[1], dtype='int64')
""" """
assert isinstance(shape, list) or isinstance(
shape, tuple), "The shape's type should be list or tuple."
assert reduce(lambda x, y: x * y,
shape) > 0, "The shape is invalid: %s." % (str(shape))
return fill_constant(value=1.0, **locals()) return fill_constant(value=1.0, **locals())
......
...@@ -148,6 +148,8 @@ class ParallelExecutor(object): ...@@ -148,6 +148,8 @@ class ParallelExecutor(object):
else framework.default_main_program() else framework.default_main_program()
# FIXME(dzhwinter): enable_inplace should be after memory_optimize # FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass. # if turn on python memory optimize, turn off the inplace_pass.
if build_strategy.memory_optimize is None:
build_strategy.memory_optimize = False if main._is_mem_optimized else True
if build_strategy.enable_inplace is None: if build_strategy.enable_inplace is None:
build_strategy.enable_inplace = False if main._is_mem_optimized else True build_strategy.enable_inplace = False if main._is_mem_optimized else True
scope = scope if scope is not None else executor.global_scope() scope = scope if scope is not None else executor.global_scope()
......
...@@ -77,6 +77,7 @@ list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op) ...@@ -77,6 +77,7 @@ list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
list(REMOVE_ITEM TEST_OPS test_nearest_interp_op) list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
list(REMOVE_ITEM TEST_OPS test_imperative_resnet) list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
list(REMOVE_ITEM TEST_OPS test_imperative_optimizer) list(REMOVE_ITEM TEST_OPS test_imperative_optimizer)
list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer)
foreach(TEST_OP ${TEST_OPS}) foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP}) py_test_modules(${TEST_OP} MODULES ${TEST_OP})
endforeach(TEST_OP) endforeach(TEST_OP)
...@@ -107,6 +108,9 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE ...@@ -107,6 +108,9 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450) set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL) py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
if(NOT WIN32)
py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL)
endif()
if(NOT APPLE) if(NOT APPLE)
py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
if(CMAKE_BUILD_TYPE STREQUAL "Debug") if(CMAKE_BUILD_TYPE STREQUAL "Debug")
......
...@@ -79,7 +79,7 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -79,7 +79,7 @@ class TestParallelExecutorBase(unittest.TestCase):
if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv
build_strategy.memory_optimize = use_ir_memory_optimize build_strategy.memory_optimize = False if memory_opt else use_ir_memory_optimize
# python memory optimization is conflict with inplace pass. # python memory optimization is conflict with inplace pass.
# Use ir graph memory optimization after inplace pass is the correct way. # Use ir graph memory optimization after inplace pass is the correct way.
build_strategy.enable_inplace = False if memory_opt else enable_inplace build_strategy.enable_inplace = False if memory_opt else enable_inplace
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.layer_helper import LayerHelper
class L1(fluid.imperative.Layer):
def __init__(self):
super(L1, self).__init__()
self._helper = LayerHelper(
'MyLayer',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)))
self.w1 = self._helper.create_parameter(
attr=self._helper.param_attr,
shape=[2, 2],
dtype='float32',
is_bias=False)
self.w2 = self._helper.create_parameter(
attr=self._helper.param_attr,
shape=[2, 2],
dtype='float32',
is_bias=False)
def forward(self):
return self.w1 + self.w2
class L2(fluid.imperative.Layer):
def __init__(self):
super(L2, self).__init__()
self.layer1 = L1()
self.layer2 = L1()
def forward(self):
return self.layer1() + self.layer2()
class L3(fluid.imperative.Layer):
def __init__(self):
super(L3, self).__init__()
self.layer1 = L2()
self.layer2 = L2()
def forward(self):
return self.layer1() + self.layer2()
class TestBaseLayer(unittest.TestCase):
def test_one_level(self):
with fluid.imperative.guard():
l = L1()
ret = l()
self.assertEqual(l.w1.name, "MyLayer_0.w_0")
self.assertEqual(l.w2.name, "MyLayer_0.w_1")
self.assertTrue(np.allclose(ret._numpy(), 0.2 * np.ones([2, 2])))
def test_three_level(self):
with fluid.imperative.guard():
l = L3()
ret = l()
self.assertTrue(np.allclose(ret._numpy(), 0.8 * np.ones([2, 2])))
if __name__ == '__main__':
unittest.main()
...@@ -22,6 +22,9 @@ import six ...@@ -22,6 +22,9 @@ import six
import unittest import unittest
import numpy as np import numpy as np
import gc
gc.set_debug(gc.DEBUG_COLLECTABLE)
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -99,6 +102,12 @@ class TranspilerTest(unittest.TestCase): ...@@ -99,6 +102,12 @@ class TranspilerTest(unittest.TestCase):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
with fluid.program_guard(main, startup): with fluid.program_guard(main, startup):
self.transpiler_test_impl() self.transpiler_test_impl()
# NOTE: run gc.collect to eliminate pybind side objects to
# prevent random double-deallocate when inherited in python.
del self.transpiler
del main
del startup
gc.collect()
class TestBasicModel(TranspilerTest): class TestBasicModel(TranspilerTest):
...@@ -797,6 +806,7 @@ class TestNCCL2Transpile(TranspilerTest): ...@@ -797,6 +806,7 @@ class TestNCCL2Transpile(TranspilerTest):
print([op.type for op in startup.global_block().ops]) print([op.type for op in startup.global_block().ops])
self.assertEqual(startup.global_block().ops[-1].type, "gen_nccl_id") self.assertEqual(startup.global_block().ops[-1].type, "gen_nccl_id")
self.assertIsNotNone(startup.global_block().vars.get("NCCLID")) self.assertIsNotNone(startup.global_block().vars.get("NCCLID"))
gc.collect()
else: else:
pass pass
......
...@@ -121,6 +121,8 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -121,6 +121,8 @@ class TestMNIST(TestParallelExecutorBase):
regularization=fluid.regularizer.L2Decay(1e-6)) regularization=fluid.regularizer.L2Decay(1e-6))
return optimizer return optimizer
# NOTE(dzh):
# need to make it compatible with elewise fuse act
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
model, model,
feed_dict={"image": img, feed_dict={"image": img,
...@@ -128,6 +130,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -128,6 +130,7 @@ class TestMNIST(TestParallelExecutorBase):
use_cuda=use_cuda, use_cuda=use_cuda,
fuse_elewise_add_act_ops=False, fuse_elewise_add_act_ops=False,
memory_opt=False, memory_opt=False,
use_ir_memory_optimize=False,
optimizer=_optimizer) optimizer=_optimizer)
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence( fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
model, model,
...@@ -136,6 +139,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -136,6 +139,7 @@ class TestMNIST(TestParallelExecutorBase):
use_cuda=use_cuda, use_cuda=use_cuda,
fuse_elewise_add_act_ops=True, fuse_elewise_add_act_ops=True,
memory_opt=False, memory_opt=False,
use_ir_memory_optimize=False,
optimizer=_optimizer) optimizer=_optimizer)
for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss): for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
......
...@@ -333,6 +333,18 @@ class TestImperative(unittest.TestCase): ...@@ -333,6 +333,18 @@ class TestImperative(unittest.TestCase):
self.assertTrue(np.allclose(dy_out, static_out)) self.assertTrue(np.allclose(dy_out, static_out))
self.assertTrue(np.allclose(dy_grad, static_grad)) self.assertTrue(np.allclose(dy_grad, static_grad))
params = mlp.parameters(True)
self.assertEqual("FC_0.w_0", params[0].name)
self.assertEqual("FC_0.b_0", params[1].name)
self.assertEqual("FC_1.w_0", params[2].name)
self.assertEqual("FC_1.b_0", params[3].name)
self.assertEqual(len(params), 4)
sublayers = mlp.sublayers(True)
self.assertEqual(mlp._fc1, sublayers[0])
self.assertEqual(mlp._fc2, sublayers[1])
self.assertEqual(len(sublayers), 2)
def test_rnn(self): def test_rnn(self):
np_inp = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0], np_inp = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0],
[10.0, 11.0, 12.0]]) [10.0, 11.0, 12.0]])
......
...@@ -33,9 +33,6 @@ class Discriminator(fluid.imperative.Layer): ...@@ -33,9 +33,6 @@ class Discriminator(fluid.imperative.Layer):
self._fc1 = FC(size=32, act='elu', name="d_fc1") self._fc1 = FC(size=32, act='elu', name="d_fc1")
self._fc2 = FC(size=1, name="d_fc2") self._fc2 = FC(size=1, name="d_fc2")
def parameters(self):
return self._fc1.parameters() + self._fc2.parameters()
def forward(self, inputs): def forward(self, inputs):
x = self._fc1(inputs) x = self._fc1(inputs)
return self._fc2(x) return self._fc2(x)
...@@ -48,10 +45,6 @@ class Generator(fluid.imperative.Layer): ...@@ -48,10 +45,6 @@ class Generator(fluid.imperative.Layer):
self._fc2 = FC(size=64, act='elu', name="g_fc2") self._fc2 = FC(size=64, act='elu', name="g_fc2")
self._fc3 = FC(size=1, name="g_fc3") self._fc3 = FC(size=1, name="g_fc3")
def parameters(self):
return self._fc1.parameters() + self._fc2.parameters(
) + self._fc3.parameters()
def forward(self, inputs): def forward(self, inputs):
x = self._fc1(inputs) x = self._fc1(inputs)
x = self._fc2(x) x = self._fc2(x)
......
...@@ -75,16 +75,6 @@ class SimpleLSTMRNN(fluid.imperative.Layer): ...@@ -75,16 +75,6 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
self.hidden_array.append(pre_hidden) self.hidden_array.append(pre_hidden)
self.cell_array.append(pre_cell) self.cell_array.append(pre_cell)
def parameters(self):
parameters = list()
for param in self.weight_1_arr:
parameters.append(param)
for param in self.weight_2_arr:
parameters.append(param)
for bias in self.bias_arr:
parameters.append(bias)
return parameters
def forward(self, input_embedding, init_hidden=None, init_cell=None): def forward(self, input_embedding, init_hidden=None, init_cell=None):
res = [] res = []
for index in range(self._num_steps): for index in range(self._num_steps):
...@@ -177,12 +167,6 @@ class PtbModel(fluid.imperative.Layer): ...@@ -177,12 +167,6 @@ class PtbModel(fluid.imperative.Layer):
def _build_once(self, input, label, init_hidden, init_cell): def _build_once(self, input, label, init_hidden, init_cell):
pass pass
def parameters(self):
parameters = self.simple_lstm_rnn.parameters() + [
self.softmax_weight, self.softmax_bias
] + self.embedding.parameters()
return parameters
def forward(self, input, label, init_hidden, init_cell): def forward(self, input, label, init_hidden, init_cell):
init_h = fluid.layers.reshape( init_h = fluid.layers.reshape(
......
...@@ -21,7 +21,6 @@ import paddle ...@@ -21,7 +21,6 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.imperative.nn import Conv2D, Pool2D, BatchNorm, FC from paddle.fluid.imperative.nn import Conv2D, Pool2D, BatchNorm, FC
from paddle.fluid.imperative.base import to_variable from paddle.fluid.imperative.base import to_variable
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
...@@ -173,11 +172,13 @@ class ResNet(fluid.imperative.Layer): ...@@ -173,11 +172,13 @@ class ResNet(fluid.imperative.Layer):
for block in range(len(depth)): for block in range(len(depth)):
shortcut = False shortcut = False
for i in range(depth[block]): for i in range(depth[block]):
bottleneck_block = BottleneckBlock( bottleneck_block = self.add_sublayer(
num_channels=num_channels, 'bb_%d_%d' % (block, i),
num_filters=num_filters[block], BottleneckBlock(
stride=2 if i == 0 and block != 0 else 1, num_channels=num_channels,
shortcut=shortcut) num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
shortcut=shortcut))
num_channels = bottleneck_block._num_channels_out num_channels = bottleneck_block._num_channels_out
self.bottleneck_block_list.append(bottleneck_block) self.bottleneck_block_list.append(bottleneck_block)
shortcut = True shortcut = True
...@@ -223,8 +224,7 @@ class TestImperativeResnet(unittest.TestCase): ...@@ -223,8 +224,7 @@ class TestImperativeResnet(unittest.TestCase):
batch_size=batch_size) batch_size=batch_size)
dy_param_init_value = {} dy_param_init_value = {}
for param in fluid.default_main_program().global_block( for param in resnet.parameters():
).all_parameters():
dy_param_init_value[param.name] = param._numpy() dy_param_init_value[param.name] = param._numpy()
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(train_reader()):
...@@ -247,16 +247,14 @@ class TestImperativeResnet(unittest.TestCase): ...@@ -247,16 +247,14 @@ class TestImperativeResnet(unittest.TestCase):
dy_out = avg_loss._numpy() dy_out = avg_loss._numpy()
if batch_id == 0: if batch_id == 0:
for param in fluid.default_main_program().global_block( for param in resnet.parameters():
).all_parameters():
if param.name not in dy_param_init_value: if param.name not in dy_param_init_value:
dy_param_init_value[param.name] = param._numpy() dy_param_init_value[param.name] = param._numpy()
avg_loss._backward() avg_loss._backward()
dy_grad_value = {} dy_grad_value = {}
for param in fluid.default_main_program().global_block( for param in resnet.parameters():
).all_parameters():
if not param.stop_gradient: if not param.stop_gradient:
np_array = np.array(param._ivar._grad_ivar().value() np_array = np.array(param._ivar._grad_ivar().value()
.get_tensor()) .get_tensor())
...@@ -267,8 +265,7 @@ class TestImperativeResnet(unittest.TestCase): ...@@ -267,8 +265,7 @@ class TestImperativeResnet(unittest.TestCase):
resnet.clear_gradients() resnet.clear_gradients()
dy_param_value = {} dy_param_value = {}
for param in fluid.default_main_program().global_block( for param in resnet.parameters():
).all_parameters():
dy_param_value[param.name] = param._numpy() dy_param_value[param.name] = param._numpy()
with new_program_scope(): with new_program_scope():
...@@ -349,6 +346,7 @@ class TestImperativeResnet(unittest.TestCase): ...@@ -349,6 +346,7 @@ class TestImperativeResnet(unittest.TestCase):
self.assertTrue(np.allclose(static_out, dy_out)) self.assertTrue(np.allclose(static_out, dy_out))
self.assertEqual(len(dy_param_init_value), len(static_param_init_value)) self.assertEqual(len(dy_param_init_value), len(static_param_init_value))
for key, value in six.iteritems(static_param_init_value): for key, value in six.iteritems(static_param_init_value):
self.assertTrue(np.allclose(value, dy_param_init_value[key])) self.assertTrue(np.allclose(value, dy_param_init_value[key]))
self.assertTrue(np.isfinite(value.all())) self.assertTrue(np.isfinite(value.all()))
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import unittest
import paddle.fluid as fluid
import paddle.fluid.core as core
os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
os.environ[
'RECORDIO_FILENAME'] = '/tmp/ir_memory_optimize_transformer.wmt16.recordio'
from test_parallel_executor_transformer import TestTransformer
from test_parallel_executor_transformer import transformer
# NOTE(dzhwinter): test diferent strategy colisions.
# open the eager delete tensor strategy by default.
class TestTransformerWithIR(TestTransformer):
def test_main(self):
if core.is_compiled_with_cuda():
# check python transpiler
self.check_network_convergence(
transformer,
use_cuda=True,
memory_opt=True,
use_ir_memory_optimize=False)
# check IR memory optimize
self.check_network_convergence(
transformer,
use_cuda=True,
memory_opt=False,
use_ir_memory_optimize=True)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册