提交 b90c08c3 编写于 作者: L lujun

merge conflict, test=release/1.4

...@@ -201,7 +201,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ...@@ -201,7 +201,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64") SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64")
ENDIF() ENDIF()
SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") SET(PROTOBUF_REPO "https://github.com/protocolbuffers/protobuf.git")
SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
ExternalProject_Add( ExternalProject_Add(
......
...@@ -13,44 +13,160 @@ ...@@ -13,44 +13,160 @@
// limitations under the License. // limitations under the License.
#include <algorithm> #include <algorithm>
#include <memory> #include <map>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
#include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/op_graph_view.h" #include "paddle/fluid/framework/details/op_graph_view.h"
#include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/op_proto_maker.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
VarHandle* GetValidInput(const OpHandleBase* a) { class AllReduceDepsPass : public ir::Pass {
for (auto p : a->Inputs()) { protected:
VarHandle* b = dynamic_cast<VarHandle*>(p); void ApplyImpl(ir::Graph* graph) const override {
if (b) { std::vector<AllReduceOpHandle*> all_reduce_op_handles =
return b; GetSortedAllReduceOps(*graph);
for (size_t i = 1; i < all_reduce_op_handles.size(); ++i) {
auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
all_reduce_op_handles[i - 1]->AddOutput(dep_var);
all_reduce_op_handles[i]->AddInput(dep_var);
}
if (VLOG_IS_ON(10)) {
DebugString(*graph, all_reduce_op_handles);
}
}
std::vector<AllReduceOpHandle*> GetSortedAllReduceOps(
const ir::Graph& graph) const {
std::vector<AllReduceOpHandle*> all_reduce_op_handles;
std::unordered_map<OpHandleBase*, size_t> pending_ops;
std::unordered_set<OpHandleBase*> ready_ops;
std::unordered_set<OpHandleBase*> next_ready_ops;
auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(graph);
size_t num_of_ops = op_handles.size();
for (OpHandleBase* op : op_handles) {
size_t not_ready_vars = op->NotReadyInputSize();
if (not_ready_vars) {
pending_ops.insert({op, not_ready_vars});
} else {
ready_ops.insert(op);
} }
} }
return nullptr; GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles);
}
void AllReduceDepsPass::ApplyImpl(ir::Graph* graph) const { size_t has_run_ops = ready_ops.size();
auto graph_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph); while (has_run_ops != num_of_ops) {
for (auto* op : ready_ops) {
for (auto& ready_var : op->Outputs()) {
for (auto* pend_op : ready_var->PendingOps()) {
auto& deps = --pending_ops[pend_op];
if (deps == 0) {
next_ready_ops.insert(pend_op);
}
}
}
}
PADDLE_ENFORCE_NE(next_ready_ops.size(), 0, "There maybe have a cycle.");
ready_ops.clear();
std::swap(ready_ops, next_ready_ops);
GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles);
has_run_ops += ready_ops.size();
}
return all_reduce_op_handles;
}
void GetSortedAllReduceOps(
const std::unordered_set<OpHandleBase*>& ready_ops,
std::vector<AllReduceOpHandle*>* all_reduce_op_handles) const {
std::vector<AllReduceOpHandle*> current_all_reduce_op_handles;
for (auto& op_handle : ready_ops) {
auto all_reduce_op_handle = dynamic_cast<AllReduceOpHandle*>(op_handle);
if (all_reduce_op_handle) {
current_all_reduce_op_handles.emplace_back(all_reduce_op_handle);
}
}
// NOTE(zcd): For distributed training, it is important to keep the order of
// allReduce on each node consistent. Otherwise, hang may occur.
// Sort the current_all_reduce_op_handles according to the name of input.
sort(current_all_reduce_op_handles.begin(),
current_all_reduce_op_handles.end(),
[](const AllReduceOpHandle* left,
const AllReduceOpHandle* right) -> bool {
auto left_in_vars = DynamicCast<VarHandle>(left->Inputs());
auto right_in_vars = DynamicCast<VarHandle>(right->Inputs());
PADDLE_ENFORCE_GT(left_in_vars.size(), 0);
PADDLE_ENFORCE_EQ(left_in_vars.size(), right_in_vars.size());
return left_in_vars[0]->Name() > right_in_vars[0]->Name();
});
all_reduce_op_handles->insert(all_reduce_op_handles->end(),
current_all_reduce_op_handles.begin(),
current_all_reduce_op_handles.end());
}
void DebugString(
const ir::Graph& graph,
const std::vector<AllReduceOpHandle*>& all_reduce_op_handles) const {
// get vars order // get vars order
std::map<int, std::vector<std::string>> vars =
GetSoredGradientsFromStaleProgram(graph);
std::stringstream out;
size_t grads_of_stale_program = 0;
out << "Get Order From kStaleProgramOpDescs: ";
for (auto& var : vars) {
out << "Order " << var.first << " [";
for (auto& var_name : var.second) {
out << var_name << ", ";
++grads_of_stale_program;
}
out << "], ";
}
VLOG(10) << out.str();
std::stringstream out2;
out2 << "Get Order From Topological order: ";
for (auto& op : all_reduce_op_handles) {
bool find_valid_input = false;
for (auto& in_var : op->Inputs()) {
if (dynamic_cast<VarHandle*>(in_var)) {
out2 << in_var->Name() << ", ";
find_valid_input = true;
break;
}
}
PADDLE_ENFORCE(find_valid_input, "Doesn't find valid input.");
}
VLOG(10) << out2.str();
if (grads_of_stale_program != all_reduce_op_handles.size()) {
VLOG(10)
<< "The gradients number of stale program and graph is not equal.";
}
}
std::map<int, std::vector<std::string>> GetSoredGradientsFromStaleProgram(
const ir::Graph& graph) const {
std::map<int, std::vector<std::string>> vars;
auto ops = graph.Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
int order = 0; int order = 0;
std::unordered_map<std::string, int> vars;
// TODO(gongwb): use graph topology sort to find the order of operators.
// Note that must assert topology sort is stable
auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
for (auto* op_desc : ops) { for (auto* op_desc : ops) {
try { try {
bool is_bk_op = bool is_bk_op =
...@@ -62,76 +178,21 @@ void AllReduceDepsPass::ApplyImpl(ir::Graph* graph) const { ...@@ -62,76 +178,21 @@ void AllReduceDepsPass::ApplyImpl(ir::Graph* graph) const {
auto backward_vars = auto backward_vars =
boost::get<std::vector<std::string>>(op_desc->GetNullableAttr( boost::get<std::vector<std::string>>(op_desc->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName())); OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); if (backward_vars.empty()) continue;
auto outputs = op_desc->Outputs(); PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
for (auto& o_it : outputs) { for (size_t i = 1; i < backward_vars.size(); i += 2) {
for (auto& v : o_it.second) { // values vars[order].emplace_back(backward_vars[i]);
vars[v] = order; VLOG(1) << "get parameter and gradient: " << backward_vars[i - 1]
VLOG(10) << "in all_reduce_deps_pass:" << v; << ", " << backward_vars[i];
}
} }
order++; order++;
} catch (boost::bad_get e) { } catch (boost::bad_get e) {
} }
} }
return vars;
std::vector<OpHandleBase*> dist_ops;
// get allreduce ops.
for (auto& op : graph_ops) {
// FIXME(gongwb):add broad cast.
if (op->Name() == "all_reduce" || op->Name() == "reduce") {
dist_ops.push_back(op);
}
} }
};
VLOG(10) << "dist_ops size:" << dist_ops.size()
<< ", outputs size:" << vars.size() << ", ops size:" << ops.size();
std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1,
OpHandleBase* op2) {
VarHandle* i0 = dynamic_cast<VarHandle*>(GetValidInput(op1));
VarHandle* i1 = dynamic_cast<VarHandle*>(GetValidInput(op2));
PADDLE_ENFORCE(i0 != nullptr && i1 != nullptr, "%s convert to %s error",
op1->DebugString(), op2->DebugString());
auto l_it = vars.find(i0->name());
auto r_it = vars.find(i1->name());
PADDLE_ENFORCE(l_it != vars.end() && r_it != vars.end(),
"can't find var's name %s and %s in opdesc", i0->name(),
i1->name());
if (l_it->second < r_it->second) return true;
if (l_it->second == r_it->second) {
return i0->name() < i1->name();
}
return false;
});
// add dependency.
auto& sorted_ops = dist_ops;
for (size_t i = 1; i < sorted_ops.size(); ++i) {
auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
auto* pre_op = sorted_ops[i - 1];
auto* op = sorted_ops[i];
pre_op->AddOutput(dep_var);
op->AddInput(dep_var);
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
VLOG(10) << "add all_reduce sequential dependencies between " << pre_op
<< " and " << op;
VLOG(10) << "pre_op:" << pre_op->DebugString()
<< ", op:" << op->DebugString();
}
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace details {
// TODO(gongwb): overlap allreduce with backward computation.
class AllReduceDepsPass : public ir::Pass {
protected:
void ApplyImpl(ir::Graph* graph) const override;
};
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
// asynchronous nccl allreduce or synchronous issue: // asynchronous nccl allreduce or synchronous issue:
// https://github.com/PaddlePaddle/Paddle/issues/15049 // https://github.com/PaddlePaddle/Paddle/issues/15049
DEFINE_bool( DEFINE_bool(
sync_nccl_allreduce, false, sync_nccl_allreduce, true,
"If set true, will call `cudaStreamSynchronize(nccl_stream)`" "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
"after allreduce, this mode can get better performance in some scenarios."); "after allreduce, this mode can get better performance in some scenarios.");
...@@ -53,6 +53,10 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, ...@@ -53,6 +53,10 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p)); this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
} }
} }
// TODO(gongwb) :polish them!
if (is_encoded) {
VLOG(1) << "Use dgc allreduce mode";
}
} }
#else #else
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
...@@ -86,7 +90,7 @@ void AllReduceOpHandle::RunImplEncoded() { ...@@ -86,7 +90,7 @@ void AllReduceOpHandle::RunImplEncoded() {
paddle::framework::GradOriginalVarName(in_var_handles[i]->name()); paddle::framework::GradOriginalVarName(in_var_handles[i]->name());
auto encode_var_name = original_name + g_dgc_encoded; auto encode_var_name = original_name + g_dgc_encoded;
auto *in_var = local_scope->FindVar(encode_var_name); auto *in_var = local_scope->FindVar(encode_var_name);
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name);
auto &in = in_var->Get<LoDTensor>(); auto &in = in_var->Get<LoDTensor>();
ins.emplace_back(&in); ins.emplace_back(&in);
......
...@@ -163,15 +163,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -163,15 +163,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
"graph_printer", new details::GraphvizSSAGraphPrinter); "graph_printer", new details::GraphvizSSAGraphPrinter);
} }
// Verify that the graph is correct for multi-device executor. // experimental shows that the program will be faster if append
AppendPass("multi_devices_check_pass"); // all_reduce_deps_pass here.
if (!strategy_.enable_parallel_graph_ &&
if (VLOG_IS_ON(2)) { (SeqOnlyAllReduceOps(strategy_) ||
AppendPass("all_reduce_deps_pass"); strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce)) {
}
if (SeqOnlyAllReduceOps(strategy_)) {
VLOG(10) << "Add all_reduce_deps_pass";
AppendPass("all_reduce_deps_pass"); AppendPass("all_reduce_deps_pass");
} }
...@@ -179,6 +175,9 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -179,6 +175,9 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
VLOG(10) << "Add modify_op_lock_and_record_event_pass"; VLOG(10) << "Add modify_op_lock_and_record_event_pass";
AppendPass("modify_op_lock_and_record_event_pass"); AppendPass("modify_op_lock_and_record_event_pass");
} }
// Verify that the graph is correct for multi-device executor.
AppendPass("multi_devices_check_pass");
} }
// Convert graph to run on multi-devices. // Convert graph to run on multi-devices.
......
...@@ -56,6 +56,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( ...@@ -56,6 +56,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
fetches.resize(fetch_tensors.size()); fetches.resize(fetch_tensors.size());
std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars; std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
std::vector<FetchOpHandle *> fetch_ops; std::vector<FetchOpHandle *> fetch_ops;
std::vector<OpHandleBase *> ready_fetch_ops;
for (auto &fetch_var_name : fetch_tensors) { for (auto &fetch_var_name : fetch_tensors) {
for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) { for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
...@@ -70,8 +71,9 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( ...@@ -70,8 +71,9 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
auto &var_name = fetch_tensors[i]; auto &var_name = fetch_tensors[i];
auto fetched_var_it = fetched_vars.find(var_name); auto fetched_var_it = fetched_vars.find(var_name);
PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(), PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
"Cannot find fetched variable.(Perhaps the main_program " "Cannot find fetched variable(%s).(Perhaps the main_program "
"is not set to ParallelExecutor)"); "is not set to ParallelExecutor)",
var_name);
auto &vars = fetched_var_it->second; auto &vars = fetched_var_it->second;
...@@ -88,7 +90,11 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( ...@@ -88,7 +90,11 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
op->AddInput(var); op->AddInput(var);
} }
(*op_deps)[op] = static_cast<int>(op->NotReadyInputSize()); int dep = static_cast<int>(op->NotReadyInputSize());
(*op_deps)[op] = dep;
if (dep == 0) {
ready_fetch_ops.emplace_back(op);
}
} }
size_t num_complete = 0; size_t num_complete = 0;
...@@ -97,7 +103,9 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( ...@@ -97,7 +103,9 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
for (auto op : bootstrap_ops_) { for (auto op : bootstrap_ops_) {
RunOpAsync(op_deps.get(), op, complete_q); RunOpAsync(op_deps.get(), op, complete_q);
} }
for (auto op : ready_fetch_ops) {
RunOpAsync(op_deps.get(), op, complete_q);
}
while (num_complete != op_deps->size()) { while (num_complete != op_deps->size()) {
size_t num_comp = complete_q->Pop(); size_t num_comp = complete_q->Pop();
if (num_comp == -1UL) { if (num_comp == -1UL) {
......
...@@ -13,9 +13,9 @@ ...@@ -13,9 +13,9 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/fetch_op_handle.h"
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -44,6 +44,7 @@ void FetchOpHandle::WaitAndMergeCPUTensors() const { ...@@ -44,6 +44,7 @@ void FetchOpHandle::WaitAndMergeCPUTensors() const {
} }
void FetchOpHandle::RunImpl() { void FetchOpHandle::RunImpl() {
platform::RecordEvent record_event(Name());
WaitInputVarGenerated(platform::CPUPlace()); WaitInputVarGenerated(platform::CPUPlace());
tensors_.resize(inputs_.size()); tensors_.resize(inputs_.size());
...@@ -62,7 +63,8 @@ void FetchOpHandle::RunImpl() { ...@@ -62,7 +63,8 @@ void FetchOpHandle::RunImpl() {
auto &t = var->Get<framework::LoDTensor>(); auto &t = var->Get<framework::LoDTensor>();
if (platform::is_gpu_place(t.place())) { if (platform::is_gpu_place(t.place())) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
TensorCopySync(t, cpu, &tensors_[i]); TensorCopy(t, cpu, *dev_ctxes_.at(t.place()), &tensors_[i]);
dev_ctxes_.at(t.place())->Wait();
#endif #endif
} else { } else {
tensors_[i].ShareDataWith(t); tensors_[i].ShareDataWith(t);
......
...@@ -305,6 +305,12 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op, ...@@ -305,6 +305,12 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
VLOG(4) << "Try to inplace " << in_var_name << " with " << out_var_name; VLOG(4) << "Try to inplace " << in_var_name << " with " << out_var_name;
if (var_nodes_[in_var_name].back() != in_node) {
VLOG(4) << "SKIP since " << in_var_name
<< " is also used as output by other ops";
continue;
}
bool can_replace = true; bool can_replace = true;
if (in_var_name == out_var_name) { if (in_var_name == out_var_name) {
can_replace = false; can_replace = false;
...@@ -527,6 +533,9 @@ void GraphView::Build(ir::Graph* g) { ...@@ -527,6 +533,9 @@ void GraphView::Build(ir::Graph* g) {
}; };
for (auto& node : g->Nodes()) { for (auto& node : g->Nodes()) {
if (!node->IsOp()) continue; if (!node->IsOp()) continue;
// avoid optimize the variable used in sub-blocks
if (OpHasSubBlock(node->Op())) update_skip_set(node);
if (node->Name() == "send") update_skip_set(node); if (node->Name() == "send") update_skip_set(node);
if (node->Name() == "recv") update_skip_set(node); if (node->Name() == "recv") update_skip_set(node);
if (node->Name() == "prefetch") update_skip_set(node); if (node->Name() == "prefetch") update_skip_set(node);
......
...@@ -131,16 +131,7 @@ size_t NodeSize(const VarDesc& node) { ...@@ -131,16 +131,7 @@ size_t NodeSize(const VarDesc& node) {
return type_size * std::abs(size); return type_size * std::abs(size);
} }
size_t NodeSize(ir::Node* n) { size_t NodeSize(ir::Node* n) { return NodeSize(*(n->Var())); }
VarDesc* desc = nullptr;
// some op do not have block pointer
if (n->inputs[0]->Op() != nullptr) {
desc = FindVarDescInBlock(n);
} else {
desc = n->Var();
}
return NodeSize(*desc);
}
std::string DebugStringImpl(VarDesc* var) { std::string DebugStringImpl(VarDesc* var) {
std::stringstream ss; std::stringstream ss;
...@@ -163,24 +154,22 @@ std::string DebugStringImpl(VarDesc* var) { ...@@ -163,24 +154,22 @@ std::string DebugStringImpl(VarDesc* var) {
} }
std::string DebugString(ir::Node* var) { std::string DebugString(ir::Node* var) {
return DebugStringImpl(FindVarDescInBlock(var)); return DebugStringImpl(GetVarDesc(var));
} }
// NOTE(dzh): based ir node, if a large node has been reused // NOTE(dzh): based ir node, if a large node has been reused
// by a small size node, then next time it appear in pool, it will // by a small size node, then next time it appear in pool, it will
// have the small size. Find the original node shap from blockdesc. // have the small size. Find the original node shap from blockdesc.
VarDesc* FindVarDescInBlock(ir::Node* n) { VarDesc* GetVarDesc(ir::Node* n) {
PADDLE_ENFORCE(n->IsVar() && !n->IsCtrlVar() && n->inputs.size() == 1); PADDLE_ENFORCE(n->IsVar() && !n->IsCtrlVar() && n->inputs.size() == 1);
BlockDesc* block = n->inputs[0]->Op()->Block(); return n->Var();
PADDLE_ENFORCE(block->HasVar(n->Name()),
string::Sprintf("Block do not has var %s", n->Name()));
return block->FindVar(n->Name());
} }
struct NodeComparator { struct NodeComparator {
bool operator()(ir::Node* lhs, ir::Node* rhs) const { bool operator()(ir::Node* lhs, ir::Node* rhs) const {
auto* lhs_desc = FindVarDescInBlock(lhs); if (lhs->Var()->GetType() != rhs->Var()->GetType()) return false;
auto* rhs_desc = FindVarDescInBlock(rhs); auto* lhs_desc = GetVarDesc(lhs);
auto* rhs_desc = GetVarDesc(rhs);
// match data type // match data type
if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) { if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) {
return false; return false;
...@@ -204,7 +193,7 @@ void OrderedSet::Insert(ir::Node* var) { ...@@ -204,7 +193,7 @@ void OrderedSet::Insert(ir::Node* var) {
return; return;
} }
auto* var_desc = FindVarDescInBlock(var); auto* var_desc = var->Var();
auto var_shape = var_desc->GetShape(); auto var_shape = var_desc->GetShape();
int batch_size = static_cast<int>(var_shape[0]); int batch_size = static_cast<int>(var_shape[0]);
...@@ -212,7 +201,7 @@ void OrderedSet::Insert(ir::Node* var) { ...@@ -212,7 +201,7 @@ void OrderedSet::Insert(ir::Node* var) {
Iter it = nodes_.begin(); Iter it = nodes_.begin();
while (it != nodes_.end()) { while (it != nodes_.end()) {
auto& prev = it->front(); auto& prev = it->front();
auto* cache_desc = FindVarDescInBlock(prev); auto* cache_desc = GetVarDesc(prev);
int cache_batch_size = cache_desc->GetShape()[0]; int cache_batch_size = cache_desc->GetShape()[0];
if ((cache_batch_size == -1 && batch_size == -1) || if ((cache_batch_size == -1 && batch_size == -1) ||
(cache_batch_size != -1 && batch_size != -1)) { (cache_batch_size != -1 && batch_size != -1)) {
...@@ -336,10 +325,16 @@ int MinChunkSize() { ...@@ -336,10 +325,16 @@ int MinChunkSize() {
bool NodeCanReused(const VarDesc& node) { bool NodeCanReused(const VarDesc& node) {
auto type = node.GetType(); auto type = node.GetType();
// only these types holds bulk of gpu memory // only these types holds bulk of gpu memory
if (!(type == proto::VarType::LOD_TENSOR || // FIXME(liuwei1031) did not find good ways to test SELECTED_ROWS and
type == proto::VarType::LOD_TENSOR_ARRAY)) { // LOD_TENSOR_ARRAY re-use logic,
return false; // disable them in version 1.4
} // if (!(type == proto::VarType::LOD_TENSOR ||
// type == proto::VarType::SELECTED_ROWS ||
// type == proto::VarType::LOD_TENSOR_ARRAY)) {
// return false;
// }
if (type != proto::VarType::LOD_TENSOR) return false;
// persistable variable is parameter // persistable variable is parameter
if (node.Persistable()) { if (node.Persistable()) {
return false; return false;
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <map> #include <map>
#include <set> #include <set>
#include <string> #include <string>
#include <unordered_map>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
...@@ -140,11 +141,7 @@ size_t NodeSize(const VarDesc&); ...@@ -140,11 +141,7 @@ size_t NodeSize(const VarDesc&);
std::string DebugString(ir::Node* var); std::string DebugString(ir::Node* var);
// NOTE(dzhwinter) VarDesc* GetVarDesc(ir::Node* n);
// after node reuse, the replaced node shape is
// different with its VarDesc. So need to find the
// correct VarDesc in Block.
VarDesc* FindVarDescInBlock(ir::Node* n);
static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
......
...@@ -68,7 +68,7 @@ void OpHandleBase::Run(bool use_cuda) { ...@@ -68,7 +68,7 @@ void OpHandleBase::Run(bool use_cuda) {
if (out_var_handle) { if (out_var_handle) {
PADDLE_ENFORCE( PADDLE_ENFORCE(
platform::is_same_place(place, out_var_handle->place()), platform::is_same_place(place, out_var_handle->place()),
"The place of input(%s) is not consistent with the " "The place of output(%s) is not consistent with the "
"place of current op(%s).", "place of current op(%s).",
out_var_handle->Name(), Name()); out_var_handle->Name(), Name());
out_var_handle->SetGenerateEvent(events_.at(dev_id)); out_var_handle->SetGenerateEvent(events_.at(dev_id));
......
...@@ -68,7 +68,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -68,7 +68,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
} }
set.clear(); set.clear();
}; };
auto run_all_op = [&](OpHandleBase *op) { RunOp(ready_vars, op); };
// Clean run context // Clean run context
run_op_futures_.clear(); run_op_futures_.clear();
exception_holder_.Clear(); exception_holder_.Clear();
...@@ -102,7 +102,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -102,7 +102,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
auto &deps = pending_ops[op]; auto &deps = pending_ops[op];
--deps; --deps;
if (deps == 0) { if (deps == 0) {
run_all_op(op); ready_ops.insert(op);
} }
} }
} }
......
...@@ -84,7 +84,8 @@ void BatchMergePass::ApplyImpl(ir::Graph* graph) const { ...@@ -84,7 +84,8 @@ void BatchMergePass::ApplyImpl(ir::Graph* graph) const {
// 1. record op nodes of different roles // 1. record op nodes of different roles
for (auto node : nodes) { for (auto node : nodes) {
if (node->IsVar()) continue; if (!node->IsOp()) continue;
PADDLE_ENFORCE(node->Op(), "must find opdesc");
int op_role = boost::get<int>(node->Op()->GetAttr( int op_role = boost::get<int>(node->Op()->GetAttr(
framework::OpProtoAndCheckerMaker::OpRoleAttrName())); framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
if ((op_role == static_cast<int>(framework::OpRole::kForward)) || if ((op_role == static_cast<int>(framework::OpRole::kForward)) ||
......
...@@ -19,17 +19,14 @@ limitations under the License. */ ...@@ -19,17 +19,14 @@ limitations under the License. */
#include <tuple> #include <tuple>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#ifdef WITH_GPERFTOOLS #ifdef WITH_GPERFTOOLS
......
...@@ -122,14 +122,14 @@ class Autograd { ...@@ -122,14 +122,14 @@ class Autograd {
std::map<std::string, std::vector<VarBase*>> input_grads = std::map<std::string, std::vector<VarBase*>> input_grads =
ready_op->ApplyGrad(); ready_op->ApplyGrad();
for (auto it : input_grads) { for (auto it = input_grads.rbegin(); it != input_grads.rend(); ++it) {
const std::vector<VarBase*>& ingrads = it.second; const std::vector<VarBase*>& ingrads = it->second;
for (size_t i = 0; i < ingrads.size(); ++i) { for (size_t i = 0; i < ingrads.size(); ++i) {
if (!ingrads[i]) continue; if (!ingrads[i]) continue;
if (ready_op->input_vars_[it.first][i]->IsStopGradient()) { if (ready_op->input_vars_[it->first][i]->IsStopGradient()) {
continue; continue;
} }
OpBase* pre_op = ready_op->pre_ops_[it.first][i]; OpBase* pre_op = ready_op->pre_ops_[it->first][i];
if (!pre_op) continue; if (!pre_op) continue;
dep_counts[pre_op] -= 1; dep_counts[pre_op] -= 1;
......
cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc DEPS anakin_engine framework_proto scope op_registry)
elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc DEPS anakin_engine framework_proto scope op_registry)
cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op SERIAL) cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op SERIAL)
cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL) cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL)
......
...@@ -34,6 +34,7 @@ ActivationOpConverter::ActivationOpConverter(const std::string &op_type) ...@@ -34,6 +34,7 @@ ActivationOpConverter::ActivationOpConverter(const std::string &op_type)
} }
void ActivationOpConverter::operator()(const framework::proto::OpDesc &op, void ActivationOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) { bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
......
...@@ -27,6 +27,7 @@ class ActivationOpConverter : public AnakinOpConverter { ...@@ -27,6 +27,7 @@ class ActivationOpConverter : public AnakinOpConverter {
explicit ActivationOpConverter(const std::string &op_type); explicit ActivationOpConverter(const std::string &op_type);
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~ActivationOpConverter() {} virtual ~ActivationOpConverter() {}
......
...@@ -29,6 +29,7 @@ namespace inference { ...@@ -29,6 +29,7 @@ namespace inference {
namespace anakin { namespace anakin {
void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op, void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) { bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
......
...@@ -25,6 +25,7 @@ class BatchNormOpConverter : public AnakinOpConverter { ...@@ -25,6 +25,7 @@ class BatchNormOpConverter : public AnakinOpConverter {
BatchNormOpConverter() = default; BatchNormOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~BatchNormOpConverter() {} virtual ~BatchNormOpConverter() {}
......
...@@ -29,6 +29,7 @@ namespace inference { ...@@ -29,6 +29,7 @@ namespace inference {
namespace anakin { namespace anakin {
void ConcatOpConverter::operator()(const framework::proto::OpDesc &op, void ConcatOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) { bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
......
...@@ -25,6 +25,7 @@ class ConcatOpConverter : public AnakinOpConverter { ...@@ -25,6 +25,7 @@ class ConcatOpConverter : public AnakinOpConverter {
ConcatOpConverter() = default; ConcatOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~ConcatOpConverter() {} virtual ~ConcatOpConverter() {}
......
...@@ -28,6 +28,7 @@ namespace inference { ...@@ -28,6 +28,7 @@ namespace inference {
namespace anakin { namespace anakin {
void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op, void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) { bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
......
...@@ -25,6 +25,7 @@ class Conv2dOpConverter : public AnakinOpConverter { ...@@ -25,6 +25,7 @@ class Conv2dOpConverter : public AnakinOpConverter {
Conv2dOpConverter() = default; Conv2dOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~Conv2dOpConverter() {} virtual ~Conv2dOpConverter() {}
......
...@@ -28,6 +28,7 @@ namespace inference { ...@@ -28,6 +28,7 @@ namespace inference {
namespace anakin { namespace anakin {
void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op, void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) { bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
......
...@@ -25,6 +25,7 @@ class Conv2dFusionOpConverter : public AnakinOpConverter { ...@@ -25,6 +25,7 @@ class Conv2dFusionOpConverter : public AnakinOpConverter {
Conv2dFusionOpConverter() = default; Conv2dFusionOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~Conv2dFusionOpConverter() {} virtual ~Conv2dFusionOpConverter() {}
......
...@@ -27,9 +27,9 @@ namespace paddle { ...@@ -27,9 +27,9 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op, void DensityPriorBoxOpConverter::operator()(
const framework::Scope& scope, const framework::proto::OpDesc& op, const framework::BlockDesc& block_desc,
bool test_mode) { const framework::Scope& scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
auto input_name = op_desc.Input("Input").front(); auto input_name = op_desc.Input("Input").front();
auto image_name = op_desc.Input("Image").front(); auto image_name = op_desc.Input("Image").front();
......
...@@ -27,6 +27,7 @@ class DensityPriorBoxOpConverter : public AnakinOpConverter { ...@@ -27,6 +27,7 @@ class DensityPriorBoxOpConverter : public AnakinOpConverter {
DensityPriorBoxOpConverter() = default; DensityPriorBoxOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~DensityPriorBoxOpConverter() {} virtual ~DensityPriorBoxOpConverter() {}
......
...@@ -26,6 +26,7 @@ namespace inference { ...@@ -26,6 +26,7 @@ namespace inference {
namespace anakin { namespace anakin {
void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op, void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) { bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
......
...@@ -27,6 +27,7 @@ class DetectionOutOpConverter : public AnakinOpConverter { ...@@ -27,6 +27,7 @@ class DetectionOutOpConverter : public AnakinOpConverter {
DetectionOutOpConverter() = default; DetectionOutOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~DetectionOutOpConverter() {} virtual ~DetectionOutOpConverter() {}
......
...@@ -31,6 +31,7 @@ namespace inference { ...@@ -31,6 +31,7 @@ namespace inference {
namespace anakin { namespace anakin {
void DropoutOpConverter::operator()(const framework::proto::OpDesc &op, void DropoutOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) { bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
......
...@@ -25,6 +25,7 @@ class DropoutOpConverter : public AnakinOpConverter { ...@@ -25,6 +25,7 @@ class DropoutOpConverter : public AnakinOpConverter {
DropoutOpConverter() = default; DropoutOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~DropoutOpConverter() {} virtual ~DropoutOpConverter() {}
......
...@@ -30,9 +30,9 @@ namespace paddle { ...@@ -30,9 +30,9 @@ namespace paddle {
namespace inference { namespace inference {
namespace anakin { namespace anakin {
void ElementwiseAddOpConverter::operator()(const framework::proto::OpDesc &op, void ElementwiseAddOpConverter::operator()(
const framework::Scope &scope, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);
...@@ -50,9 +50,9 @@ void ElementwiseAddOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -50,9 +50,9 @@ void ElementwiseAddOpConverter::operator()(const framework::proto::OpDesc &op,
engine_->AddOpAttr<PTuple<float>>(op_name, "coeff", coeff); engine_->AddOpAttr<PTuple<float>>(op_name, "coeff", coeff);
} }
void ElementwiseMulOpConverter::operator()(const framework::proto::OpDesc &op, void ElementwiseMulOpConverter::operator()(
const framework::Scope &scope, const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc,
bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);
......
...@@ -25,6 +25,7 @@ class ElementwiseAddOpConverter : public AnakinOpConverter { ...@@ -25,6 +25,7 @@ class ElementwiseAddOpConverter : public AnakinOpConverter {
ElementwiseAddOpConverter() = default; ElementwiseAddOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~ElementwiseAddOpConverter() {} virtual ~ElementwiseAddOpConverter() {}
...@@ -37,6 +38,7 @@ class ElementwiseMulOpConverter : public AnakinOpConverter { ...@@ -37,6 +38,7 @@ class ElementwiseMulOpConverter : public AnakinOpConverter {
ElementwiseMulOpConverter() = default; ElementwiseMulOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~ElementwiseMulOpConverter() {} virtual ~ElementwiseMulOpConverter() {}
......
...@@ -27,6 +27,7 @@ namespace inference { ...@@ -27,6 +27,7 @@ namespace inference {
namespace anakin { namespace anakin {
void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op, void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) { bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
......
...@@ -25,6 +25,7 @@ class FcBaseOpConverter : public AnakinOpConverter { ...@@ -25,6 +25,7 @@ class FcBaseOpConverter : public AnakinOpConverter {
FcBaseOpConverter() = default; FcBaseOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~FcBaseOpConverter() {} virtual ~FcBaseOpConverter() {}
......
...@@ -26,6 +26,7 @@ namespace inference { ...@@ -26,6 +26,7 @@ namespace inference {
namespace anakin { namespace anakin {
void FlattenOpConverter::operator()(const framework::proto::OpDesc &op, void FlattenOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) { bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
......
...@@ -25,6 +25,7 @@ class FlattenOpConverter : public AnakinOpConverter { ...@@ -25,6 +25,7 @@ class FlattenOpConverter : public AnakinOpConverter {
FlattenOpConverter() = default; FlattenOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~FlattenOpConverter() {} virtual ~FlattenOpConverter() {}
......
...@@ -31,6 +31,7 @@ namespace inference { ...@@ -31,6 +31,7 @@ namespace inference {
namespace anakin { namespace anakin {
void Im2SequenceConverter::operator()(const framework::proto::OpDesc &op, void Im2SequenceConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) { bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
......
...@@ -25,6 +25,7 @@ class Im2SequenceConverter : public AnakinOpConverter { ...@@ -25,6 +25,7 @@ class Im2SequenceConverter : public AnakinOpConverter {
Im2SequenceConverter() = default; Im2SequenceConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~Im2SequenceConverter() {} virtual ~Im2SequenceConverter() {}
......
...@@ -40,8 +40,10 @@ class AnakinOpConverter { ...@@ -40,8 +40,10 @@ class AnakinOpConverter {
AnakinOpConverter() = default; AnakinOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) {} const framework::Scope &scope, bool test_mode) {}
void ConvertOp(const framework::proto::OpDesc &op, void ConvertOp(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const std::unordered_set<std::string> &parameters, const std::unordered_set<std::string> &parameters,
const framework::Scope &scope, AnakinNvEngine *engine, const framework::Scope &scope, AnakinNvEngine *engine,
bool test_mode = false) { bool test_mode = false) {
...@@ -58,16 +60,17 @@ class AnakinOpConverter { ...@@ -58,16 +60,17 @@ class AnakinOpConverter {
} }
PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_type); PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_type);
it->SetEngine(engine); it->SetEngine(engine);
(*it)(op, scope, test_mode); (*it)(op, block_desc, scope, test_mode);
} }
void ConvertBlock(const framework::proto::BlockDesc &block, void ConvertBlock(framework::BlockDesc *block_desc,
const std::unordered_set<std::string> &parameters, const std::unordered_set<std::string> &parameters,
const framework::Scope &scope, AnakinNvEngine *engine) { const framework::Scope &scope, AnakinNvEngine *engine) {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
for (auto i = 0; i < block.ops_size(); i++) { framework::proto::BlockDesc *block = block_desc->Proto();
auto &op = block.ops(i); for (auto i = 0; i < block->ops_size(); i++) {
ConvertOp(op, parameters, scope, engine); auto &op = block->ops(i);
ConvertOp(op, *block_desc, parameters, scope, engine);
} }
} }
...@@ -77,9 +80,7 @@ class AnakinOpConverter { ...@@ -77,9 +80,7 @@ class AnakinOpConverter {
const std::vector<std::string> &inputs, const std::vector<std::string> &inputs,
const std::unordered_set<std::string> &parameters, const std::unordered_set<std::string> &parameters,
const std::vector<std::string> &outputs, AnakinNvEngine *engine) { const std::vector<std::string> &outputs, AnakinNvEngine *engine) {
framework::proto::BlockDesc *block_proto = block_desc->Proto(); ConvertBlock(block_desc, parameters, *scope, engine);
ConvertBlock(*block_proto, parameters, *scope, engine);
engine->Freeze(); engine->Freeze();
// if the max_batch size // if the max_batch size
int max_batch_size = engine->GetMaxBatchSize(); int max_batch_size = engine->GetMaxBatchSize();
......
...@@ -31,6 +31,7 @@ namespace inference { ...@@ -31,6 +31,7 @@ namespace inference {
namespace anakin { namespace anakin {
void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op, void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) { bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
......
...@@ -25,6 +25,7 @@ class Pool2dOpConverter : public AnakinOpConverter { ...@@ -25,6 +25,7 @@ class Pool2dOpConverter : public AnakinOpConverter {
Pool2dOpConverter() = default; Pool2dOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~Pool2dOpConverter() {} virtual ~Pool2dOpConverter() {}
......
...@@ -26,6 +26,7 @@ namespace inference { ...@@ -26,6 +26,7 @@ namespace inference {
namespace anakin { namespace anakin {
void ReluOpConverter::operator()(const framework::proto::OpDesc &op, void ReluOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) { bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
......
...@@ -27,6 +27,7 @@ class ReluOpConverter : public AnakinOpConverter { ...@@ -27,6 +27,7 @@ class ReluOpConverter : public AnakinOpConverter {
ReluOpConverter() = default; ReluOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~ReluOpConverter() {} virtual ~ReluOpConverter() {}
......
...@@ -26,6 +26,7 @@ namespace inference { ...@@ -26,6 +26,7 @@ namespace inference {
namespace anakin { namespace anakin {
void ReshapeOpConverter::operator()(const framework::proto::OpDesc &op, void ReshapeOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) { bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
......
...@@ -25,6 +25,7 @@ class ReshapeOpConverter : public AnakinOpConverter { ...@@ -25,6 +25,7 @@ class ReshapeOpConverter : public AnakinOpConverter {
ReshapeOpConverter() = default; ReshapeOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~ReshapeOpConverter() {} virtual ~ReshapeOpConverter() {}
......
...@@ -26,6 +26,7 @@ namespace inference { ...@@ -26,6 +26,7 @@ namespace inference {
namespace anakin { namespace anakin {
void ScaleOpConverter::operator()(const framework::proto::OpDesc &op, void ScaleOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) { bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
......
...@@ -27,6 +27,7 @@ class ScaleOpConverter : public AnakinOpConverter { ...@@ -27,6 +27,7 @@ class ScaleOpConverter : public AnakinOpConverter {
ScaleOpConverter() = default; ScaleOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~ScaleOpConverter() {} virtual ~ScaleOpConverter() {}
......
...@@ -24,6 +24,7 @@ namespace inference { ...@@ -24,6 +24,7 @@ namespace inference {
namespace anakin { namespace anakin {
void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op, void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) { bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
...@@ -32,8 +33,16 @@ void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op, ...@@ -32,8 +33,16 @@ void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op,
auto input = op_desc.Input("X").front(); auto input = op_desc.Input("X").front();
auto output = op_desc.Output("Out").front(); auto output = op_desc.Output("Out").front();
auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
auto input_var_desc = block_desc.FindVar(input);
PADDLE_ENFORCE(input_var_desc,
"Cant find %s variable When runing Anakin Softmax converter.",
input);
auto input_shape_in_fluid = input_var_desc->GetShape();
size_t input_dims = input_shape_in_fluid.size();
engine_->AddOp(op_name, "Softmax", {input}, {output}); engine_->AddOp(op_name, "Softmax", {input}, {output});
engine_->AddOpAttr(op_name, "axis", 2); engine_->AddOpAttr(op_name, "axis", static_cast<int>(input_dims - 1));
} }
} // namespace anakin } // namespace anakin
......
...@@ -25,6 +25,7 @@ class SoftMaxOpConverter : public AnakinOpConverter { ...@@ -25,6 +25,7 @@ class SoftMaxOpConverter : public AnakinOpConverter {
SoftMaxOpConverter() = default; SoftMaxOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~SoftMaxOpConverter() {} virtual ~SoftMaxOpConverter() {}
......
...@@ -30,6 +30,7 @@ namespace inference { ...@@ -30,6 +30,7 @@ namespace inference {
namespace anakin { namespace anakin {
void SplitOpConverter::operator()(const framework::proto::OpDesc &op, void SplitOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) { bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
......
...@@ -25,6 +25,7 @@ class SplitOpConverter : public AnakinOpConverter { ...@@ -25,6 +25,7 @@ class SplitOpConverter : public AnakinOpConverter {
SplitOpConverter() = default; SplitOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~SplitOpConverter() {} virtual ~SplitOpConverter() {}
......
...@@ -31,6 +31,7 @@ namespace inference { ...@@ -31,6 +31,7 @@ namespace inference {
namespace anakin { namespace anakin {
void SumOpConverter::operator()(const framework::proto::OpDesc &op, void SumOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, bool test_mode) { const framework::Scope &scope, bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 2); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 2);
......
...@@ -25,6 +25,7 @@ class SumOpConverter : public AnakinOpConverter { ...@@ -25,6 +25,7 @@ class SumOpConverter : public AnakinOpConverter {
SumOpConverter() = default; SumOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~SumOpConverter() {} virtual ~SumOpConverter() {}
......
...@@ -28,6 +28,7 @@ namespace inference { ...@@ -28,6 +28,7 @@ namespace inference {
namespace anakin { namespace anakin {
void TransposeOpConverter::operator()(const framework::proto::OpDesc &op, void TransposeOpConverter::operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) { bool test_mode) {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
......
...@@ -25,6 +25,7 @@ class TransposeOpConverter : public AnakinOpConverter { ...@@ -25,6 +25,7 @@ class TransposeOpConverter : public AnakinOpConverter {
TransposeOpConverter() = default; TransposeOpConverter() = default;
virtual void operator()(const framework::proto::OpDesc &op, virtual void operator()(const framework::proto::OpDesc &op,
const framework::BlockDesc &block_desc,
const framework::Scope &scope, const framework::Scope &scope,
bool test_mode) override; bool test_mode) override;
virtual ~TransposeOpConverter() {} virtual ~TransposeOpConverter() {}
......
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
...@@ -112,6 +113,17 @@ class AnakinConvertValidation { ...@@ -112,6 +113,17 @@ class AnakinConvertValidation {
auto* x_tensor = x->GetMutable<framework::LoDTensor>(); auto* x_tensor = x->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dim_vec)); x_tensor->Resize(framework::make_ddim(dim_vec));
RandomizeTensor(x_tensor, place_, ctx); RandomizeTensor(x_tensor, place_, ctx);
std::vector<int64_t> dim_vec_int64;
for (auto& ele : dim_vec) {
dim_vec_int64.push_back(static_cast<int64_t>(ele));
}
// Add var_desc to block_desc
auto* block_desc = program_desc_.MutableBlock(framework::kRootBlockIndex);
auto* var_desc = block_desc->Var(name);
var_desc->SetShape(dim_vec_int64);
} }
void SetOp(const framework::proto::OpDesc& desc) { void SetOp(const framework::proto::OpDesc& desc) {
...@@ -119,8 +131,10 @@ class AnakinConvertValidation { ...@@ -119,8 +131,10 @@ class AnakinConvertValidation {
op_desc_.reset(new framework::OpDesc(desc, nullptr)); op_desc_.reset(new framework::OpDesc(desc, nullptr));
// should init anakin engine here. // should init anakin engine here.
auto& block_desc = program_desc_.Block(framework::kRootBlockIndex);
Singleton<AnakinOpConverter>::Global().ConvertOp( Singleton<AnakinOpConverter>::Global().ConvertOp(
desc, parameters_, *scope_, engine_.get(), true /*test_mode*/); desc, block_desc, parameters_, *scope_, engine_.get(),
true /*test_mode*/);
engine_->Freeze(); engine_->Freeze();
std::map<std::string, std::vector<int>> temp_max_input_shape; std::map<std::string, std::vector<int>> temp_max_input_shape;
...@@ -194,6 +208,7 @@ class AnakinConvertValidation { ...@@ -194,6 +208,7 @@ class AnakinConvertValidation {
cudaStream_t stream_; cudaStream_t stream_;
std::unique_ptr<framework::OperatorBase> op_; std::unique_ptr<framework::OperatorBase> op_;
std::unique_ptr<framework::OpDesc> op_desc_; std::unique_ptr<framework::OpDesc> op_desc_;
framework::ProgramDesc program_desc_;
const std::unordered_set<std::string>& parameters_; const std::unordered_set<std::string>& parameters_;
framework::Scope* scope_; framework::Scope* scope_;
platform::CUDAPlace place_; platform::CUDAPlace place_;
......
...@@ -91,7 +91,6 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute( ...@@ -91,7 +91,6 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
" or equal to the real input shape, Please set the max " " or equal to the real input shape, Please set the max "
"input shape using EnableAnakinEngine"); "input shape using EnableAnakinEngine");
anakin_input->reshape(fluid_input_shape); anakin_input->reshape(fluid_input_shape);
::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0, ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
fluid_input_shape); fluid_input_shape);
anakin_input->copy_from(tmp_anakin_tensor); anakin_input->copy_from(tmp_anakin_tensor);
......
...@@ -168,6 +168,7 @@ struct Argument { ...@@ -168,6 +168,7 @@ struct Argument {
DECL_ARGUMENT_FIELD(anakin_max_input_shape, AnakinMaxInputShape, DECL_ARGUMENT_FIELD(anakin_max_input_shape, AnakinMaxInputShape,
anakin_max_shape_t); anakin_max_shape_t);
DECL_ARGUMENT_FIELD(anakin_max_batch_size, AnakinMaxBatchSize, int); DECL_ARGUMENT_FIELD(anakin_max_batch_size, AnakinMaxBatchSize, int);
DECL_ARGUMENT_FIELD(anakin_min_subgraph_size, AnakinMinSubgraphSize, int);
DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool); DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool);
// Memory optimized related. // Memory optimized related.
......
...@@ -151,13 +151,20 @@ void AnakinSubgraphPass::CreateAnakinOp( ...@@ -151,13 +151,20 @@ void AnakinSubgraphPass::CreateAnakinOp(
op_desc->SetType("anakin_engine"); op_desc->SetType("anakin_engine");
std::unordered_map<std::string, std::string> output_name_map; std::unordered_map<std::string, std::string> output_name_map;
std::unordered_map<std::string, framework::ir::Node *> graph_var_map;
for (framework::ir::Node *node : graph->Nodes()) {
if (node->IsVar() && node->Var()) {
graph_var_map[node->Name()] = node;
}
}
auto &subgraph_nodes = *Agent(node).subgraph(); auto &subgraph_nodes = *Agent(node).subgraph();
// The following procedure is used to rename all the intermediate // The following procedure is used to rename all the intermediate
// variables and the output variables of the subgraph. // variables and the output variables of the subgraph.
RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id, RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id,
&output_names_with_id, &output_names, &output_name_map, &output_names_with_id, &output_names, &output_name_map,
false); graph_var_map, false);
// When anakin engine runs at the end of the operation, // When anakin engine runs at the end of the operation,
// output_mapping help us copy the data from the renamed ITensor // output_mapping help us copy the data from the renamed ITensor
...@@ -168,13 +175,6 @@ void AnakinSubgraphPass::CreateAnakinOp( ...@@ -168,13 +175,6 @@ void AnakinSubgraphPass::CreateAnakinOp(
output_mapping.push_back(output_name_map[name]); output_mapping.push_back(output_name_map[name]);
} }
auto *vars = block_desc.Proto()->mutable_vars();
for (framework::ir::Node *node : graph->Nodes()) {
if (node->IsVar() && node->Var()) {
*vars->Add() = *node->Var()->Proto();
}
}
PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
"the block has no var-desc"); "the block has no var-desc");
PADDLE_ENFORCE(!output_mapping.empty()); PADDLE_ENFORCE(!output_mapping.empty());
......
...@@ -60,6 +60,7 @@ void RenameAndGetOutputs( ...@@ -60,6 +60,7 @@ void RenameAndGetOutputs(
std::set<std::string> *output_names_with_id, std::set<std::string> *output_names_with_id,
std::set<std::string> *output_names, std::set<std::string> *output_names,
std::unordered_map<std::string, std::string> *output_name_map, std::unordered_map<std::string, std::string> *output_name_map,
const std::unordered_map<std::string, framework::ir::Node *> &graph_var_map,
bool is_trt) { bool is_trt) {
//// In the normal case, the paddle-trt exists bug when runing the googlenet. //// In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the // When there are more than two convolutions of 1 * 1 with the same input, the
...@@ -69,6 +70,15 @@ void RenameAndGetOutputs( ...@@ -69,6 +70,15 @@ void RenameAndGetOutputs(
std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/> std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
same_hierarchy_conv2d_num_map; same_hierarchy_conv2d_num_map;
auto add_block_var = [&](const std::string &graph_arg,
const std::string &block_arg) {
auto arg_var_node = graph_var_map.find(graph_arg);
PADDLE_ENFORCE(arg_var_node != graph_var_map.end());
auto *var_t = block_desc->Var(block_arg);
var_t->SetShape(arg_var_node->second->Var()->GetShape());
var_t->SetDataType(arg_var_node->second->Var()->GetDataType());
};
for (size_t index = 0; index < block_desc->OpSize(); ++index) { for (size_t index = 0; index < block_desc->OpSize(); ++index) {
framework::proto::OpDesc *op = block_desc->Op(index)->Proto(); framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
framework::OpDesc op_desc(*op, nullptr); framework::OpDesc op_desc(*op, nullptr);
...@@ -87,13 +97,20 @@ void RenameAndGetOutputs( ...@@ -87,13 +97,20 @@ void RenameAndGetOutputs(
auto *in_var = op->mutable_inputs(i); auto *in_var = op->mutable_inputs(i);
std::vector<std::string> replaced_names; std::vector<std::string> replaced_names;
for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments
std::string arg_value = in_var->arguments(k); const std::string arg_value = in_var->arguments(k);
std::string arg_value_with_id = const std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]); arg_value + std::to_string(var2id[arg_value]);
if (input_names_with_id.count(arg_value_with_id)) { if (input_names_with_id.count(arg_value_with_id)) {
replaced_names.push_back(arg_value); replaced_names.push_back(arg_value);
if (graph_var_map.count(arg_value)) {
add_block_var(arg_value, arg_value);
}
} else { } else {
replaced_names.push_back(arg_value_with_id); replaced_names.push_back(arg_value_with_id);
if (graph_var_map.count(arg_value)) {
add_block_var(arg_value, arg_value_with_id);
}
} }
} }
in_var->clear_arguments(); in_var->clear_arguments();
...@@ -105,7 +122,6 @@ void RenameAndGetOutputs( ...@@ -105,7 +122,6 @@ void RenameAndGetOutputs(
for (auto out_var : correspond_node->outputs) { for (auto out_var : correspond_node->outputs) {
var2id[out_var->Name()] = out_var->id(); var2id[out_var->Name()] = out_var->id();
} }
if (op_desc.Type() == "conv2d" && is_trt) { if (op_desc.Type() == "conv2d" && is_trt) {
auto input_var_name = op_desc.Input("Input").front(); auto input_var_name = op_desc.Input("Input").front();
auto filter_var_name = op_desc.Input("Filter").front(); auto filter_var_name = op_desc.Input("Filter").front();
...@@ -125,15 +141,18 @@ void RenameAndGetOutputs( ...@@ -125,15 +141,18 @@ void RenameAndGetOutputs(
same_hierarchy_conv2d_num_map[input_var_name] += 1; same_hierarchy_conv2d_num_map[input_var_name] += 1;
} }
} }
// rename for the output variables of op inside subgraph // rename for the output variables of op inside subgraph
for (int i = 0; i < op->outputs_size(); i++) { for (int i = 0; i < op->outputs_size(); i++) {
framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i); framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
std::vector<std::string> replaced_names; std::vector<std::string> replaced_names;
for (int k = 0; k < out_var->arguments_size(); k++) { for (int k = 0; k < out_var->arguments_size(); k++) {
std::string arg_value = out_var->arguments(k); const std::string arg_value = out_var->arguments(k);
std::string arg_value_with_id = const std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]); arg_value + std::to_string(var2id[arg_value]);
if (graph_var_map.count(arg_value)) {
add_block_var(arg_value, arg_value_with_id);
}
if (output_names_with_id->count(arg_value_with_id)) { if (output_names_with_id->count(arg_value_with_id)) {
(*output_name_map)[arg_value] = arg_value_with_id; (*output_name_map)[arg_value] = arg_value_with_id;
} }
......
...@@ -42,6 +42,7 @@ void RenameAndGetOutputs( ...@@ -42,6 +42,7 @@ void RenameAndGetOutputs(
std::set<std::string> *output_names_with_id, std::set<std::string> *output_names_with_id,
std::set<std::string> *output_names, std::set<std::string> *output_names,
std::unordered_map<std::string, std::string> *output_name_map, std::unordered_map<std::string, std::string> *output_name_map,
const std::unordered_map<std::string, framework::ir::Node *> &graph_var_map,
bool is_trt = true); bool is_trt = true);
} // namespace analysis } // namespace analysis
......
...@@ -142,6 +142,13 @@ void TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -142,6 +142,13 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
} }
std::unordered_map<std::string, std::string> output_name_map; std::unordered_map<std::string, std::string> output_name_map;
std::unordered_map<std::string, framework::ir::Node *> graph_var_map;
for (framework::ir::Node *node : graph->Nodes()) {
if (node->IsVar() && node->Var()) {
graph_var_map[node->Name()] = node;
}
}
auto &subgraph_nodes = *Agent(node).subgraph(); auto &subgraph_nodes = *Agent(node).subgraph();
// The following procedure is used to rename all the intermediate // The following procedure is used to rename all the intermediate
...@@ -157,7 +164,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -157,7 +164,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
// So we have to rename the variable in the subgraph to make sure // So we have to rename the variable in the subgraph to make sure
// it is either an OP's input or an OP's output. // it is either an OP's input or an OP's output.
RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id, RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id,
&output_names_with_id, &output_names, &output_name_map); &output_names_with_id, &output_names, &output_name_map,
graph_var_map);
// When tensorrt engine runs at the end of the operation, // When tensorrt engine runs at the end of the operation,
// output_mapping help us copy the data from the renamed ITensor // output_mapping help us copy the data from the renamed ITensor
...@@ -168,14 +176,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -168,14 +176,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
output_mapping.push_back(output_name_map[name]); output_mapping.push_back(output_name_map[name]);
} }
PADDLE_ENFORCE(!output_mapping.empty()); PADDLE_ENFORCE(!output_mapping.empty());
auto *vars = block_desc.Proto()->mutable_vars();
for (framework::ir::Node *node : graph->Nodes()) {
if (node->IsVar() && node->Var()) {
*vars->Add() = *node->Var()->Proto();
}
}
PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
"the block has no var-desc"); "the block has no var-desc");
...@@ -213,7 +213,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -213,7 +213,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
SetAttr(op_desc->Proto(), "enable_int8", enable_int8); SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
SetAttr(op_desc->Proto(), "engine_key", engine_key); SetAttr(op_desc->Proto(), "engine_key", engine_key);
std::string trt_engine_serialized_data = ""; std::string trt_engine_serialized_data = "";
SetAttr(op_desc->Proto(), "engine_serialized_data", SetAttr(op_desc->Proto(), "engine_serialized_data",
trt_engine_serialized_data); trt_engine_serialized_data);
......
...@@ -115,6 +115,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -115,6 +115,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(use_anakin_); CP_MEMBER(use_anakin_);
CP_MEMBER(anakin_max_batchsize_); CP_MEMBER(anakin_max_batchsize_);
CP_MEMBER(anakin_max_input_shape_); CP_MEMBER(anakin_max_input_shape_);
CP_MEMBER(anakin_min_subgraph_size_);
// Ir related. // Ir related.
CP_MEMBER(enable_ir_optim_); CP_MEMBER(enable_ir_optim_);
...@@ -315,6 +316,7 @@ std::string AnalysisConfig::SerializeInfoCache() { ...@@ -315,6 +316,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << specify_input_name_; ss << specify_input_name_;
ss << cpu_math_library_num_threads_; ss << cpu_math_library_num_threads_;
ss << use_anakin_; ss << use_anakin_;
ss << anakin_min_subgraph_size_;
return ss.str(); return ss.str();
} }
...@@ -386,10 +388,11 @@ void AnalysisConfig::SwitchIrDebug(int x) { ...@@ -386,10 +388,11 @@ void AnalysisConfig::SwitchIrDebug(int x) {
Update(); Update();
} }
void AnalysisConfig::EnableAnakinEngine( void AnalysisConfig::EnableAnakinEngine(
int max_batch_size, int max_batch_size, std::map<std::string, std::vector<int>> max_input_shape,
std::map<std::string, std::vector<int>> max_input_shape) { int min_subgraph_size) {
anakin_max_batchsize_ = max_batch_size; anakin_max_batchsize_ = max_batch_size;
anakin_max_input_shape_ = max_input_shape; anakin_max_input_shape_ = max_input_shape;
anakin_min_subgraph_size_ = min_subgraph_size;
use_anakin_ = true; use_anakin_ = true;
Update(); Update();
} }
......
...@@ -385,6 +385,7 @@ void AnalysisPredictor::PrepareArgument() { ...@@ -385,6 +385,7 @@ void AnalysisPredictor::PrepareArgument() {
if (config_.use_gpu() && config_.anakin_engine_enabled()) { if (config_.use_gpu() && config_.anakin_engine_enabled()) {
argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_); argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_);
argument_.SetAnakinMaxInputShape(config_.anakin_max_input_shape_); argument_.SetAnakinMaxInputShape(config_.anakin_max_input_shape_);
argument_.SetAnakinMinSubgraphSize(config_.anakin_min_subgraph_size_);
LOG(INFO) << "Anakin subgraph engine is enabled"; LOG(INFO) << "Anakin subgraph engine is enabled";
} }
......
...@@ -151,7 +151,8 @@ struct AnalysisConfig { ...@@ -151,7 +151,8 @@ struct AnalysisConfig {
*/ */
void EnableAnakinEngine( void EnableAnakinEngine(
int max_batch_size = 1, int max_batch_size = 1,
std::map<std::string, std::vector<int>> max_input_shape = {}); std::map<std::string, std::vector<int>> max_input_shape = {},
int min_subgraph_size = 6);
/** A boolean state indicating whether the Anakin sub-graph engine is used. /** A boolean state indicating whether the Anakin sub-graph engine is used.
*/ */
...@@ -288,6 +289,7 @@ struct AnalysisConfig { ...@@ -288,6 +289,7 @@ struct AnalysisConfig {
bool use_anakin_{false}; bool use_anakin_{false};
int anakin_max_batchsize_; int anakin_max_batchsize_;
int anakin_min_subgraph_size_{6};
std::map<std::string, std::vector<int>> anakin_max_input_shape_; std::map<std::string, std::vector<int>> anakin_max_input_shape_;
std::map<std::string, std::string> engine_opt_info_; std::map<std::string, std::string> engine_opt_info_;
......
...@@ -87,6 +87,7 @@ const std::vector<std::string> kAnakinSubgraphPasses({ ...@@ -87,6 +87,7 @@ const std::vector<std::string> kAnakinSubgraphPasses({
GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
passes_.assign({ passes_.assign({
"infer_clean_graph_pass", // "infer_clean_graph_pass", //
"runtime_context_cache_pass", //
// "identity_scale_op_clean_pass", // // "identity_scale_op_clean_pass", //
"conv_affine_channel_fuse_pass", // "conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", // "conv_eltwiseadd_affine_channel_fuse_pass", //
...@@ -96,7 +97,6 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { ...@@ -96,7 +97,6 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
"conv_elementwise_add_act_fuse_pass", // "conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", // "conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", // "conv_elementwise_add_fuse_pass", //
"runtime_context_cache_pass", //
#endif // #endif //
"transpose_flatten_concat_fuse_pass", "transpose_flatten_concat_fuse_pass",
}); });
...@@ -117,6 +117,10 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { ...@@ -117,6 +117,10 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
// not be damaged by smaller ones. // not be damaged by smaller ones.
passes_.assign({ passes_.assign({
"infer_clean_graph_pass", // "infer_clean_graph_pass", //
// TODO(luotao): runtime_context_cache_pass should be located in the
// front, see https://github.com/PaddlePaddle/Paddle/issues/16609,
// will enhance this pass later.
"runtime_context_cache_pass", //
"attention_lstm_fuse_pass", // "attention_lstm_fuse_pass", //
"seqpool_concat_fuse_pass", // "seqpool_concat_fuse_pass", //
"seqconv_eltadd_relu_fuse_pass", // "seqconv_eltadd_relu_fuse_pass", //
...@@ -132,8 +136,6 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { ...@@ -132,8 +136,6 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
"conv_bn_fuse_pass", // "conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", //
"is_test_pass", // "is_test_pass", //
"identity_scale_op_clean_pass", //
"runtime_context_cache_pass", //
}); });
use_gpu_ = false; use_gpu_ = false;
} }
......
...@@ -23,18 +23,11 @@ namespace analysis { ...@@ -23,18 +23,11 @@ namespace analysis {
void SetConfig(AnalysisConfig *cfg) { void SetConfig(AnalysisConfig *cfg) {
cfg->SetModel(FLAGS_infer_model); cfg->SetModel(FLAGS_infer_model);
cfg->SetProgFile("__model__");
cfg->DisableGpu(); cfg->DisableGpu();
cfg->SwitchIrOptim(); cfg->SwitchIrOptim();
cfg->SwitchSpecifyInputNames(false); cfg->SwitchSpecifyInputNames();
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
cfg->EnableMKLDNN(); cfg->EnableMKLDNN();
cfg->pass_builder()->SetPasses(
{"infer_clean_graph_pass", "mkldnn_placement_pass",
"depthwise_conv_mkldnn_pass", "conv_bn_fuse_pass",
"conv_eltwiseadd_bn_fuse_pass", "conv_bias_mkldnn_fuse_pass",
"conv_elementwise_add_mkldnn_fuse_pass", "conv_relu_mkldnn_fuse_pass",
"fc_fuse_pass", "is_test_pass"});
} }
template <typename T> template <typename T>
...@@ -84,13 +77,13 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData( ...@@ -84,13 +77,13 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
std::to_string(num_images) + " is bigger than all test data size."); std::to_string(num_images) + " is bigger than all test data size.");
PaddleTensor images; PaddleTensor images;
images.name = "input"; images.name = "image";
images.shape = {num_images, 3, 224, 224}; images.shape = {num_images, 3, 224, 224};
images.dtype = PaddleDType::FLOAT32; images.dtype = PaddleDType::FLOAT32;
images.data.Resize(sizeof(float) * num_images * 3 * 224 * 224); images.data.Resize(sizeof(float) * num_images * 3 * 224 * 224);
PaddleTensor labels; PaddleTensor labels;
labels.name = "labels"; labels.name = "label";
labels.shape = {num_images, 1}; labels.shape = {num_images, 1};
labels.dtype = PaddleDType::INT64; labels.dtype = PaddleDType::INT64;
labels.data.Resize(sizeof(int64_t) * num_images); labels.data.Resize(sizeof(int64_t) * num_images);
...@@ -132,7 +125,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs, ...@@ -132,7 +125,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
images_offset_in_file + sizeof(float) * total_images * 3 * 224 * 224; images_offset_in_file + sizeof(float) * total_images * 3 * 224 * 224;
TensorReader<float> image_reader(file, images_offset_in_file, TensorReader<float> image_reader(file, images_offset_in_file,
image_batch_shape, "input"); image_batch_shape, "image");
TensorReader<int64_t> label_reader(file, labels_offset_in_file, TensorReader<int64_t> label_reader(file, labels_offset_in_file,
label_batch_shape, "label"); label_batch_shape, "label");
......
# INT8 MKL-DNN quantization
This document describes how to use Paddle inference Engine to convert the FP32 model to INT8 model on ResNet-50 and MobileNet-V1. We provide the instructions on enabling INT8 MKL-DNN quantization in Paddle inference and show the ResNet-50 and MobileNet-V1 results in accuracy and performance.
## 0. Install PaddlePaddle
Follow PaddlePaddle [installation instruction](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification#installation) to install PaddlePaddle. If you build PaddlePaddle yourself, please use the following cmake arguments.
```
cmake .. -DWITH_TESTING=ON -WITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_MKL=ON -WITH_SWIG_PY=OFF -DWITH_INFERENCE_API_TEST=ON -DON_INFER=ON
```
Note: MKL-DNN and MKL are required.
## 1. Enable INT8 MKL-DNN quantization
For reference, please examine the code of unit test enclosed in [analyzer_int8_image_classification_tester.cc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc).
* ### Create Analysis config
INT8 quantization is one of the optimizations in analysis config. More information about analysis config can be found [here](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/advanced_usage/deploy/inference/native_infer_en.md#upgrade-performance-based-on-contribanalysisconfig-prerelease)
* ### Create quantize config by analysis config
We enable the MKL-DNN quantization procedure by calling an appropriate method from analysis config. Afterwards, all the required quantization parameters (quantization op names, quantization strategies etc.) can be set through quantizer config which is present in the analysis config. It is also necessary to specify a pre-processed warmup dataset and desired batch size.
```cpp
//Enable MKL-DNN quantization
cfg.EnableMkldnnQuantizer();
//use analysis config to call the MKL-DNN quantization config
cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data);
cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(100);
```
## 2. Accuracy and Performance benchmark
We provide the results of accuracy and performance measured on Intel(R) Xeon(R) Gold 6271 on single core.
>**I. Top-1 Accuracy on Intel(R) Xeon(R) Gold 6271**
| Model | Dataset | FP32 Accuracy | INT8 Accuracy | Accuracy Diff |
| :------------: | :------------: | :------------: | :------------: | :------------: |
| ResNet-50 | Full ImageNet Val | 76.63% | 76.48% | 0.15% |
| MobileNet-V1 | Full ImageNet Val | 70.78% | 70.36% | 0.42% |
>**II. Throughput on Intel(R) Xeon(R) Gold 6271 (batch size 1 on single core)**
| Model | Dataset | FP32 Throughput | INT8 Throughput | Ratio(INT8/FP32) |
| :------------: | :------------: | :------------: | :------------: | :------------: |
| ResNet-50 | Full ImageNet Val | 13.17 images/s | 49.84 images/s | 3.78 |
| MobileNet-V1 | Full ImageNet Val | 75.49 images/s | 232.38 images/s | 3.07 |
Notes:
* Measurement of accuracy requires a model which accepts two inputs: data and labels.
* Different sampling batch data may cause slight difference on INT8 top accuracy.
* C-API performance data is better than Python API performance data because of the python overhead. Especially for the small computational model, python overhead will be more obvious.
## 3. Commands to reproduce the above accuracy and performance benchmark
* #### Full dataset (Single core)
* ##### Download full ImageNet Validation Dataset
```bash
cd /PATH/TO/PADDLE/build
python ../paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
```
The converted data binary file is saved by default in ~/.cache/paddle/dataset/int8/download/int8_full_val.bin
* ##### ResNet50 Full dataset benchmark
```bash
./paddle/fluid/inference/tests/api/test_analyzer_int8_resnet50 --infer_model=third_party/inference_demo/int8v2/resnet50/model --infer_data=/path/to/converted/int8_full_val.bin --batch_size=1 --paddle_num_threads=1
```
* ##### Mobilenet-v1 Full dataset benchmark
```bash
./paddle/fluid/inference/tests/api/test_analyzer_int8_mobilenet --infer_model=third_party/inference_demo/int8v2/mobilenet/model --infer_data=/path/to/converted/int8_full_val.bin --batch_size=1 --paddle_num_threads=1
```
...@@ -316,7 +316,8 @@ void PredictionRun(PaddlePredictor *predictor, ...@@ -316,7 +316,8 @@ void PredictionRun(PaddlePredictor *predictor,
int num_threads, int tid) { int num_threads, int tid) {
int num_times = FLAGS_repeat; int num_times = FLAGS_repeat;
int iterations = inputs.size(); // process the whole dataset ... int iterations = inputs.size(); // process the whole dataset ...
if (FLAGS_iterations > 0 && FLAGS_iterations < inputs.size()) if (FLAGS_iterations > 0 &&
FLAGS_iterations < static_cast<int64_t>(inputs.size()))
iterations = iterations =
FLAGS_iterations; // ... unless the number of iterations is set FLAGS_iterations; // ... unless the number of iterations is set
outputs->resize(iterations); outputs->resize(iterations);
...@@ -329,14 +330,14 @@ void PredictionRun(PaddlePredictor *predictor, ...@@ -329,14 +330,14 @@ void PredictionRun(PaddlePredictor *predictor,
#endif #endif
if (!FLAGS_zero_copy) { if (!FLAGS_zero_copy) {
run_timer.tic(); run_timer.tic();
for (size_t i = 0; i < iterations; i++) { for (int i = 0; i < iterations; i++) {
for (int j = 0; j < num_times; j++) { for (int j = 0; j < num_times; j++) {
predictor->Run(inputs[i], &(*outputs)[i], FLAGS_batch_size); predictor->Run(inputs[i], &(*outputs)[i], FLAGS_batch_size);
} }
} }
elapsed_time = run_timer.toc(); elapsed_time = run_timer.toc();
} else { } else {
for (size_t i = 0; i < iterations; i++) { for (int i = 0; i < iterations; i++) {
ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]); ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]);
run_timer.tic(); run_timer.tic();
for (int j = 0; j < num_times; j++) { for (int j = 0; j < num_times; j++) {
...@@ -366,9 +367,8 @@ void TestOneThreadPrediction( ...@@ -366,9 +367,8 @@ void TestOneThreadPrediction(
const std::vector<std::vector<PaddleTensor>> &inputs, const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<std::vector<PaddleTensor>> *outputs, bool use_analysis = true) { std::vector<std::vector<PaddleTensor>> *outputs, bool use_analysis = true) {
auto predictor = CreateTestPredictor(config, use_analysis); auto predictor = CreateTestPredictor(config, use_analysis);
PredictionWarmUp(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads, PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0);
0); PredictionRun(predictor.get(), inputs, outputs, 1, 0);
PredictionRun(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads, 0);
} }
void TestMultiThreadPrediction( void TestMultiThreadPrediction(
......
...@@ -120,40 +120,8 @@ class AnakinEngineOp : public framework::OperatorBase { ...@@ -120,40 +120,8 @@ class AnakinEngineOp : public framework::OperatorBase {
inference::Singleton<inference::anakin::AnakinEngineManager>::Global() inference::Singleton<inference::anakin::AnakinEngineManager>::Global()
.Get(engine_key_); .Get(engine_key_);
} }
return anakin_engine_; return anakin_engine_;
} }
void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
AnakinNvEngineT *engine) const {
LOG(INFO) << "Prepare Anakin engine (Optimize model structure, Select OP "
"kernel etc). This process may cost a lot of time.";
framework::proto::BlockDesc block_desc;
block_desc.ParseFromString(Attr<std::string>("subgraph"));
std::vector<std::string> output_maps =
Attr<std::vector<std::string>>("output_name_mapping");
inference::Singleton<inference::anakin::AnakinOpConverter>::Global()
.ConvertBlock(block_desc, param_names_, scope, engine);
engine->Freeze();
for (const auto &x : Inputs("Xs")) {
if (param_names_.count(x)) continue;
auto &t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
auto t_shape = framework::vectorize2int(t.dims());
// all input shape should be 4 dims
if (t_shape.size() == 2) {
t_shape.push_back(1);
t_shape.push_back(1);
}
engine->SetInputShape(x, t_shape);
}
engine->Optimize();
engine->InitGraph();
}
}; };
} // namespace operators } // namespace operators
......
...@@ -24,19 +24,22 @@ class DGCClipByNormKernel : public ClipByNormKernel<DeviceContext, T> { ...@@ -24,19 +24,22 @@ class DGCClipByNormKernel : public ClipByNormKernel<DeviceContext, T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto rampup_begin_step = context.Attr<float>("rampup_begin_step"); auto rampup_begin_step = context.Attr<float>("rampup_begin_step");
if (static_cast<int>(rampup_begin_step) >= 0) { if (static_cast<int>(rampup_begin_step) < 0) {
auto current_step_tensor = return;
context.Input<framework::Tensor>("current_step"); }
auto current_step_tensor = context.Input<framework::Tensor>("current_step");
auto* current_step = current_step_tensor->data<T>(); auto* current_step = current_step_tensor->data<T>();
if (static_cast<int>(*current_step) < VLOG(10) << "current_step:" << *current_step
static_cast<int>(rampup_begin_step)) { << ", rampup_begin_step:" << rampup_begin_step;
if (static_cast<int>(*current_step) < static_cast<int>(rampup_begin_step)) {
VLOG(10) << "current_step:" << *current_step VLOG(10) << "current_step:" << *current_step
<< " < rampup_begin_step:" << rampup_begin_step << " < rampup_begin_step:" << rampup_begin_step
<< " so does't use dgc_clip_by_norm"; << " so does't use dgc_clip_by_norm";
return; return;
} }
}
return ClipByNormKernel<DeviceContext, T>::Compute(context); return ClipByNormKernel<DeviceContext, T>::Compute(context);
}; };
......
...@@ -991,15 +991,17 @@ TEST(JITKernel_pool, jitpool) { ...@@ -991,15 +991,17 @@ TEST(JITKernel_pool, jitpool) {
TEST(JITKernel_pool, more) { TEST(JITKernel_pool, more) {
const auto& kers = jit::KernelPool::Instance().AllKernels(); const auto& kers = jit::KernelPool::Instance().AllKernels();
#if defined(__APPLE__) || defined(__OSX__) size_t target_num = 8;
EXPECT_EQ(kers.size(), 10UL);
#else #ifdef __AVX__
#ifdef PADDLE_WITH_MKLML target_num += 2;
EXPECT_EQ(kers.size(), 22UL);
#else
EXPECT_EQ(kers.size(), 8UL);
#endif #endif
#ifdef PADDLE_WITH_MKLML
target_num += 12;
#endif #endif
EXPECT_EQ(kers.size(), target_num);
} }
TEST(JITKernel_pool, refer) { TEST(JITKernel_pool, refer) {
......
...@@ -29,7 +29,7 @@ class LoadOp : public framework::OperatorWithKernel { ...@@ -29,7 +29,7 @@ class LoadOp : public framework::OperatorWithKernel {
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
framework::OpKernelType kt = framework::OpKernelType( framework::OpKernelType kt = framework::OpKernelType(
framework::proto::VarType::FP32, platform::CPUPlace()); framework::proto::VarType::FP32, ctx.GetPlace());
return kt; return kt;
} }
}; };
......
...@@ -75,6 +75,7 @@ std::vector<std::string> NgraphEngine::feed_vars = {}; ...@@ -75,6 +75,7 @@ std::vector<std::string> NgraphEngine::feed_vars = {};
std::vector<std::string> NgraphEngine::fetch_vars = {}; std::vector<std::string> NgraphEngine::fetch_vars = {};
framework::Variable* NgraphEngine::pre_var_ptr = nullptr; framework::Variable* NgraphEngine::pre_var_ptr = nullptr;
const framework::BlockDesc* NgraphEngine::p_bdesc = nullptr; const framework::BlockDesc* NgraphEngine::p_bdesc = nullptr;
bool NgraphEngine::is_training = false;
std::unordered_map<std::string, EngineCache> NgraphEngine::engine_cache = {}; std::unordered_map<std::string, EngineCache> NgraphEngine::engine_cache = {};
std::unordered_map<std::string, std::unordered_map<std::string,
...@@ -93,11 +94,13 @@ static std::vector<std::vector<int>> NgraphOpIntervals( ...@@ -93,11 +94,13 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
int size = ops->size(); int size = ops->size();
int left = 0; int left = 0;
while (left < size && ops->at(left)->Type() != framework::kFeedOpType && while (left < size && ops->at(left)->Type() != framework::kFeedOpType &&
ops->at(left)->Type() != "read" &&
ops->at(left)->Type() != framework::kFetchOpType) { ops->at(left)->Type() != framework::kFetchOpType) {
++left; ++left;
} }
while (left < size && ops->at(left)->Type() == framework::kFeedOpType) { while (left < size && (ops->at(left)->Type() == framework::kFeedOpType ||
ops->at(left)->Type() == "read")) {
for (auto& var_name_item : ops->at(left)->Outputs()) { for (auto& var_name_item : ops->at(left)->Outputs()) {
for (auto& var_name : var_name_item.second) { for (auto& var_name : var_name_item.second) {
NgraphEngine::feed_vars.emplace_back(var_name); NgraphEngine::feed_vars.emplace_back(var_name);
...@@ -270,6 +273,7 @@ void NgraphEngine::Prepare(const std::vector<int>& interval) { ...@@ -270,6 +273,7 @@ void NgraphEngine::Prepare(const std::vector<int>& interval) {
for (auto op_desc : ops_desc) { for (auto op_desc : ops_desc) {
if (op_desc->Type().find("_grad") != std::string::npos) { if (op_desc->Type().find("_grad") != std::string::npos) {
is_training = true;
this->is_test_ = false; this->is_test_ = false;
break; break;
} }
...@@ -590,7 +594,7 @@ void NgraphEngine::Run(const framework::Scope& scope, ...@@ -590,7 +594,7 @@ void NgraphEngine::Run(const framework::Scope& scope,
} }
bool is_persistable = bool is_persistable =
(p_persistables->find(vi) != p_persistables->end()) ? true : false; (p_persistables->find(vi) != p_persistables->end()) ? true : false;
if (is_test && is_persistable) { if (!is_training && is_test && is_persistable) {
ti->set_stale(false); ti->set_stale(false);
} }
(*p_t_in).emplace_back(ti); (*p_t_in).emplace_back(ti);
......
...@@ -57,6 +57,7 @@ class NgraphEngine { ...@@ -57,6 +57,7 @@ class NgraphEngine {
void Run(const framework::Scope& scope, const platform::Place& place) const; void Run(const framework::Scope& scope, const platform::Place& place) const;
static bool is_training;
static const framework::BlockDesc* p_bdesc; static const framework::BlockDesc* p_bdesc;
static std::vector<std::string> feed_vars, fetch_vars; static std::vector<std::string> feed_vars, fetch_vars;
......
...@@ -91,6 +91,10 @@ class LayerObjectHelper(LayerHelperBase): ...@@ -91,6 +91,10 @@ class LayerObjectHelper(LayerHelperBase):
Returns input, param_attr Returns input, param_attr
""" """
param_attr_in = ParamAttr._to_attr(param_attr_in)
if isinstance(param_attr_in, bool):
raise ValueError('Param_attr should not be False in {}'.format(
self.name))
inputs = inputs_in if (inputs_in is not None) else [] inputs = inputs_in if (inputs_in is not None) else []
inputs = self._multiple_input(inputs) inputs = self._multiple_input(inputs)
param_attrs = self._multiple_param_attr(len(inputs), param_attr_in) param_attrs = self._multiple_param_attr(len(inputs), param_attr_in)
......
...@@ -511,44 +511,67 @@ class FC(layers.Layer): ...@@ -511,44 +511,67 @@ class FC(layers.Layer):
self._param_attr = param_attr self._param_attr = param_attr
self._bias_attr = bias_attr self._bias_attr = bias_attr
self._act = act self._act = act
self.__w = list()
@property
def _w(self, i=0):
return self.__w[i]
@_w.setter
def _w(self, value, i=0):
assert isinstance(value, Parameter)
self.__w[i] = value
def _build_once(self, input): def _build_once(self, input):
input_shape = input.shape i = 0
for inp, param in self._helper.iter_inputs_and_params(input,
self._param_attr):
input_shape = inp.shape
param_shape = [ param_shape = [
reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1) reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:],
1)
] + [self._size] ] + [self._size]
self._w = self.create_parameter( self.__w.append(
attr=self._param_attr, self.add_parameter(
'_w%d' % i,
self.create_parameter(
attr=param,
shape=param_shape, shape=param_shape,
dtype=self._dtype, dtype=self._dtype,
is_bias=False) is_bias=False)))
i += 1
if self._bias_attr:
size = list([self._size]) size = list([self._size])
self._b = self.create_parameter( self._b = self.create_parameter(
attr=self._bias_attr, attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True)
shape=size,
dtype=self._dtype,
is_bias=True)
else:
self._b = None
def forward(self, input): def forward(self, input):
mul_results = list()
i = 0
for inp, param in self._helper.iter_inputs_and_params(input,
self._param_attr):
tmp = self._helper.create_variable_for_type_inference(self._dtype) tmp = self._helper.create_variable_for_type_inference(self._dtype)
self._helper.append_op( self._helper.append_op(
type="mul", type="mul",
inputs={"X": input, inputs={"X": inp,
"Y": self._w}, "Y": self.__w[i]},
outputs={"Out": tmp}, outputs={"Out": tmp},
attrs={ attrs={
"x_num_col_dims": self._num_flatten_dims, "x_num_col_dims": self._num_flatten_dims,
"y_num_col_dims": 1 "y_num_col_dims": 1
}) })
i += 1
mul_results.append(tmp)
pre_bias = self._helper.create_variable_for_type_inference(self._dtype) if len(mul_results) == 1:
pre_bias = mul_results[0]
else:
pre_bias = self._helper.create_variable_for_type_inference(
self._dtype)
self._helper.append_op( self._helper.append_op(
type="sum", type="sum",
inputs={"X": [tmp]}, inputs={"X": mul_results},
outputs={"Out": pre_bias}, outputs={"Out": pre_bias},
attrs={"use_mkldnn": False}) attrs={"use_mkldnn": False})
......
...@@ -493,7 +493,8 @@ class Variable(object): ...@@ -493,7 +493,8 @@ class Variable(object):
self._ivar._run_backward() self._ivar._run_backward()
def _gradient(self): def _gradient(self):
return np.array(self._ivar._grad_value()) new_ivar = self._ivar._grad_ivar()._copy_to(core.CPUPlace(), True)
return np.array(new_ivar.value().get_tensor())
def _clear_gradient(self): def _clear_gradient(self):
self._ivar._clear_gradient() self._ivar._clear_gradient()
......
...@@ -752,7 +752,7 @@ class DGCMomentumOptimizer(MomentumOptimizer): ...@@ -752,7 +752,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
force_cpu=True) force_cpu=True)
for param_var, grad_var in param_and_grads: for param_var, grad_var in param_and_grads:
var_numel = reduce(lambda x, y: x * y, param_var.shape) var_numel = abs(reduce(lambda x, y: x * y, param_var.shape))
if var_numel < 16384 or \ if var_numel < 16384 or \
param_var.type == core.VarDesc.VarType.SELECTED_ROWS or \ param_var.type == core.VarDesc.VarType.SELECTED_ROWS or \
grad_var.type == core.VarDesc.VarType.SELECTED_ROWS or \ grad_var.type == core.VarDesc.VarType.SELECTED_ROWS or \
...@@ -832,7 +832,7 @@ class DGCMomentumOptimizer(MomentumOptimizer): ...@@ -832,7 +832,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
type=x.type, name=name, dtype=x.dtype, persistable=False) type=x.type, name=name, dtype=x.dtype, persistable=False)
helper.append_op( helper.append_op(
type="clip_by_norm", type="dgc_clip_by_norm",
inputs={"X": x, inputs={"X": x,
"current_step": self._global_step_var}, "current_step": self._global_step_var},
attrs={ attrs={
...@@ -845,7 +845,7 @@ class DGCMomentumOptimizer(MomentumOptimizer): ...@@ -845,7 +845,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
def _append_clip_norm(self, grad_var, clip_norm): def _append_clip_norm(self, grad_var, clip_norm):
with grad_var.block.program._backward_role_guard(): with grad_var.block.program._backward_role_guard():
return self._clip_by_norm( return self._clip_by_norm(
x=grad_var, max_norm=clip_norm, name=grad_var.name + "@DGC") x=grad_var, max_norm=clip_norm, name=grad_var.name)
def _dgc_op(self, param_var, clip_var, grad_var, u_var, v_var, k_var, def _dgc_op(self, param_var, clip_var, grad_var, u_var, v_var, k_var,
encoded_var): encoded_var):
......
...@@ -104,10 +104,11 @@ class ParallelExecutor(object): ...@@ -104,10 +104,11 @@ class ParallelExecutor(object):
self._scope = scope if scope is not None else executor.global_scope() self._scope = scope if scope is not None else executor.global_scope()
if main_program is not None and main_program._enable_dgc: if main_program is not None and main_program._enable_dgc:
assert num_trainers > 1, "dgc is not useful for single trainer training."
assert build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce assert build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce
assert num_trainers * len( assert num_trainers * len(
self._places) > 1, "dgc is not useful for single card training" self._places) > 1, "dgc is not useful for single card training."
assert use_cuda assert use_cuda, "dgc only used when cuda is used."
main_program = main_program if main_program is not None \ main_program = main_program if main_program is not None \
else framework.default_main_program() else framework.default_main_program()
...@@ -123,6 +124,11 @@ class ParallelExecutor(object): ...@@ -123,6 +124,11 @@ class ParallelExecutor(object):
exec_strategy=exec_strategy, exec_strategy=exec_strategy,
share_vars_from=share_vars_from._compiled_program share_vars_from=share_vars_from._compiled_program
if share_vars_from else None) if share_vars_from else None)
# FIXME(gongwb): I will move dgc from dist mode to allreduce mode in next pr.
if main_program._enable_dgc:
self._compiled_program._build_strategy.is_distribution = True
self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace() self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
self._exe = executor.Executor(self._place) self._exe = executor.Executor(self._place)
self._compiled_program._compile(place=self._place, scope=self._scope) self._compiled_program._compile(place=self._place, scope=self._scope)
......
...@@ -139,8 +139,7 @@ class TestDistRunnerBase(object): ...@@ -139,8 +139,7 @@ class TestDistRunnerBase(object):
pass_builder = None pass_builder = None
if args.batch_merge_repeat > 1: if args.batch_merge_repeat > 1:
pass_builder = build_stra._finalize_strategy_and_create_passes() pass_builder = build_stra._finalize_strategy_and_create_passes()
mypass = pass_builder.insert_pass( mypass = pass_builder.insert_pass(0, "multi_batch_merge_pass")
len(pass_builder.all_passes()) - 3, "multi_batch_merge_pass")
mypass.set("num_repeats", args.batch_merge_repeat) mypass.set("num_repeats", args.batch_merge_repeat)
if args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer": if args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer":
......
...@@ -16,6 +16,7 @@ from __future__ import print_function ...@@ -16,6 +16,7 @@ from __future__ import print_function
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.dygraph.nn import Embedding from paddle.fluid.dygraph.nn import Embedding
import paddle.fluid.framework as framework import paddle.fluid.framework as framework
from paddle.fluid.optimizer import SGDOptimizer from paddle.fluid.optimizer import SGDOptimizer
...@@ -200,8 +201,6 @@ class PtbModel(fluid.dygraph.Layer): ...@@ -200,8 +201,6 @@ class PtbModel(fluid.dygraph.Layer):
rnn_out, shape=[-1, self.num_steps, self.hidden_size]) rnn_out, shape=[-1, self.num_steps, self.hidden_size])
projection = fluid.layers.matmul(rnn_out, self.softmax_weight) projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
projection = fluid.layers.elementwise_add(projection, self.softmax_bias) projection = fluid.layers.elementwise_add(projection, self.softmax_bias)
projection = fluid.layers.reshape(
projection, shape=[-1, self.vocab_size])
projection = fluid.layers.reshape( projection = fluid.layers.reshape(
projection, shape=[-1, self.vocab_size]) projection, shape=[-1, self.vocab_size])
loss = fluid.layers.softmax_with_cross_entropy( loss = fluid.layers.softmax_with_cross_entropy(
...@@ -223,6 +222,7 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -223,6 +222,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
num_steps = 3 num_steps = 3
init_scale = 0.1 init_scale = 0.1
batch_size = 4 batch_size = 4
batch_num = 200
with fluid.dygraph.guard(): with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
...@@ -242,7 +242,6 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -242,7 +242,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
dy_loss = None dy_loss = None
last_hidden = None last_hidden = None
last_cell = None last_cell = None
batch_num = 200
for i in range(batch_num): for i in range(batch_num):
x_data = np.arange(12).reshape(4, 3).astype('int64') x_data = np.arange(12).reshape(4, 3).astype('int64')
...@@ -280,9 +279,11 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -280,9 +279,11 @@ class TestDygraphPtbRnn(unittest.TestCase):
num_steps=num_steps, num_steps=num_steps,
init_scale=init_scale) init_scale=init_scale)
exe = fluid.Executor(fluid.CPUPlace()) exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
sgd = SGDOptimizer(learning_rate=1e-3) sgd = SGDOptimizer(learning_rate=1e-3)
x = fluid.layers.data(name="x", shape=[-1, 3, 1], dtype='int64') x = fluid.layers.data(
name="x", shape=[-1, num_steps, 1], dtype='int64')
y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32') y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
init_hidden = fluid.layers.data( init_hidden = fluid.layers.data(
name="init_hidden", shape=[1], dtype='float32') name="init_hidden", shape=[1], dtype='float32')
...@@ -332,7 +333,6 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -332,7 +333,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
for k in range(3, len(out)): for k in range(3, len(out)):
static_param_updated[static_param_name_list[k - static_param_updated[static_param_name_list[k -
3]] = out[k] 3]] = out[k]
self.assertTrue(np.allclose(static_loss_value, dy_loss._numpy())) self.assertTrue(np.allclose(static_loss_value, dy_loss._numpy()))
self.assertTrue(np.allclose(static_last_cell_value, last_cell._numpy())) self.assertTrue(np.allclose(static_last_cell_value, last_cell._numpy()))
self.assertTrue( self.assertTrue(
...@@ -340,13 +340,11 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -340,13 +340,11 @@ class TestDygraphPtbRnn(unittest.TestCase):
for key, value in six.iteritems(static_param_init): for key, value in six.iteritems(static_param_init):
# print("static_init name: {}, value {}".format(key, value)) # print("static_init name: {}, value {}".format(key, value))
# print("dy_init name: {}, value {}".format(key, dy_param_init[key])) # print("dy_init name: {}, value {}".format(key, dy_param_init[key]))
self.assertTrue(np.allclose(value, dy_param_init[key], atol=1e-5)) self.assertTrue(np.allclose(value, dy_param_init[key]))
for key, value in six.iteritems(static_param_updated): for key, value in six.iteritems(static_param_updated):
# print("static name: {}, value {}".format(key, value)) # print("static name: {}, value {}".format(key, value))
# print("dy name: {}, value {}".format(key, dy_param_updated[key])) # print("dy name: {}, value {}".format(key, dy_param_updated[key]))
self.assertTrue( self.assertTrue(np.allclose(value, dy_param_updated[key]))
np.allclose(
value, dy_param_updated[key], atol=1e-5))
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -302,8 +302,7 @@ use_py_reader = False ...@@ -302,8 +302,7 @@ use_py_reader = False
# if we run sync mode # if we run sync mode
sync = False sync = False
# how many batches we use batch_num = 5
batch_num = 2
np.random.seed = 1 np.random.seed = 1
src_word_np = np.random.randint( src_word_np = np.random.randint(
...@@ -335,24 +334,6 @@ lbl_word_np = np.random.randint( ...@@ -335,24 +334,6 @@ lbl_word_np = np.random.randint(
dtype='int64') dtype='int64')
lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32') lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')
# np.random.seed = 1
# src_word_np = np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
# src_pos_np = np.random.randint(
# 1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
# src_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
# seq_len, seq_len).astype('float32')
#
# trg_word_np = np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
# trg_pos_np = np.random.randint(
# 1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
# trg_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
# seq_len, seq_len).astype('float32')
# trg_src_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
# seq_len, seq_len).astype('float32')
#
# lbl_word_np = np.arange(0, 10).reshape([batch_size * seq_len, 1]).astype('int64')
# lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')
#
pos_inp1 = position_encoding_init(ModelHyperParams.max_length, pos_inp1 = position_encoding_init(ModelHyperParams.max_length,
ModelHyperParams.d_model) ModelHyperParams.d_model)
pos_inp2 = position_encoding_init(ModelHyperParams.max_length, pos_inp2 = position_encoding_init(ModelHyperParams.max_length,
...@@ -739,7 +720,7 @@ class DecoderSubLayer(Layer): ...@@ -739,7 +720,7 @@ class DecoderSubLayer(Layer):
enc_attn_output_pp = self._multihead_attention_layer2( enc_attn_output_pp = self._multihead_attention_layer2(
pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias) pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias)
enc_attn_output = self._post_process_layer2( enc_attn_output = self._post_process_layer2(
slf_attn_output, enc_attn_output_pp, self._postprocess_cmd, slf_attn_output_pp, enc_attn_output_pp, self._postprocess_cmd,
self._prepostprcess_dropout) self._prepostprcess_dropout)
pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output, pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output,
self._preprocess_cmd, self._preprocess_cmd,
...@@ -1076,20 +1057,17 @@ class TestDygraphTransformer(unittest.TestCase): ...@@ -1076,20 +1057,17 @@ class TestDygraphTransformer(unittest.TestCase):
4]] = out[k] 4]] = out[k]
self.assertTrue( self.assertTrue(
np.allclose(static_avg_cost_value, dy_avg_cost._numpy())) np.array_equal(static_avg_cost_value, dy_avg_cost._numpy()))
self.assertTrue( self.assertTrue(
np.allclose(static_sum_cost_value, dy_sum_cost._numpy())) np.array_equal(static_sum_cost_value, dy_sum_cost._numpy()))
self.assertTrue( self.assertTrue(
np.allclose( np.array_equal(static_predict_value, dy_predict._numpy()))
static_predict_value, dy_predict._numpy(), atol=1e-5))
self.assertTrue( self.assertTrue(
np.allclose(static_token_num_value, dy_token_num._numpy())) np.array_equal(static_token_num_value, dy_token_num._numpy()))
for key, value in six.iteritems(static_param_init): for key, value in six.iteritems(static_param_init):
self.assertTrue(np.allclose(value, dy_param_init[key])) self.assertTrue(np.array_equal(value, dy_param_init[key]))
for key, value in six.iteritems(static_param_updated): for key, value in six.iteritems(static_param_updated):
self.assertTrue( self.assertTrue(np.array_equal(value, dy_param_updated[key]))
np.allclose(
value, dy_param_updated[key], atol=1e-4))
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -38,7 +38,15 @@ def Lenet(data, class_dim): ...@@ -38,7 +38,15 @@ def Lenet(data, class_dim):
class TestFetchAndFeed(unittest.TestCase): class TestFetchAndFeed(unittest.TestCase):
def parallel_exe(self, use_cuda, run_parallel_exe, seed=1): @classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
def parallel_exe(self,
use_cuda,
run_parallel_exe,
use_experimental_executor=False,
seed=1):
main_program = fluid.Program() main_program = fluid.Program()
startup = fluid.Program() startup = fluid.Program()
startup.random_seed = seed startup.random_seed = seed
...@@ -63,8 +71,12 @@ class TestFetchAndFeed(unittest.TestCase): ...@@ -63,8 +71,12 @@ class TestFetchAndFeed(unittest.TestCase):
build_strategy = fluid.BuildStrategy() build_strategy = fluid.BuildStrategy()
build_strategy.enable_inplace = False build_strategy.enable_inplace = False
build_strategy.memory_optimize = False build_strategy.memory_optimize = False
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.use_experimental_executor = use_experimental_executor
train_cp = compiler.CompiledProgram(main_program).with_data_parallel( train_cp = compiler.CompiledProgram(main_program).with_data_parallel(
loss_name=loss.name, build_strategy=build_strategy) loss_name=loss.name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
run_parallel_exe(train_cp, exe, use_cuda, data, label, loss) run_parallel_exe(train_cp, exe, use_cuda, data, label, loss)
...@@ -131,8 +143,7 @@ class TestFetchAndFeed(unittest.TestCase): ...@@ -131,8 +143,7 @@ class TestFetchAndFeed(unittest.TestCase):
if batch_id == 2: if batch_id == 2:
break break
def test_fetch(self): def test_fetch_with_threaded_executor(self):
os.environ['CPU_NUM'] = str(4)
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
self.parallel_exe( self.parallel_exe(
use_cuda=True, use_cuda=True,
...@@ -140,8 +151,18 @@ class TestFetchAndFeed(unittest.TestCase): ...@@ -140,8 +151,18 @@ class TestFetchAndFeed(unittest.TestCase):
self.parallel_exe( self.parallel_exe(
use_cuda=False, run_parallel_exe=self.run_parallel_exe_with_fetch) use_cuda=False, run_parallel_exe=self.run_parallel_exe_with_fetch)
def test_fetch_with_fast_threaded_executor(self):
if core.is_compiled_with_cuda():
self.parallel_exe(
use_cuda=True,
run_parallel_exe=self.run_parallel_exe_with_fetch,
use_experimental_executor=True)
self.parallel_exe(
use_cuda=False,
run_parallel_exe=self.run_parallel_exe_with_fetch,
use_experimental_executor=True)
def test_feed(self): def test_feed(self):
os.environ['CPU_NUM'] = str(4)
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
self.parallel_exe( self.parallel_exe(
use_cuda=True, run_parallel_exe=self.run_parallel_exe_with_feed) use_cuda=True, run_parallel_exe=self.run_parallel_exe_with_feed)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册