提交 f4bcee1d 编写于 作者: L luotao1

Merge branch 'develop' into anakin_test

...@@ -175,6 +175,7 @@ include(external/any) # download libn::any ...@@ -175,6 +175,7 @@ include(external/any) # download libn::any
include(external/eigen) # download eigen3 include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11 include(external/pybind11) # download pybind11
include(external/cares) include(external/cares)
include(external/cub)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
if(WITH_GRPC) if(WITH_GRPC)
......
if(NOT WITH_GPU)
return()
endif()
include(ExternalProject)
set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub)
set(CUB_INCLUDE_DIR ${CUB_SOURCE_DIR}/src/extern_cub)
include_directories(${CUB_INCLUDE_DIR})
ExternalProject_Add(
extern_cub
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/NVlabs/cub.git"
GIT_TAG "v1.8.0"
PREFIX ${CUB_SOURCE_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cub_dummy.c)
file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
add_library(cub STATIC ${dummyfile})
else()
add_library(cub INTERFACE)
endif()
add_dependencies(cub extern_cub)
LIST(APPEND externl_project_dependencies cub)
# Operator fusion
Fusing multiple operators together is an important method to optimize the program execution, particularly for GPU or other specialized accelerators. An obvious benefit is to avoid the overhead of saving the intermediate result back into global memory.
There are generally two ways to fuse operators, fusing directly connected operators and fusing non directly connected operators. The first method is mainly used by [NNVM Compiler](https://github.com/dmlc/tvm/) and [XLA](https://www.tensorflow.org/performance/xla/). The second method is mainly used by Dynet and TensorFlow Fold to do auto-batching. The principle of fusing operator is according to some rules to combine multiple operations into one, for example, `Y = X * W` and `Z = Y + B` can be fused to `Z = X * W + B`, and `Y1 = X1 * W` and `Y2 = X2 * W` can be fused to `[Y1;Y2] = [X1;X2] * W`. In order to get a short-term profit, we decided to try to manually specify these rules.
## Challenge
The challenge of fusing operators is:
- how to make the rules.
- how to implement these rules efficiently.
### How to make the rules?
The problem of determining the best single location for a fusion operator is an NP-hard combinatorial problem. After analysis the operators of the DL model, we found there are two group of operators can be fused explicitly, one is the simple and adjacent operations, for example, `tmp = x + y` and `z = Relu(tmp)`, and the other is the operators that have the same function, for example, a serials of `SGD` or `Momentum`. They usually appear in the model in a large number. So we should think about how to fuse them separately first.
### How to implement these rules efficiently?
#### How to fuse the adjacent operations efficiently?
Here we use a template function to represent the fused operations. The pros of using a template function are that it is simple and efficient, and the cons are that it is not easy to expand, and it can only be used to express some simple operations. So taking into account our current needs, the template function is more appropriate.
#### How to fuse the operators that have the same function efficiently?
We take SGD operator as an example, the training model may have hundreds of parameters and correspondingly have the same number of SGD operators. The expression(`w = w - lr*w_g`) of those operators is the same, so during of training, the executor will execute this expression hundreds time in CPU or other specialized accelerators. If we can fuse them and make the address of all `w` and all `w_g` continuous respectively, we only need execute one time. For some accelerators, the time of launching kernel is not neglected, so the time of hundreds of times of launching and executing kernel may be larger than launching and executing only once. There usually are many operators that similar to `SGD` in the DL model, such as `AllReduce` and `FC`.
...@@ -336,6 +336,7 @@ paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=Non ...@@ -336,6 +336,7 @@ paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=Non
paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False)) paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
......
...@@ -275,7 +275,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( ...@@ -275,7 +275,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
if (strategy_.gradient_scale_ != if (strategy_.gradient_scale_ !=
BuildStrategy::GradientScaleStrategy::kCustomized) { BuildStrategy::GradientScaleStrategy::kCustomized) {
// TODO(paddle-dev): Why is there no input for this op_handle? // TODO(paddle-dev): Why is there no input for this op_handle?
CreateScaleLossGradOp(&result); auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
CreateScaleLossGradOp(&result, loss_grad_name);
} }
// This assumes the backward generating code will ensure IsScaleLossOp // This assumes the backward generating code will ensure IsScaleLossOp
// is true only for the op that scale the final scalar loss. // is true only for the op that scale the final scalar loss.
...@@ -535,7 +536,8 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph, ...@@ -535,7 +536,8 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph,
return got == sharded_var_device.end() ? -1 : got->second; return got == sharded_var_device.end() ? -1 : got->second;
} }
void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const { void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
ir::Graph *result, const std::string &loss_grad_name) const {
for (size_t i = 0; i < places_.size(); ++i) { for (size_t i = 0; i < places_.size(); ++i) {
// Insert ScaleCost OpHandle // Insert ScaleCost OpHandle
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -558,10 +560,10 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const { ...@@ -558,10 +560,10 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const {
// loss->pending_ops_.emplace_back(op_handle); // loss->pending_ops_.emplace_back(op_handle);
// op_handle->inputs_.emplace_back(loss); // op_handle->inputs_.emplace_back(loss);
CreateOpOutput(result, op_handle, CreateOpOutput(
result->CreateEmptyNode(GradVarName(loss_var_name_), result, op_handle,
ir::Node::Type::kVariable), result->CreateEmptyNode(loss_grad_name, ir::Node::Type::kVariable),
places_[i], i); places_[i], i);
} }
} }
......
...@@ -75,7 +75,9 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { ...@@ -75,7 +75,9 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
void CreateComputationalOps(ir::Graph *result, ir::Node *node, void CreateComputationalOps(ir::Graph *result, ir::Node *node,
size_t num_places) const; size_t num_places) const;
void CreateScaleLossGradOp(ir::Graph *result) const; void CreateScaleLossGradOp(ir::Graph *result,
const std::string &loss_grad_name) const;
VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og, VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
int dst_dev_id) const; int dst_dev_id) const;
void CreateComputationalOp(ir::Graph *result, ir::Node *node, void CreateComputationalOp(ir::Graph *result, ir::Node *node,
......
...@@ -330,12 +330,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -330,12 +330,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
} }
for (auto& op : ctx->ops_) { for (auto& op : ctx->ops_) {
VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
op->Run(*local_scope, place_); op->Run(*local_scope, place_);
// NOTE! Please do not delete this line, it's usefull because the debug
// string before and after op.run are different, after run the output
// will have right shape which is usefull for debug.
VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
if (FLAGS_benchmark) { if (FLAGS_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: " VLOG(2) << "Memory used after operator " + op->Type() + " running: "
......
...@@ -127,7 +127,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { ...@@ -127,7 +127,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
} }
void OperatorBase::Run(const Scope& scope, const platform::Place& place) { void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
VLOG(10) << "- " << DebugStringEx(&scope); VLOG(4) << place << " " << DebugStringEx(&scope);
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
PADDLE_THROW("Cannot run operator on place %s", place); PADDLE_THROW("Cannot run operator on place %s", place);
...@@ -139,7 +139,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -139,7 +139,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place)); platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place); RunImpl(scope, place);
VLOG(10) << "+ " << DebugStringEx(&scope); VLOG(3) << place << " " << DebugStringEx(&scope);
} }
bool OperatorBase::HasInputs(const std::string& name) const { bool OperatorBase::HasInputs(const std::string& name) const {
...@@ -778,6 +778,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( ...@@ -778,6 +778,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
const ExecutionContext& ctx) const { const ExecutionContext& ctx) const {
auto& scope = ctx.scope(); auto& scope = ctx.scope();
int data_type = -1; int data_type = -1;
std::string last_input_name;
for (auto& input : this->inputs_) { for (auto& input : this->inputs_) {
for (auto& ipt_name : input.second) { for (auto& ipt_name : input.second) {
auto* var = scope.FindVar(ipt_name); auto* var = scope.FindVar(ipt_name);
...@@ -794,9 +795,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( ...@@ -794,9 +795,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
int tmp = static_cast<int>(ToDataType(t->type())); int tmp = static_cast<int>(ToDataType(t->type()));
PADDLE_ENFORCE( PADDLE_ENFORCE(
tmp == data_type || data_type == -1, tmp == data_type || data_type == -1,
"DataType of Paddle Op %s must be the same. Get %d != %d", Type(), "DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)",
data_type, tmp); Type(), last_input_name, data_type, ipt_name, tmp);
data_type = tmp; data_type = tmp;
last_input_name = ipt_name;
} }
} }
} }
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
namespace paddle { namespace paddle {
DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false, DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, true,
"Enable subgraph to TensorRT engine for acceleration"); "Enable subgraph to TensorRT engine for acceleration");
DEFINE_string(inference_analysis_graphviz_log_root, "./", DEFINE_string(inference_analysis_graphviz_log_root, "./",
...@@ -42,10 +42,19 @@ class DfgPassManagerImpl final : public DfgPassManager { ...@@ -42,10 +42,19 @@ class DfgPassManagerImpl final : public DfgPassManager {
// TODO(Superjomn) set the key with pass reprs. // TODO(Superjomn) set the key with pass reprs.
AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass); AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) { if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
auto trt_teller = [](const Node* node) { auto trt_teller = [&](const Node* node) {
std::unordered_set<std::string> teller_set(
{"elementwise_add", "mul", "conv2d", "pool2d", "relu"});
if (!node->IsFunction()) return false; if (!node->IsFunction()) return false;
return static_cast<const Function*>(node)->func_type() == "mul";
const auto* func = static_cast<const Function*>(node);
if (teller_set.count(func->func_type()))
return true;
else {
return false;
}
}; };
AddPass("tensorrt-subgraph-marker", AddPass("tensorrt-subgraph-marker",
new TensorRTSubgraphNodeMarkPass(trt_teller)); new TensorRTSubgraphNodeMarkPass(trt_teller));
AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller)); AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
......
...@@ -337,6 +337,34 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT ...@@ -337,6 +337,34 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT
std::vector<Node *>(outputs.begin(), outputs.end())); std::vector<Node *>(outputs.begin(), outputs.end()));
} }
void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
std::vector<Node *> op_nodes;
for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
if (node.type() == Node::Type::kValue || node.deleted()) {
continue;
}
op_nodes.push_back(&node);
}
size_t op_num = op_nodes.size();
for (size_t i = 0; i < op_num; i++) {
if (op_nodes[i]->type() == Node::Type::kFunction) continue;
std::unordered_set<std::string> follow_up_input_names;
for (size_t j = i + 1; j < op_num; j++) {
for (auto *in : op_nodes[j]->inlinks) {
follow_up_input_names.insert(in->name());
}
}
std::vector<Node *> filtered_subgraph_outlinks;
for (auto *out : op_nodes[i]->outlinks) {
if (follow_up_input_names.count(out->name())) {
filtered_subgraph_outlinks.push_back(out);
}
}
PADDLE_ENFORCE_GE(filtered_subgraph_outlinks.size(), 1UL);
op_nodes[i]->outlinks = filtered_subgraph_outlinks;
}
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -178,6 +178,7 @@ struct GraphTraits<DataFlowGraph> { ...@@ -178,6 +178,7 @@ struct GraphTraits<DataFlowGraph> {
std::pair<std::vector<Node *>, std::vector<Node *>> std::pair<std::vector<Node *>, std::vector<Node *>>
ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph); // NOLINT ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph); // NOLINT
void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph);
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
namespace paddle { namespace paddle {
namespace inference { namespace inference {
DEFINE_int32(tensorrt_max_batchsize, 300, "TensorRT maximum batch size"); DEFINE_int32(tensorrt_max_batchsize, 3, "TensorRT maximum batch size");
DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size"); DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size");
namespace analysis { namespace analysis {
...@@ -52,6 +52,7 @@ bool DataFlowGraphToFluidPass::Initialize(Argument *argument) { ...@@ -52,6 +52,7 @@ bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
bool DataFlowGraphToFluidPass::Finalize() { return true; } bool DataFlowGraphToFluidPass::Finalize() { return true; }
void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) { void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
FilterRedundantOutputOfSubGraph(graph);
LOG(INFO) << "graph.inputs " << graph->inputs.size(); LOG(INFO) << "graph.inputs " << graph->inputs.size();
for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) { for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
if (node.deleted()) continue; if (node.deleted()) continue;
...@@ -87,34 +88,113 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) { ...@@ -87,34 +88,113 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
} }
void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
const framework::proto::BlockDesc &block) { framework::proto::BlockDesc *block) {
static int counter{0}; static int counter{0};
PADDLE_ENFORCE(node->IsFunctionBlock()); PADDLE_ENFORCE(node->IsFunctionBlock());
framework::OpDesc desc; framework::OpDesc desc;
auto *func = static_cast<FunctionBlock *>(node); auto *func = static_cast<FunctionBlock *>(node);
// collect inputs // collect inputs
std::vector<std::string> io; std::unordered_set<std::string> input_names;
for (auto *x : func->inlinks) { for (auto *x : func->inlinks) {
io.push_back(x->name()); input_names.insert(x->name());
} }
desc.SetInput("Xs", io); desc.SetInput(
"Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
// collect outputs std::unordered_set<std::string> output_names;
io.clear();
for (auto *x : func->outlinks) { for (auto *x : func->outlinks) {
io.push_back(x->name()); output_names.insert(x->name());
} }
desc.SetOutput("Ys", io);
std::vector<std::string> output_temp(output_names.begin(),
output_names.end());
desc.SetOutput("Ys", output_temp);
desc.SetType("tensorrt_engine"); desc.SetType("tensorrt_engine");
PADDLE_ENFORCE(!block.vars().empty(), "the block has no var-desc"); std::unordered_map<std::string, std::string> output_name_map;
// The following procedure is used to rename all the intermediate
// variables and the output variables of the subgraph.
// Why we do this?
// During the transition from fluid OP to tensorrt OP, we map
// the input and output Tensor(fluid data structure) of fluid OP
// to the correspondin ITensor (trt data structure) through the
// Tensor name. When we set up ITensor for an variable, we must
// ensure that it has not been set before.
// If there is variable in the fluid graph, which is not only the
// input of a OP, but also the output of a Op, there will be problems.
// So we have to rename the variable in the subgraph to make sure
// it is either an OP's input or an OP's output.
auto subgraph_nodes = func->subgraph;
for (int index = 0; index < block->ops_size(); index++) {
framework::proto::OpDesc *op = block->mutable_ops(index);
auto correspond_node = subgraph_nodes[index];
PADDLE_ENFORCE_EQ(correspond_node->name(), op->type());
std::unordered_map<std::string, size_t> var2id;
for (auto *in_var : correspond_node->inlinks) {
var2id[in_var->name()] = in_var->id();
}
// rename for the input variables of op inside subgraph
for (int i = 0; i < op->inputs_size(); i++) {
framework::proto::OpDesc_Var *in_var = op->mutable_inputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < in_var->arguments_size(); k++) {
std::string arg_value = in_var->arguments(k);
if (input_names.count(arg_value)) {
replaced_names.push_back(arg_value);
} else {
replaced_names.push_back(arg_value +
std::to_string(var2id[arg_value]));
}
}
in_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
in_var->add_arguments(replaced_names[k]);
}
}
var2id.clear();
for (auto out_var : correspond_node->outlinks) {
var2id[out_var->name()] = out_var->id();
}
// rename for the output variables of op inside subgraph
for (int i = 0; i < op->outputs_size(); i++) {
framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < out_var->arguments_size(); k++) {
std::string arg_value = out_var->arguments(k);
if (output_names.count(arg_value)) {
output_name_map[arg_value] =
arg_value + std::to_string(var2id[arg_value]);
}
replaced_names.push_back(arg_value + std::to_string(var2id[arg_value]));
}
out_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
out_var->add_arguments(replaced_names[k]);
}
}
}
// When tensorrt engine runs at the end of the operation,
// output_mapping help us copy the data from the renamed ITensor
// to Tensor.
std::vector<std::string> output_mapping;
for (auto name : output_names) {
PADDLE_ENFORCE(output_name_map.count(name) != 0);
output_mapping.push_back(output_name_map[name]);
}
PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc");
// Set attrs // Set attrs
SetAttr(desc.Proto(), "subgraph", block.SerializeAsString()); SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++)); SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize); SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize);
SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size); SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size);
SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes())); SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
node->SetPbMsg(desc.Proto()->SerializeAsString()); node->SetPbMsg(desc.Proto()->SerializeAsString());
} }
...@@ -146,15 +226,17 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) { ...@@ -146,15 +226,17 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
LOG(INFO) << "transformed variable size: " LOG(INFO) << "transformed variable size: "
<< block_desc.Proto()->vars().size(); << block_desc.Proto()->vars().size();
// copy ops. // copy ops.
for (auto *node : block_node->subgraph) { for (auto *node : block_node->subgraph) {
auto *op = block_desc.AppendOp(); auto *op = block_desc.AppendOp();
PADDLE_ENFORCE(!node->pb_msg().empty()); PADDLE_ENFORCE(!node->pb_msg().empty());
op->Proto()->ParseFromString(node->pb_msg()); op->Proto()->ParseFromString(node->pb_msg());
} }
*block_desc.Proto()->mutable_vars() = *block_desc.Proto()->mutable_vars() =
argument_->origin_program_desc->blocks(0).vars(); argument_->origin_program_desc->blocks(0).vars();
PADDLE_ENFORCE(!block_desc.Proto()->vars().empty()); PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto()); CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto());
auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex); auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
auto *op = main_block->add_ops(); auto *op = main_block->add_ops();
PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block"); PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
......
...@@ -46,9 +46,9 @@ std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) { ...@@ -46,9 +46,9 @@ std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
for (size_t i = 0; i < graph->nodes.size(); i++) { for (size_t i = 0; i < graph->nodes.size(); i++) {
const Node &node = graph->nodes.Get(i); const Node &node = graph->nodes.Get(i);
if (!config_.display_deleted_node && node.deleted()) continue; if (!config_.display_deleted_node && node.deleted()) continue;
for (auto &in : node.inlinks) { for (auto &out : node.outlinks) {
if (!config_.display_deleted_node && in->deleted()) continue; if (!config_.display_deleted_node && out->deleted()) continue;
dot.AddEdge(in->repr(), node.repr(), {}); dot.AddEdge(node.repr(), out->repr(), {});
} }
} }
return dot.Build(); return dot.Build();
......
...@@ -76,7 +76,7 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) { ...@@ -76,7 +76,7 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() { std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
std::vector<Node *> marked_nodes; std::vector<Node *> marked_nodes;
for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes()) { for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes_in_TS()) {
if (node.attr(kMarkerAttrName).Bool()) { if (node.attr(kMarkerAttrName).Bool()) {
marked_nodes.push_back(&node); marked_nodes.push_back(&node);
} }
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <glog/logging.h>
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace paddle { namespace paddle {
...@@ -40,19 +41,36 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other) ...@@ -40,19 +41,36 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other)
PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; } PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) { PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
if (!other.memory_owned_) {
data_ = other.data_;
length_ = other.length_;
memory_owned_ = other.memory_owned_;
} else {
Resize(other.length());
memcpy(data_, other.data(), other.length());
length_ = other.length();
memory_owned_ = true;
}
return *this;
}
PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) {
// only the buffer with external memory can be copied // only the buffer with external memory can be copied
assert(!other.memory_owned_);
data_ = other.data_; data_ = other.data_;
length_ = other.length_; length_ = other.length_;
memory_owned_ = other.memory_owned_; memory_owned_ = other.memory_owned_;
other.data_ = nullptr;
other.length_ = 0;
other.memory_owned_ = false;
return *this; return *this;
} }
void PaddleBuf::Resize(size_t length) { void PaddleBuf::Resize(size_t length) {
// Only the owned memory can be reset, the external memory can't be changed. // Only the owned memory can be reset, the external memory can't be changed.
if (length_ == length) return; if (length_ == length) return;
assert(memory_owned_); if (memory_owned_) {
Free(); Free();
}
data_ = new char[length]; data_ = new char[length];
length_ = length; length_ = length;
memory_owned_ = true; memory_owned_ = true;
...@@ -68,7 +86,7 @@ void PaddleBuf::Reset(void* data, size_t length) { ...@@ -68,7 +86,7 @@ void PaddleBuf::Reset(void* data, size_t length) {
void PaddleBuf::Free() { void PaddleBuf::Free() {
if (memory_owned_ && data_) { if (memory_owned_ && data_) {
assert(length_ > 0); assert(length_ > 0);
delete static_cast<char*>(data_); delete[] static_cast<char*>(data_);
data_ = nullptr; data_ = nullptr;
length_ = 0; length_ = 0;
} }
......
...@@ -40,11 +40,12 @@ class PaddleBuf { ...@@ -40,11 +40,12 @@ class PaddleBuf {
// Copy only available when memory is managed externally. // Copy only available when memory is managed externally.
explicit PaddleBuf(const PaddleBuf&); explicit PaddleBuf(const PaddleBuf&);
PaddleBuf& operator=(const PaddleBuf&); PaddleBuf& operator=(const PaddleBuf&);
PaddleBuf& operator=(PaddleBuf&&);
// Do not own the memory. // Do not own the memory.
PaddleBuf(void* data, size_t length) PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {} : data_(data), length_(length), memory_owned_{false} {}
// Own memory. // Own memory.
explicit PaddleBuf(size_t length) PaddleBuf(size_t length)
: data_(new char[length]), length_(length), memory_owned_(true) {} : data_(new char[length]), length_(length), memory_owned_(true) {}
// Resize to `length` bytes. // Resize to `length` bytes.
void Resize(size_t length); void Resize(size_t length);
......
# Add TRT tests # Add TRT tests
nv_library(tensorrt_converter nv_library(tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
activation_op.cc
DEPS tensorrt_engine operator scope framework_proto op_registry) DEPS tensorrt_engine operator scope framework_proto op_registry)
nv_test(test_op_converter SRCS test_op_converter.cc DEPS nv_test(test_op_converter SRCS test_op_converter.cc DEPS
......
...@@ -55,7 +55,6 @@ class OpConverter { ...@@ -55,7 +55,6 @@ class OpConverter {
it = Registry<OpConverter>::Lookup("fc"); it = Registry<OpConverter>::Lookup("fc");
} }
} }
if (op_desc.Type().find("elementwise") != std::string::npos) { if (op_desc.Type().find("elementwise") != std::string::npos) {
static std::unordered_set<std::string> add_tensor_op_set{ static std::unordered_set<std::string> add_tensor_op_set{
"add", "mul", "sub", "div", "max", "min", "pow"}; "add", "mul", "sub", "div", "max", "min", "pow"};
...@@ -72,6 +71,8 @@ class OpConverter { ...@@ -72,6 +71,8 @@ class OpConverter {
"Unsupported elementwise type" + op_type); "Unsupported elementwise type" + op_type);
it = it =
Registry<OpConverter>::Lookup("elementwise_" + op_type + "_weight"); Registry<OpConverter>::Lookup("elementwise_" + op_type + "_weight");
PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
op_desc.Type());
} else { } else {
PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0, PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0,
"Unsupported elementwise type" + op_type); "Unsupported elementwise type" + op_type);
......
...@@ -534,8 +534,8 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx, ...@@ -534,8 +534,8 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
const framework::Tensor& dout, int axis, const framework::Tensor& dout, int axis,
framework::Tensor* dx, framework::Tensor* dy, framework::Tensor* dx, framework::Tensor* dy,
DX_OP dx_op, DY_OP dy_op) { DX_OP dx_op, DY_OP dy_op) {
const framework::DDim x_dim = x.dims(); const framework::DDim& x_dim = x.dims();
const framework::DDim y_dim = y.dims(); const framework::DDim& y_dim = y.dims();
if (x.dims() == y.dims()) { if (x.dims() == y.dims()) {
ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>( ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
...@@ -558,19 +558,19 @@ void ElemwiseExplicitGradCompute(const framework::ExecutionContext& ctx, ...@@ -558,19 +558,19 @@ void ElemwiseExplicitGradCompute(const framework::ExecutionContext& ctx,
framework::Tensor* dx, framework::Tensor* dy, framework::Tensor* dx, framework::Tensor* dy,
DX_OP dx_op, DY_OP dy_op) { DX_OP dx_op, DY_OP dy_op) {
if (dy == nullptr) { if (dy == nullptr) {
const framework::DDim dx_dims = dout.dims(); const framework::DDim& dx_dims = dout.dims();
auto dy_dims = dx_dims; auto dy_dims = dx_dims;
ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>( ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
} else { } else {
if (dout.dims() == dy->dims()) { if (dout.dims() == dy->dims()) {
const framework::DDim dx_dims = dout.dims(); const framework::DDim& dx_dims = dout.dims();
const framework::DDim dy_dims = dy->dims(); const framework::DDim& dy_dims = dy->dims();
ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>( ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
} else { // Y is a scalar } else { // Y is a scalar
auto dx_dims = dout.dims(); auto dx_dims = dout.dims();
const framework::DDim dy_dims = dy->dims(); const framework::DDim& dy_dims = dy->dims();
ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>( ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/operators/fused_elemwise_activation_op.h"
namespace paddle {
namespace operators {
class FusedElemwiseActivationOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(
ctx->HasInput("X"),
"Input(X) of FusedElemwiseActivationOp op should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("Y"),
"Input(Y) of FusedElemwiseActivationOp op should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Out"),
"Output(Out) of FusedElemwiseActivationOp op should not be null.");
auto x_dim = ctx->GetInputDim("X");
auto y_dim = ctx->GetInputDim("Y");
PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
"Rank of first input must >= rank of second input.");
ctx->SetOutputDim("Out", x_dim);
ctx->ShareLoD("X", /*->*/ "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE_EQ(ctx.Input<framework::Tensor>("X")->type(),
ctx.Input<framework::Tensor>("Y")->type(),
"The element's type of input should be the same.");
auto input_data_type =
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type());
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
};
class FusedElemwiseActivationMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(vector<Tensor>)");
AddInput("Y", "(vector<Tensor>)");
AddOutput("Out", "vector<Tensor>");
AddAttr<int>("axis",
"axis is used by elementwise_op, the default value is -1.")
.SetDefault(-1);
AddAttr<float>("scale",
"scale is used by scale_op, the default value is 0.0.")
.SetDefault(0.0);
AddAttr<bool>("recomputation",
"Whether to recompute the Out."
"fused_elemwise_activation_grad has two methods to get the "
"dx and dy, one "
"is to use the 'Out', and the other is not to use it. "
"The former method will save the time of recomputing the "
"'Out', but it must occupy the memory to store the 'out'. "
"While, the later method can avoid occupying the memory, "
"but it must recompute the 'Out'. The default value is true.")
.SetDefault(true);
AddAttr<std::vector<std::string>>("functor_list",
"The functors that should be fused.")
.AddCustomChecker([&](const std::vector<std::string> &functor_list) {
PADDLE_ENFORCE(ValidCheck(functor_list));
});
AddComment(R"DOC(
FusedElemwiseActivation Operator.
At present, FusedElemwiseActivation only supports Two kinds of compound
operators (elementwise_op and activation_op):
Z = Binary(X, Unary(Y))
Z = Unary(Binary(X, Y))
The attributions of activation_op can be get from fused_elemwise_activation_op's
attributions. functor_list records the functors to be fused, for example
"scale,elementwise_add".
)DOC");
}
private:
bool ValidCheck(const std::vector<std::string> &functors) {
std::unordered_set<std::string> unary_fun = {"scale", "relu"};
std::unordered_set<std::string> binary_fun = {"elementwise_add"};
std::string unary_fun_str;
if (binary_fun.count(functors[0])) {
unary_fun_str = functors[1];
} else if (binary_fun.count(functors[1])) {
unary_fun_str = functors[0];
} else {
PADDLE_THROW("%s and %s are not included in fused_list.", functors[0],
functors[1]);
}
PADDLE_ENFORCE_EQ(unary_fun.count(unary_fun_str), 1,
"%s is not included in fused_list.", unary_fun_str);
return true;
}
};
class FusedElemwiseActivationGradMaker
: public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *op_desc_ptr = new framework::OpDesc();
op_desc_ptr->SetType(this->ForwardOpType() + "_grad");
for (auto &input_param : this->InputNames()) {
op_desc_ptr->SetInput(input_param, this->Input(input_param));
op_desc_ptr->SetOutput(framework::GradVarName(input_param),
this->InputGrad(input_param, true));
}
for (auto &output_param : this->OutputNames()) {
op_desc_ptr->SetInput(output_param, this->Output(output_param));
op_desc_ptr->SetInput(framework::GradVarName(output_param),
this->OutputGrad(output_param));
}
op_desc_ptr->SetAttrMap(this->Attrs());
std::vector<std::string> functor_names =
boost::get<std::vector<std::string>>(
op_desc_ptr->GetAttr("functor_list"));
functor_names[0] += "_grad";
functor_names[1] += "_grad";
op_desc_ptr->SetAttr("functor_list", functor_names);
return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
}
};
class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null");
auto x_dims = ctx->GetInputDim("X");
auto y_dims = ctx->GetInputDim("Y");
auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
"Rank of first input must >= rank of second input.");
auto x_grad_name = framework::GradVarName("X");
auto y_grad_name = framework::GradVarName("Y");
if (ctx->HasOutput(x_grad_name)) {
ctx->SetOutputDim(x_grad_name, x_dims);
}
if (ctx->HasOutput(y_grad_name)) {
ctx->SetOutputDim(y_grad_name, y_dims);
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
auto input_data_type_index = ctx.Input<framework::Tensor>("X")->type();
PADDLE_ENFORCE_EQ(input_data_type_index,
ctx.Input<framework::Tensor>("Y")->type(),
"The element's type of input should be the same.");
PADDLE_ENFORCE_EQ(
input_data_type_index,
ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
"The element's type of input should be the same.");
auto input_data_type = framework::ToDataType(input_data_type_index);
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(fused_elemwise_activation, ops::FusedElemwiseActivationOp,
ops::FusedElemwiseActivationMaker,
ops::FusedElemwiseActivationGradMaker);
REGISTER_OPERATOR(fused_elemwise_activation_grad,
ops::FusedElemwiseActivationOpGrad);
REGISTER_OP_CPU_KERNEL(
fused_elemwise_activation,
ops::FusedElemwiseActivationKernel<paddle::platform::CPUDeviceContext,
float>,
ops::FusedElemwiseActivationKernel<paddle::platform::CPUDeviceContext,
double>);
REGISTER_OP_CPU_KERNEL(
fused_elemwise_activation_grad,
ops::FusedElemwiseActivationGradKernel<paddle::platform::CPUDeviceContext,
float>,
ops::FusedElemwiseActivationGradKernel<paddle::platform::CPUDeviceContext,
double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused_elemwise_activation_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
fused_elemwise_activation,
ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
float>,
ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
double>);
REGISTER_OP_CUDA_KERNEL(
fused_elemwise_activation_grad,
ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
float>,
ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/elementwise_op_function.h"
#include "paddle/fluid/operators/math/functors.h"
namespace math = paddle::operators::math;
namespace paddle {
namespace operators {
// CompoundFunctors
// For example: Z = Binary(X, Unary(Y))
template <typename T, typename BinaryFun, typename UnaryFun>
struct BinaryCompoundFunctor {
BinaryCompoundFunctor(const BinaryFun &binary_fun, const UnaryFun &unary_fun)
: binary_fun_(binary_fun), unary_fun_(unary_fun) {}
inline HOSTDEVICE T operator()(T x, T y) {
return binary_fun_(x, unary_fun_(y));
}
private:
BinaryFun binary_fun_;
UnaryFun unary_fun_;
};
// For example: Z = Unary(Binary(X, Y))
template <typename T, typename UnaryFun, typename BinaryFun>
struct UnaryCompoundFunctor {
UnaryCompoundFunctor(const UnaryFun &unary_fun, const BinaryFun &binary_fun)
: unary_fun_(unary_fun), binary_fun_(binary_fun) {}
inline HOSTDEVICE T operator()(T x, T y) {
return unary_fun_(binary_fun_(x, y));
}
private:
UnaryFun unary_fun_;
BinaryFun binary_fun_;
};
// FIXME(zcd): DBinaryFun and DUnaryFun have to method to get
// the dx, one is to use the 'out', and the other is not to use it.
// the former method will save the time of recomputing the
// 'out', but it must occupy the memory to store the 'out'.
// While the later method can avoid occupying this memory,
// but it must recompute the 'out'.
template <typename T, typename DBinaryFun, typename UnaryFun,
bool Recomputation = true>
struct BinaryCompoundGradDxFunctor {
BinaryCompoundGradDxFunctor(const DBinaryFun &d_binary_fun,
const UnaryFun &unary_fun)
: d_binary_fun_(d_binary_fun), unary_fun_(unary_fun) {}
inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
if (Recomputation) {
return dout * d_binary_fun_(x, unary_fun_(y));
} else {
return dout * d_binary_fun_(x, unary_fun_(y), out);
}
}
private:
DBinaryFun d_binary_fun_;
UnaryFun unary_fun_;
};
template <typename T, typename DBinaryFun, typename UnaryFun,
typename DUnaryFun, bool Recomputation = true>
struct BinaryCompoundGradDyFunctor {
BinaryCompoundGradDyFunctor(const DBinaryFun &d_binary_fun,
const UnaryFun &unary_fun,
const DUnaryFun &d_unary_fun)
: d_binary_fun_(d_binary_fun),
unary_fun_(unary_fun),
d_unary_fun_(d_unary_fun) {}
inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
if (Recomputation) {
return dout * d_binary_fun_(unary_fun_(y), x) * d_unary_fun_(y);
} else {
return dout * d_binary_fun_(unary_fun_(y), x, out) * d_unary_fun_(y);
}
}
private:
DBinaryFun d_binary_fun_;
UnaryFun unary_fun_;
DUnaryFun d_unary_fun_;
};
template <typename T, typename DUnaryFun, typename BinaryFun,
typename DBinaryFun, bool Recomputation = true>
struct UnaryCompoundGradDxFunctor {
UnaryCompoundGradDxFunctor(const DUnaryFun &d_unary_fun,
const BinaryFun &binary_fun,
const DBinaryFun &d_binary_fun)
: d_unary_fun_(d_unary_fun),
binary_fun_(binary_fun),
d_binary_fun_(d_binary_fun) {}
inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
T base;
if (Recomputation) {
base = dout * d_unary_fun_(binary_fun_(x, y));
} else {
base = dout * d_unary_fun_(binary_fun_(x, y), out);
}
return base * d_binary_fun_(x, y);
}
private:
DUnaryFun d_unary_fun_;
BinaryFun binary_fun_;
DBinaryFun d_binary_fun_;
};
template <typename T, typename DUnaryFun, typename BinaryFun,
typename DBinaryFun, bool Recomputation = true>
struct UnaryCompoundGradDyFunctor {
UnaryCompoundGradDyFunctor(const DUnaryFun &d_unary_fun,
const BinaryFun &binary_fun,
const DBinaryFun &d_binary_fun)
: d_unary_fun_(d_unary_fun),
binary_fun_(binary_fun),
d_binary_fun_(d_binary_fun) {}
inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
T base;
if (Recomputation) {
base = dout * d_unary_fun_(binary_fun_(x, y));
} else {
base = dout * d_unary_fun_(binary_fun_(x, y), out);
}
return base * d_binary_fun_(y, x);
}
private:
DUnaryFun d_unary_fun_;
BinaryFun binary_fun_;
DBinaryFun d_binary_fun_;
};
template <typename DeviceContext, typename T, typename BinaryFunctor,
typename UnaryFunctor>
static void RunBinaryCompoundFunctor(const framework::ExecutionContext &ctx,
const BinaryFunctor &binary_functor,
const UnaryFunctor &unary_functor,
const framework::Tensor *in_x,
const framework::Tensor *in_y,
framework::Tensor *output) {
int axis = ctx.Attr<int>("axis");
using BinaryCompoundFunctor =
BinaryCompoundFunctor<T, BinaryFunctor, UnaryFunctor>;
ElementwiseComputeEx<BinaryCompoundFunctor, DeviceContext, T>(
ctx, in_x, in_y, axis,
BinaryCompoundFunctor(binary_functor, unary_functor), output);
}
template <typename DeviceContext, typename T, typename UnaryFunctor,
typename BinaryFunctor>
static void RunUnaryCompoundFunctors(const framework::ExecutionContext &ctx,
const UnaryFunctor &unary_functor,
const BinaryFunctor &binary_functor,
const framework::Tensor *in_x,
const framework::Tensor *in_y,
framework::Tensor *output) {
int axis = ctx.Attr<int>("axis");
using UnaryCompoundFunctor =
UnaryCompoundFunctor<T, UnaryFunctor, BinaryFunctor>;
ElementwiseComputeEx<UnaryCompoundFunctor, DeviceContext, T>(
ctx, in_x, in_y, axis,
UnaryCompoundFunctor(unary_functor, binary_functor), output);
}
template <typename DeviceContext, typename T, typename BinaryGradFunctor,
typename UnaryFunctor, typename UnaryGradFunctor,
bool Recomputation = true>
static void RunBinaryCompoundGradFunctors(
const framework::ExecutionContext &ctx,
const BinaryGradFunctor &binary_grad_functor,
const UnaryFunctor &unary_functor,
const UnaryGradFunctor &unary_grad_functor, const framework::Tensor *in_x,
const framework::Tensor *in_y, const framework::Tensor *in_out,
const framework::Tensor *in_out_grad, framework::Tensor *x_grad,
framework::Tensor *y_grad) {
int axis = ctx.Attr<int>("axis");
using BinaryCompoundDxFunctor =
BinaryCompoundGradDxFunctor<T, BinaryGradFunctor, UnaryFunctor,
Recomputation>;
using BinaryCompoundDyFunctor =
BinaryCompoundGradDyFunctor<T, BinaryGradFunctor, UnaryFunctor,
UnaryGradFunctor, Recomputation>;
ElemwiseGradCompute<DeviceContext, T, BinaryCompoundDxFunctor,
BinaryCompoundDyFunctor>(
ctx, *in_x, *in_y, *in_out, *in_out_grad, axis, x_grad, y_grad,
BinaryCompoundDxFunctor(binary_grad_functor, unary_functor),
BinaryCompoundDyFunctor(binary_grad_functor, unary_functor,
unary_grad_functor));
}
template <typename DeviceContext, typename T, typename UnaryGradFunctor,
typename BinaryFunctor, typename BinaryGradFunctor,
bool Recomputation = true>
static void RunUnaryCompoundGradFunctors(
const framework::ExecutionContext &ctx,
const UnaryGradFunctor &unary_grad_functor,
const BinaryFunctor &binary_functor,
const BinaryGradFunctor &binary_grad_functor, const framework::Tensor *in_x,
const framework::Tensor *in_y, const framework::Tensor *in_out,
const framework::Tensor *in_out_grad, framework::Tensor *x_grad,
framework::Tensor *y_grad) {
int axis = ctx.Attr<int>("axis");
using UnaryCompoundDxFunctor =
UnaryCompoundGradDxFunctor<T, UnaryGradFunctor, BinaryFunctor,
BinaryGradFunctor, Recomputation>;
using UnaryCompoundDyFunctor =
UnaryCompoundGradDyFunctor<T, UnaryGradFunctor, BinaryFunctor,
BinaryGradFunctor, Recomputation>;
ElemwiseGradCompute<DeviceContext, T, UnaryCompoundDxFunctor,
UnaryCompoundDyFunctor>(
ctx, *in_x, *in_y, *in_out, *in_out_grad, axis, x_grad, y_grad,
UnaryCompoundDxFunctor(unary_grad_functor, binary_functor,
binary_grad_functor),
UnaryCompoundDyFunctor(unary_grad_functor, binary_functor,
binary_grad_functor));
}
template <typename DeviceContext, typename T>
static void RunFunctors(const framework::ExecutionContext &ctx,
const framework::Tensor *in_x,
const framework::Tensor *in_y,
framework::Tensor *output) {
auto &functors = ctx.Attr<std::vector<std::string>>("functor_list");
auto funcs_str = functors[0] + "," + functors[1];
// TODO(zcd): The following code can be refined.
if (funcs_str == "elementwise_add,scale") {
// Z = Binary(X, Unary(Y))
T scale = static_cast<T>(ctx.Attr<float>("scale"));
RunBinaryCompoundFunctor<DeviceContext, T, math::AddFunctor<T>,
math::ScaleFunctor<T>>(
ctx, math::AddFunctor<T>(), math::ScaleFunctor<T>(scale), in_x, in_y,
output);
} else if (funcs_str == "scale,elementwise_add") {
// Z = Unary(Binary(X, Y))
T scale = static_cast<T>(ctx.Attr<float>("scale"));
RunUnaryCompoundFunctors<DeviceContext, T, math::ScaleFunctor<T>,
math::AddFunctor<T>>(
ctx, math::ScaleFunctor<T>(scale), math::AddFunctor<T>(), in_x, in_y,
output);
} else if (funcs_str == "elementwise_add,relu") {
RunBinaryCompoundFunctor<DeviceContext, T, math::AddFunctor<T>,
math::ReluFunctor<T>>(
ctx, math::AddFunctor<T>(), math::ReluFunctor<T>(), in_x, in_y, output);
} else if (funcs_str == "relu,elementwise_add") {
RunUnaryCompoundFunctors<DeviceContext, T, math::ReluFunctor<T>,
math::AddFunctor<T>>(
ctx, math::ReluFunctor<T>(), math::AddFunctor<T>(), in_x, in_y, output);
} else {
PADDLE_THROW("%s has not been implemented.", funcs_str);
}
}
template <typename DeviceContext, typename T>
static void RunGradFunctors(const framework::ExecutionContext &ctx,
const framework::Tensor *in_x,
const framework::Tensor *in_y,
const framework::Tensor *in_out,
const framework::Tensor *in_out_grad,
framework::Tensor *x_grad,
framework::Tensor *y_grad) {
auto &functors = ctx.Attr<std::vector<std::string>>("functor_list");
auto funcs_str = functors[0] + "," + functors[1];
bool recomputation = ctx.Attr<bool>("recomputation");
// TODO(zcd): The following code can be refined. for example, use registion
if (funcs_str == "elementwise_add_grad,scale_grad") {
// The backward of Z = Binary(X, Unary(Y))
T scale = static_cast<T>(ctx.Attr<float>("scale"));
if (recomputation) {
RunBinaryCompoundGradFunctors<DeviceContext, T, math::AddGradFunctor<T>,
math::ScaleFunctor<T>,
math::ScaleGradFunctor<T>, true>(
ctx, math::AddGradFunctor<T>(), math::ScaleFunctor<T>(scale),
math::ScaleGradFunctor<T>(scale), in_x, in_y, in_out, in_out_grad,
x_grad, y_grad);
} else {
RunBinaryCompoundGradFunctors<DeviceContext, T, math::AddGradFunctor<T>,
math::ScaleFunctor<T>,
math::ScaleGradFunctor<T>, false>(
ctx, math::AddGradFunctor<T>(), math::ScaleFunctor<T>(scale),
math::ScaleGradFunctor<T>(scale), in_x, in_y, in_out, in_out_grad,
x_grad, y_grad);
}
} else if (funcs_str == "scale_grad,elementwise_add_grad") {
// The backward of Z = Unary(Binary(X, Y))
T scale = static_cast<T>(ctx.Attr<float>("scale"));
if (recomputation) {
RunUnaryCompoundGradFunctors<DeviceContext, T, math::ScaleGradFunctor<T>,
math::AddFunctor<T>, math::AddGradFunctor<T>,
true>(ctx, math::ScaleGradFunctor<T>(scale),
math::AddFunctor<T>(),
math::AddGradFunctor<T>(), in_x, in_y,
in_out, in_out_grad, x_grad, y_grad);
} else {
RunUnaryCompoundGradFunctors<DeviceContext, T, math::ScaleGradFunctor<T>,
math::AddFunctor<T>, math::AddGradFunctor<T>,
false>(ctx, math::ScaleGradFunctor<T>(scale),
math::AddFunctor<T>(),
math::AddGradFunctor<T>(), in_x, in_y,
in_out, in_out_grad, x_grad, y_grad);
}
} else if (funcs_str == "elementwise_add_grad,relu_grad") {
if (recomputation) {
RunBinaryCompoundGradFunctors<DeviceContext, T, math::AddGradFunctor<T>,
math::ReluFunctor<T>,
math::ReluGradFunctor<T>, true>(
ctx, math::AddGradFunctor<T>(), math::ReluFunctor<T>(),
math::ReluGradFunctor<T>(), in_x, in_y, in_out, in_out_grad, x_grad,
y_grad);
} else {
RunBinaryCompoundGradFunctors<DeviceContext, T, math::AddGradFunctor<T>,
math::ReluFunctor<T>,
math::ReluGradFunctor<T>, false>(
ctx, math::AddGradFunctor<T>(), math::ReluFunctor<T>(),
math::ReluGradFunctor<T>(), in_x, in_y, in_out, in_out_grad, x_grad,
y_grad);
}
} else if (funcs_str == "relu_grad,elementwise_add_grad") {
if (recomputation) {
RunUnaryCompoundGradFunctors<DeviceContext, T, math::ReluGradFunctor<T>,
math::AddFunctor<T>, math::AddGradFunctor<T>,
true>(ctx, math::ReluGradFunctor<T>(),
math::AddFunctor<T>(),
math::AddGradFunctor<T>(), in_x, in_y,
in_out, in_out_grad, x_grad, y_grad);
} else {
RunUnaryCompoundGradFunctors<DeviceContext, T, math::ReluGradFunctor<T>,
math::AddFunctor<T>, math::AddGradFunctor<T>,
false>(ctx, math::ReluGradFunctor<T>(),
math::AddFunctor<T>(),
math::AddGradFunctor<T>(), in_x, in_y,
in_out, in_out_grad, x_grad, y_grad);
}
} else {
PADDLE_THROW("%s has not been implemented.", funcs_str);
}
}
template <typename DeviceContext, typename T>
class FusedElemwiseActivationKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto &in_x = detail::Ref(ctx.Input<framework::Tensor>("X"),
"Cannot get input tensor %s, variable name = %s",
"X", ctx.op().Input("X"));
auto &in_y = detail::Ref(ctx.Input<framework::Tensor>("Y"),
"Cannot get input tensor %s, variable name = %s",
"Y", ctx.op().Input("Y"));
auto &output = detail::Ref(ctx.Output<framework::Tensor>("Out"),
"Cannot get input tensor %s, variable name = %s",
"Out", ctx.op().Output("Out"));
RunFunctors<DeviceContext, T>(ctx, &in_x, &in_y, &output);
}
};
template <typename DeviceContext, typename T>
class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto &in_x = detail::Ref(ctx.Input<framework::Tensor>("X"),
"Cannot get input tensor %s, variable name = %s",
"X", ctx.op().Input("X"));
auto &in_y = detail::Ref(ctx.Input<framework::Tensor>("Y"),
"Cannot get input tensor %s, variable name = %s",
"Y", ctx.op().Input("Y"));
auto &in_out = detail::Ref(ctx.Input<framework::Tensor>("Out"),
"Cannot get input tensor %s, variable name = %s",
"Out", ctx.op().Input("Out"));
auto &in_out_grad =
detail::Ref(ctx.Input<framework::Tensor>(framework::GradVarName("Out")),
"Cannot get input tensor %s, variable name = %s",
framework::GradVarName("Out"),
ctx.op().Input(framework::GradVarName("Out")));
framework::Tensor *x_grad =
ctx.Output<framework::Tensor>(framework::GradVarName("X"));
framework::Tensor *y_grad =
ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
RunGradFunctors<DeviceContext, T>(ctx, &in_x, &in_y, &in_out, &in_out_grad,
x_grad, y_grad);
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace paddle {
namespace operators {
namespace math {
// AddFunctor
template <typename T>
struct AddFunctor {
// out = x + y;
inline HOSTDEVICE T operator()(T x, T y) { return x + y; }
};
template <typename T>
struct AddGradFunctor {
inline HOSTDEVICE T operator()(T x, T y) { return 1; }
inline HOSTDEVICE T operator()(T x, T y, T out) const { return 1; }
};
template <typename T>
struct ScaleFunctor {
explicit ScaleFunctor(const T coeff) : coeff_(coeff) {}
inline HOSTDEVICE T operator()(T ele) { return ele * coeff_; }
private:
T coeff_;
};
template <typename T>
struct ScaleGradFunctor {
explicit ScaleGradFunctor(T coeff) : coeff_(coeff) {}
inline HOSTDEVICE T operator()(T x) { return coeff_; }
inline HOSTDEVICE T operator()(T x, T out) { return coeff_; }
private:
T coeff_;
};
template <typename T>
struct ReluFunctor {
inline HOSTDEVICE T operator()(T x) { return x * (x > 0); }
};
template <typename T>
struct ReluGradFunctor {
inline HOSTDEVICE T operator()(T x) { return x > 0 ? 1 : 0; }
inline HOSTDEVICE T operator()(T x, T out) { return x > 0 ? 1 : 0; }
};
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -163,12 +163,11 @@ class ParallelDoOp : public framework::OperatorBase { ...@@ -163,12 +163,11 @@ class ParallelDoOp : public framework::OperatorBase {
auto &place = places[place_idx]; auto &place = places[place_idx];
auto *cur_scope = sub_scopes[place_idx]; auto *cur_scope = sub_scopes[place_idx];
workers.emplace_back( workers.emplace_back(framework::Async([program, cur_scope, place, block] {
framework::Async([program, cur_scope, place, block, place_idx] { framework::Executor executor(place);
framework::Executor executor(place); executor.Run(*program, cur_scope, block->ID(),
executor.Run(*program, cur_scope, block->ID(), false /*create_local_scope*/);
false /*create_local_scope*/); }));
}));
} }
for (auto &worker : workers) { for (auto &worker : workers) {
worker.wait(); worker.wait();
...@@ -239,12 +238,11 @@ class ParallelDoGradOp : public framework::OperatorBase { ...@@ -239,12 +238,11 @@ class ParallelDoGradOp : public framework::OperatorBase {
auto *cur_scope = sub_scopes[i]; auto *cur_scope = sub_scopes[i];
// execute // execute
workers.emplace_back( workers.emplace_back(framework::Async([program, cur_scope, place, block] {
framework::Async([program, cur_scope, place, block, i] { framework::Executor executor(place);
framework::Executor executor(place); executor.Run(*program, cur_scope, block->ID(),
executor.Run(*program, cur_scope, block->ID(), false /*create_local_scope*/);
false /*create_local_scope*/); }));
}));
} }
for (auto &worker : workers) { for (auto &worker : workers) {
worker.wait(); worker.wait();
......
...@@ -55,18 +55,8 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) { ...@@ -55,18 +55,8 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
"TensorRT' tensor input requires at least 2 dimensions"); "TensorRT' tensor input requires at least 2 dimensions");
PADDLE_ENFORCE_LE(shape.size(), 4UL, PADDLE_ENFORCE_LE(shape.size(), 4UL,
"TensorRT' tensor input requires at most 4 dimensions"); "TensorRT' tensor input requires at most 4 dimensions");
PADDLE_ENFORCE_EQ(shape.size(), 4UL);
switch (shape.size()) { return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
case 2:
return nvinfer1::Dims2(1, shape[1]);
case 3:
return nvinfer1::Dims3(1, shape[1], shape[2]);
case 4:
return nvinfer1::Dims4(1, shape[1], shape[2], shape[3]);
default:
return nvinfer1::Dims();
}
return nvinfer1::Dims();
} }
} // namespace } // namespace
...@@ -86,6 +76,9 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare( ...@@ -86,6 +76,9 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
parameters.insert(param); parameters.insert(param);
} }
std::vector<std::string> output_maps =
context.Attr<std::vector<std::string>>("output_name_mapping");
// TODO(Superjomn) replace this with a different stream // TODO(Superjomn) replace this with a different stream
auto *engine = Singleton<TRT_EngineManager>::Global().Create( auto *engine = Singleton<TRT_EngineManager>::Global().Create(
max_batch, max_workspace, nullptr /*engine hold its own stream*/, max_batch, max_workspace, nullptr /*engine hold its own stream*/,
...@@ -97,6 +90,7 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare( ...@@ -97,6 +90,7 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
// Add inputs // Add inputs
VLOG(4) << "declare inputs"; VLOG(4) << "declare inputs";
for (auto &input : context.Inputs("Xs")) { for (auto &input : context.Inputs("Xs")) {
if (parameters.count(input)) continue;
VLOG(4) << "declare input " << input; VLOG(4) << "declare input " << input;
auto *var = block.FindVar(input); auto *var = block.FindVar(input);
// TensorRT engine need to create parameters. The parameter's description // TensorRT engine need to create parameters. The parameter's description
...@@ -122,7 +116,7 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare( ...@@ -122,7 +116,7 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
block_desc, parameters, context.scope(), engine); block_desc, parameters, context.scope(), engine);
// Add outputs // Add outputs
for (auto &output : context.Outputs("Ys")) { for (auto &output : output_maps) {
engine->DeclareOutput(output); engine->DeclareOutput(output);
} }
......
...@@ -66,8 +66,17 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -66,8 +66,17 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
context.Attr<int>("max_batch")); context.Attr<int>("max_batch"));
std::vector<std::string> output_maps =
context.Attr<std::vector<std::string>>("output_name_mapping");
auto params = context.Attr<std::vector<std::string>>("parameters");
std::unordered_set<std::string> parameters;
for (const auto& param : params) {
parameters.insert(param);
}
// Convert input tensor from fluid to engine. // Convert input tensor from fluid to engine.
for (const auto& x : context.Inputs("Xs")) { for (const auto& x : context.Inputs("Xs")) {
if (parameters.count(x)) continue;
// convert input and copy to TRT engine's buffer // convert input and copy to TRT engine's buffer
auto& t = inference::analysis::GetFromScope<framework::LoDTensor>( auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
context.scope(), x); context.scope(), x);
...@@ -82,10 +91,12 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -82,10 +91,12 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
// Execute the engine. // Execute the engine.
PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0); PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0);
engine->Execute(FLAGS_tensorrt_engine_batch_size); engine->Execute(FLAGS_tensorrt_engine_batch_size);
// Convert output tensor from engine to fluid // Convert output tensor from engine to fluid
int output_index = 0;
for (const auto& y : context.Outputs("Ys")) { for (const auto& y : context.Outputs("Ys")) {
// convert output and copy to fluid. // convert output and copy to fluid.
nvinfer1::ITensor* trt_t = engine->GetITensor(y); nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
auto dims = trt_t->getDimensions(); auto dims = trt_t->getDimensions();
// Use the output ITensor's dims to reshape the Fluid Tensor. // Use the output ITensor's dims to reshape the Fluid Tensor.
std::vector<int> ddim(dims.d, dims.d + dims.nbDims); std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
...@@ -102,7 +113,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -102,7 +113,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
// TODO(Superjomn) change this float to dtype size. // TODO(Superjomn) change this float to dtype size.
auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) * auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) *
FLAGS_tensorrt_engine_batch_size; FLAGS_tensorrt_engine_batch_size;
engine->GetOutputInCPU(y, engine->GetOutputInCPU(output_maps[output_index],
fluid_t->mutable_data<float>(platform::CPUPlace()), fluid_t->mutable_data<float>(platform::CPUPlace()),
size * sizeof(float)); size * sizeof(float));
//} else { //} else {
...@@ -110,6 +121,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -110,6 +121,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
// y, fluid_t->mutable_data<float>(platform::CUDAPlace()), // y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
// size * sizeof(float)); // size * sizeof(float));
//} //}
output_index += 1;
} }
cudaStreamSynchronize(*engine->stream()); cudaStreamSynchronize(*engine->stream());
......
...@@ -103,6 +103,9 @@ TEST(TensorRTEngineOp, manual) { ...@@ -103,6 +103,9 @@ TEST(TensorRTEngineOp, manual) {
SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine"); SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters", SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
std::vector<std::string>({})); std::vector<std::string>({}));
SetAttr<std::vector<std::string>>(engine_op_desc.Proto(),
"output_name_mapping",
std::vector<std::string>({"z0"}));
LOG(INFO) << "create engine op"; LOG(INFO) << "create engine op";
auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto()); auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
...@@ -196,6 +199,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { ...@@ -196,6 +199,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
std::vector<std::string>({"y0", "y1", "y2", "y3"})); std::vector<std::string>({"y0", "y1", "y2", "y3"}));
SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "b_engine"); SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "b_engine");
SetAttr<std::vector<std::string>>(engine_op_desc.Proto(),
"output_name_mapping",
std::vector<std::string>({"z3"}));
auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto()); auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
// Execute them. // Execute them.
......
...@@ -29,13 +29,13 @@ __all__ = ['test, get_dict', 'get_embedding', 'convert'] ...@@ -29,13 +29,13 @@ __all__ = ['test, get_dict', 'get_embedding', 'convert']
DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz' DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
DATA_MD5 = '387719152ae52d60422c016e92a742fc' DATA_MD5 = '387719152ae52d60422c016e92a742fc'
WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt' WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa' WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt' VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt'
VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c' VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt' TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt'
TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751' TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb' EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb'
EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7' EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
UNK_IDX = 0 UNK_IDX = 0
......
...@@ -40,7 +40,7 @@ URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/' ...@@ -40,7 +40,7 @@ URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/'
'wmt_shrinked_data/wmt14.tgz') 'wmt_shrinked_data/wmt14.tgz')
MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c' MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
# BLEU of this trained model is 26.92 # BLEU of this trained model is 26.92
URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz' URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz'
MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3' MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
START = "<s>" START = "<s>"
......
...@@ -14,5 +14,7 @@ ...@@ -14,5 +14,7 @@
import decoder import decoder
from decoder import * from decoder import *
import memory_usage_calc
from memory_usage_calc import *
__all__ = decoder.__all__ __all__ = decoder.__all__ + memory_usage_calc.__all__
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This module privides a memory usage calculate function for user.
The purpose of this API is to allow users to estimate memory usage of
a program under a special batch size, then user can set appropriate
batch size to fully utilize a GPU.
This API is still under active development and may change drastically.
"""
from .. import core
from ..framework import Program, Variable
__all__ = ['memory_usage']
dtype_to_size = {
core.VarDesc.VarType.FP16: 2,
core.VarDesc.VarType.FP32: 4,
core.VarDesc.VarType.FP64: 8,
core.VarDesc.VarType.INT16: 2,
core.VarDesc.VarType.INT32: 4,
core.VarDesc.VarType.INT64: 8,
core.VarDesc.VarType.BOOL: 1,
core.VarDesc.VarType.UINT8: 1,
}
DEBUG = False
def memory_usage(program, batch_size):
"""
Get the estimate memory usage of program with input batch size.
Args:
program(Program): The current Program.
batch_size(int): The current input data batch_size.
Returns:
min_total_memory(float): the estimate memory usage lower bound.
max_total_memory(float): the estimate memory usage upper bound.
unit_str(string): the unit of estimate usage result.
Examples:
>>> import paddle.fluid as fluid
>>> lower_usage, upper_usage, unit = fluid.contrib.memory_usage(
fluid.default_main_program(), batch_size=10)
>>> print "memory usage is about %.3f - %.3f %s" % \
(lower_usage, upper_usage, unit)
"""
# Parameters check
if not isinstance(program, Program):
raise TypeError(
"Calculating Memory Usage requires Program as its Parameter."
"But you passed in %s" % (type(prgram)))
if batch_size <= 0:
raise ValueError("The batch size need to be positive.")
# Get the var_name list of first block and calculate
total_memory = 0.0
for var in program.global_block().vars.itervalues():
data_count = 1
for x in var.shape:
if x == -1:
data_count *= batch_size
else:
data_count *= x
var_memory = data_count * dtype_to_size[var.dtype]
if DEBUG:
print "%s memory usage: %d" % (var.name, var_memory)
total_memory += var_memory
if DEBUG:
print "total memory usage: %.2f" % (total_memory)
# Convert appropriate unit
unit_str = "B"
if total_memory > 1024:
total_memory /= 1024
unit_str = "KB"
if total_memory > 1024:
total_memory /= 1024
unit_str = "MB"
# Append extra memory consumption (5% - 10%)
min_total_memory = total_memory * 1.05
max_total_memory = total_memory * 1.1
return min_total_memory, max_total_memory, unit_str
...@@ -313,9 +313,9 @@ class TestAbs(OpTest): ...@@ -313,9 +313,9 @@ class TestAbs(OpTest):
self.init_dtype() self.init_dtype()
x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype) x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
# Because we set delta = 0.005 in caculating numeric gradient, # Because we set delta = 0.005 in calculating numeric gradient,
# if x is too small, such as 0.002, x_neg will be -0.003 # if x is too small, such as 0.002, x_neg will be -0.003
# x_pos will be 0.007, so the numeric gradient is unaccurate. # x_pos will be 0.007, so the numeric gradient is inaccurate.
# we should avoid this # we should avoid this
x[np.abs(x) < 0.005] = 0.02 x[np.abs(x) < 0.005] = 0.02
out = np.abs(x) out = np.abs(x)
......
...@@ -51,17 +51,17 @@ class TranspilerTest(unittest.TestCase): ...@@ -51,17 +51,17 @@ class TranspilerTest(unittest.TestCase):
self.origin_prog = main.clone() self.origin_prog = main.clone()
return main return main
def get_trainer(self, config=None): def get_trainer(self, config=None, sync_mode=True):
t = self._transpiler_instance(config) t = self._transpiler_instance(config, sync_mode)
return t.get_trainer_program() return t.get_trainer_program()
def get_pserver(self, ep, config=None): def get_pserver(self, ep, config=None, sync_mode=True):
t = self._transpiler_instance(config) t = self._transpiler_instance(config, sync_mode)
pserver = t.get_pserver_program(ep) pserver = t.get_pserver_program(ep)
startup = t.get_startup_program(ep, pserver) startup = t.get_startup_program(ep, pserver)
return pserver, startup return pserver, startup
def _transpiler_instance(self, config=None): def _transpiler_instance(self, config=None, sync_mode=True):
if not self.transpiler: if not self.transpiler:
main = self.get_main_program() main = self.get_main_program()
self.transpiler = fluid.DistributeTranspiler(config=config) self.transpiler = fluid.DistributeTranspiler(config=config)
...@@ -69,7 +69,8 @@ class TranspilerTest(unittest.TestCase): ...@@ -69,7 +69,8 @@ class TranspilerTest(unittest.TestCase):
self.trainer_id, self.trainer_id,
program=main, program=main,
pservers=self.pserver_eps, pservers=self.pserver_eps,
trainers=self.trainers) trainers=self.trainers,
sync_mode=sync_mode)
return self.transpiler return self.transpiler
...@@ -464,5 +465,76 @@ class TestDistLookupTable(TestDistLookupTableBase): ...@@ -464,5 +465,76 @@ class TestDistLookupTable(TestDistLookupTableBase):
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
class TestAsyncLocalLookupTable(TestDistLookupTableBase):
def net_conf(self):
self.network_with_table(is_sparse=True, is_distributed=False)
def transpiler_test_impl(self):
config = fluid.DistributeTranspilerConfig()
pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False)
self.assertEqual(len(pserver1.blocks), 3)
# 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam
self.assertEqual([op.type for op in pserver1.blocks[1].ops],
["adam", "scale", "scale"])
# 2 optimize for table adam
# NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
self.assertEqual([op.type for op in pserver1.blocks[2].ops],
["adam", "scale", "scale"])
trainer = self.get_trainer(config)
self.assertEqual(len(trainer.blocks), 1)
ops = [
'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean',
'fill_constant', 'mean_grad', 'cross_entropy_grad',
'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad',
'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
'lookup_table_grad', 'sum', 'split_selected_rows', 'send', 'recv',
'recv', 'recv', 'concat'
]
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
class TestAsyncDistLookupTable(TestDistLookupTableBase):
def net_conf(self):
self.network_with_table(is_sparse=True, is_distributed=True)
def transpiler_test_impl(self):
config = fluid.DistributeTranspilerConfig()
pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False)
self.assertEqual(len(pserver1.blocks), 6)
# 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam
self.assertEqual([op.type for op in pserver1.blocks[1].ops],
["adam", "scale", "scale"])
# 2 optimize for table sgd
self.assertEqual([op.type for op in pserver1.blocks[2].ops], ["sgd"])
# 3 prefetch -> lookup_sparse_table for data0
self.assertEqual([op.type for op in pserver1.blocks[3].ops],
["lookup_sparse_table"])
# 4 prefetch -> lookup_sparse_table for data1
self.assertEqual([op.type for op in pserver1.blocks[4].ops],
["lookup_sparse_table"])
# 5 save table
self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
trainer = self.get_trainer(config)
self.assertEqual(len(trainer.blocks), 1)
ops = [
'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul',
'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
'sum', 'split_ids', 'send', 'recv', 'recv'
]
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle
import paddle.fluid as fluid
import contextlib
import unittest
def train_simulator(test_batch_size=10):
if test_batch_size <= 0:
raise ValueError("batch_size should be a positive integeral value, "
"but got batch_size={}".format(test_batch_size))
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_cost)
# Calculate memory usage in current network config
lower_usage, upper_usage, unit = fluid.contrib.memory_usage(
fluid.default_main_program(), batch_size=test_batch_size)
print("memory usage is about %.3f - %.3f %s" %
(lower_usage, upper_usage, unit))
class TestMemoryUsage(unittest.TestCase):
def test_with_unit_B(self):
with self.program_scope_guard():
train_simulator()
def test_with_unit_KB(self):
with self.program_scope_guard():
train_simulator(test_batch_size=1000)
def test_with_unit_MB(self):
with self.program_scope_guard():
train_simulator(test_batch_size=100000)
@contextlib.contextmanager
def program_scope_guard(self):
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
yield
if __name__ == '__main__':
unittest.main()
...@@ -293,14 +293,15 @@ class DistributeTranspiler(object): ...@@ -293,14 +293,15 @@ class DistributeTranspiler(object):
RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
}) })
program.global_block().append_op( if self.sync_mode:
type="fetch_barrier", program.global_block().append_op(
inputs={}, type="fetch_barrier",
outputs={}, inputs={},
attrs={ outputs={},
"endpoints": pserver_endpoints, attrs={
RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE "endpoints": pserver_endpoints,
}) RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
})
for varname, splited_var in self.param_var_mapping.iteritems(): for varname, splited_var in self.param_var_mapping.iteritems():
if len(splited_var) <= 1: if len(splited_var) <= 1:
......
...@@ -29,13 +29,13 @@ __all__ = ['test, get_dict', 'get_embedding', 'convert'] ...@@ -29,13 +29,13 @@ __all__ = ['test, get_dict', 'get_embedding', 'convert']
DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz' DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
DATA_MD5 = '387719152ae52d60422c016e92a742fc' DATA_MD5 = '387719152ae52d60422c016e92a742fc'
WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt' WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa' WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt' VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt'
VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c' VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt' TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt'
TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751' TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb' EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb'
EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7' EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
UNK_IDX = 0 UNK_IDX = 0
......
...@@ -41,7 +41,7 @@ URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/' ...@@ -41,7 +41,7 @@ URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/'
'wmt_shrinked_data/wmt14.tgz') 'wmt_shrinked_data/wmt14.tgz')
MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c' MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
# BLEU of this trained model is 26.92 # BLEU of this trained model is 26.92
URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz' URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz'
MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3' MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
START = "<s>" START = "<s>"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册