提交 ea8a375f 编写于 作者: F fengjiayi

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into update_py_reader

#!/bin/bash
set -e
function train() {
......
#!/bin/bash
set -e
function clock_to_seconds() {
......
#!/bin/bash
set -e
function train() {
......
#!/bin/bash
set -e
function clock_to_seconds() {
......
#!/bin/bash
set -e
function train() {
......
#!/bin/bash
set -e
function test() {
......
#!/bin/bash
set -e
function test() {
......
#!/bin/bash
set -e
function test() {
......
#!/bin/bash
set -e
function test() {
......
......@@ -22,8 +22,6 @@
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
namespace paddle {
namespace inference {
namespace analysis {
DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
"Enable subgraph to TensorRT engine for acceleration");
......@@ -31,6 +29,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
DEFINE_string(inference_analysis_graphviz_log_root, "./",
"Graphviz debuger for data flow graphs.");
namespace inference {
namespace analysis {
class DfgPassManagerImpl final : public DfgPassManager {
public:
DfgPassManagerImpl() {
......
......@@ -45,14 +45,15 @@ limitations under the License. */
#include "paddle/fluid/inference/analysis/pass_manager.h"
namespace paddle {
namespace inference {
namespace analysis {
// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
// flag if not available.
DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
DECLARE_string(inference_analysis_graphviz_log_root);
namespace inference {
namespace analysis {
class Analyzer : public OrderedRegistry<PassManager> {
public:
// Register all the pass-managers.
......
......@@ -13,13 +13,21 @@
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <google/protobuf/text_format.h>
#include "paddle/fluid/inference/analysis/ut_helper.h"
namespace paddle {
namespace inference {
namespace analysis {
TEST_F(DFG_Tester, main) {
TEST_F(DFG_Tester, analysis_without_tensorrt) {
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false;
Analyzer analyser;
analyser.Run(&argument);
}
TEST_F(DFG_Tester, analysis_with_tensorrt) {
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true;
Analyzer analyser;
analyser.Run(&argument);
}
......
......@@ -222,10 +222,19 @@ Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
return stack_.top();
}
inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
return node.inlinks.size() == n;
}
GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
const std::vector<Node *> &source) {
PADDLE_ENFORCE(!source.empty(),
"Start points of topological sorting should not be empty!");
// CHECK all the inputs' in-degree is 0
for (auto *node : source) {
PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
}
std::unordered_set<Node *> visited;
std::unordered_set<Node *> to_visit{source.begin(), source.end()};
......@@ -233,6 +242,11 @@ GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
while (!to_visit.empty()) {
std::vector<Node *> queue(to_visit.begin(), to_visit.end());
for (auto *p : queue) {
if (p->deleted()) {
visited.insert(p);
to_visit.erase(p);
continue;
}
inlink_visited.clear();
std::copy_if(p->inlinks.begin(), p->inlinks.end(),
......@@ -292,6 +306,37 @@ Node *GraphTraits<DataFlowGraph>::NodesTSIterator::operator->() {
return sorted_[cursor_];
}
std::pair<std::vector<Node *>, std::vector<Node *>>
ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT
std::unordered_set<Node *> nodes(graph.begin(), graph.end());
std::unordered_set<Node *> inputs;
std::unordered_set<Node *> outputs;
// Input a Value, check whether its inlink is in the subgraph.
auto inlink_in_subgraph = [&](Node *n) {
for (auto *in : n->inlinks) {
if (nodes.count(in)) return true;
}
return false;
};
for (auto &node : graph) {
for (auto *in : node->inlinks) {
// The Value that is written by nodes inside a sub-graph shouldn't be the
// input of the sub-graph.
if (!nodes.count(in) && in->type() == Node::Type::kValue &&
!inlink_in_subgraph(in)) {
inputs.insert(in);
}
}
for (auto *out : node->outlinks) {
if (!nodes.count(out) && out->type() == Node::Type::kValue) {
outputs.insert(out);
}
}
}
return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
std::vector<Node *>(outputs.begin(), outputs.end()));
}
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -133,7 +133,7 @@ struct GraphTraits<DataFlowGraph> {
private:
std::vector<Node *> sorted_;
int cursor_{0};
size_t cursor_{0};
};
explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}
......@@ -173,36 +173,8 @@ struct GraphTraits<DataFlowGraph> {
// Extract the inputs and outputs of a graph. The inputs and outputs of a
// sub-graph is the inputs nodes and output nodes that doesn't inside the
// sub-graph.
static std::pair<std::vector<Node *>, std::vector<Node *>>
ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT
std::unordered_set<Node *> nodes(graph.begin(), graph.end());
std::unordered_set<Node *> inputs;
std::unordered_set<Node *> outputs;
// Input a Value, check whether its inlink is in the subgraph.
auto inlink_in_subgraph = [&](Node *n) {
for (auto *in : n->inlinks) {
if (nodes.count(in)) return true;
}
return false;
};
for (auto &node : graph) {
for (auto *in : node->inlinks) {
// The Value that is written by nodes inside a sub-graph shouldn't be the
// input of the sub-graph.
if (!nodes.count(in) && in->type() == Node::Type::kValue &&
!inlink_in_subgraph(in)) {
inputs.insert(in);
}
}
for (auto *out : node->outlinks) {
if (!nodes.count(out) && out->type() == Node::Type::kValue) {
outputs.insert(out);
}
}
}
return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
std::vector<Node *>(outputs.begin(), outputs.end()));
}
std::pair<std::vector<Node *>, std::vector<Node *>>
ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);
} // namespace analysis
} // namespace inference
......
......@@ -22,14 +22,18 @@
namespace paddle {
namespace inference {
DEFINE_int32(tensorrt_max_batchsize, 300, "TensorRT maximum batch size");
DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size");
namespace analysis {
using framework::proto::ProgramDesc;
std::vector<std::string> ExtractParameters(
const std::vector<std::unique_ptr<Node>>& nodes);
const std::vector<std::unique_ptr<Node>> &nodes);
bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
PADDLE_ENFORCE(!argument->transformed_program_desc);
......@@ -47,76 +51,77 @@ bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
bool DataFlowGraphToFluidPass::Finalize() { return true; }
void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
auto traits = GraphTraits<DataFlowGraph>(graph);
for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) {
if (it->deleted()) continue;
void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
LOG(INFO) << "graph.inputs " << graph->inputs.size();
for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
if (node.deleted()) continue;
switch (it->type()) {
switch (node.type()) {
case Node::Type::kFunction: {
LOG(INFO) << "add function " << it->repr();
AddFluidOp(&(*it));
LOG(INFO) << "add function " << node.repr();
AddFluidOp(&node);
} break;
case Node::Type::kFunctionBlock: {
LOG(INFO) << "add engine op " << it->repr() << " , "
<< static_cast<FunctionBlock*>(&(*it))->subgraph.size();
AddEngineOp(&(*it));
LOG(INFO) << "add engine op " << node.repr() << " , "
<< static_cast<FunctionBlock *>(&node)->subgraph.size();
AddEngineOp(&node);
} break;
default:
continue;
}
}
PADDLE_ENFORCE(argument_->transformed_program_desc.get());
}
void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
auto* ori_op = static_cast<framework::proto::OpDesc*>(node->pb_desc());
void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
auto *ori_op = static_cast<framework::proto::OpDesc *>(node->pb_desc());
// currently only the main block is analyzed.
auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
auto* op = main_block->add_ops();
auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
auto *op = main_block->add_ops();
*op = *ori_op; // copy the attributes, by default, these will not be changed
// by analysis phrase.
// by analysis phrase.
// The inputs and outputs of the existing ops are not changed by tensorrt
// subgraph pass.
// NOTE It might be changed by other passes in the long run.
}
void CreateTrtEngineOp(Node* node, const DataFlowGraph& graph,
const framework::proto::BlockDesc& block) {
void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
const framework::proto::BlockDesc &block) {
static int counter{0};
PADDLE_ENFORCE(node->IsFunctionBlock());
framework::OpDesc desc;
auto* func = static_cast<FunctionBlock*>(node);
auto *func = static_cast<FunctionBlock *>(node);
// collect inputs
std::vector<std::string> io;
for (auto* x : func->inlinks) {
for (auto *x : func->inlinks) {
io.push_back(x->name());
}
desc.SetInput("Xs", io);
// collect outputs
io.clear();
for (auto* x : func->outlinks) {
for (auto *x : func->outlinks) {
io.push_back(x->name());
}
desc.SetOutput("Ys", io);
desc.SetType("tensorrt_engine");
PADDLE_ENFORCE(!block.vars().empty(), "the block has no var-desc");
// Set attrs
SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
SetAttr(desc.Proto(), "engine_unique_key",
"trt-" + std::to_string(counter++));
SetAttr(desc.Proto(), "max_batch", 100); // TODO(Superjomn) add config latter
SetAttr(desc.Proto(), "max_workspace",
1024); // TODO(Superjomn) add config latter
SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize);
SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size);
SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
node->SetPbMsg(desc.Proto()->SerializeAsString());
}
std::vector<std::string> ExtractParameters(
const std::vector<std::unique_ptr<Node>>& nodes) {
const std::vector<std::unique_ptr<Node>> &nodes) {
std::vector<std::string> parameters;
for (const auto& node : nodes) {
for (const auto &node : nodes) {
if (!node->IsValue()) continue;
PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first");
framework::proto::VarDesc var;
......@@ -128,21 +133,30 @@ std::vector<std::string> ExtractParameters(
return parameters;
}
void DataFlowGraphToFluidPass::AddEngineOp(Node* node) {
void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
// TODO(Superjomn) Here need to expose some arguments for default setting.
PADDLE_ENFORCE(node->IsFunctionBlock());
auto* block_node = static_cast<FunctionBlock*>(node);
auto *block_node = static_cast<FunctionBlock *>(node);
framework::proto::BlockDesc proto;
framework::BlockDesc block_desc(nullptr, &proto);
block_desc.Proto()->set_parent_idx(-1);
block_desc.Proto()->set_idx(0);
LOG(INFO) << "origin variable size: "
<< argument_->origin_program_desc->blocks(0).vars().size();
LOG(INFO) << "transformed variable size: "
<< block_desc.Proto()->vars().size();
// copy ops.
for (auto* node : block_node->subgraph) {
auto* op = block_desc.AppendOp();
for (auto *node : block_node->subgraph) {
auto *op = block_desc.AppendOp();
PADDLE_ENFORCE(!node->pb_msg().empty());
op->Proto()->ParseFromString(node->pb_msg());
}
*block_desc.Proto()->mutable_vars() =
argument_->origin_program_desc->blocks(0).vars();
PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
auto* op = main_block->add_ops();
auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
auto *op = main_block->add_ops();
PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
op->ParseFromString(node->pb_msg());
}
......@@ -151,7 +165,7 @@ namespace {
class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
public:
using Config = DFG_GraphvizDrawPass::Config;
explicit DFG_DebuggerPass(const Config& config)
explicit DFG_DebuggerPass(const Config &config)
: DFG_GraphvizDrawPass(config) {}
std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
......@@ -160,7 +174,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
};
} // namespace
Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
FLAGS_inference_analysis_graphviz_log_root,
"data_flow_graph_to_fluid_graphviz_debugger"));
......
......@@ -26,6 +26,10 @@
namespace paddle {
namespace inference {
DECLARE_int32(tensorrt_max_batchsize);
DECLARE_int32(tensorrt_workspace_size);
namespace analysis {
class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
public:
......
......@@ -40,7 +40,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
no++;
}
// DFG is sensitive to ProgramDesc, be careful to change the existing models.
ASSERT_EQ(no, 82);
ASSERT_EQ(no, 83);
}
} // namespace analysis
......
......@@ -28,7 +28,6 @@ bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc);
PADDLE_ENFORCE(argument);
if (!argument->main_dfg) {
LOG(INFO) << "Init DFG";
argument->main_dfg.reset(new DataFlowGraph);
}
desc_ = argument->origin_program_desc.get();
......@@ -51,6 +50,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
v->SetPbMsg(var.SerializeAsString());
var2id[var.name()] = v->id();
}
for (int i = 0; i < main_block.ops_size(); i++) {
const auto &op = main_block.ops(i);
auto *o = graph->nodes.Create(Node::Type::kFunction);
......@@ -62,19 +62,31 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
o->SetPbMsg(op.SerializeAsString());
// set inputs and outputs
// TODO(Superjomn) make sure the InputNames is the real variable name.
std::unordered_set<Node *> inlinks;
for (int j = 0; j < op.inputs_size(); j++) {
auto &in_var = op.inputs(j);
for (int k = 0; k < in_var.arguments_size(); k++) {
auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
in->outlinks.push_back(o);
o->inlinks.push_back(in);
inlinks.insert(in);
}
}
for (int j = 0; j < op.outputs_size(); j++) {
auto &out_var = op.outputs(j);
for (int k = 0; k < out_var.arguments_size(); k++) {
auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
if (inlinks.count(out)) {
// Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
auto *out_alias = graph->nodes.Create(Node::Type::kValue);
out_alias->SetName(out->name());
out_alias->SetPbDesc(out->pb_desc());
out_alias->SetPbMsg(out->pb_msg());
var2id[out_alias->name()] = out_alias->id(); // update a -> a0
LOG(INFO) << "loop found in graph, create SSA alias node ["
<< out_alias->repr() << "] for [" << out->repr() << "]";
out = out_alias;
}
out->inlinks.push_back(o);
o->outlinks.push_back(out);
}
......
......@@ -24,12 +24,12 @@ namespace analysis {
TEST_F(DFG_Tester, Init) {
FluidToDataFlowGraphPass pass;
pass.Initialize(&argument);
DataFlowGraph graph;
pass.Run(&graph);
pass.Run(argument.main_dfg.get());
// Analysis is sensitive to ProgramDesc, careful to change the original model.
ASSERT_EQ(graph.nodes.size(), 37UL);
ASSERT_EQ(argument.main_dfg->nodes.size(), 38UL);
pass.Finalize();
LOG(INFO) << '\n' << graph.DotString();
ASSERT_FALSE(argument.main_dfg->DotString().empty());
EXPECT_FALSE(argument.main_dfg->inputs.empty());
}
} // namespace analysis
......
......@@ -25,6 +25,9 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
SubGraphFuse(graph, node_inside_subgraph_teller_)();
VLOG(4) << "debug info "
<< graph->HumanReadableInfo(false /*show_values*/,
true /*show_functions*/);
}
} // namespace analysis
......
......@@ -82,7 +82,7 @@ inference_api_test(test_api_impl
if(WITH_GPU AND TENSORRT_FOUND)
cc_library(paddle_inference_tensorrt_subgraph_engine
SRCS api_tensorrt_subgraph_engine.cc
DEPS paddle_inference_api analysis tensorrt_engine paddle_fluid_api)
DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter)
inference_api_test(test_api_tensorrt_subgraph_engine ARGS test_word2vec)
endif()
......
......@@ -39,7 +39,7 @@ bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
bool PaddleInferenceAnakinPredictor::Run(
const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) {
std::vector<PaddleTensor> *output_data, int batch_size) {
for (const auto &input : inputs) {
if (input.dtype != PaddleDType::FLOAT32) {
LOG(ERROR) << "Only support float type inputs. " << input.name
......
......@@ -37,7 +37,8 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
// NOTE Unlike the native engine, the buffers of anakin engine's output_data
// should be allocated first.
bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data) override;
std::vector<PaddleTensor>* output_data,
int batch_size = -1) override;
std::unique_ptr<PaddlePredictor> Clone() override;
......
......@@ -108,7 +108,8 @@ NativePaddlePredictor::~NativePaddlePredictor() {
}
bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) {
std::vector<PaddleTensor> *output_data,
int batch_size) {
VLOG(3) << "Predictor::predict";
Timer timer;
timer.tic();
......
......@@ -38,7 +38,8 @@ class NativePaddlePredictor : public PaddlePredictor {
bool Init(std::shared_ptr<framework::Scope> parent_scope);
bool Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) override;
std::vector<PaddleTensor> *output_data,
int batch_size = -1) override;
std::unique_ptr<PaddlePredictor> Clone() override;
......
......@@ -16,6 +16,7 @@
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/operators/tensorrt_engine_op.h"
namespace paddle {
......@@ -64,16 +65,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
return false;
}
// Analyze inference_program
Argument argument;
argument.origin_program_desc.reset(
new ProgramDesc(*inference_program_->Proto()));
Singleton<Analyzer>::Global().Run(&argument);
CHECK(argument.transformed_program_desc);
VLOG(5) << "transformed program:\n"
<< argument.transformed_program_desc->SerializeAsString();
VLOG(5) << "to prepare executor";
*inference_program_->Proto() = *argument.transformed_program_desc;
OptimizeInferenceProgram();
ctx_ = executor_->Prepare(*inference_program_, 0);
VLOG(5) << "to create variables";
......@@ -86,6 +78,29 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
return true;
}
bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data,
int batch_size = -1) override {
PADDLE_ENFORCE_GT(batch_size, 0,
"TensorRT engine needs the argument batch_size set");
FLAGS_tensorrt_engine_batch_size = batch_size;
return NativePaddlePredictor::Run(inputs, output_data, batch_size);
}
void OptimizeInferenceProgram() {
// Analyze inference_program
Argument argument;
argument.origin_program_desc.reset(
new ProgramDesc(*inference_program_->Proto()));
Singleton<Analyzer>::Global().Run(&argument);
CHECK(argument.transformed_program_desc);
VLOG(5) << "transformed program:\n"
<< argument.transformed_program_desc->SerializeAsString();
VLOG(5) << "to prepare executor";
inference_program_.reset(
new framework::ProgramDesc(*argument.transformed_program_desc));
}
private:
TensorRTConfig config_;
};
......
......@@ -98,7 +98,8 @@ class PaddlePredictor {
// responsible for the output tensor's buffer, either allocated or passed from
// outside.
virtual bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data) = 0;
std::vector<PaddleTensor>* output_data,
int batch_size = -1) = 0;
// Clone a predictor that share the model weights, the Cloned predictor should
// be thread-safe.
......
......@@ -35,7 +35,8 @@ class DemoPredictor : public PaddlePredictor {
LOG(INFO) << "I get other_config " << config.other_config;
}
bool Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) override {
std::vector<PaddleTensor> *output_data,
int batch_size = 0) override {
LOG(INFO) << "Run";
return false;
}
......
......@@ -15,50 +15,79 @@
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace paddle {
DEFINE_string(dirname, "", "Directory of the inference model.");
void Main(bool use_gpu) {
void CompareTensorRTWithFluid(bool enable_tensorrt) {
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = enable_tensorrt;
//# 1. Create PaddlePredictor with a config.
TensorRTConfig config;
config.model_dir = FLAGS_dirname + "word2vec.inference.model";
config.use_gpu = use_gpu;
config.fraction_of_gpu_memory = 0.15;
config.device = 0;
auto predictor =
NativeConfig config0;
config0.model_dir = FLAGS_dirname + "word2vec.inference.model";
config0.use_gpu = true;
config0.fraction_of_gpu_memory = 0.3;
config0.device = 0;
TensorRTConfig config1;
config1.model_dir = FLAGS_dirname + "word2vec.inference.model";
config1.use_gpu = true;
config1.fraction_of_gpu_memory = 0.3;
config1.device = 0;
auto predictor0 =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
auto predictor1 =
CreatePaddlePredictor<TensorRTConfig,
PaddleEngineKind::kAutoMixedTensorRT>(config);
PaddleEngineKind::kAutoMixedTensorRT>(config1);
for (int batch_id = 0; batch_id < 3; batch_id++) {
for (int batch_id = 0; batch_id < 1; batch_id++) {
//# 2. Prepare input.
int64_t data[4] = {1, 2, 3, 4};
std::vector<int64_t> data(20);
for (int i = 0; i < 20; i++) data[i] = i;
PaddleTensor tensor{.name = "",
.shape = std::vector<int>({4, 1}),
.data = PaddleBuf(data, sizeof(data)),
.dtype = PaddleDType::INT64};
PaddleTensor tensor{
.name = "",
.shape = std::vector<int>({10, 1}),
.data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)),
.dtype = PaddleDType::INT64};
// For simplicity, we set all the slots with the same data.
std::vector<PaddleTensor> slots(4, tensor);
//# 3. Run
std::vector<PaddleTensor> outputs;
CHECK(predictor->Run(slots, &outputs));
std::vector<PaddleTensor> outputs0;
std::vector<PaddleTensor> outputs1;
CHECK(predictor0->Run(slots, &outputs0));
CHECK(predictor1->Run(slots, &outputs1, 10));
//# 4. Get output.
ASSERT_EQ(outputs.size(), 1UL);
LOG(INFO) << "output buffer size: " << outputs.front().data.length();
const size_t num_elements = outputs.front().data.length() / sizeof(float);
// The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
ASSERT_EQ(outputs0.size(), 1UL);
ASSERT_EQ(outputs1.size(), 1UL);
const size_t num_elements = outputs0.front().data.length() / sizeof(float);
const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
EXPECT_EQ(num_elements, num_elements1);
auto *data0 = static_cast<float *>(outputs0.front().data.data());
auto *data1 = static_cast<float *>(outputs1.front().data.data());
ASSERT_GT(num_elements, 0UL);
for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
EXPECT_NEAR(data0[i], data1[i], 1e-3);
}
}
}
TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); }
TEST(paddle_inference_api_tensorrt_subgraph_engine, without_tensorrt) {
CompareTensorRTWithFluid(false);
}
TEST(paddle_inference_api_tensorrt_subgraph_engine, with_tensorrt) {
CompareTensorRTWithFluid(true);
}
} // namespace paddle
......@@ -93,6 +93,10 @@ class OpConverter {
framework::Scope* scope_{nullptr};
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__) \
struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \
trt_##op_type__##_converter() { \
......@@ -111,7 +115,3 @@ class OpConverter {
extern int TouchConverterRegister_##op_type__(); \
static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \
TouchConverterRegister_##op_type__();
} // namespace tensorrt
} // namespace inference
} // namespace paddle
......@@ -26,18 +26,20 @@ namespace paddle {
namespace inference {
namespace tensorrt {
void TensorRTEngine::Build(const DescType& paddle_model) {
void TensorRTEngine::Build(const DescType &paddle_model) {
PADDLE_ENFORCE(false, "not implemented");
}
void TensorRTEngine::Execute(int batch_size) {
std::vector<void*> buffers;
for (auto& buf : buffers_) {
batch_size_ = batch_size;
std::vector<void *> buffers;
for (auto &buf : buffers_) {
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
PADDLE_ENFORCE_GT(buf.max_size, 0);
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
buffers.push_back(buf.buffer);
}
PADDLE_ENFORCE_NOT_NULL(stream_);
infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
cudaStreamSynchronize(*stream_);
}
......@@ -45,7 +47,7 @@ void TensorRTEngine::Execute(int batch_size) {
TensorRTEngine::~TensorRTEngine() {
cudaStreamSynchronize(*stream_);
// clean buffer
for (auto& buf : buffers_) {
for (auto &buf : buffers_) {
if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
buf.buffer = nullptr;
......@@ -70,32 +72,37 @@ void TensorRTEngine::FreezeNetwork() {
// allocate GPU buffers.
buffers_.resize(buffer_sizes_.size());
for (auto& item : buffer_sizes_) {
for (auto &item : buffer_sizes_) {
// The output buffers are not set in the network building phrase, need to
// infer from the TesorRT network.
if (item.second == 0) {
auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
auto dims = infer_engine_->getBindingDimensions(slot_offset);
item.second = kDataTypeSize[static_cast<int>(
infer_engine_->getBindingDataType(slot_offset))] *
analysis::AccuDims(dims.d, dims.nbDims);
PADDLE_ENFORCE_GT(item.second, 0);
}
auto& buf = buffer(item.first);
auto &buf = buffer(item.first);
buf.max_size = item.second * max_batch_;
CHECK(buf.buffer == nullptr); // buffer should be allocated only once.
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second));
VLOG(4) << "buffer malloc " << item.first << " " << item.second << " "
<< buf.buffer;
buf.size = buf.max_size = item.second;
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, buf.max_size));
PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G
// buf.size will changed in the runtime.
buf.size = 0;
buf.device = DeviceType::GPU;
}
}
nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
nvinfer1::DataType dtype,
const nvinfer1::Dims& dims) {
const nvinfer1::Dims &dims) {
PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s",
name);
PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
auto* input = infer_network_->addInput(name.c_str(), dtype, dims);
auto *input = infer_network_->addInput(name.c_str(), dtype, dims);
PADDLE_ENFORCE(input, "infer network add input %s failed", name);
buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
analysis::AccuDims(dims.d, dims.nbDims);
......@@ -104,12 +111,12 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
return input;
}
void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
const std::string& name) {
void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
const std::string &name) {
PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
name);
auto* output = layer->getOutput(offset);
auto *output = layer->getOutput(offset);
SetITensor(name, output);
PADDLE_ENFORCE(output != nullptr);
output->setName(name.c_str());
......@@ -121,11 +128,11 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
buffer_sizes_[name] = 0;
}
void TensorRTEngine::DeclareOutput(const std::string& name) {
void TensorRTEngine::DeclareOutput(const std::string &name) {
PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
name);
auto* output = TensorRTEngine::GetITensor(name);
auto *output = TensorRTEngine::GetITensor(name);
PADDLE_ENFORCE(output != nullptr);
output->setName(name.c_str());
PADDLE_ENFORCE(!output->isNetworkInput());
......@@ -135,38 +142,45 @@ void TensorRTEngine::DeclareOutput(const std::string& name) {
buffer_sizes_[name] = 0;
}
void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
return buffer(name).buffer;
}
void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst,
void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
size_t max_size) {
// determine data size
auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_GE(max_size, it->second);
auto& buf = buffer(name);
auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second,
cudaMemcpyDeviceToDevice, *stream_),
0);
}
void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
size_t max_size) {
VLOG(4) << "get output in cpu";
auto &buf = buffer(name);
// Update needed buffer size.
auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
auto dims = infer_engine_->getBindingDimensions(slot_offset);
buf.size = kDataTypeSize[static_cast<int>(
infer_engine_->getBindingDataType(slot_offset))] *
analysis::AccuDims(dims.d, dims.nbDims);
PADDLE_ENFORCE_LE(buf.size, buf.max_size);
// determine data size
auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_GE(max_size, it->second);
auto& buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, it->second,
cudaMemcpyDeviceToHost, *stream_));
// DEBUG
memset(dst, 0, buf.size);
PADDLE_ENFORCE_EQ(
0, cudaMemcpy(dst, buf.buffer, buf.size, cudaMemcpyDeviceToHost));
}
Buffer& TensorRTEngine::buffer(const std::string& name) {
Buffer &TensorRTEngine::buffer(const std::string &name) {
PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end());
......@@ -174,19 +188,23 @@ Buffer& TensorRTEngine::buffer(const std::string& name) {
return buffers_[slot_offset];
}
void TensorRTEngine::SetInputFromCPU(const std::string& name, const void* data,
void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
size_t size) {
auto& buf = buffer(name);
auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer);
PADDLE_ENFORCE_NOT_NULL(data);
PADDLE_ENFORCE_NOT_NULL(stream_);
PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
buf.size = size;
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
cudaMemcpyHostToDevice, *stream_));
}
void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
size_t size) {
auto& buf = buffer(name);
auto &buf = buffer(name);
buf.size = size;
PADDLE_ENFORCE_NOT_NULL(buf.buffer);
PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
......@@ -194,15 +212,15 @@ void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
cudaMemcpyDeviceToDevice, *stream_));
}
void TensorRTEngine::SetITensor(const std::string& name,
nvinfer1::ITensor* tensor) {
void TensorRTEngine::SetITensor(const std::string &name,
nvinfer1::ITensor *tensor) {
PADDLE_ENFORCE(tensor != nullptr);
PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
name);
itensor_map_[name] = tensor;
}
nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) {
nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
return itensor_map_[name];
}
......
......@@ -57,7 +57,9 @@ class TensorRTEngine : public EngineBase {
: max_batch_(max_batch),
max_workspace_(max_workspace),
stream_(stream ? stream : &default_stream_),
logger_(logger) {}
logger_(logger) {
cudaStreamCreate(&default_stream_);
}
virtual ~TensorRTEngine();
......@@ -121,6 +123,9 @@ class TensorRTEngine : public EngineBase {
int max_batch_;
// the max memory size the engine uses
int max_workspace_;
// batch size of the current data, will be updated each Executation.
int batch_size_{-1};
cudaStream_t* stream_;
// If stream_ is not set from outside, hold its own stream.
cudaStream_t default_stream_;
......
......@@ -103,6 +103,10 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
LOG(INFO) << "to get output";
float y_cpu[2] = {-1., -1.};
auto dims = engine_->GetITensor("y")->getDimensions();
ASSERT_EQ(dims.nbDims, 3);
ASSERT_EQ(dims.d[0], 2);
ASSERT_EQ(dims.d[1], 1);
engine_->GetOutputInCPU("y", &y_cpu[0], sizeof(float) * 2);
ASSERT_EQ(y_cpu[0], 4.5);
ASSERT_EQ(y_cpu[1], 14.5);
......
......@@ -168,6 +168,8 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(relu);\n")
elseif(${TARGET} STREQUAL "fake_dequantize")
file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
elseif(${TARGET} STREQUAL "tensorrt_engine_op")
message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
else()
file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
endif()
......@@ -237,9 +239,9 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
op_library(softmax_op DEPS softmax)
op_library(sequence_softmax_op DEPS softmax)
if (WITH_GPU AND TENSORRT_FOUND)
op_library(tensorrt_engine_op DEPS tensorrt_engine)
op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter
DEPS tensorrt_engine_op
analysis)
else()
set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
......
......@@ -98,7 +98,7 @@ The update equations are as follows:
$$
velocity = mu * velocity + gradient \\
if (use\_nesterov): \\
param = param - gradient * learning\_rate + mu * velocity * learning\_rate \\
param = param - (gradient + mu * velocity) * learning\_rate \\
else: \\
param = param - learning\_rate * velocity. \\
$$
......
......@@ -30,7 +30,7 @@ __global__ void MomentumKernel(const T* p, const T* g, const T* v,
T g_val = g[i];
T v_new = v[i] * mu + g_val;
v_out[i] = v_new;
p_out[i] = p[i] - (g_val - v_new * mu) * lr;
p_out[i] = p[i] - (g_val + v_new * mu) * lr;
}
} else {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
......
......@@ -46,7 +46,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
v_out = v * mu + g;
if (use_nesterov) {
p_out = p - (g - v_out * mu) * lr[0];
p_out = p - (g + v_out * mu) * lr[0];
} else {
p_out = p - lr[0] * v_out;
}
......
......@@ -24,6 +24,9 @@
#include "paddle/fluid/operators/tensorrt_engine_op.h"
namespace paddle {
DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
namespace operators {
using inference::Singleton;
......@@ -52,7 +55,6 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
"TensorRT' tensor input requires at least 2 dimensions");
PADDLE_ENFORCE_LE(shape.size(), 4UL,
"TensorRT' tensor input requires at most 4 dimensions");
switch (shape.size()) {
case 2:
return nvinfer1::Dims2(shape[0], shape[1]);
......@@ -90,27 +92,36 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
engine->InitNetwork();
framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
VLOG(4) << "parsed var size " << block.AllVars().size();
// Add inputs
VLOG(4) << "declare inputs";
for (auto &input : context.Inputs("Xs")) {
VLOG(4) << "declare input " << input;
auto *var = block.FindVar(input);
// TensorRT engine need to create parameters. The parameter's description
// should be set in
PADDLE_ENFORCE(var, "no variable called %s", input);
PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
"TensorRT engine only takes LoDTensor as input");
auto shape = var->GetShape();
// For the special batch_size placeholder -1, drop it and pass the real
// shape of data.
// TODO(Superjomn) fix this with batch broadcast, or it can't handle
// variational batch size.
if (shape[0] == -1) {
shape[0] = FLAGS_tensorrt_engine_batch_size;
}
engine->DeclareInput(
input, FluidDataType2TRT(
var->Proto()->type().lod_tensor().tensor().data_type()),
Vec2TRT_Dims(var->GetShape()));
Vec2TRT_Dims(shape));
}
inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
block_desc, parameters, context.scope(), engine);
// Add outputs
VLOG(4) << "declare outputs";
for (auto &output : context.Outputs("Ys")) {
VLOG(4) << "declare output " << output;
engine->DeclareOutput(output);
}
......@@ -151,4 +162,7 @@ REGISTER_OP_CPU_KERNEL(
ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int>,
ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int64_t>);
// A trick to compile with the needed TensorRT op converter.
USE_TRT_CONVERTER(mul)
#endif // PADDLE_WITH_CUDA
......@@ -24,6 +24,9 @@
#include "paddle/fluid/inference/tensorrt/engine.h"
namespace paddle {
DECLARE_int32(tensorrt_engine_batch_size);
namespace operators {
using inference::Singleton;
......@@ -53,7 +56,6 @@ template <typename DeviceContext, typename T>
class TensorRTEngineKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
VLOG(4) << "TensorRTEngineKernel executing";
auto engine_name = context.Attr<std::string>("engine_uniq_key");
if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
Prepare(context);
......@@ -61,11 +63,8 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
auto input_names = context.op().Inputs("Xs");
PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
// Try to determine a batch_size
auto& tensor0 = inference::analysis::GetFromScope<framework::LoDTensor>(
context.scope(), input_names.front());
int batch_size = tensor0.dims()[0];
PADDLE_ENFORCE_LE(batch_size, context.Attr<int>("max_batch"));
PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
context.Attr<int>("max_batch"));
// Convert input tensor from fluid to engine.
for (const auto& x : context.Inputs("Xs")) {
......@@ -81,8 +80,8 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
}
}
// Execute the engine.
PADDLE_ENFORCE_GT(batch_size, 0);
engine->Execute(batch_size);
PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0);
engine->Execute(FLAGS_tensorrt_engine_batch_size);
// Convert output tensor from engine to fluid
for (const auto& y : context.Outputs("Ys")) {
// convert output and copy to fluid.
......@@ -94,18 +93,21 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
auto* fluid_v = context.scope().FindVar(y);
PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
fluid_t->Resize(framework::make_ddim(ddim));
auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
if (platform::is_cpu_place(fluid_t->place())) {
// TODO(Superjomn) change this float to dtype size.
engine->GetOutputInCPU(
y, fluid_t->mutable_data<float>(platform::CPUPlace()),
size * sizeof(float));
} else {
engine->GetOutputInGPU(
y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
size * sizeof(float));
}
fluid_t->Resize(framework::make_ddim(ddim));
// TODO(Superjomn) find some way to determine which device to output the
// tensor.
// if (platform::is_cpu_place(fluid_t->place())) {
// TODO(Superjomn) change this float to dtype size.
engine->GetOutputInCPU(y,
fluid_t->mutable_data<float>(platform::CPUPlace()),
size * sizeof(float));
//} else {
// engine->GetOutputInGPU(
// y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
// size * sizeof(float));
//}
}
cudaStreamSynchronize(*engine->stream());
......
......@@ -333,8 +333,7 @@ function assert_api_not_changed() {
python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec
deactivate
# Use git diff --name-only HEAD^ may not get file changes for update commits in one PR
API_CHANGE=`echo $CHANGED_FILES | grep "paddle/fluid/API.spec" || true`
API_CHANGE=`git diff --name-only upstream/develop | grep "paddle/fluid/API.spec" || true`
echo "checking API.spec change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
# TODO: curl -H 'Authorization: token ${TOKEN}'
......
......@@ -166,7 +166,8 @@ def fc(input,
param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
parameters/weights of this layer.
bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
of this layer. If it is set to None, no bias will be added to the output units.
of this layer. If it is set to False, no bias will be added to the output units.
If it is set to None, the bias is initialized zero. Default: None.
act (str, default None): Activation to be applied to the output of this layer.
is_test(bool): A flag indicating whether execution is in test phase.
use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn
......
......@@ -324,7 +324,7 @@ class MomentumOptimizer(Optimizer):
& if (use\_nesterov):
&\quad param = param - gradient * learning\_rate + mu * velocity * learning\_rate
&\quad param = param - (gradient + mu * velocity) * learning\_rate
& else:
......
......@@ -48,6 +48,7 @@ list(REMOVE_ITEM TEST_OPS test_warpctc_op)
list(REMOVE_ITEM TEST_OPS test_dist_train)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP})
endforeach(TEST_OP)
......@@ -60,3 +61,4 @@ if(WITH_DISTRIBUTE)
endif()
py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import argparse
import time
import math
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from paddle.fluid import core
import unittest
from multiprocessing import Process
import os
import sys
import signal
# Fix seed for test
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1
train_parameters = {
"input_size": [3, 224, 224],
"input_mean": [0.485, 0.456, 0.406],
"input_std": [0.229, 0.224, 0.225],
"learning_strategy": {
"name": "piecewise_decay",
"epochs": [30, 60, 90],
"steps": [0.1, 0.01, 0.001, 0.0001]
}
}
class SE_ResNeXt():
def __init__(self, layers=50):
self.params = train_parameters
self.layers = layers
def net(self, input, class_dim=1000):
layers = self.layers
supported_layers = [50, 101, 152]
assert layers in supported_layers, \
"supported layers are {} but input layer is {}".format(supported_layers, layers)
if layers == 50:
cardinality = 32
reduction_ratio = 16
depth = [3, 4, 6, 3]
num_filters = [128, 256, 512, 1024]
conv = self.conv_bn_layer(
input=input,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
elif layers == 101:
cardinality = 32
reduction_ratio = 16
depth = [3, 4, 23, 3]
num_filters = [128, 256, 512, 1024]
conv = self.conv_bn_layer(
input=input,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
elif layers == 152:
cardinality = 64
reduction_ratio = 16
depth = [3, 8, 36, 3]
num_filters = [128, 256, 512, 1024]
conv = self.conv_bn_layer(
input=input,
num_filters=64,
filter_size=3,
stride=2,
act='relu')
conv = self.conv_bn_layer(
input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
conv = self.conv_bn_layer(
input=conv,
num_filters=128,
filter_size=3,
stride=1,
act='relu')
conv = fluid.layers.pool2d(
input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
pool_type='max')
for block in range(len(depth)):
for i in range(depth[block]):
conv = self.bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
cardinality=cardinality,
reduction_ratio=reduction_ratio)
pool = fluid.layers.pool2d(
input=conv, pool_size=7, pool_type='avg', global_pooling=True)
drop = fluid.layers.dropout(x=pool, dropout_prob=0.2)
stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
out = fluid.layers.fc(input=drop, size=class_dim, act='softmax')
return out
def shortcut(self, input, ch_out, stride):
ch_in = input.shape[1]
if ch_in != ch_out or stride != 1:
filter_size = 1
return self.conv_bn_layer(input, ch_out, filter_size, stride)
else:
return input
def bottleneck_block(self, input, num_filters, stride, cardinality,
reduction_ratio):
conv0 = self.conv_bn_layer(
input=input, num_filters=num_filters, filter_size=1, act='relu')
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
stride=stride,
groups=cardinality,
act='relu')
conv2 = self.conv_bn_layer(
input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
scale = self.squeeze_excitation(
input=conv2,
num_channels=num_filters * 2,
reduction_ratio=reduction_ratio)
short = self.shortcut(input, num_filters * 2, stride)
return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
def conv_bn_layer(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) / 2,
groups=groups,
act=None,
bias_attr=False)
return fluid.layers.batch_norm(input=conv, act=act)
def squeeze_excitation(self, input, num_channels, reduction_ratio):
pool = fluid.layers.pool2d(
input=input, pool_size=0, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
squeeze = fluid.layers.fc(input=pool,
size=num_channels / reduction_ratio,
act='relu')
stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
excitation = fluid.layers.fc(input=squeeze,
size=num_channels,
act='sigmoid')
scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
return scale
def get_model(batch_size):
# Input data
image = fluid.layers.fill_constant(
shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
label = fluid.layers.fill_constant(
shape=[batch_size, 1], dtype='int64', value=0.0)
# Train program
model = SE_ResNeXt(layers=50)
out = model.net(input=image, class_dim=102)
cost = fluid.layers.cross_entropy(input=out, label=label)
avg_cost = fluid.layers.mean(x=cost)
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
# Evaluator
test_program = fluid.default_main_program().clone(for_test=True)
# Optimization
total_images = 6149 # flowers
epochs = [30, 60, 90]
step = int(total_images / batch_size + 1)
bd = [step * e for e in epochs]
base_lr = 0.1
lr = []
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
optimizer.minimize(avg_cost)
# Reader
train_reader = paddle.batch(
paddle.dataset.flowers.train(), batch_size=batch_size)
test_reader = paddle.batch(
paddle.dataset.flowers.test(), batch_size=batch_size)
return test_program, avg_cost, train_reader, test_reader, acc_top1, out
def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
t = fluid.DistributeTranspiler()
t.transpile(
trainer_id=trainer_id,
program=main_program,
pservers=pserver_endpoints,
trainers=trainers)
return t
class DistSeResneXt2x2:
def run_pserver(self, pserver_endpoints, trainers, current_endpoint,
trainer_id):
get_model(batch_size=2)
t = get_transpiler(trainer_id,
fluid.default_main_program(), pserver_endpoints,
trainers)
pserver_prog = t.get_pserver_program(current_endpoint)
startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)
exe.run(pserver_prog)
def _wait_ps_ready(self, pid):
retry_times = 20
while True:
assert retry_times >= 0, "wait ps ready failed"
time.sleep(3)
print("waiting ps ready: ", pid)
try:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os.stat("/tmp/paddle.%d.port" % pid)
return
except os.error:
retry_times -= 1
def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model(
batch_size=20)
if is_dist:
t = get_transpiler(trainer_id,
fluid.default_main_program(), endpoints,
trainers)
trainer_prog = t.get_trainer_program()
else:
trainer_prog = fluid.default_main_program()
startup_exe = fluid.Executor(place)
startup_exe.run(fluid.default_startup_program())
strategy = fluid.ExecutionStrategy()
strategy.num_threads = 1
strategy.allow_op_delay = False
exe = fluid.ParallelExecutor(
True,
loss_name=avg_cost.name,
exec_strategy=strategy,
num_trainers=trainers,
trainer_id=trainer_id)
feed_var_list = [
var for var in trainer_prog.global_block().vars.itervalues()
if var.is_data
]
feeder = fluid.DataFeeder(feed_var_list, place)
reader_generator = train_reader()
first_loss, = exe.run(fetch_list=[avg_cost.name])
print(first_loss)
for i in xrange(5):
loss, = exe.run(fetch_list=[avg_cost.name])
last_loss, = exe.run(fetch_list=[avg_cost.name])
print(last_loss)
def main(role="pserver",
endpoints="127.0.0.1:9123",
trainer_id=0,
current_endpoint="127.0.0.1:9123",
trainers=1,
is_dist=True):
model = DistSeResneXt2x2()
if role == "pserver":
model.run_pserver(endpoints, trainers, current_endpoint, trainer_id)
else:
p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
) else fluid.CPUPlace()
model.run_trainer(p, endpoints, trainer_id, trainers, is_dist)
if __name__ == "__main__":
if len(sys.argv) != 7:
print(
"Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
)
role = sys.argv[1]
endpoints = sys.argv[2]
trainer_id = int(sys.argv[3])
current_endpoint = sys.argv[4]
trainers = int(sys.argv[5])
is_dist = True if sys.argv[6] == "TRUE" else False
main(
role=role,
endpoints=endpoints,
trainer_id=trainer_id,
current_endpoint=current_endpoint,
trainers=trainers,
is_dist=is_dist)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import argparse
import time
import math
import unittest
import os
import signal
import subprocess
class TestDistSeResneXt2x2(unittest.TestCase):
def setUp(self):
self._trainers = 2
self._pservers = 2
self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
self._python_interp = "python"
def start_pserver(self):
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
ps0_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
(self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
ps1_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
(self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
ps0_proc = subprocess.Popen(
ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
ps1_proc = subprocess.Popen(
ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return ps0_proc, ps1_proc
def _wait_ps_ready(self, pid):
retry_times = 20
while True:
assert retry_times >= 0, "wait ps ready failed"
time.sleep(3)
try:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os.stat("/tmp/paddle.%d.port" % pid)
return
except os.error:
retry_times -= 1
def non_test_with_place(self):
# *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
required_envs = {
"PATH": os.getenv("PATH"),
"PYTHONPATH": os.getenv("PYTHONPATH"),
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
"FLAGS_fraction_of_gpu_memory_to_use": "0.15"
}
# Run local to get a base line
env_local = {"CUDA_VISIBLE_DEVICES": "0"}
env_local.update(required_envs)
local_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d FLASE" % \
(self._python_interp, "127.0.0.1:1234", "127.0.0.1:1234", 1)
local_proc = subprocess.Popen(
local_cmd.split(" "), stdout=subprocess.PIPE, env=env_local)
local_proc.wait()
local_ret = local_proc.stdout.read()
# Run dist train to compare with local results
ps0, ps1 = self.start_pserver()
self._wait_ps_ready(ps0.pid)
self._wait_ps_ready(ps1.pid)
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
tr0_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d TRUE" % \
(self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
tr1_cmd = "%s dist_se_resnext.py trainer %s 1 %s %d TRUE" % \
(self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
env0 = {"CUDA_VISIBLE_DEVICES": "0"}
env1 = {"CUDA_VISIBLE_DEVICES": "1"}
env0.update(required_envs)
env1.update(required_envs)
FNULL = open(os.devnull, 'w')
tr0_proc = subprocess.Popen(
tr0_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env0)
tr1_proc = subprocess.Popen(
tr1_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env1)
tr0_proc.wait()
tr1_proc.wait()
loss_data0 = tr0_proc.stdout.read()
lines = loss_data0.split("\n")
dist_first_loss = eval(lines[0].replace(" ", ","))[0]
dist_last_loss = eval(lines[1].replace(" ", ","))[0]
local_lines = local_ret.split("\n")
local_first_loss = eval(local_lines[0])[0]
local_last_loss = eval(local_lines[1])[0]
self.assertAlmostEqual(local_first_loss, dist_first_loss)
self.assertAlmostEqual(local_last_loss, dist_last_loss)
# check tr0_out
# FIXME: ensure the server process is killed
# replace with ps0.terminate()
os.kill(ps0.pid, signal.SIGKILL)
os.kill(ps1.pid, signal.SIGKILL)
FNULL.close()
if __name__ == "__main__":
unittest.main()
......@@ -39,7 +39,7 @@ class TestMomentumOp1(OpTest):
velocity_out = mu * velocity + grad
if use_nesterov:
param_out = param - grad * learning_rate + \
param_out = param - grad * learning_rate - \
velocity_out * mu * learning_rate
else:
param_out = param - learning_rate * velocity_out
......@@ -75,7 +75,7 @@ class TestMomentumOp2(OpTest):
velocity_out = mu * velocity + grad
if use_nesterov:
param_out = param - grad * learning_rate + \
param_out = param - grad * learning_rate - \
velocity_out * mu * learning_rate
else:
param_out = param - learning_rate * velocity_out
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册