提交 6d01f10d 编写于 作者: Y Yancey1989

Merge branch 'develop' of github.com:PaddlePaddle/Paddle into dist_test_word2vec

......@@ -122,5 +122,9 @@ def parse_args():
type=str,
default="",
help='Directory that contains all the training recordio files.')
parser.add_argument(
'--use_inference_transpiler',
action='store_true',
help='If set, uses inference transpiler to optimize the program.')
args = parser.parse_args()
return args
......@@ -131,6 +131,11 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
exe = fluid.Executor(place)
exe.run(startup_prog)
# Use inference_transpiler to speedup
if args.use_inference_transpiler:
t = fluid.InferenceTranspiler()
t.transpile(infer_prog, place)
if not args.use_reader_op:
feed_var_list = [
var for var in train_prog.global_block().vars.itervalues()
......
......@@ -26,13 +26,15 @@ function(fetch_include_recursively root_dir)
endforeach()
endfunction()
# download library
message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
# download library
message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
endif()
if (WITH_ANAKIN)
message(STATUS "Anakin for inference is enabled")
......
......@@ -149,21 +149,33 @@ copy(memory_lib
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
)
set(module "inference")
copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
DSTS ${dst_dir}/${module} ${dst_dir}/${module}
)
set(inference_deps paddle_fluid_shared paddle_fluid)
if(WITH_CONTRIB)
set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
copy(contrib_inference_lib DEPS paddle_inference_api
message(STATUS "installing contrib")
set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
if (WITH_ANAKIN)
copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
SRCS
${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api
${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
DSTS ${contrib_dst_dir}/anakin ${contrib_dst_dir}/anakin)
list(APPEND inference_deps contrib_anakin_inference_lib)
endif()
copy(contrib_inference_lib DEPS paddle_inference_api
SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.*
DSTS ${contrib_dst_dir} ${contrib_dst_dir}
)
DSTS ${contrib_dst_dir} ${contrib_dst_dir})
list(APPEND inference_deps contrib_inference_lib)
endif()
set(module "inference")
copy(inference_lib DEPS ${inference_deps}
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
DSTS ${dst_dir}/${module} ${dst_dir}/${module}
)
set(module "platform")
copy(platform_lib DEPS profiler_py_proto
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
......
......@@ -18,7 +18,7 @@ if(APPLE)
endif(APPLE)
set(inference_deps paddle_inference_api paddle_fluid_api)
set(inference_deps paddle_inference_api paddle_fluid_api paddle_inference_tensorrt_subgraph_engine)
function(inference_api_test TARGET_NAME)
if (WITH_TESTING)
......@@ -50,13 +50,24 @@ cc_test(test_paddle_inference_api
inference_api_test(test_paddle_inference_api_impl
ARGS test_word2vec test_image_classification)
if(WITH_GPU AND TENSORRT_FOUND)
cc_library(paddle_inference_tensorrt_subgraph_engine
SRCS paddle_inference_api_tensorrt_subgraph_engine.cc
DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api)
inference_api_test(test_paddle_inference_api_tensorrt_subgraph_engine ARGS test_word2vec)
endif()
if (WITH_ANAKIN AND WITH_TESTING) # only needed in CI
# Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
# so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
# compile the libinference_anakin_api.a and compile with anakin.so.
nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
nv_library(inference_anakin_api SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
nv_library(inference_anakin_api_shared SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
target_compile_options(inference_anakin_api_shared BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
target_link_libraries(inference_anakin_api anakin anakin_saber_common)
target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
DEPS inference_anakin_api)
......
......@@ -15,6 +15,11 @@
inference_api_test(simple_on_word2vec ARGS test_word2vec)
option(WITH_INFERENCE_DEMO "Compile with Inference demo" OFF)
if(NOT WITH_INFERENCE_DEMO)
return()
endif()
set(DEMO_INSTALL_DIR "${PADDLE_BINARY_DIR}/inference_demo")
set(URL_ROOT http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F)
......
# Paddle 预测 API
为了更简单方便的预测部署,Fluid 提供了一套高层 API 用来隐藏底层不同的优化实现。
预测库包含:
- 头文件 `paddle_inference_api.h` 定义了所有的接口
- 库文件`libpaddle_fluid.so``libpaddle_fluid.a`
- 库文件 `libpaddle_inference_api.so``libpaddle_inference_api.a`
下面是详细的一些 API 概念介绍
## PaddleTensor
PaddleTensor 定义了预测最基本的输入输出的数据格式,其定义是
```c++
struct PaddleTensor {
std::string name; // variable name.
std::vector<int> shape;
PaddleBuf data; // blob of data.
PaddleDType dtype;
};
```
- `name` 用于指定输入数据对应的 模型中variable 的名字 (暂时没有用,但会在后续支持任意 target 时启用)
- `shape` 表示一个 Tensor 的 shape
- `data` 数据以连续内存的方式存储在`PaddleBuf` 中,`PaddleBuf` 可以接收外面的数据或者独立`malloc`内存,详细可以参考头文件中相关定义。
- `dtype` 表示 Tensor 的数据类型
## engine
高层 API 底层有多种优化实现,我们称之为 engine,目前有三种 engine
- 原生 engine,由 paddle 原生的 forward operator 组成,可以天然支持所有paddle 训练出的模型,
- Anakin engine,封装了 [Anakin](https://github.com/PaddlePaddle/Anakin) ,在某些模型上性能不错,但只能接受自带模型格式,无法支持所有 paddle 模型,
- TensorRT mixed engine,用子图的方式支持了 [TensorRT](https://developer.nvidia.com/tensorrt) ,支持所有paddle 模型,并自动切割部分计算子图到 TensorRT 上加速(WIP)
其实现为
```c++
enum class PaddleEngineKind {
kNative = 0, // Use the native Fluid facility.
kAnakin, // Use Anakin for inference.
kAutoMixedTensorRT // Automatically mixing TensorRT with the Fluid ops.
};
```
## 预测部署过程
总体上分为以下步骤
1. 用合适的配置创建 `PaddlePredictor`
2. 创建输入用的 `PaddleTensor`,传入到 `PaddlePredictor`
3. 获取输出的 `PaddleTensor` ,将结果取出
下面完整演示一个简单的模型,部分细节代码隐去
```c++
#include "paddle_inference_api.h"
// 创建一个 config,并修改相关设置
paddle::NativeConfig config;
config.model_dir = "xxx";
config.use_gpu = false;
// 创建一个原生的 PaddlePredictor
auto predictor =
paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
// 创建输入 tensor
int64_t data[4] = {1, 2, 3, 4};
paddle::PaddleTensor tensor{.name = "",
.shape = std::vector<int>({4, 1}),
.data = PaddleBuf(data, sizeof(data)),
.dtype = PaddleDType::INT64};
// 创建输出 tensor,输出 tensor 的内存可以复用
std::vector<paddle::PaddleTensor> outputs;
// 执行预测
CHECK(predictor->Run(slots, &outputs));
// 获取 outputs ...
```
编译时,联编 `libpaddle_fluid.a/.so``libpaddle_inference_api.a/.so` 便可。
## 详细代码参考
- [inference demos](./demo)
- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc)
......@@ -73,12 +73,12 @@ struct PaddleTensor {
};
enum class PaddleEngineKind {
kNative = 0, // Use the native Fluid facility.
kAnakin, // Use Anakin for inference.
kNative = 0, // Use the native Fluid facility.
kAnakin, // Use Anakin for inference.
kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT.
// TODO(Superjomn) support following engines latter.
// kTensorRT, // Use TensorRT for inference.
// kAutoMixedAnakin, // Automatically mix Fluid with Anakin.
// kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT.
};
/*
......@@ -130,6 +130,11 @@ struct AnakinConfig : public PaddlePredictor::Config {
int max_batch_size{-1};
};
struct TensorRTConfig : public NativeConfig {
// Determine whether a subgraph will be executed by TRT.
int min_subgraph_size{1};
};
// A factory to help create different predictors.
//
// FOR EXTENSION DEVELOPER:
......
......@@ -89,6 +89,7 @@ bool NativePaddlePredictor::Init(
LOG(ERROR) << "fail to load inference model.";
return false;
}
ctx_ = executor_->Prepare(*inference_program_, 0);
executor_->CreateVariables(
*inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
......@@ -119,6 +120,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
return false;
}
for (size_t i = 0; i < feed_target_names_.size(); ++i) {
VLOG(4) << "setting " << i << "-th target";
feed_targets[feed_target_names_[i]] = &feeds[i];
}
// get fetch variable
......@@ -130,14 +132,16 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
}
// Run the inference program
// if share variables, we need not create variables
VLOG(4) << "Run prepared context";
executor_->RunPreparedContext(
ctx_.get(),
sub_scope_ != nullptr ? sub_scope_ : scope_.get(),
&feed_targets,
&fetch_targets,
false /* don't create variable eatch time */);
VLOG(4) << "Finish prepared context";
if (!GetFetch(fetchs, output_data)) {
LOG(ERROR) << "fail to get fetchs";
LOG(ERROR) << "fail to get fetches";
return false;
}
VLOG(3) << "predict cost: " << timer.toc() << "ms";
......
......@@ -44,7 +44,7 @@ class NativePaddlePredictor : public PaddlePredictor {
~NativePaddlePredictor() override;
private:
protected:
bool SetFeed(const std::vector<PaddleTensor> &input_datas,
std::vector<framework::LoDTensor> *feeds);
bool GetFetch(const std::vector<framework::LoDTensor> &fetchs,
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/contrib/inference/paddle_inference_api.h"
#include "paddle/contrib/inference/paddle_inference_api_impl.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/utils/singleton.h"
namespace paddle {
using inference::analysis::Argument;
using inference::Singleton;
using inference::analysis::Analyzer;
using framework::proto::ProgramDesc;
class TensorRTSubgraphPredictor : public NativePaddlePredictor {
public:
explicit TensorRTSubgraphPredictor(const TensorRTConfig& config)
: NativePaddlePredictor(config), config_(config) {}
bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
VLOG(3) << "Predictor::init()";
if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device);
} else {
place_ = paddle::platform::CPUPlace();
}
if (parent_scope) {
scope_ = parent_scope;
sub_scope_ = &(parent_scope->NewScope());
} else {
paddle::framework::InitDevices(false);
scope_.reset(new paddle::framework::Scope());
}
executor_.reset(new paddle::framework::Executor(place_));
// Initialize the inference program
if (!config_.model_dir.empty()) {
// Parameters are saved in separate files sited in
// the specified `dirname`.
inference_program_ = paddle::inference::Load(
executor_.get(), scope_.get(), config_.model_dir);
} else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
// All parameters are saved in a single file.
// The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`.
inference_program_ = paddle::inference::Load(
executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
} else {
LOG(ERROR) << "fail to load inference model.";
return false;
}
// Analyze inference_program
Argument argument;
argument.origin_program_desc.reset(
new ProgramDesc(*inference_program_->Proto()));
Singleton<Analyzer>::Global().Run(&argument);
CHECK(argument.transformed_program_desc);
VLOG(5) << "transformed program:\n"
<< argument.transformed_program_desc->SerializeAsString();
VLOG(5) << "to prepare executor";
*inference_program_->Proto() = *argument.transformed_program_desc;
ctx_ = executor_->Prepare(*inference_program_, 0);
VLOG(5) << "to create variables";
executor_->CreateVariables(
*inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
// Get the feed_target_names and fetch_target_names
feed_target_names_ = inference_program_->GetFeedTargetNames();
fetch_target_names_ = inference_program_->GetFetchTargetNames();
return true;
}
private:
TensorRTConfig config_;
};
template <>
std::unique_ptr<PaddlePredictor>
CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
const TensorRTConfig& config) {
VLOG(3) << "create TensorRTSubgraphPredictor";
if (config.use_gpu) {
// 1. GPU memeroy
PADDLE_ENFORCE_GT(
config.fraction_of_gpu_memory,
0.f,
"fraction_of_gpu_memory in the config should be set to range (0., 1.]");
PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
std::vector<std::string> flags;
if (config.fraction_of_gpu_memory >= 0.0f ||
config.fraction_of_gpu_memory <= 0.95f) {
flags.push_back("dummpy");
std::string flag = "--fraction_of_gpu_memory_to_use=" +
std::to_string(config.fraction_of_gpu_memory);
flags.push_back(flag);
VLOG(3) << "set flag: " << flag;
framework::InitGflags(flags);
}
}
std::unique_ptr<PaddlePredictor> predictor(
new TensorRTSubgraphPredictor(config));
if (!dynamic_cast<TensorRTSubgraphPredictor*>(predictor.get())
->Init(nullptr)) {
return nullptr;
}
return std::move(predictor);
}
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "paddle/contrib/inference/paddle_inference_api.h"
namespace paddle {
DEFINE_string(dirname, "", "Directory of the inference model.");
void Main(bool use_gpu) {
//# 1. Create PaddlePredictor with a config.
TensorRTConfig config;
config.model_dir = FLAGS_dirname + "word2vec.inference.model";
config.use_gpu = use_gpu;
config.fraction_of_gpu_memory = 0.15;
config.device = 0;
auto predictor =
CreatePaddlePredictor<TensorRTConfig,
PaddleEngineKind::kAutoMixedTensorRT>(config);
for (int batch_id = 0; batch_id < 3; batch_id++) {
//# 2. Prepare input.
int64_t data[4] = {1, 2, 3, 4};
PaddleTensor tensor{.name = "",
.shape = std::vector<int>({4, 1}),
.data = PaddleBuf(data, sizeof(data)),
.dtype = PaddleDType::INT64};
// For simplicity, we set all the slots with the same data.
std::vector<PaddleTensor> slots(4, tensor);
//# 3. Run
std::vector<PaddleTensor> outputs;
CHECK(predictor->Run(slots, &outputs));
//# 4. Get output.
ASSERT_EQ(outputs.size(), 1UL);
LOG(INFO) << "output buffer size: " << outputs.front().data.length();
const size_t num_elements = outputs.front().data.length() / sizeof(float);
// The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
}
}
}
TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); }
} // namespace paddle
\ No newline at end of file
......@@ -713,6 +713,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
t = &var->Get<LoDTensor>();
} else if (var->IsType<SelectedRows>()) {
t = &(var->Get<SelectedRows>().value());
} else if (var->IsType<LoDTensorArray>()) {
const LoDTensorArray& arr = var->Get<LoDTensorArray>();
PADDLE_ENFORCE(arr.size() > 0);
t = &(arr[0]);
}
if (t != nullptr) {
int tmp = static_cast<int>(ToDataType(t->type()));
......
set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
fluid_to_data_flow_graph_pass.cc
data_flow_graph_to_fluid_pass.cc
tensorrt_subgraph_pass.cc
dfg_graphviz_draw_pass.cc
DEPS framework_proto)
tensorrt_subgraph_pass.cc
tensorrt_subgraph_node_mark_pass.cc
analyzer.cc
helper.cc
DEPS framework_proto proto_desc)
cc_test(test_node SRCS node_tester.cc DEPS analysis)
cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
......@@ -28,5 +30,7 @@ inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_
inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
#inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
inference_analysis_test(test_analyzer SRCS analyzer_tester.cc)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include "paddle/fluid/inference/analysis/pass_manager.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
namespace paddle {
namespace inference {
namespace analysis {
DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
"Enable subgraph to TensorRT engine for acceleration");
DEFINE_string(inference_analysis_graphviz_log_root, "./",
"Graphviz debuger for data flow graphs.");
class DfgPassManagerImpl final : public DfgPassManager {
public:
DfgPassManagerImpl() {
// TODO(Superjomn) set the key with pass reprs.
AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
auto trt_teller = [](const Node* node) {
if (!node->IsFunction()) return false;
return static_cast<const Function*>(node)->func_type() == "mul";
};
AddPass("tensorrt-subgraph-marker",
new TensorRTSubgraphNodeMarkPass(trt_teller));
AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
}
AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
}
std::string repr() const override { return "dfg-pass-manager"; }
std::string description() const override { return "DFG pass manager."; }
private:
void AddPass(const std::string& name, Pass* pass) {
LOG(INFO) << "Adding pass " << name;
Register(name, pass);
AddGraphvizDebugerPass(pass);
}
// Add the graphviz debuger pass if the parent pass has one.
void AddGraphvizDebugerPass(Pass* pass) {
auto* debuger_pass = pass->CreateGraphvizDebugerPass();
if (debuger_pass) {
LOG(INFO) << " - register debug pass [" << debuger_pass->repr() << "]";
Register(debuger_pass->repr(), debuger_pass);
}
}
};
Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
void Analyzer::Run(Argument* argument) {
for (auto& x : data_) {
PADDLE_ENFORCE(x->Initialize(argument));
x->RunAll();
PADDLE_ENFORCE(x->Finalize());
}
}
} // namespace analysis
} // namespace inference
} // namespace paddle
\ No newline at end of file
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
/*
* This file contains Analyzer, an class that exposed as a library that analyze
* and optimize
* Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to
* control whether
* an process is applied on the program.
*
* The processes are called Passes in analysis, the Passes are placed in a
* pipeline, the first
* Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to
* a data flow
* graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow
* graph to a
* Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes
* which take a
* node or data flow graph as input.
*
* The Analyzer can be used in two methods, the first is a executable file which
* can be used to
* pre-process the inference model and can be controlled by passing difference
* command flags;
* the other way is to compose inside the inference API as a runtime pre-process
* phase in the
* inference service.
*/
#include <gflags/gflags.h>
#include "paddle/fluid/inference/analysis/pass.h"
#include "paddle/fluid/inference/analysis/pass_manager.h"
namespace paddle {
namespace inference {
namespace analysis {
// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
// flag if not available.
DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
DECLARE_string(inference_analysis_graphviz_log_root);
class Analyzer : public OrderedRegistry<PassManager> {
public:
// Register all the pass-managers.
Analyzer();
void Run(Argument* argument);
DISABLE_COPY_AND_ASSIGN(Analyzer);
};
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
namespace paddle {
namespace inference {
namespace analysis {
TEST_F(DFG_Tester, main) {
Analyzer analyser;
analyser.Run(&argument);
}
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -41,6 +41,9 @@ struct Argument {
// The original program desc.
std::unique_ptr<framework::proto::ProgramDesc> origin_program_desc;
// The processed program desc.
std::unique_ptr<framework::proto::ProgramDesc> transformed_program_desc;
};
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
......
......@@ -20,7 +20,7 @@ namespace paddle {
namespace inference {
namespace analysis {
// It is a better idea that the inputs and outputs of this graph is set manully
// It is a better idea that the inputs and outputs of this graph is set manually
// before, but there must be a Pass that helps to prune the unnecessary ops that
// do not contribute to the given targets, so in this pass, analysis and get the
// inputs and outputs is OK.
......@@ -50,6 +50,25 @@ void DataFlowGraph::Build() {
outputs.push_back(out);
}
}
Clean();
}
void DataFlowGraph::Clean() {
for (auto &node : nodes.nodes()) {
std::unordered_set<Node *> inlinks_set(node->inlinks.begin(),
node->inlinks.end());
std::unordered_set<Node *> outlinks_set(node->outlinks.begin(),
node->outlinks.end());
if (inlinks_set.size() < node->inlinks.size()) {
LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
node->inlinks.assign(inlinks_set.begin(), inlinks_set.end());
}
if (outlinks_set.size() < node->outlinks.size()) {
LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
node->outlinks.assign(outlinks_set.begin(), outlinks_set.end());
}
}
}
std::string DataFlowGraph::DotString() const {
......
......@@ -47,6 +47,10 @@ struct DataFlowGraph {
// Output a DOT graph file for debug.
std::string DotString() const;
private:
// Remove duplicate edges and so on.
void Clean();
};
/*
......@@ -133,17 +137,24 @@ struct GraphTraits<DataFlowGraph> {
// Extract the inputs and outputs of a graph. The inputs and outputs of a
// sub-graph is the inputs nodes and output nodes that doesn't inside the
// sub-graph.
std::pair<
std::vector<Node *>,
std::vector<
Node *>> static ExtractInputAndOutputOfSubGraph(std::vector<Node *>
&graph) {
static std::pair<std::vector<Node *>, std::vector<Node *>>
ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {
std::unordered_set<Node *> nodes(graph.begin(), graph.end());
std::unordered_set<Node *> inputs;
std::unordered_set<Node *> outputs;
// Input a Value, check whether its inlink is in the subgraph.
auto inlink_in_subgraph = [&](Node *n) {
for (auto *in : n->inlinks) {
if (nodes.count(in)) return true;
}
return false;
};
for (auto &node : graph) {
for (auto *in : node->inlinks) {
if (!nodes.count(in) && in->type() == Node::Type::kValue) {
// The Value that is written by nodes inside a sub-graph shouldn't be the
// input of the sub-graph.
if (!nodes.count(in) && in->type() == Node::Type::kValue &&
!inlink_in_subgraph(in)) {
inputs.insert(in);
}
}
......
......@@ -13,21 +13,34 @@
// limitations under the License.
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/proto_desc.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
namespace paddle {
namespace inference {
namespace analysis {
using framework::proto::ProgramDesc;
std::vector<std::string> ExtractParameters(
const std::vector<std::unique_ptr<Node>>& nodes);
bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
desc_ = argument->origin_program_desc.get();
// Here some logic from program_desc.cc and will not add new interfaces into
// framework::ProgramDesc class, use some UT to assure the correctness.
auto* block = desc_->mutable_blocks()->Add();
block->set_idx(framework::kRootBlockIndex);
block->set_parent_idx(framework::kNoneBlockIndex);
PADDLE_ENFORCE(!argument->transformed_program_desc);
// The transformed_program_desc should inherit all the VarDesc and BlockDesc
// from the original program desc. The operators of the main block(the first
// block) should rewritten by data flow graph.
argument->transformed_program_desc.reset(
new ProgramDesc(*argument->origin_program_desc));
argument->transformed_program_desc->mutable_blocks(framework::kRootBlockIndex)
->clear_ops();
desc_ = argument->transformed_program_desc.get();
argument_ = argument;
return true;
}
......@@ -37,14 +50,17 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
auto traits = GraphTraits<DataFlowGraph>(graph);
for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) {
if (it->deleted()) continue;
switch (it->type()) {
case Node::Type::kFunction:
LOG(INFO) << "add function " << it->name();
case Node::Type::kFunction: {
LOG(INFO) << "add function " << it->repr();
AddFluidOp(&(*it));
break;
case Node::Type::kFunctionBlock:
} break;
case Node::Type::kFunctionBlock: {
LOG(INFO) << "add engine op " << it->repr() << " , "
<< static_cast<FunctionBlock*>(&(*it))->subgraph.size();
AddEngineOp(&(*it));
break;
} break;
default:
continue;
}
......@@ -52,12 +68,10 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
}
void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
LOG(INFO) << "processing func " << node->name();
auto* ori_op = static_cast<framework::proto::OpDesc*>(node->pb_desc());
// currently only the main block is analyzed.
auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
auto* op = main_block->add_ops();
LOG(INFO) << "to copy the op";
*op = *ori_op; // copy the attributes, by default, these will not be changed
// by analysis phrase.
// The inputs and outputs of the existing ops are not changed by tensorrt
......@@ -65,11 +79,89 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
// NOTE It might be changed by other passes in the long run.
}
void CreateTrtEngineOp(Node* node, const DataFlowGraph& graph,
const framework::proto::BlockDesc& block) {
static int counter{0};
PADDLE_ENFORCE(node->IsFunctionBlock());
framework::OpDesc desc;
auto* func = static_cast<FunctionBlock*>(node);
// collect inputs
std::vector<std::string> io;
for (auto* x : func->inlinks) {
io.push_back(x->name());
}
desc.SetInput("Xs", io);
// collect outputs
io.clear();
for (auto* x : func->outlinks) {
io.push_back(x->name());
}
desc.SetOutput("Ys", io);
desc.SetType("tensorrt_engine");
// Set attrs
SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
SetAttr(desc.Proto(), "engine_unique_key",
"trt-" + std::to_string(counter++));
SetAttr(desc.Proto(), "max_batch", 100); // TODO(Superjomn) add config latter
SetAttr(desc.Proto(), "max_workspace",
1024); // TODO(Superjomn) add config latter
SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
node->SetPbMsg(desc.Proto()->SerializeAsString());
}
std::vector<std::string> ExtractParameters(
const std::vector<std::unique_ptr<Node>>& nodes) {
std::vector<std::string> parameters;
for (const auto& node : nodes) {
if (!node->IsValue()) continue;
PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first");
framework::proto::VarDesc var;
var.ParseFromString(node->pb_msg());
if (var.persistable()) {
parameters.push_back(var.name());
}
}
return parameters;
}
void DataFlowGraphToFluidPass::AddEngineOp(Node* node) {
// auto* ori_op = static_cast<framework::proto::OpDesc*>(node->extra_info());
// auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
// auto* op = main_block->add_ops();
// TODO(Superjomn) Here need to expose some arguments for default setting.
PADDLE_ENFORCE(node->IsFunctionBlock());
auto* block_node = static_cast<FunctionBlock*>(node);
framework::proto::BlockDesc proto;
framework::BlockDesc block_desc(nullptr, &proto);
// copy ops.
for (auto* node : block_node->subgraph) {
auto* op = block_desc.AppendOp();
PADDLE_ENFORCE(!node->pb_msg().empty());
op->Proto()->ParseFromString(node->pb_msg());
}
CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
auto* op = main_block->add_ops();
PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
op->ParseFromString(node->pb_msg());
}
namespace {
class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
public:
using Config = DFG_GraphvizDrawPass::Config;
DFG_DebuggerPass(const Config& config) : DFG_GraphvizDrawPass(config) {}
std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
bool Finalize() override { return true; }
};
}
Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
FLAGS_inference_analysis_graphviz_log_root,
"data_flow_graph_to_fluid_graphviz_debugger"));
}
} // namespace analysis
......
......@@ -40,10 +40,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
return "Transform a DFG to a Fluid ProgramDesc";
}
Pass *CreatePrinterPass(std::ostream &os,
const std::string &banner) const override {
return nullptr;
}
Pass *CreateGraphvizDebugerPass() const override;
protected:
// Add a Fluid Op into the ProgramDesc.
......@@ -53,6 +50,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
private:
framework::proto::ProgramDesc *desc_;
Argument *argument_;
};
} // namespace analysis
} // namespace inference
......
......@@ -18,12 +18,19 @@ namespace paddle {
namespace inference {
namespace analysis {
int DFG_GraphvizDrawPass::counter_{0};
void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) {
auto content = Draw(graph);
std::ofstream file(GenDotPath());
auto dot_path = GenDotPath();
std::ofstream file(dot_path);
file.write(content.c_str(), content.size());
file.close();
LOG(INFO) << "draw dot to " << GenDotPath();
auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png";
std::string message;
LOG(INFO) << "draw to " << png_path;
ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message);
}
std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
......@@ -41,9 +48,7 @@ std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
if (!config_.display_deleted_node && node.deleted()) continue;
for (auto &in : node.inlinks) {
if (!config_.display_deleted_node && in->deleted()) continue;
for (auto &in : node.inlinks) {
dot.AddEdge(in->repr(), node.repr(), {});
}
dot.AddEdge(in->repr(), node.repr(), {});
}
}
return dot.Build();
......
......@@ -50,20 +50,25 @@ class DFG_GraphvizDrawPass : public DataFlowGraphPass {
bool Initialize(Argument *argument) override { return true; }
void Run(DataFlowGraph *graph) override;
bool Finalize() override { return Pass::Finalize(); }
bool Finalize() override { return true; }
std::string repr() const override { return "DFG graphviz drawer"; }
std::string description() const override {
return "Debug a DFG by draw with graphviz";
}
private:
protected:
// A counter to add a number prefix to the debugger image output so that they
// will sort in the triggered order.
static int counter_;
// Path of the dot file to output.
std::string GenDotPath() const {
return config_.dir + "/" + "graph_" + config_.id + ".dot";
return config_.dir + "/" + std::to_string(counter_++) + "-graph_" +
config_.id + ".dot";
}
std::string Draw(DataFlowGraph *graph);
virtual std::string Draw(DataFlowGraph *graph);
Config config_;
};
......
......@@ -31,7 +31,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
pass.Run(&dfg);
// test content
std::ifstream file("./graph_test.dot");
std::ifstream file("./0-graph_test.dot");
ASSERT_TRUE(file.is_open());
std::string line;
......@@ -40,7 +40,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
no++;
}
// DFG is sensitive to ProgramDesc, be careful to change the existing models.
ASSERT_EQ(no, 112);
ASSERT_EQ(no, 82);
}
} // namespace analysis
......
......@@ -15,6 +15,8 @@ limitations under the License. */
#include <string>
#include <vector>
#include "analyzer.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
namespace paddle {
......@@ -33,7 +35,7 @@ bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
return true;
}
bool FluidToDataFlowGraphPass::Finalize() { return Pass::Finalize(); }
bool FluidToDataFlowGraphPass::Finalize() { return true; }
void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
PADDLE_ENFORCE(graph);
......@@ -46,6 +48,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
auto *v = graph->nodes.Create(Node::Type::kValue);
v->SetName(var.name());
v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
v->SetPbMsg(var.SerializeAsString());
var2id[var.name()] = v->id();
}
for (int i = 0; i < main_block.ops_size(); i++) {
......@@ -56,6 +59,8 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
// Link to the original protobuf message's memory, make it easier to
// generate from a data flow graph to fluid ProgramDesc.
o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
o->SetPbMsg(op.SerializeAsString());
// set inputs and outputs
// TODO(Superjomn) make sure the InputNames is the real variable name.
for (int j = 0; j < op.inputs_size(); j++) {
......@@ -79,9 +84,19 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
graph->Build();
}
Pass *FluidToDataFlowGraphPass::CreatePrinterPass(
std::ostream &os, const std::string &banner) const {
return nullptr;
namespace {
class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
public:
using Config = DFG_GraphvizDrawPass::Config;
DFG_DebuggerPass(const Config &config) : DFG_GraphvizDrawPass(config) {}
std::string repr() const override { return "fluid-to-dfg-debuger-pass"; }
bool Finalize() override { return true; }
};
}
Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
FLAGS_inference_analysis_graphviz_log_root, "fluid-to-dfg-debuger"));
}
} // namespace analysis
......
......@@ -46,8 +46,7 @@ class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
return "transform a fluid ProgramDesc to a data flow graph.";
}
Pass *CreatePrinterPass(std::ostream &os,
const std::string &banner) const override;
Pass *CreateGraphvizDebugerPass() const override;
private:
framework::proto::ProgramDesc const *desc_;
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/framework/framework.pb.h"
namespace paddle {
namespace inference {
namespace analysis {
template <>
void SetAttr<std::string>(framework::proto::OpDesc *op, const std::string &name,
const std::string &data) {
auto *attr = op->add_attrs();
attr->set_name(name);
attr->set_type(paddle::framework::proto::AttrType::STRING);
attr->set_s(data);
}
template <>
void SetAttr<int>(framework::proto::OpDesc *op, const std::string &name,
const int &data) {
auto *attr = op->add_attrs();
attr->set_name(name);
attr->set_type(paddle::framework::proto::AttrType::INT);
attr->set_i(data);
}
template <>
void SetAttr<int64_t>(framework::proto::OpDesc *op, const std::string &name,
const int64_t &data) {
auto *attr = op->add_attrs();
attr->set_name(name);
attr->set_type(paddle::framework::proto::AttrType::LONG);
attr->set_l(data);
}
template <>
void SetAttr<std::vector<std::string>>(framework::proto::OpDesc *op,
const std::string &name,
const std::vector<std::string> &data) {
auto *attr = op->add_attrs();
attr->set_name(name);
attr->set_type(paddle::framework::proto::AttrType::STRINGS);
for (const auto &s : data) {
attr->add_strings(s.c_str());
}
}
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -14,10 +14,12 @@ limitations under the License. */
#pragma once
#include <cstdio>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/enforce.h"
......@@ -26,6 +28,10 @@ namespace paddle {
namespace inference {
namespace analysis {
template <typename T>
void SetAttr(framework::proto::OpDesc *op, const std::string &name,
const T &data);
template <typename Vec>
int AccuDims(Vec &&vec, int size) {
int res = 1;
......@@ -93,7 +99,7 @@ template <typename T>
class OrderedRegistry {
public:
T *Register(const std::string &name, T *x) {
PADDLE_ENFORCE(!dic_.count(name));
PADDLE_ENFORCE(!dic_.count(name), "duplicate key [%s]", name);
dic_[name] = data_.size();
data_.emplace_back(std::unique_ptr<T>(x));
return data_.back().get();
......@@ -117,6 +123,20 @@ T &GetFromScope(const framework::Scope &scope, const std::string &name) {
return *var->GetMutable<T>();
}
static void ExecShellCommand(const std::string &cmd, std::string *message) {
char buffer[128];
std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
if (!pipe) {
LOG(ERROR) << "error running command: " << cmd;
return;
}
while (!feof(pipe.get())) {
if (fgets(buffer, 128, pipe.get()) != nullptr) {
*message += buffer;
}
}
}
} // namespace analysis
} // namespace inference
} // namespace paddle
......
......@@ -20,6 +20,17 @@ namespace paddle {
namespace inference {
namespace analysis {
template <>
std::string &NodeAttr::As<std::string>() {
if (data_.empty()) {
type_hash_ = typeid(std::string).hash_code();
}
PADDLE_ENFORCE_EQ(type_hash_, typeid(std::string).hash_code());
return data_;
}
std::string &NodeAttr::String() { return As<std::string>(); }
std::vector<Dot::Attr> Value::dot_attrs() const {
return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
Dot::Attr("shape", "box"),
......
......@@ -35,6 +35,44 @@ namespace analysis {
class NodeMap;
// A helper class to maintain the status from Pass.
struct NodeAttr {
// NOTE T should be a primary type or a struct combined by several primary
// types.
// NOTE the STL containers should not use here.
// Some usages
// Attr attr;
// attr.Bool() = true;
bool &Bool() { return As<bool>(); }
float &Float() { return As<float>(); }
int32_t &Int32() { return As<int32_t>(); }
int64_t &Int64() { return As<int64_t>(); }
void *&Pointer() { return As<void *>(); }
std::string &String();
private:
template <typename T>
T &As() {
// init storage in the first usage.
if (data_.empty()) {
VLOG(4) << "resize data to " << sizeof(T);
type_hash_ = typeid(T).hash_code();
data_.resize(sizeof(T));
}
PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(),
"type not matched, origin is %s, want %s",
DataTypeNamer::Global().repr(type_hash_),
DataTypeNamer::Global().repr<T>());
PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
return *reinterpret_cast<T *>(&data_[0]);
}
private:
std::string data_;
size_t type_hash_{std::numeric_limits<size_t>::max()};
};
/*
* Node Representation.
*
......@@ -50,8 +88,6 @@ class Node {
Node() = default;
struct Attr;
// Cast to a subclass type, Function for example.
template <typename Subclass>
Subclass &As() {
......@@ -71,7 +107,7 @@ class Node {
// Get an additional attribute and convert it to T data type. NOTE this will
// silently create a new attribute if not exists.
Attr &attr(const std::string &name) const { return attrs_[name]; }
NodeAttr &attr(const std::string &name) const { return attrs_[name]; }
int id() const { return id_; }
......@@ -80,6 +116,9 @@ class Node {
void SetPbDesc(void *pb) { attr("pb_desc").Pointer() = pb; }
void *pb_desc() const { return attr("pb_desc").Pointer(); }
void SetPbMsg(const std::string &s) { attr("pb_msg").String() = s; }
const std::string &pb_msg() const { return attr("pb_msg").String(); }
void SetDeleted() { deleted_ = true; }
bool deleted() const { return deleted_; }
......@@ -94,43 +133,6 @@ class Node {
// Output links.
std::vector<Node *> outlinks;
// A helper class to maintain the status from Pass.
struct Attr {
// NOTE T should be a primary type or a struct combined by several primary
// types.
// NOTE the STL containers should not use here.
// Some usages
// Attr attr;
// attr.Bool() = true;
bool &Bool() { return As<bool>(); }
float &Float() { return As<float>(); }
int32_t &Int32() { return As<int32_t>(); }
int64_t &Int64() { return As<int64_t>(); }
void *&Pointer() { return As<void *>(); }
private:
template <typename T>
T &As() {
// init storage in the first usage.
if (data_.empty()) {
VLOG(4) << "resize data to " << sizeof(T);
type_hash_ = typeid(T).hash_code();
data_.resize(sizeof(T));
}
PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(),
"type not matched, origin is %s, want %s",
DataTypeNamer::Global().repr(type_hash_),
DataTypeNamer::Global().repr<T>());
PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
return *reinterpret_cast<T *>(&data_[0]);
}
private:
std::string data_;
size_t type_hash_{std::numeric_limits<size_t>::max()};
};
// Type checks.
bool IsFunction() const { return type_ == Node::Type::kFunction; }
bool IsValue() const { return type_ == Node::Type::kValue; }
......@@ -150,7 +152,7 @@ class Node {
Type type_{Type::kNone};
// Mark this node is deleted by some pass.
bool deleted_{false};
mutable std::unordered_map<std::string, Attr> attrs_;
mutable std::unordered_map<std::string, NodeAttr> attrs_;
};
class Function;
......@@ -213,6 +215,10 @@ class Function : public Node {
struct FunctionBlock : public Node {
std::string repr() const override { return "block-" + std::to_string(id()); }
std::vector<Node *> subgraph;
protected:
FunctionBlock() { SetType(Node::Type::kFunctionBlock); }
friend class NodeMap;
};
class NodeMap {
......@@ -227,7 +233,7 @@ class NodeMap {
void Delete(size_t id);
const std::vector<std::unique_ptr<Node>> &nodes() { return nodes_; }
const std::vector<std::unique_ptr<Node>> &nodes() const { return nodes_; }
size_t size() const { return nodes_.size(); }
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
* This file contains all the flags that declared in Node::Attr.
*
* The Node::Attr is designed to share information between different passes, one
* can get other's attributes in a Node by the flags in this file.
*/
#pragma once
namespace paddle {
namespace inference {
namespace analysis {
#define DECLARE_NODE_ATTR(flag__) const char ATTR_##flag__[] = #flag__;
DECLARE_NODE_ATTR(supported_by_tensorrt) // bool
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -60,6 +60,9 @@ class Pass {
return nullptr;
}
// Create a debugger Pass that draw the DFG by graphviz toolkit.
virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; }
// Run on a single Node.
virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
// Run on a single Function.
......
......@@ -19,6 +19,18 @@ namespace paddle {
namespace inference {
namespace analysis {
bool PassManager::Initialize(Argument* argument) {
argument_ = argument;
for (auto& pass : data_) {
LOG(INFO) << "Initializing pass " << pass->repr();
if (!pass->Initialize(argument)) {
LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
return false;
}
}
return true;
}
void DfgPassManager::RunAll() {
PADDLE_ENFORCE(argument_);
for (auto& pass : data_) {
......
......@@ -50,17 +50,7 @@ class PassManager : public OrderedRegistry<Pass> {
// globally shared, so pass them as the arguemnts for all the pass managers.
virtual bool Initialize(const Argument& argument) { return false; }
virtual bool Initialize(Argument* argument) {
argument_ = argument;
for (auto& pass : data_) {
LOG(INFO) << "Initializing pass " << pass->repr();
if (!pass->Initialize(argument)) {
LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
return false;
}
}
return true;
}
virtual bool Initialize(Argument* argument);
// Call all the passes' Finalize methods.
virtual bool Finalize() {
......
......@@ -64,6 +64,7 @@ TEST_F(DFG_Tester, DFG_pass_manager) {
manager.Register("graphviz", new DFG_GraphvizDrawPass(config));
manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass);
ASSERT_TRUE(&argument);
ASSERT_TRUE(manager.Initialize(&argument));
manager.RunAll();
}
......
......@@ -119,10 +119,12 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
void SubGraphFuse::ReplaceNodesWithSubGraphs() {
auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
for (auto &subgraph : subgraphs) {
std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
// replace this sub-graph with the first node. Two steps: 1. Create a Block
// Node that contains this subgraph 2. Mark the nodes inside the sub-graph
// as deleted. 3. Replace the deleted node with the new Block Node.
auto *block_node = graph_->nodes.Create(Node::Type::kFunctionBlock);
auto *block_node = static_cast<FunctionBlock *>(
graph_->nodes.Create(Node::Type::kFunctionBlock));
auto io = ExtractInputAndOutputOfSubGraph(subgraph);
block_node->inlinks = std::move(io.first);
block_node->outlinks = std::move(io.second);
......@@ -130,21 +132,25 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() {
// TODO(Superjomn) need a unified mechanism to treat deleted node in each
// pass.
node->SetDeleted();
block_node->subgraph.push_back(node);
}
std::unordered_map<Node *, Node *>
delelte_node_map; // deleted node to BlockNode
for (auto *n : block_node->inlinks) {
n->inlinks.clear();
}
for (auto *n : block_node->outlinks) {
n->outlinks.clear();
}
for (auto *n : block_node->inlinks) {
n->outlinks.push_back(block_node);
// Change all the sub-graph's inputs and outputs corresponding inlink and
// outlink to this sub-graph node.
auto inlink_or_outlink_cleaner = [&](std::vector<Node *> &nodes) {
for (auto *&n : nodes) {
if (subgraph_uniq.count(n)) {
n = block_node;
}
}
std::unordered_set<Node *> uniq(nodes.begin(), nodes.end());
nodes.assign(uniq.begin(), uniq.end());
};
for (auto *i : block_node->inlinks) {
inlink_or_outlink_cleaner(i->outlinks);
}
for (auto *n : block_node->outlinks) {
n->inlinks.push_back(n);
for (auto *&o : block_node->outlinks) {
inlink_or_outlink_cleaner(o->inlinks);
}
}
}
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/node_attr_flags.h"
namespace paddle {
namespace inference {
namespace analysis {
void TensorRTSubgraphNodeMarkPass::Run(DataFlowGraph *graph) {
for (auto &node : graph->nodes.nodes()) {
node->attr(ATTR_supported_by_tensorrt).Bool() = teller_(node.get());
}
}
class DfgDebuggerPass : public DFG_GraphvizDrawPass {
public:
DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config)
: DFG_GraphvizDrawPass(config) {}
std::string repr() const override {
return "tensorrt-subgraph-node-mark-debugger";
}
bool Finalize() override { return true; }
protected:
std::string Draw(DataFlowGraph *graph) override {
Dot dot;
// Add nodes
for (size_t i = 0; i < graph->nodes.size(); i++) {
const Node &node = graph->nodes.Get(i);
if (config_.display_deleted_node || !node.deleted()) {
auto dot_attr = node.dot_attrs();
if (node.attr(ATTR_supported_by_tensorrt).Bool()) {
dot_attr.assign(
{Dot::Attr{"color", "green"}, Dot::Attr{"style", "filled"}});
}
dot.AddNode(node.repr(), dot_attr);
}
}
// Add edges
for (size_t i = 0; i < graph->nodes.size(); i++) {
const Node &node = graph->nodes.Get(i);
if (!config_.display_deleted_node && node.deleted()) continue;
for (auto &in : node.inlinks) {
if (!config_.display_deleted_node && in->deleted()) continue;
dot.AddEdge(in->repr(), node.repr(), {});
}
}
return dot.Build();
}
};
Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
DFG_GraphvizDrawPass::Config config(
FLAGS_inference_analysis_graphviz_log_root, "tensorrt_marked_node");
return new DfgDebuggerPass(config);
}
bool TensorRTSubgraphNodeMarkPass::Finalize() { return true; }
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
* This file defines TensorRTSubgraphNodeMarkPass which helps to mark the ops
* that supported by TensorRT engine.
*/
#include "paddle/fluid/inference/analysis/pass.h"
#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
namespace paddle {
namespace inference {
namespace analysis {
/*
* Mark the operators that TensorRT engine supports.
*/
class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
public:
using teller_t = SubGraphSplitter::NodeInsideSubgraphTeller;
TensorRTSubgraphNodeMarkPass(const teller_t& teller) : teller_(teller) {}
bool Initialize(Argument* argument) override { return true; }
// This class get a sub-graph as input and determine whether to transform this
// sub-graph into TensorRT.
void Run(DataFlowGraph* graph) override;
std::string repr() const { return "tensorrt-sub-subgraph-mark"; }
std::string description() const { return "tensorrt sub-graph mark pass"; }
Pass* CreateGraphvizDebugerPass() const override;
bool Finalize() override;
private:
teller_t teller_;
};
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
#include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/node_attr_flags.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
namespace paddle {
namespace inference {
namespace analysis {
TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
// init
FluidToDataFlowGraphPass pass;
ASSERT_TRUE(pass.Initialize(&argument));
argument.main_dfg.reset(new DataFlowGraph);
pass.Run(argument.main_dfg.get());
TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) {
return node->IsFunction() &&
static_cast<const Function*>(node)->func_type() == "mul";
};
TensorRTSubgraphNodeMarkPass pass1(teller);
ASSERT_TRUE(pass1.Initialize(&argument));
pass1.Run(argument.main_dfg.get());
int counter{0};
for (auto& node : argument.main_dfg->nodes.nodes()) {
counter += node->attr(ATTR_supported_by_tensorrt).Bool();
}
LOG(INFO) << counter << " nodes marked";
}
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -24,7 +24,7 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
: node_inside_subgraph_teller_(teller) {}
void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
SubGraphFuse(graph, node_inside_subgraph_teller_);
SubGraphFuse(graph, node_inside_subgraph_teller_)();
}
} // namespace analysis
......
......@@ -38,6 +38,11 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
// sub-graph into TensorRT.
void Run(DataFlowGraph* graph) override;
bool Finalize() override { return true; }
std::string repr() const { return "tensorrt-sub-graph"; }
std::string description() const { return "tensorrt sub graph pass"; }
private:
NodeInsideSubgraphTeller node_inside_subgraph_teller_;
};
......
......@@ -23,49 +23,48 @@ namespace paddle {
namespace inference {
namespace analysis {
DEFINE_string(model_dir, "", "inference test model dir");
DEFINE_string(dot_dir, "./", "");
TEST(TensorRTSubGraph, single_pass) {
auto desc = LoadProgramDesc();
auto dfg = ProgramDescToDFG(desc);
SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
TEST_F(DFG_Tester, tensorrt_single_pass) {
std::unordered_set<std::string> teller_set(
{"elementwise_add", "mul", "sigmoid"});
SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) {
if (node->type() != Node::Type::kFunction) return false;
const auto* func = static_cast<const Function*>(node);
if (func->func_type() == "elementwise_add" || func->func_type() == "relu" ||
func->func_type() == "conv2d" || func->func_type() == "mul" ||
func->func_type() == "sigmoid" || func->func_type() == "softmax") {
LOG(INFO) << "sub-graph marked " << node->repr();
return true;
}
if (teller_set.count(func->func_type())) return true;
return false;
};
DFG_GraphvizDrawPass::Config config{"./", "test"};
DFG_GraphvizDrawPass dfg_pass(config);
dfg_pass.Initialize();
DFG_GraphvizDrawPass dfg_pass1(config);
dfg_pass1.Initialize();
dfg_pass.Run(&dfg);
LOG(INFO) << "init";
DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};
DFG_GraphvizDrawPass dfg_pass(config);
DFG_GraphvizDrawPass dfg_pass1(config1);
FluidToDataFlowGraphPass pass0;
TensorRTSubGraphPass trt_pass(std::move(teller));
trt_pass.Initialize();
trt_pass.Run(&dfg);
LOG(INFO) << "Initialize";
dfg_pass.Initialize(&argument);
dfg_pass1.Initialize(&argument);
pass0.Initialize(&argument);
trt_pass.Initialize(&argument);
dfg_pass1.Run(&dfg);
LOG(INFO) << "Run";
argument.main_dfg.reset(new DataFlowGraph);
pass0.Run(argument.main_dfg.get());
dfg_pass.Run(argument.main_dfg.get());
trt_pass.Run(argument.main_dfg.get());
dfg_pass1.Run(argument.main_dfg.get());
// Check the TRT op's block desc
for (auto node : dfg.nodes.nodes()) {
for (auto& node : argument.main_dfg->nodes.nodes()) {
if (node->IsFunctionBlock()) {
LOG(INFO) << "get function block";
}
}
}
TEST(TensorRTSubGraph, pass_manager) {}
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -226,7 +226,8 @@ op_library(sequence_softmax_op DEPS softmax)
if (WITH_GPU AND TENSORRT_FOUND)
op_library(tensorrt_engine_op DEPS tensorrt_engine)
nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter)
DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter
analysis)
else()
set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
endif()
......
......@@ -56,9 +56,12 @@ class AdamOp : public framework::OperatorWithKernel {
"Beta2 power accumulator should have 1 dimension");
auto param_dims = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Grad"),
"Param and Grad input of AdamOp should have same dimension");
if (ctx->GetInputsVarType("Grad")[0] ==
framework::proto::VarType::LOD_TENSOR) {
PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Grad"),
"Param and Grad input of AdamOp should have same dimension");
}
PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Moment1"),
"Param and Moment1 input of AdamOp should have same dimension");
......
......@@ -282,6 +282,10 @@ class AdamOpKernel : public framework::OpKernel<T> {
} else if (grad_var->IsType<framework::SelectedRows>()) {
auto& grad =
Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad");
if (grad.rows().size() == 0) {
VLOG(3) << "grad row size is 0!!";
return;
}
// merge duplicated rows if any.
scatter::MergeAdd<DeviceContext, T> merge_func;
auto grad_merge =
......
......@@ -66,6 +66,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const float epsilon = ctx.Attr<float>("epsilon");
const float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test");
const bool fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
const auto *x = ctx.Input<Tensor>("X");
const auto *mean = ctx.Input<Tensor>("Mean");
......@@ -111,6 +112,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
unsigned flags = mkldnn::use_scale_shift;
if (is_test) flags |= mkldnn::use_global_stats;
if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
// create mkldnn memory from input x tensor
auto src_memory =
......
......@@ -155,6 +155,9 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<bool>("fuse_with_relu",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC(
Batch Normalization.
......
......@@ -302,6 +302,7 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
namespace ops = paddle::operators;
// conv2d_transpose
REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp,
ops::Conv2DTransposeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
......@@ -317,6 +318,7 @@ REGISTER_OP_CPU_KERNEL(
ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
double>);
// conv3d_transpose
REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp,
ops::Conv3DTransposeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
......@@ -331,3 +333,19 @@ REGISTER_OP_CPU_KERNEL(
ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
double>);
// depthwise conv2d_transpose
REGISTER_OPERATOR(depthwise_conv2d_transpose, ops::ConvTransposeOp,
ops::Conv2DTransposeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad);
REGISTER_OP_CPU_KERNEL(
depthwise_conv2d_transpose,
ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
depthwise_conv2d_transpose_grad,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
double>);
......@@ -15,25 +15,28 @@ limitations under the License. */
#include "paddle/fluid/operators/conv_transpose_op.h"
namespace ops = paddle::operators;
using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(
conv2d_transpose,
ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
conv2d_transpose_grad,
ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
float>,
ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
double>);
REGISTER_OP_CUDA_KERNEL(
conv3d_transpose,
ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
conv3d_transpose_grad,
ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
float>,
ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
double>);
// conv2d
REGISTER_OP_CUDA_KERNEL(conv2d_transpose,
ops::GemmConvTransposeKernel<CUDA, float>,
ops::GemmConvTransposeKernel<CUDA, double>);
REGISTER_OP_CUDA_KERNEL(conv2d_transpose_grad,
ops::GemmConvTransposeGradKernel<CUDA, float>,
ops::GemmConvTransposeGradKernel<CUDA, double>);
// conv3d
REGISTER_OP_CUDA_KERNEL(conv3d_transpose,
ops::GemmConvTransposeKernel<CUDA, float>,
ops::GemmConvTransposeKernel<CUDA, double>);
REGISTER_OP_CUDA_KERNEL(conv3d_transpose_grad,
ops::GemmConvTransposeGradKernel<CUDA, float>,
ops::GemmConvTransposeGradKernel<CUDA, double>);
// depthwise conv2d
REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose,
ops::DepthwiseConvTransposeKernel<CUDA, float>,
ops::DepthwiseConvTransposeKernel<CUDA, double>);
REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose_grad,
ops::DepthwiseConvTransposeGradKernel<CUDA, float>,
ops::DepthwiseConvTransposeGradKernel<CUDA, double>);
......@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/vol2col.h"
......@@ -316,5 +317,74 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
}
}
};
template <typename DeviceContext, typename T>
class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* output = context.Output<Tensor>("Output");
output->mutable_data<T>(context.GetPlace());
int groups = context.Attr<int>("groups");
PADDLE_ENFORCE_EQ(groups, filter.dims()[0]);
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
for (auto v : dilations) {
PADDLE_ENFORCE_EQ(v, 1);
}
output->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<DeviceContext>();
math::SetConstant<DeviceContext, T> set_zero;
set_zero(dev_ctx, output, static_cast<T>(0));
math::DepthwiseConvInputGradFunctor<DeviceContext, T>
depthwiseConvInputGrad;
depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings,
output);
}
};
template <typename DeviceContext, typename T>
class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
const Tensor* output_grad =
context.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad =
context.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad =
context.Output<Tensor>(framework::GradVarName("Filter"));
Tensor filter = *context.Input<Tensor>("Filter");
if (!input_grad && !filter_grad) return;
auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
if (input_grad) {
math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings,
input_grad);
}
if (filter_grad) {
math::SetConstant<DeviceContext, T> set_zero;
filter_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
depthwiseConvFilterGrad;
depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings,
filter_grad);
}
}
};
} // namespace operators
} // namespace paddle
......@@ -51,6 +51,12 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
}
};
template <class T>
bool DistPairDescend(std::tuple<int, int, T> pair1,
std::tuple<int, int, T> pair2) {
return std::get<2>(pair1) > std::get<2>(pair2);
}
template <typename T>
class BipartiteMatchKernel : public framework::OpKernel<T> {
public:
......@@ -58,46 +64,76 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
// The match_dist must be initialized to 0 at first.
void BipartiteMatch(const Tensor& dist, int* match_indices,
T* match_dist) const {
constexpr T kEPS = static_cast<T>(1e-6);
PADDLE_ENFORCE_EQ(dist.dims().size(), 2, "The rank of dist must be 2.");
int64_t row = dist.dims()[0];
int64_t col = dist.dims()[1];
auto* dist_data = dist.data<T>();
std::vector<int> row_pool;
for (int i = 0; i < row; ++i) {
row_pool.push_back(i);
}
while (row_pool.size() > 0) {
int max_idx = -1;
int max_row_idx = -1;
T max_dist = -1;
for (int64_t j = 0; j < col; ++j) {
if (match_indices[j] != -1) {
continue;
// Test result: When row==130 the speed of these two methods almost the same
if (row >= 130) {
std::vector<std::tuple<int, int, T>> match_pair;
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
match_pair.push_back(std::make_tuple(i, j, dist_data[i * col + j]));
}
for (size_t k = 0; k < row_pool.size(); ++k) {
int m = row_pool[k];
// distance is 0 between m-th row and j-th column
if (dist_data[m * col + j] < kEPS) {
}
std::sort(match_pair.begin(), match_pair.end(), DistPairDescend<T>);
std::vector<int> row_indices(row, -1);
int64_t idx = 0;
for (int64_t k = 0; k < row * col; ++k) {
int64_t i = std::get<0>(match_pair[k]);
int64_t j = std::get<1>(match_pair[k]);
T dist = std::get<2>(match_pair[k]);
if (idx >= row) {
break;
}
if (match_indices[j] == -1 && row_indices[i] == -1 && dist > 0) {
match_indices[j] = i;
row_indices[i] = j;
match_dist[j] = dist;
idx += 1;
}
}
} else {
constexpr T kEPS = static_cast<T>(1e-6);
std::vector<int> row_pool;
for (int i = 0; i < row; ++i) {
row_pool.push_back(i);
}
while (row_pool.size() > 0) {
int max_idx = -1;
int max_row_idx = -1;
T max_dist = -1;
for (int64_t j = 0; j < col; ++j) {
if (match_indices[j] != -1) {
continue;
}
if (dist_data[m * col + j] > max_dist) {
max_idx = j;
max_row_idx = m;
max_dist = dist_data[m * col + j];
for (size_t k = 0; k < row_pool.size(); ++k) {
int m = row_pool[k];
// distance is 0 between m-th row and j-th column
if (dist_data[m * col + j] < kEPS) {
continue;
}
if (dist_data[m * col + j] > max_dist) {
max_idx = j;
max_row_idx = m;
max_dist = dist_data[m * col + j];
}
}
}
}
if (max_idx == -1) {
// Cannot find good match.
break;
} else {
PADDLE_ENFORCE_EQ(match_indices[max_idx], -1);
match_indices[max_idx] = max_row_idx;
match_dist[max_idx] = max_dist;
// Erase the row index.
row_pool.erase(
std::find(row_pool.begin(), row_pool.end(), max_row_idx));
if (max_idx == -1) {
// Cannot find good match.
break;
} else {
PADDLE_ENFORCE_EQ(match_indices[max_idx], -1);
match_indices[max_idx] = max_row_idx;
match_dist[max_idx] = max_dist;
// Erase the row index.
row_pool.erase(
std::find(row_pool.begin(), row_pool.end(), max_row_idx));
}
}
}
}
......
......@@ -26,8 +26,12 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
"Input(X) of FillZerosLikeOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of FillZerosLikeOp should not be null.");
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ "Out");
if (ctx->IsRuntime() &&
ctx->GetOutputsVarType("Out")[0] ==
framework::proto::VarType::LOD_TENSOR_ARRAY) {
return; // skip runtime infershape when is tensor array;
}
}
};
......@@ -39,7 +43,7 @@ class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC(
FillZerosLike Operator.
Fill up a variable with zeros.
Fill up a variable with zeros, supporting both LoDTensor and LoDTensorArray.
The output will have the same size as the input.
)DOC");
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
......@@ -23,12 +24,29 @@ template <typename DeviceContext, typename T>
class FillZerosLikeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* out = context.Output<framework::Tensor>("Out");
out->mutable_data<T>(context.GetPlace());
math::SetConstant<DeviceContext, T> setter;
setter(context.template device_context<DeviceContext>(), out,
static_cast<T>(0));
auto var = context.InputVar("X");
if (var->IsType<framework::LoDTensor>()) {
auto& input = *context.Input<framework::LoDTensor>("X");
auto& output = *context.Output<framework::LoDTensor>("Out");
output.Resize(input.dims());
output.set_lod(input.lod());
output.mutable_data<T>(context.GetPlace());
math::SetConstant<DeviceContext, T> setter;
setter(context.template device_context<DeviceContext>(), &(output),
static_cast<T>(0));
} else if (var->IsType<framework::LoDTensorArray>()) {
auto& input = *context.Input<framework::LoDTensorArray>("X");
auto& output = *context.Output<framework::LoDTensorArray>("Out");
output.resize(input.size());
for (auto i = 0; i < input.size(); i++) {
output[i].Resize(input[i].dims());
output[i].set_lod(input[i].lod());
output[i].mutable_data<T>(context.GetPlace());
math::SetConstant<DeviceContext, T> setter;
setter(context.template device_context<DeviceContext>(), &(output[i]),
static_cast<T>(0));
}
}
}
};
......
......@@ -53,6 +53,7 @@ template <typename DeviceContext, typename T>
class TensorRTEngineKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
VLOG(4) << "TensorRTEngineKernel executing";
auto engine_name = context.Attr<std::string>("engine_uniq_key");
if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
Prepare(context);
......
......@@ -19,6 +19,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
......@@ -51,48 +52,10 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
*var = *desc.Proto();
}
template <typename T>
void SetAttr(framework::proto::OpDesc* op, const std::string& name,
const T& data);
template <>
void SetAttr<std::string>(framework::proto::OpDesc* op, const std::string& name,
const std::string& data) {
auto* attr = op->add_attrs();
attr->set_name(name);
attr->set_type(paddle::framework::proto::AttrType::STRING);
attr->set_s(data);
}
template <>
void SetAttr<int>(framework::proto::OpDesc* op, const std::string& name,
const int& data) {
auto* attr = op->add_attrs();
attr->set_name(name);
attr->set_type(paddle::framework::proto::AttrType::INT);
attr->set_i(data);
}
template <>
void SetAttr<int64_t>(framework::proto::OpDesc* op, const std::string& name,
const int64_t& data) {
auto* attr = op->add_attrs();
attr->set_name(name);
attr->set_type(paddle::framework::proto::AttrType::LONG);
attr->set_l(data);
}
template <>
void SetAttr<std::vector<std::string>>(framework::proto::OpDesc* op,
const std::string& name,
const std::vector<std::string>& data) {
auto* attr = op->add_attrs();
attr->set_name(name);
attr->set_type(paddle::framework::proto::AttrType::STRINGS);
for (const auto& s : data) {
attr->add_strings(s.c_str());
}
}
} // namespace
using inference::analysis::SetAttr;
TEST(TensorRTEngineOp, manual) {
framework::ProgramDesc program;
auto* block_ = program.Proto()->add_blocks();
......
......@@ -106,6 +106,7 @@ function cmake_gen() {
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_CONTRIB=${WITH_CONTRIB:-ON}
-DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
========================================
EOF
# Disable UNITTEST_USE_VIRTUALENV in docker because
......@@ -133,7 +134,8 @@ EOF
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-DWITH_ANAKIN=${WITH_ANAKIN:-ON}
-DWITH_ANAKIN=${WITH_ANAKIN:-ON} \
-DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
}
function abort(){
......
......@@ -111,7 +111,7 @@ def fetch():
paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5)
def convert(path):
......
......@@ -95,6 +95,7 @@ __all__ = [
'relu',
'log',
'crop',
'fill_zeros_like',
]
......@@ -1993,7 +1994,8 @@ def batch_norm(input,
name=None,
moving_mean_name=None,
moving_variance_name=None,
do_model_average_for_mean_and_var=False):
do_model_average_for_mean_and_var=False,
fuse_with_relu=False):
"""
**Batch Normalization Layer**
......@@ -2036,6 +2038,7 @@ def batch_norm(input,
moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
fuse_with_relu (bool): if True, this OP performs relu after batch norm.
Returns:
Variable: A tensor variable which is the result after applying batch normalization on the input.
......@@ -2121,7 +2124,8 @@ def batch_norm(input,
"momentum": momentum,
"epsilon": epsilon,
"is_test": is_test,
"use_mkldnn": use_mkldnn
"use_mkldnn": use_mkldnn,
"fuse_with_relu": fuse_with_relu
})
return helper.append_activation(batch_norm_out)
......@@ -2334,10 +2338,17 @@ def conv2d_transpose(input,
data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3)
"""
helper = LayerHelper("conv2d_transpose", **locals())
input_channel = input.shape[1]
op_type = 'conv2d_transpose'
if (input_channel == groups and num_filters == input_channel and
not use_cudnn):
op_type = 'depthwise_conv2d_transpose'
helper = LayerHelper(op_type, **locals())
if not isinstance(input, Variable):
raise TypeError("Input of conv2d_transpose must be Variable")
input_channel = input.shape[1]
padding = utils.convert_to_list(padding, 2, 'padding')
stride = utils.convert_to_list(stride, 2, 'stride')
......@@ -2371,7 +2382,7 @@ def conv2d_transpose(input,
pre_bias = helper.create_tmp_variable(dtype=input.dtype)
helper.append_op(
type='conv2d_transpose',
type=op_type,
inputs={'Input': [input],
'Filter': [img_filter]},
outputs={'Output': pre_bias},
......@@ -5174,3 +5185,40 @@ def crop(x, shape=None, offsets=None, name=None):
outputs={'Out': out},
attrs=None if len(attrs) == 0 else attrs)
return out
def fill_zeros_like(x):
"""
This layer takes an input and outputs a variable that has the same structure as
the input and with all the element values as zero. The variable can be a Tensor
or TensorArray.
.. code-block:: text
Given
X = [[0, 1, 2, 0],
[0, 3, 4, 0],
[0, 0, 0, 0]],
output is:
Out = [[0, 0, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 0]].
Args:
x (Variable): The input variable, which could be a tensor or tensor array
Returns:
Variable: The zero-filled variable, which has the same type and shape as
the input variable.
Examples:
.. code-block:: python
y = fluid.layers.fill_zeros_like(x)
"""
helper = LayerHelper('fill_zeros_like', **locals())
out = helper.create_tmp_variable(dtype=x.dtype)
helper.append_op(
type='fill_zeros_like', inputs={'X': [x]}, outputs={'Out': [out]})
return out
......@@ -51,3 +51,4 @@ py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)
......@@ -52,5 +52,17 @@ class TestMKLDNNBatchNormOpInference(TestBatchNormOpInference):
self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5])
class TestMKLDNNBatchNormOpWithReluInference(TestBatchNormOpInference):
def init_kernel_type(self):
self.use_mkldnn = True
self.fuse_with_relu = True
def test_check_output(self):
place = core.CPUPlace()
data_format = "NCHW"
self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5])
if __name__ == '__main__':
unittest.main()
......@@ -159,6 +159,7 @@ class TestBatchNormOpInference(unittest.TestCase):
def setUp(self):
self.dtype = np.float32
self.use_mkldnn = False
self.fuse_with_relu = False
self.init_kernel_type()
def __assert_close(self, tensor, np_array, msg, atol=1e-4):
......@@ -180,6 +181,8 @@ class TestBatchNormOpInference(unittest.TestCase):
scale_shape = [c]
x_val = np.random.random_sample(x_shape).astype(dtype)
# generate some negative values to test case with relu fused
x_val = x_val - 0.5
scale_val = np.random.random_sample(scale_shape).astype(np.float32)
bias_val = np.random.random_sample(scale_shape).astype(np.float32)
......@@ -188,6 +191,8 @@ class TestBatchNormOpInference(unittest.TestCase):
y_out = _reference_testing(x_val, scale_val, bias_val, mean, variance,
epsilon, data_layout).astype(dtype)
if self.fuse_with_relu:
y_out = np.maximum(y_out, 0)
scope = core.Scope()
......@@ -233,6 +238,7 @@ class TestBatchNormOpInference(unittest.TestCase):
is_test=True,
data_layout=data_layout,
use_mkldnn=self.use_mkldnn,
fuse_with_relu=self.fuse_with_relu,
epsilon=epsilon)
batch_norm_op.run(scope, place)
......@@ -265,6 +271,7 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
def setUp(self):
self.dtype = np.float16
self.use_mkldnn = False
self.fuse_with_relu = False
self.init_kernel_type()
def test_check_output(self):
......@@ -284,6 +291,7 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
class TestBatchNormOpTraining(unittest.TestCase):
def setUp(self):
self.use_mkldnn = False
self.fuse_with_relu = False
self.data_formats = ["NCHW", "NHWC"]
self.init_kernel_type()
......@@ -367,7 +375,8 @@ class TestBatchNormOpTraining(unittest.TestCase):
"epsilon": epsilon,
"is_test": False,
"data_layout": data_layout,
"use_mkldnn": self.use_mkldnn
"use_mkldnn": self.use_mkldnn,
"fuse_with_relu": self.fuse_with_relu
})
block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
......
......@@ -114,6 +114,23 @@ class TestBipartiteMatchOpWithoutLoD(OpTest):
self.check_output()
class TestBipartiteMatchOpWithoutLoDLargeScaleInput(OpTest):
def setUp(self):
self.op_type = 'bipartite_match'
lod = [[300]]
dist = np.random.random((300, 17)).astype('float32')
match_indices, match_dist = batch_bipartite_match(dist, lod[0])
self.inputs = {'DistMat': dist}
self.outputs = {
'ColToRowMatchIndices': match_indices,
'ColToRowMatchDist': match_dist,
}
def test_check_output(self):
self.check_output()
class TestBipartiteMatchOpWithPerPredictionType(OpTest):
def setUp(self):
self.op_type = 'bipartite_match'
......
......@@ -242,6 +242,19 @@ class TestCUDNNWithGroups(TestWithGroups):
self.op_type = "conv2d_transpose"
class TestDepthwiseConvTranspose(TestConv2dTransposeOp):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [2, 2]
self.dilations = [1, 1]
self.input_size = [2, 8, 16, 16] # NCHW
self.groups = 8
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups
self.filter_size = [self.input_size[1], f_c, 4, 4]
self.op_type = "depthwise_conv2d_transpose"
# Please Don't remove the following code.
# Currently, CI use cudnn V5.0 which not support dilation conv.
# class TestCUDNNWithDilation(TestWithDilation):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import argparse
import time
import math
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from paddle.fluid import core
import unittest
from multiprocessing import Process
import os
import signal
SEED = 1
DTYPE = "float32"
paddle.dataset.mnist.fetch()
# random seed must set before configuring the network.
# fluid.default_startup_program().random_seed = SEED
def cnn_model(data):
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=data,
filter_size=5,
num_filters=20,
pool_size=2,
pool_stride=2,
act="relu")
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
pool_size=2,
pool_stride=2,
act="relu")
# TODO(dzhwinter) : refine the initializer and random seed settting
SIZE = 10
input_shape = conv_pool_2.shape
param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
predict = fluid.layers.fc(
input=conv_pool_2,
size=SIZE,
act="softmax",
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)))
return predict
def get_model(batch_size):
# Input data
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program
predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(
input=predict, label=label, total=batch_size_tensor)
inference_program = fluid.default_main_program().clone()
# Optimization
opt = fluid.optimizer.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999)
# Reader
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=batch_size)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size)
opt.minimize(avg_cost)
return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
t = fluid.DistributeTranspiler()
t.transpile(
trainer_id=trainer_id,
program=main_program,
pservers=pserver_endpoints,
trainers=trainers)
return t
def run_pserver(pserver_endpoints, trainers, current_endpoint):
get_model(batch_size=20)
t = get_transpiler(0,
fluid.default_main_program(), pserver_endpoints,
trainers)
pserver_prog = t.get_pserver_program(current_endpoint)
startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)
exe.run(pserver_prog)
class TestDistMnist(unittest.TestCase):
def setUp(self):
self._trainers = 1
self._pservers = 1
self._ps_endpoints = "127.0.0.1:9123"
def start_pserver(self, endpoint):
p = Process(
target=run_pserver,
args=(self._ps_endpoints, self._trainers, endpoint))
p.start()
return p.pid
def _wait_ps_ready(self, pid):
retry_times = 5
while True:
assert retry_times >= 0, "wait ps ready failed"
time.sleep(1)
try:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os.stat("/tmp/paddle.%d.port" % pid)
return
except os.error:
retry_times -= 1
def stop_pserver(self, pid):
os.kill(pid, signal.SIGTERM)
def test_with_place(self):
p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
) else fluid.CPUPlace()
pserver_pid = self.start_pserver(self._ps_endpoints)
self._wait_ps_ready(pserver_pid)
self.run_trainer(p, 0)
self.stop_pserver(pserver_pid)
def run_trainer(self, place, trainer_id):
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model(
batch_size=20)
t = get_transpiler(trainer_id,
fluid.default_main_program(), self._ps_endpoints,
self._trainers)
trainer_prog = t.get_trainer_program()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
feed_var_list = [
var for var in trainer_prog.global_block().vars.itervalues()
if var.is_data
]
feeder = fluid.DataFeeder(feed_var_list, place)
for pass_id in xrange(10):
for batch_id, data in enumerate(train_reader()):
exe.run(trainer_prog, feed=feeder.feed(data))
if (batch_id + 1) % 10 == 0:
acc_set = []
avg_loss_set = []
for test_data in test_reader():
acc_np, avg_loss_np = exe.run(
program=test_program,
feed=feeder.feed(test_data),
fetch_list=[batch_acc, avg_cost])
acc_set.append(float(acc_np))
avg_loss_set.append(float(avg_loss_np))
# get test acc and loss
acc_val = np.array(acc_set).mean()
avg_loss_val = np.array(avg_loss_set).mean()
if float(acc_val
) > 0.8: # Smaller value to increase CI speed
return
else:
print(
'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
format(pass_id, batch_id + 1,
float(avg_loss_val), float(acc_val)))
if math.isnan(float(avg_loss_val)):
assert ("got Nan loss, training failed.")
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.fluid.core as core
import numpy
import paddle.fluid.layers as layers
from paddle.fluid.framework import Program, program_guard
from paddle.fluid.executor import Executor
import paddle.fluid as fluid
import paddle.fluid.core as core
class TestFillZerosLikeOpForTensorArray(unittest.TestCase):
def place(self):
return core.CPUPlace()
def test_zero_filling_lod_tensor_array(self):
tensor = core.LoDTensor()
tensor.set(
numpy.arange(20).reshape(20, 1).astype('int32'), self.place())
tensor.set_lod([[0, 2, 5], [0, 3, 9, 11, 17, 20]])
expect = [
numpy.array(
[0, 0, 0, 0, 0], dtype='int32'), numpy.array(
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='int32'),
numpy.array(
[0, 0, 0], dtype='int32')
]
lod = [[[0, 2, 5]], [[0, 6, 12]], [[0, 3]]]
self.main(
tensor=tensor,
expect_array=expect,
expect_lod=lod,
expect_max_len=3)
def main(self, tensor, expect_array, expect_lod, expect_max_len, level=0):
place = self.place()
program = Program()
with program_guard(program):
x = layers.data(name='x', shape=[10])
x.persistable = True
table = layers.lod_rank_table(x, level=level)
max_len = layers.max_sequence_len(table)
max_len.persistable = True
array = layers.lod_tensor_to_array(x, table)
array = layers.fill_zeros_like(array)
array.persistable = True
result = layers.array_to_lod_tensor(array, table)
result.persistable = True
exe = Executor(place)
scope = core.Scope()
exe.run(program, feed={'x': tensor}, scope=scope)
var = scope.find_var(array.name)
array = var.get_lod_tensor_array()
if expect_array is not None and expect_lod is not None:
self.check_array_same(array, expect_array, expect_lod)
self.assertEqual(
numpy.array(scope.find_var(max_len.name).get_tensor())[0],
expect_max_len)
def check_array_same(self, array, expect_tensor, expect_lod):
self.assertEqual(len(expect_tensor), len(array))
for i, exp in enumerate(zip(expect_tensor, expect_lod)):
exp_tensor, exp_lod = exp
exp_tensor = numpy.expand_dims(exp_tensor, axis=1)
self.assertTrue(numpy.allclose(exp_tensor, numpy.array(array[i])))
self.assertEqual(exp_lod, array[i].lod())
if __name__ == '__main__':
unittest.main()
......@@ -315,7 +315,7 @@ class Trainer(object):
for ip in worker_ips.split(","):
worker_endpoints.append(':'.join([ip, port]))
self.num_trainers = len(worker_endpoints)
current_endpoint = os.getenv("POD_IP") + ":" + port
current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
worker_endpoints.remove(current_endpoint)
# TODO(wuyi): use self.nccl_id_var, self.num_trainers and self.trainer_id
# in ParallelExecutor to start
......
......@@ -301,6 +301,7 @@ class DistributeTranspiler(object):
Program: trainer side program.
"""
# remove optimize ops and add a send op to main_program
# FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
delete_ops(self.origin_program.global_block(), self.optimize_ops)
self.origin_program.__str__()
return self.origin_program
......@@ -537,7 +538,6 @@ class DistributeTranspiler(object):
# 2. rename op outputs
for op in orig_s_prog.global_block().ops:
new_inputs = dict()
new_outputs = dict()
# do not append startup op if var is not on this pserver
op_on_pserver = False
......
......@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
from .. import core
from ..framework import Program
......@@ -20,12 +21,15 @@ from ..executor import global_scope
class InferenceTranspiler:
'''
Convert the fluid program to optimized inference program.
There are several optimizations, only fuse batch normalization is supported now.
Convert the fluid program to optimized inference program.
There are several optimizations:
- fuse convolution and batch normalization
- fuse batch normalization and relu (MKLDNN only)
Examples:
.. code-block:: python
# As InferenceTranspiler will modify the original program,
......@@ -54,19 +58,64 @@ class InferenceTranspiler:
if not isinstance(scope, core.Scope):
raise TypeError("scope should be as Scope type or None")
self.fuse_batch_norm(program, place, scope)
self.fuse_relu_mkldnn(program)
def fuse_relu_mkldnn(self, program):
'''
Transpile the program by fused relu activation for MKLDNN program.
Relu activation following batch norm OP can be fused by adding
:math:`fuse_with_relu` attribute to batch norm OP.
The result of fuse is:
- before:
- batch_norm->relu->any_other_op
- after:
- batch_norm->any_other_op
:param program: program to transpile
:type program: Program
'''
use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
if not use_mkldnn:
return
self.block = program.block(0)
i = 0
while i < len(self.block.ops) - 1:
current_op = self.block.ops[i]
if current_op.type in ['batch_norm']:
next_op = self.block.ops[i + 1]
if next_op.type == 'relu':
# modify bnorm OP to include relu
current_op.set_attr("fuse_with_relu", True)
# remove relu OP
self.block.remove_op(i + 1)
i = i + 1
self._remove_unused_var()
# TODO(luotao): use clone() method to flush the program.desc in force,
# since some large program.desc will not be flushed immediately.
# And a better solution will be considered later.
program = program.clone()
def fuse_batch_norm(self, program, place, scope):
'''
Transpile the program by fused batch normalization.
The batch normalization followed the convolution or fully connected layer
can be integrated with them. Doing so will give us a forward acceleration,
The batch normalization followed the convolution or fully connected layer
can be integrated with them. Doing so will give us a forward acceleration,
especially in environments like mobile or embedded.
For input :math:`X`:
- Conv process: :math:`X = input * W + bias`
- Batch norm process: :math:`X' = (X - mean) / std`
- Conv process: :math:`X = input * W + bias`
- Batch norm process: :math:`X' = (X - mean) / std`
- Scale Process: :math:`Y = a * X' + b`
After fuse into one operation:
......@@ -76,17 +125,17 @@ class InferenceTranspiler:
Y &= (input * W + bias - mean) / std * a + b \\\\
&= input * a * W / std + ((bias - mean) / std * a + b)
The operator transformation is:
The operator transformation is:
- before:
- conv->batch_norm->any_other_op (bias == 0)
- conv->elementwise_add->batch_norm->any_other_op (bias != 0)
- after:
- after:
- conv->elementwise_add->any_other_op
The transpile stages are:
1. insert elementwise_add op when bias == 0.
......@@ -99,20 +148,20 @@ class InferenceTranspiler:
program (Program): program to transpile
place (Place): inference place
scope (Scope): inference Scope
'''
self.scope = scope
self.place = place
self.block = program.block(0)
self.input_map = {} # store the input names should be adjusted
self.input_map = {} # store the input names should be adjusted
i = 0
while i < len(self.block.ops):
while i < len(self.block.ops) - 2:
current_op = self.block.ops[i]
# TODO(luotao1): consider only conv2d now. fc would be delt later.
if current_op.type in ['conv2d']:
# TODO(luotao1): consider single chain network now.
# For branch network, we counldn't use block.ops[i + 1] as
# TODO(luotao1): consider single chain network now.
# For branch network, we counldn't use block.ops[i + 1] as
# the judgment condition.
next_op = self.block.ops[i + 1]
# conv2d without bias
......@@ -137,17 +186,17 @@ class InferenceTranspiler:
self._adjust_input()
self._remove_unused_var()
# TODO(luotao): use clone() method to flush the program.desc in force,
# since some large program.desc will not be flushed immediately.
# TODO(luotao): use clone() method to flush the program.desc in force,
# since some large program.desc will not be flushed immediately.
# And a better solution will be considered later.
program = program.clone()
# ====================== private transpiler functions =====================
def _insert_bias_op(self, index, current_op, bn_op):
'''
Construct elementwise_add operator for adding bias
Construct elementwise_add operator for adding bias
and insert it into program.
:param index: insert location of bias_op
:type index: Int
:param current_op: current operator (conv or fc)
......@@ -175,14 +224,14 @@ class InferenceTranspiler:
def _fuse_param(self, current_op, bn_op, bias_op, with_bias):
'''
fuse the batch_norm_op' parameters to current_op (conv or fc)
:param current_op: current operator (conv or fc)
:type current_op: Operator
:param bn_op: batch norm operator
:type bn_op: Operator
:param bias_op: elementwise_add operator for adding bias
:type bias_op: Operator
:param with_bias: If current operator has bias, with_bias = 1; otherwise 0.
:param with_bias: If current operator has bias, with_bias = 1; otherwise 0.
:type with_bias: Int
'''
......
......@@ -112,7 +112,7 @@ def fetch():
paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5)
def convert(path):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册