提交 2dd331cc 编写于 作者: T tensor-tang

Merge remote-tracking branch 'ups/develop' into fuse/seqpool_concat

test=develop
......@@ -55,6 +55,7 @@ option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF)
option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF)
option(WITH_TIMER "Compile PaddlePaddle with stats timer" OFF)
option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF)
option(WITH_JEMALLOC "Compile PaddlePaddle with jemalloc" OFF)
option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
......@@ -261,6 +262,12 @@ if (WITH_PROFILER)
add_definitions(-DWITH_GPERFTOOLS)
endif()
if (WITH_JEMALLOC)
find_package(JeMalloc REQUIRED)
include_directories(${JEMALLOC_INCLUDE_DIR})
add_definitions(-DWITH_JEMALLOC)
endif()
include(generic) # simplify cmake module
include(package) # set paddle packages
include(ccache) # set ccache for compilation
......@@ -290,7 +297,7 @@ if(WITH_PSLIB)
list(APPEND EXTERNAL_LIBS pslib_brpc)
list(APPEND EXTERNAL_LIBS libmct)
endif(WITH_PSLIB)
if(WITH_AMD_GPU)
find_package(HIP)
include(hip)
......
# - Find JeMalloc library
# Find the native JeMalloc includes and library
#
# JEMALLOC_INCLUDE_DIR - where to find jemalloc.h, etc.
# JEMALLOC_LIBRARIES - List of libraries when using jemalloc.
# JEMALLOC_FOUND - True if jemalloc found.
find_path(JEMALLOC_INCLUDE_DIR
NAMES jemalloc/jemalloc.h
HINTS ${JEMALLOC_ROOT_DIR}/include)
find_library(JEMALLOC_LIBRARIES
NAMES jemalloc
HINTS ${JEMALLOC_ROOT_DIR}/lib)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALLOC_INCLUDE_DIR)
mark_as_advanced(
JEMALLOC_LIBRARIES
JEMALLOC_INCLUDE_DIR)
......@@ -134,6 +134,7 @@ if(WITH_GPU)
message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF")
set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE)
endif()
add_definitions(-DWITH_ANAKIN)
endif()
if(WITH_ANAKIN)
# NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR
......
......@@ -115,6 +115,10 @@ function(common_link TARGET_NAME)
if (WITH_PROFILER)
target_link_libraries(${TARGET_NAME} gperftools::profiler)
endif()
if (WITH_JEMALLOC)
target_link_libraries(${TARGET_NAME} ${JEMALLOC_LIBRARIES})
endif()
endfunction()
......@@ -228,7 +232,7 @@ function(merge_static_libs TARGET_NAME)
# Get the file names of the libraries to be merged
set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
endforeach()
# msvc will put libarary in directory of "/Release/xxxlib" by default
# msvc will put libarary in directory of "/Release/xxxlib" by default
# COMMAND cmake -E remove "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND cmake -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}"
......
......@@ -18,7 +18,7 @@ limitations under the License. */
#include <memory>
#include "paddle/fluid/framework/details/memory_reuse_types.h"
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/sequential_execution_pass.h"
......@@ -86,10 +86,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
if (strategy.memory_optimize_) {
auto analysis_var_pass = AppendPass("analysis_var_pass");
}
// Convert graph to run on multi-devices.
auto multi_devices_pass = AppendPass("multi_devices_pass");
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
&strategy_);
AppendMultiDevPass(strategy);
// Add a graph print pass to record a graph with device info.
if (!strategy_.debug_graphviz_path_.empty()) {
......@@ -115,6 +113,25 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
}
}
// Convert graph to run on multi-devices.
void AppendMultiDevPass(const BuildStrategy &strategy) {
ir::Pass *multi_devices_pass;
if (strategy_.is_distribution_) {
multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
} else {
if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
multi_devices_pass =
AppendPass("allreduce_mode_multi_devices_pass").get();
} else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
} else {
PADDLE_THROW("Unknown reduce strategy.");
}
}
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
&strategy_);
}
private:
BuildStrategy strategy_;
};
......@@ -131,6 +148,10 @@ std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
return pass_builder_;
}
bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
return framework::details::MultiDevSSAGraphBuilder().count(pass_name) > 0;
}
std::unique_ptr<ir::Graph> BuildStrategy::Apply(
const ProgramDesc &main_program, const std::vector<platform::Place> &places,
const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
......@@ -145,22 +166,23 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
if (pass->Type() == "multi_devices_pass") {
pass->Erase("places");
pass->SetNotOwned<const std::vector<platform::Place>>("places", &places);
pass->Erase("loss_var_name");
pass->SetNotOwned<const std::string>("loss_var_name", &loss_var_name);
pass->Erase("local_scopes");
pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
if (IsMultiDevPass(pass->Type())) {
pass->Erase(kPlaces);
pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
pass->Erase(kLossVarName);
pass->SetNotOwned<const std::string>(kLossVarName, &loss_var_name);
pass->Erase(kLocalScopes);
pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
&local_scopes);
pass->Erase("nranks");
pass->Set<size_t>("nranks", new size_t(nranks));
pass->Erase(kNRanks);
pass->Set<size_t>(kNRanks, new size_t(nranks));
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
pass->Erase("nccl_ctxs");
pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
#endif
} else if (pass->Type() == "analysis_var_pass") {
const std::vector<OpDesc *> *all_op_descs =
new std::vector<OpDesc *>(main_program.Block(0).AllOps());
......@@ -201,7 +223,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
USE_PASS(fuse_elewise_add_act_pass);
USE_PASS(graph_viz_pass);
USE_PASS(multi_batch_merge_pass);
USE_PASS(multi_devices_pass);
USE_PASS(reduce_mode_multi_devices_pass);
USE_PASS(allreduce_mode_multi_devices_pass);
USE_PASS(dist_multi_devices_pass);
USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass);
USE_PASS(analysis_var_pass);
......
......@@ -74,8 +74,6 @@ struct BuildStrategy {
bool fuse_elewise_add_act_ops_{false};
bool enable_data_balance_{false};
bool memory_optimize_{false};
bool memory_early_delete_{false};
......@@ -84,6 +82,10 @@ struct BuildStrategy {
bool fuse_broadcast_op_{false};
// FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
// num_trainers is 1, so the current fields of build_strategy doesn't tell if
// it's distributed model.
bool is_distribution_{false};
int num_trainers_{1};
int trainer_id_{0};
std::vector<std::string> trainers_endpoints_;
......@@ -104,6 +106,8 @@ struct BuildStrategy {
bool IsFinalized() const { return is_finalized_; }
bool IsMultiDevPass(const std::string &pass_name) const;
// Apply the passes built by the pass_builder_. The passes will be
// applied to the Program and output an ir::Graph.
std::unique_ptr<ir::Graph> Apply(const ProgramDesc &main_program,
......
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include <string>
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
......@@ -21,68 +21,78 @@ namespace paddle {
namespace framework {
namespace details {
bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
std::unordered_map<OpHandleBase *, size_t> pending_ops;
std::unordered_set<VarHandleBase *> pending_vars;
std::unordered_set<VarHandleBase *> ready_vars;
std::unordered_set<OpHandleBase *> ready_ops;
class SSAGraghBuilderWithChecker : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override {
PADDLE_ENFORCE(IsValidGraph(graph.get()));
return graph;
}
auto insert_pending_var = [&](VarHandleBase *var) {
pending_vars.insert(var);
if (var->GeneratedOp() == nullptr) {
ready_vars.emplace(var);
}
};
bool IsValidGraph(const ir::Graph *graph) const {
std::unordered_map<OpHandleBase *, size_t> pending_ops;
std::unordered_set<VarHandleBase *> pending_vars;
std::unordered_set<VarHandleBase *> ready_vars;
std::unordered_set<OpHandleBase *> ready_ops;
for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
for (auto &name_pair : var_map) {
for (auto &version_pair : name_pair.second) {
insert_pending_var(version_pair);
auto insert_pending_var = [&](VarHandleBase *var) {
pending_vars.insert(var);
if (var->GeneratedOp() == nullptr) {
ready_vars.emplace(var);
}
}
}
};
for (auto &var : graph->Get<GraphDepVars>(kGraphDepVars)) {
insert_pending_var(var);
}
for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
for (auto &name_pair : var_map) {
for (auto &version_pair : name_pair.second) {
insert_pending_var(version_pair);
}
}
}
for (OpHandleBase *op : ir::FilterByNodeWrapper<OpHandleBase>(*graph)) {
if (op->Inputs().empty()) {
ready_ops.insert(op);
} else {
pending_ops.insert({op, op->NoDupInputSize()});
for (auto &var : graph->Get<GraphDepVars>(kGraphDepVars)) {
insert_pending_var(var);
}
}
auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
for (auto *op : set) {
for (auto out : op->Outputs()) {
ready_vars.emplace(out);
for (OpHandleBase *op : ir::FilterByNodeWrapper<OpHandleBase>(*graph)) {
if (op->Inputs().empty()) {
ready_ops.insert(op);
} else {
pending_ops.insert({op, op->NoDupInputSize()});
}
}
set.clear();
};
while (!pending_vars.empty()) {
run_all_ops(ready_ops);
auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
for (auto *op : set) {
for (auto out : op->Outputs()) {
ready_vars.emplace(out);
}
}
set.clear();
};
if (ready_vars.empty()) {
return false;
}
while (!pending_vars.empty()) {
run_all_ops(ready_ops);
for (auto ready_var : ready_vars) {
pending_vars.erase(ready_var);
for (auto *op : ready_var->PendingOps()) {
auto &deps = --pending_ops[op];
if (deps == 0) {
ready_ops.insert(op);
if (ready_vars.empty()) {
return false;
}
for (auto ready_var : ready_vars) {
pending_vars.erase(ready_var);
for (auto *op : ready_var->PendingOps()) {
auto &deps = --pending_ops[op];
if (deps == 0) {
ready_ops.insert(op);
}
}
}
ready_vars.clear();
}
ready_vars.clear();
return true;
}
return true;
}
};
} // namespace details
} // namespace framework
} // namespace paddle
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#pragma once
#include <string>
#include <utility>
#include <vector>
......@@ -30,78 +31,70 @@ namespace framework {
class Scope;
namespace details {
class MultiDevSSAGraphBuilder : public ir::Pass {
constexpr char kLossVarName[] = "loss_var_name";
constexpr char kPlaces[] = "places";
constexpr char kLocalScopes[] = "local_scopes";
constexpr char kStrategy[] = "strategy";
constexpr char kNRanks[] = "nranks";
class MultiDevSSAGraphBuilderBase : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
private:
void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
size_t device_id) const;
void Init() const;
virtual void Init() const;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
mutable platform::NCCLContextMap *nccl_ctxs_;
#endif
virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;
int GetVarDeviceID(
const std::string &varname,
const std::unordered_map<std::string, int> &sharded_var_device) const;
virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
const std::string &g_name) const = 0;
bool IsScaleLossOp(ir::Node *node) const;
virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const = 0;
virtual void InsertPostprocessOps(ir::Graph *result) const = 0;
int CreateRPCOp(
ir::Graph *result, ir::Node *node,
std::unordered_map<std::string, int> *sharded_var_device) const;
int CreateDistTrainOp(
ir::Graph *result, ir::Node *node,
std::unordered_map<std::string, int> *sharded_var_device) const;
bool UseGPU() const;
bool NeedCollectiveOps() const;
bool IsScaleLossOp(ir::Node *node) const;
void CreateComputationalOps(ir::Graph *result, ir::Node *node,
size_t num_places) const;
void CreateScaleLossGradOp(ir::Graph *result,
const std::string &loss_grad_name,
ir::Node *out_var_node,
ir::Node *out_var_node, size_t loss_scale,
proto::VarType::Type dtype) const;
VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
int dst_dev_id) const;
void CreateComputationalOp(ir::Graph *result, ir::Node *node,
int dev_id) const;
int GetOpDeviceID(
ir::Node *node,
const std::unordered_map<std::string, int> &sharded_var_device) const;
void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
bool IsSparseGradient(const std::string &og) const;
void InsertDataBalanceOp(ir::Graph *result,
const std::vector<std::string> &datas) const;
void CreateAllReduceOp(ir::Graph *result, const std::string &og) const;
void CreateBroadcastOp(ir::Graph *result, const std::string &p_name,
size_t src_dev_id) const;
void InsertScaleLossGradOp(ir::Graph *result, const ir::Node *node) const;
void CreateFusedBroadcastOp(
ir::Graph *result,
const std::vector<std::unordered_set<std::string>> &bcast_varnames) const;
bool IsSparseGradient(const std::string &og) const;
size_t GetAppropriateDeviceID(
const std::vector<std::string> &var_names) const;
void SetCommunicationContext(OpHandleBase *op_handle,
const platform::Place &p) const;
std::vector<ir::Node *> SortForReduceMode(
const std::vector<ir::Node *> &) const;
void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
size_t device_id) const;
int GetOpDeviceID(
ir::Node *node,
const std::unordered_map<std::string, int> &shared_var_device,
std::unordered_map<std::string, std::vector<ir::Node *>> *delay_ops)
const;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
mutable platform::NCCLContextMap *nccl_ctxs_;
#endif
mutable std::string loss_var_name_;
mutable std::vector<platform::Place> places_;
......@@ -109,8 +102,83 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
mutable BuildStrategy strategy_;
mutable std::unordered_map<std::string, VarDesc *> all_vars_;
};
class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
protected:
virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
const std::string &g_name) const;
virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const {
return false;
}
virtual void InsertPostprocessOps(ir::Graph *result) const {}
};
class BalanceVarSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
protected:
int GetVarDeviceID(const std::string &varname) const;
int GetOpDeviceID(ir::Node *node) const;
size_t GetAppropriateDeviceID(
const std::vector<std::string> &var_names) const;
virtual void ResetState() const;
mutable std::unordered_map<std::string, int> sharded_var_device_;
mutable std::vector<int64_t> balance_vars_;
};
class ReduceSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
protected:
virtual void Init() const;
virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
const std::string &g_name) const;
virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
virtual void InsertPostprocessOps(ir::Graph *result) const;
virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;
virtual void ResetState() const;
int GetOpDeviceID(ir::Node *node,
std::unordered_map<std::string, std::vector<ir::Node *>>
*delay_ops) const;
std::vector<ir::Node *> SortForReduceMode(
const std::vector<ir::Node *> &topo_ops) const;
mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
};
class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
protected:
virtual void Init() const;
virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
virtual void InsertPostprocessOps(ir::Graph *result) const;
virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
const std::string &g_name) const;
virtual void ResetState() const;
int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
mutable bool need_broadcast_var_{false};
};
std::unordered_set<std::string> &MultiDevSSAGraphBuilder();
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -40,14 +40,14 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
void NaiveExecutor::Run() {
#ifndef PADDLE_ON_INFERENCE
LOG_FIRST_N(WARNING, 15) << "The NaiveExecutor can not work properly if the "
"cmake flag ON_INFER is not set.";
LOG_FIRST_N(WARNING, 15) << "Unlike the training phase, all the scopes and "
"variables will be reused to save the allocation "
"overhead.";
LOG_FIRST_N(WARNING, 15) << "Please re-compile the inference library by "
"setting the cmake flag ON_INFER=ON if you are "
"running Paddle Inference";
LOG_FIRST_N(WARNING, 5) << "The NaiveExecutor can not work properly if the "
"cmake flag ON_INFER is not set.";
LOG_FIRST_N(WARNING, 5) << "Unlike the training phase, all the scopes and "
"variables will be reused to save the allocation "
"overhead.";
LOG_FIRST_N(WARNING, 5) << "Please re-compile the inference library by "
"setting the cmake flag ON_INFER=ON if you are "
"running Paddle Inference";
#endif // PADDLE_ON_INFERENCE
for (auto &op : ops_) {
VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
......
......@@ -123,8 +123,6 @@ struct Argument {
DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
DECL_ARGUMENT_FIELD(tensorrt_node_teller, TensorRtNodeTeller,
std::function<bool(const framework::ir::Node*)>);
DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
......
......@@ -49,13 +49,6 @@ void IRPassManager::CreatePasses(Argument *argument,
for (const std::string &pass_name : passes) {
auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
// Set some pass attributes.
if (pass_name == "ir_analysis_pass") {
pass->Set("tensorrt_node_teller",
new SubgraphDetector::NodeInsideSubgraphTeller(
argument->tensorrt_node_teller()));
}
if (pass_name == "graph_viz_pass") {
std::string dot_file_path = std::to_string(pass_num) + "_ir_" +
(pre_pass.empty() ? "origin" : pre_pass) +
......@@ -70,9 +63,6 @@ void IRPassManager::CreatePasses(Argument *argument,
}
if (pass_name == "tensorrt_subgraph_pass") {
PADDLE_ENFORCE(argument->tensorrt_node_teller_valid());
pass->SetNotOwned("tensorrt_node_teller",
argument->tensorrt_node_teller_ptr());
pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
pass->Set("min_subgraph_size",
......
cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector)
set(analysis_deps ${analysis_deps}
subgraph_detector tensorrt_subgraph_pass
CACHE INTERNAL "")
set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n")
set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
if (TENSORRT_FOUND)
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)
set(analysis_deps ${analysis_deps}
subgraph_detector tensorrt_subgraph_pass
CACHE INTERNAL "")
set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n")
set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
endif()
......@@ -20,6 +20,7 @@
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
#include "paddle/fluid/inference/tensorrt/op_teller.h"
namespace paddle {
namespace inference {
......@@ -35,8 +36,10 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
std::unique_ptr<framework::ir::Graph> graph) const {
framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
auto teller =
Get<SubgraphDetector::NodeInsideSubgraphTeller>("tensorrt_node_teller");
auto teller = [](const framework::ir::Node *node) {
if (!node->IsOp() || !node->Op()) return false;
return tensorrt::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
};
SubGraphFuser fuser(graph.get(), teller,
Get<int>("min_subgraph_size") /*min subgraph size*/);
......@@ -232,7 +235,6 @@ std::vector<std::string> ExtractParameters(
REGISTER_PASS(tensorrt_subgraph_pass,
paddle::inference::analysis::TensorRtSubgraphPass)
.RequirePassAttr("tensorrt_node_teller")
.RequirePassAttr("max_batch_size")
.RequirePassAttr("workspace_size")
.RequirePassAttr("min_subgraph_size");
......@@ -27,9 +27,6 @@ namespace analysis {
void IrAnalysisComposePass::RunImpl(Argument *argument) {
ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
if (argument->use_tensorrt_valid() && argument->use_tensorrt()) {
InitTensorRTAttrs(argument);
}
ApplyIrPasses(argument);
CollectFusionStatis(argument);
}
......@@ -38,26 +35,6 @@ std::string IrAnalysisComposePass::repr() const {
return "ir-analysis-compose-pass";
}
void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
if (argument->use_tensorrt_valid() && argument->use_tensorrt()) {
LOG(INFO) << "Initing TensorRT pass";
argument->SetTensorRtNodeTeller([](const framework::ir::Node *node) {
std::unordered_set<std::string> teller_set(
{"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
"depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
"elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
"conv2d_transpose", "leaky_relu"});
if (!node->IsOp()) return false;
if (teller_set.count(node->Op()->Type())) {
return true;
} else {
return false;
}
});
}
}
void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
std::vector<std::string> passes({
"ir_graph_build_pass", "ir_analysis_pass",
......
......@@ -33,8 +33,6 @@ class IrAnalysisComposePass : public AnalysisPass {
std::string repr() const override;
private:
void InitTensorRTAttrs(Argument* argument);
void ApplyIrPasses(Argument* argument);
void CollectFusionStatis(Argument* argument);
......
......@@ -14,86 +14,101 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_pass_builder.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle_pass_builder.h" // NOLINT
#include "paddle/fluid/platform/gpu_info.h"
namespace paddle {
PassStrategy *contrib::AnalysisConfig::pass_builder() const {
PADDLE_ENFORCE(
pass_builder_.get(),
"Should call constructor first, that will init the pass_builder_.");
if (!pass_builder_.get()) {
if (use_gpu_) {
LOG(INFO) << "Create GPU IR passes";
pass_builder_.reset(new GpuPassStrategy);
} else {
LOG(INFO) << "Create CPU IR passes";
pass_builder_.reset(new CpuPassStrategy);
}
} else if (pass_builder_->use_gpu() ^ use_gpu()) {
LOG(WARNING) << "The use_gpu flag is not compatible between Config and "
"PassBuilder, the flags are "
<< use_gpu() << " " << pass_builder_->use_gpu();
LOG(WARNING) << "Please make them compatible, still use the existing "
"PassBuilder.";
}
return pass_builder_.get();
}
contrib::AnalysisConfig::AnalysisConfig(bool use_gpu) {
this->use_gpu = use_gpu;
if (use_gpu) {
pass_builder_.reset(new GpuPassStrategy);
} else {
pass_builder_.reset(new CpuPassStrategy);
}
contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) {
model_dir_ = model_dir;
}
contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file,
const std::string &params_file) {
prog_file_ = prog_file;
params_file_ = params_file;
}
void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path,
const std::string &params_file_path) {
prog_file_ = prog_file_path;
params_file_ = params_file_path;
}
void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
int device_id) {
#ifdef PADDLE_WITH_CUDA
use_gpu_ = true;
memory_pool_init_size_mb_ = memory_pool_init_size_mb;
device_id_ = device_id;
#else
LOG(ERROR) << "Please compile with gpu to EnableGpu";
use_gpu_ = false;
#endif
}
void contrib::AnalysisConfig::DisableGpu() { use_gpu_ = false; }
contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
// fields from Config
model_dir = other.model_dir;
// fields from NativeConfig
use_gpu = other.use_gpu;
device = other.device;
fraction_of_gpu_memory = other.fraction_of_gpu_memory;
prog_file = other.prog_file;
param_file = other.param_file;
specify_input_name = other.specify_input_name;
cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
// fields from this.
enable_ir_optim = other.enable_ir_optim;
// For mkldnn
use_mkldnn_ = other.use_mkldnn_;
mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_;
use_feed_fetch_ops = other.use_feed_fetch_ops;
use_tensorrt_ = other.use_tensorrt_;
tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_;
model_from_memory_ = other.model_from_memory_;
if (use_gpu) {
#define CP_MEMBER(member__) member__ = other.member__;
// Model related.
CP_MEMBER(model_dir_);
CP_MEMBER(prog_file_);
CP_MEMBER(params_file_);
CP_MEMBER(model_from_memory_); // the memory model reuses prog_file_ and
// params_file_ fields.
// Gpu releated.
CP_MEMBER(use_gpu_);
CP_MEMBER(device_id_);
CP_MEMBER(memory_pool_init_size_mb_);
// TensorRT releated.
CP_MEMBER(use_tensorrt_);
CP_MEMBER(tensorrt_workspace_size_);
CP_MEMBER(tensorrt_max_batchsize_);
CP_MEMBER(tensorrt_min_subgraph_size_);
// MKLDNN releated.
CP_MEMBER(use_mkldnn_);
CP_MEMBER(mkldnn_enabled_op_types_);
// Ir related.
CP_MEMBER(enable_ir_optim_);
CP_MEMBER(use_feed_fetch_ops_);
CP_MEMBER(ir_debug_);
CP_MEMBER(specify_input_name_);
CP_MEMBER(cpu_math_library_num_threads_);
CP_MEMBER(serialized_info_cache_);
if (use_gpu_) {
pass_builder_.reset(new GpuPassStrategy(
*static_cast<GpuPassStrategy *>(other.pass_builder())));
} else {
pass_builder_.reset(new CpuPassStrategy(
*static_cast<CpuPassStrategy *>(other.pass_builder())));
}
}
contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
// fields from Config
model_dir = other.model_dir;
// fields from NativeConfig
use_gpu = other.use_gpu;
device = other.device;
fraction_of_gpu_memory = other.fraction_of_gpu_memory;
prog_file = other.prog_file;
param_file = other.param_file;
specify_input_name = other.specify_input_name;
cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
// fields from this.
enable_ir_optim = other.enable_ir_optim;
// For mkldnn
use_mkldnn_ = other.use_mkldnn_;
mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_;
use_feed_fetch_ops = other.use_feed_fetch_ops;
use_tensorrt_ = other.use_tensorrt_;
tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_;
model_from_memory_ = other.model_from_memory_;
pass_builder_ = std::move(other.pass_builder_);
#undef CP_MEMBER
}
void contrib::AnalysisConfig::EnableMKLDNN() {
......@@ -112,17 +127,90 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
use_tensorrt_ = true;
tensorrt_workspace_size_ = workspace_size;
tensorrt_max_batchsize_ = max_batch_size;
tensorrt_min_subgraph_size_ = min_subgraph_size;
// Append after the conv+affine_channel fuse pass.
pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
}
void contrib::AnalysisConfig::Update() {
auto info = SerializeInfoCache();
if (info == serialized_info_cache_) return;
if (use_gpu_) {
pass_builder_.reset(new GpuPassStrategy);
} else {
pass_builder_.reset(new CpuPassStrategy);
}
if (use_tensorrt_) {
if (!use_gpu_) {
LOG(ERROR)
<< "TensorRT engine is not available when EnableGpu() not actived.";
} else {
// Append after the infer_clean pass.
pass_builder()->InsertPass(1, "tensorrt_subgraph_pass");
}
}
if (use_mkldnn_) {
if (!enable_ir_optim_) {
LOG(ERROR)
<< "EnableMKLDNN() only works when IR optimization is enabled.";
}
#ifdef PADDLE_WITH_MKLDNN
pass_builder()->EnableMKLDNN();
use_mkldnn_ = true;
#else
LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
use_mkldnn_ = false;
#endif
}
if (ir_debug_) {
pass_builder()->TurnOnDebug();
}
}
std::string contrib::AnalysisConfig::SerializeInfoCache() {
std::stringstream ss;
ss << use_gpu_;
ss << memory_pool_init_size_mb_;
ss << use_tensorrt_;
ss << tensorrt_workspace_size_;
ss << tensorrt_max_batchsize_;
ss << use_mkldnn_;
ss << enable_ir_optim_;
ss << use_feed_fetch_ops_;
ss << ir_debug_;
return ss.str();
}
void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads(
int cpu_math_library_num_threads) {
cpu_math_library_num_threads_ = cpu_math_library_num_threads;
}
float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
#ifdef PADDLE_WITH_CUDA
// Get the GPU memory details and calculate the fraction of memory for the
// GPU memory pool.
size_t gpu_used, gpu_available;
platform::GpuMemoryUsage(&gpu_used, &gpu_available);
double total_gpu_memory = (gpu_used + gpu_available) / 1024. / 1024.;
float fraction_of_gpu_memory =
static_cast<double>(memory_pool_init_size_mb()) / total_gpu_memory;
return fraction_of_gpu_memory;
#else
return 0.;
#endif
}
void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
size_t prog_buffer_size,
const char *param_buffer,
size_t param_buffer_size) {
prog_file = std::string(prog_buffer, prog_buffer + prog_buffer_size);
param_file = std::string(param_buffer, param_buffer + param_buffer_size);
prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size);
params_file_ = std::string(param_buffer, param_buffer + param_buffer_size);
model_from_memory_ = true;
}
......
......@@ -33,6 +33,7 @@
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool(profile);
......@@ -59,8 +60,8 @@ bool AnalysisPredictor::Init(
if (FLAGS_profile) {
LOG(WARNING) << "Profiler is actived, might affect the performance";
LOG(INFO) << "You can turn off by set gflags '-profile false'";
auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
: platform::ProfilerState::kCPU;
auto tracking_device = config_.use_gpu() ? platform::ProfilerState::kAll
: platform::ProfilerState::kCPU;
platform::EnableProfiler(tracking_device);
}
......@@ -112,7 +113,7 @@ bool AnalysisPredictor::PrepareProgram(
// Optimize the program, and load parameters and modify them in the
// scope_.
// This will change the scope_ address.
if (config_.enable_ir_optim) {
if (config_.ir_optim()) {
status_ir_optim_enabled_ = true;
OptimizeInferenceProgram();
} else {
......@@ -140,9 +141,9 @@ bool AnalysisPredictor::PrepareProgram(
return true;
}
bool AnalysisPredictor::CreateExecutor() {
if (config_.use_gpu) {
if (config_.use_gpu_) {
status_use_gpu_ = true;
place_ = paddle::platform::CUDAPlace(config_.device);
place_ = paddle::platform::CUDAPlace(config_.device_id_);
} else {
place_ = paddle::platform::CPUPlace();
}
......@@ -151,7 +152,7 @@ bool AnalysisPredictor::CreateExecutor() {
}
bool AnalysisPredictor::PrepareExecutor() {
executor_->Prepare(sub_scope_, *inference_program_, 0,
config_.use_feed_fetch_ops);
config_.use_feed_fetch_ops_);
PADDLE_ENFORCE_NOT_NULL(sub_scope_);
......@@ -250,7 +251,7 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
}
input.set_lod(lod);
int idx = -1;
if (config_.specify_input_name) {
if (config_.specify_input_name_) {
auto name = inputs[i].name;
if (feed_names_.find(name) == feed_names_.end()) {
LOG(ERROR) << "feed names from program do not have name: [" << name
......@@ -314,22 +315,22 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
void AnalysisPredictor::OptimizeInferenceProgram() {
status_program_optimized_ = true;
argument_.SetUseGPU(config_.use_gpu);
argument_.SetGPUDeviceId(config_.device);
argument_.SetUseGPU(config_.use_gpu());
argument_.SetGPUDeviceId(config_.gpu_device_id());
argument_.SetModelFromMemory(config_.model_from_memory_);
// Analyze inference_program
if (!config_.model_dir.empty()) {
argument_.SetModelDir(config_.model_dir);
if (!config_.model_dir().empty()) {
argument_.SetModelDir(config_.model_dir());
} else {
PADDLE_ENFORCE(
!config_.param_file.empty(),
!config_.params_file().empty(),
"Either model_dir or (param_file, prog_file) should be set.");
PADDLE_ENFORCE(!config_.prog_file.empty());
argument_.SetModelProgramPath(config_.prog_file);
argument_.SetModelParamsPath(config_.param_file);
PADDLE_ENFORCE(!config_.prog_file().empty());
argument_.SetModelProgramPath(config_.prog_file());
argument_.SetModelParamsPath(config_.params_file());
}
if (config_.use_gpu && config_.use_tensorrt_) {
if (config_.use_gpu() && config_.tensorrt_engine_enabled()) {
argument_.SetUseTensorRT(true);
argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
......@@ -341,7 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
}
auto passes = config_.pass_builder()->AllPasses();
if (!config_.enable_ir_optim) passes.clear();
if (!config_.ir_optim()) passes.clear();
argument_.SetIrAnalysisPasses(passes);
argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get()));
Analyzer().Run(&argument_);
......@@ -358,18 +359,26 @@ template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
VLOG(3) << "create AnalysisConfig";
if (config.use_gpu) {
if (config.use_gpu()) {
// 1. GPU memeroy
PADDLE_ENFORCE_GT(
config.fraction_of_gpu_memory, 0.f,
"fraction_of_gpu_memory in the config should be set to range (0., 1.]");
PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f);
PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
config.gpu_device_id());
std::vector<std::string> flags;
if (config.fraction_of_gpu_memory >= 0.0f ||
config.fraction_of_gpu_memory <= 0.95f) {
float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool();
if (fraction_of_gpu_memory > 0.95f) {
LOG(ERROR)
<< "Allocate too much memory for the GPU memory pool, assigned "
<< config.memory_pool_init_size_mb() << " MB";
LOG(ERROR)
<< "Try to shink the value by setting AnalysisConfig::EnableGpu(...)";
}
if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) {
flags.push_back("dummpy");
std::string flag = "--fraction_of_gpu_memory_to_use=" +
std::to_string(config.fraction_of_gpu_memory);
std::to_string(fraction_of_gpu_memory);
flags.push_back(flag);
VLOG(3) << "set flag: " << flag;
framework::InitGflags(flags);
......@@ -443,22 +452,22 @@ bool AnalysisPredictor::ZeroCopyRun() {
bool AnalysisPredictor::LoadProgramDesc() {
// Initialize the inference program
std::string filename;
if (!config_.model_dir.empty()) {
filename = config_.model_dir + "/__model__";
} else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
if (!config_.model_dir().empty()) {
filename = config_.model_dir() + "/__model__";
} else if (!config_.prog_file().empty() && !config_.params_file().empty()) {
// All parameters are saved in a single file.
// The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`.
filename = config_.prog_file;
filename = config_.prog_file();
} else {
if (config_.model_dir.empty() && config_.prog_file.empty()) {
if (config_.model_dir().empty() && config_.prog_file().empty()) {
LOG(ERROR)
<< "Either model_dir or (prog_file, param_file) should be set.";
return false;
}
LOG(ERROR) << string::Sprintf(
"not valid model path '%s' or program path '%s'.", config_.model_dir,
config_.param_file);
"not valid model path '%s' or program path '%s'.", config_.model_dir(),
config_.params_file());
return false;
}
......@@ -478,7 +487,7 @@ bool AnalysisPredictor::LoadProgramDesc() {
proto.ParseFromString(pb_content);
} else {
proto.ParseFromString(config_.prog_file);
proto.ParseFromString(config_.prog_file());
}
inference_program_.reset(new framework::ProgramDesc(proto));
return true;
......@@ -508,27 +517,27 @@ bool AnalysisPredictor::LoadParameters() {
new_var->SetLoDLevel(var->GetLoDLevel());
new_var->SetPersistable(true);
if (!config_.param_file.empty()) {
if (!config_.params_file().empty()) {
params.push_back(new_var->Name());
} else {
// append_op
framework::OpDesc *op = load_block->AppendOp();
op->SetType("load");
op->SetOutput("Out", {new_var->Name()});
op->SetAttr("file_path", {config_.model_dir + "/" + new_var->Name()});
op->SetAttr("file_path", {config_.model_dir() + "/" + new_var->Name()});
op->CheckAttrs();
}
}
}
if (!config_.param_file.empty()) {
if (!config_.params_file().empty()) {
// sort paramlist to have consistent ordering
std::sort(params.begin(), params.end());
// append just the load_combine op
framework::OpDesc *op = load_block->AppendOp();
op->SetType("load_combine");
op->SetOutput("Out", params);
op->SetAttr("file_path", {config_.param_file});
op->SetAttr("file_path", {config_.params_file()});
op->CheckAttrs();
}
......
......@@ -25,9 +25,9 @@ namespace paddle {
using contrib::AnalysisConfig;
TEST(AnalysisPredictor, analysis_off) {
AnalysisConfig config(false);
config.model_dir = FLAGS_dirname;
config.enable_ir_optim = false;
AnalysisConfig config;
config.SetModel(FLAGS_dirname);
config.SwitchIrOptim(false);
auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
......@@ -55,14 +55,14 @@ TEST(AnalysisPredictor, analysis_off) {
}
TEST(AnalysisPredictor, analysis_on) {
AnalysisConfig config;
config.SetModel(FLAGS_dirname);
config.SwitchIrOptim(true);
#ifdef PADDLE_WITH_CUDA
AnalysisConfig config(true);
config.fraction_of_gpu_memory = 0.15;
config.EnableUseGpu(100, 0);
#else
AnalysisConfig config;
config.DisableGpu();
#endif
config.model_dir = FLAGS_dirname;
config.enable_ir_optim = true;
auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
......@@ -89,7 +89,8 @@ TEST(AnalysisPredictor, analysis_on) {
}
// compare with NativePredictor
auto naive_predictor = CreatePaddlePredictor<NativeConfig>(config);
auto naive_predictor =
CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
std::vector<PaddleTensor> naive_outputs;
ASSERT_TRUE(naive_predictor->Run(inputs, &naive_outputs));
ASSERT_EQ(naive_outputs.size(), 1UL);
......@@ -98,9 +99,8 @@ TEST(AnalysisPredictor, analysis_on) {
TEST(AnalysisPredictor, ZeroCopy) {
AnalysisConfig config;
config.model_dir = FLAGS_dirname;
config.use_feed_fetch_ops = false;
config.SetModel(FLAGS_dirname);
config.SwitchUseFeedFetchOps(false);
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
auto w0 = predictor->GetInputTensor("firstw");
......@@ -137,9 +137,9 @@ TEST(AnalysisPredictor, ZeroCopy) {
TEST(AnalysisPredictor, Clone) {
AnalysisConfig config;
config.model_dir = FLAGS_dirname;
config.use_feed_fetch_ops = true;
config.enable_ir_optim = true;
config.SetModel(FLAGS_dirname);
config.SwitchUseFeedFetchOps(true);
config.SwitchIrOptim(true);
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
predictors.emplace_back(CreatePaddlePredictor(config));
......
......@@ -19,8 +19,6 @@ limitations under the License. */
#pragma once
#define WITH_ANAKIN
#include <vector>
#include "framework/core/net/net.h"
......
......@@ -288,7 +288,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
VLOG(3) << "create NativePaddlePredictor";
if (config.use_gpu) {
// 1. GPU memeroy
PADDLE_ENFORCE_GT(
PADDLE_ENFORCE_GE(
config.fraction_of_gpu_memory, 0.f,
"fraction_of_gpu_memory in the config should be set to range (0., 1.]");
PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
......
......@@ -295,7 +295,8 @@ TEST(inference_api_native, image_classification_gpu) {
#endif
TEST(PassBuilder, Delete) {
contrib::AnalysisConfig config(false);
contrib::AnalysisConfig config;
config.DisableGpu();
config.pass_builder()->DeletePass("attention_lstm_fuse_pass");
const auto& passes = config.pass_builder()->AllPasses();
auto it = std::find(passes.begin(), passes.end(), "attention_lstm_fuse_pass");
......
......@@ -36,12 +36,11 @@ namespace demo {
*/
void Main() {
std::unique_ptr<PaddlePredictor> predictor;
paddle::contrib::AnalysisConfig config(true);
config.param_file = FLAGS_modeldir + "/__params__";
config.prog_file = FLAGS_modeldir + "/__model__";
config.device = 0;
paddle::contrib::AnalysisConfig config;
config.EnableUseGpu(100, 0);
config.SetModel(FLAGS_modeldir + "/__params__",
FLAGS_modeldir + "/__model__");
config.EnableTensorRtEngine();
config.fraction_of_gpu_memory = 0.1; // set by yourself
predictor = CreatePaddlePredictor(config);
VLOG(3) << "begin to process data";
......
......@@ -40,15 +40,14 @@ using contrib::AnalysisConfig;
*/
void Main(bool use_gpu) {
std::unique_ptr<PaddlePredictor> predictor, analysis_predictor;
AnalysisConfig config(use_gpu);
config.param_file = FLAGS_modeldir + "/__params__";
config.prog_file = FLAGS_modeldir + "/__model__";
config.device = 0;
if (FLAGS_use_gpu) {
config.fraction_of_gpu_memory = 0.1; // set by yourself
AnalysisConfig config;
if (use_gpu) {
config.EnableUseGpu(100, 0);
}
config.SetModel(FLAGS_modeldir + "/__model__",
FLAGS_modeldir + "/__params__");
predictor = CreatePaddlePredictor<NativeConfig>(config);
predictor = CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
analysis_predictor = CreatePaddlePredictor(config);
// Just a single batch of data.
......
......@@ -34,26 +34,67 @@ class AnalysisPredictor;
namespace contrib {
// NOTE WIP, not stable yet.
struct AnalysisConfig : public NativeConfig {
explicit AnalysisConfig(bool use_gpu = false);
struct AnalysisConfig {
AnalysisConfig() = default;
explicit AnalysisConfig(const AnalysisConfig& other);
explicit AnalysisConfig(AnalysisConfig&& other);
explicit AnalysisConfig(const std::string& model_dir);
explicit AnalysisConfig(const std::string& prog_file,
const std::string& params_file);
// Model path related.
void SetModel(const std::string& model_dir) { model_dir_ = model_dir; }
void SetModel(const std::string& prog_file_path,
const std::string& params_file_path);
void SetProgFile(const std::string& x) { prog_file_ = x; }
void SetParamsFile(const std::string& x) { params_file_ = x; }
const std::string& model_dir() const { return model_dir_; }
const std::string& prog_file() const { return prog_file_; }
const std::string& params_file() const { return params_file_; }
// GPU related.
void EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id = 0);
void DisableGpu();
bool use_gpu() const { return use_gpu_; }
int gpu_device_id() const { return device_id_; }
int memory_pool_init_size_mb() const { return memory_pool_init_size_mb_; }
float fraction_of_gpu_memory_for_pool() const;
// Determine whether to perform graph optimization.
bool enable_ir_optim = true;
void SwitchIrOptim(int x = true) { enable_ir_optim_ = x; }
bool ir_optim() const { return enable_ir_optim_; }
// Get a pass builder for customize the passes in IR analysis phase.
PassStrategy* pass_builder() const;
void SwitchUseFeedFetchOps(int x = true) { use_feed_fetch_ops_ = x; }
bool use_feed_fetch_ops_enabled() const { return use_feed_fetch_ops_; }
// NOT stable yet.
bool use_feed_fetch_ops{true};
void SwitchSpecifyInputNames(bool x = true) { specify_input_name_ = x; }
bool specify_input_name() const { return specify_input_name_; }
void EnableTensorRtEngine(int workspace_size = 1 << 20,
int max_batch_size = 1, int min_subgraph_size = 3);
bool use_tensorrt() const { return use_tensorrt_; }
bool tensorrt_engine_enabled() const { return use_tensorrt_; }
void SwitchIrDebug(int x = true) { ir_debug_ = x; }
void EnableMKLDNN();
bool use_mkldnn() const { return use_mkldnn_; }
bool mkldnn_enabled() const { return use_mkldnn_; }
// Set and get the number of cpu math library threads.
void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads);
int cpu_math_library_num_threads() const {
return cpu_math_library_num_threads_;
}
NativeConfig ToNativeConfig() const {
NativeConfig config;
config.model_dir = model_dir_;
config.prog_file = prog_file_;
config.param_file = params_file_;
config.use_gpu = use_gpu_;
config.device = device_id_;
config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
config.specify_input_name = specify_input_name_;
return config;
}
void SetMKLDNNOp(std::unordered_set<std::string> op_list) {
mkldnn_enabled_op_types_ = op_list;
}
......@@ -65,10 +106,29 @@ struct AnalysisConfig : public NativeConfig {
friend class ::paddle::AnalysisPredictor;
// NOTE just for developer, not an official API, easily to be broken.
// Get a pass builder for customize the passes in IR analysis phase.
PassStrategy* pass_builder() const;
protected:
// Update the config.
void Update();
std::string SerializeInfoCache();
protected:
// Model pathes.
std::string model_dir_;
std::string prog_file_;
std::string params_file_;
// GPU releated.
bool use_gpu_{false};
int device_id_{0};
uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB.
// TensorRT releated.
bool use_tensorrt_{false};
bool use_mkldnn_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_;
// For workspace_size, refer it from here:
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
int tensorrt_workspace_size_;
......@@ -82,17 +142,24 @@ struct AnalysisConfig : public NativeConfig {
// We set this variable to control the minimum number of nodes in the
// subgraph, 3 as default value.
int tensorrt_min_subgraph_size_{3};
std::unique_ptr<PassStrategy> pass_builder_;
bool use_mkldnn_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_;
bool model_from_memory_{false};
};
// Configurations for Anakin engine.
struct AnakinConfig : public PaddlePredictor::Config {
enum TargetType { NVGPU = 0, X86 };
int device;
std::string model_file;
int max_batch_size{-1};
TargetType target_type;
bool enable_ir_optim_{true};
bool use_feed_fetch_ops_{true};
bool ir_debug_{false};
bool specify_input_name_{false};
int cpu_math_library_num_threads_{1};
// A runtime cache, shouldn't be transferred to others.
std::string serialized_info_cache_;
mutable std::unique_ptr<PassStrategy> pass_builder_;
};
} // namespace contrib
......
......@@ -26,9 +26,8 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle_api.h" // NOLINT
#ifndef WITH_ANAKIN
#include "paddle_analysis_config.h" // NOLINT
#else
#include "paddle_api.h" // NOLINT
#ifdef WITH_ANAKIN
#include "paddle_anakin_config.h" // NOLINT
#endif
......@@ -62,7 +62,12 @@ class PassStrategy : public PaddlePassBuilder {
// still some CPU kernels running in CPU mode.
virtual void EnableMKLDNN() = 0;
bool use_gpu() const { return use_gpu_; }
virtual ~PassStrategy() = default;
protected:
bool use_gpu_{false};
};
/*
......@@ -88,6 +93,7 @@ class CpuPassStrategy : public PassStrategy {
"conv_eltwiseadd_bn_fuse_pass", //
"is_test_pass", //
});
use_gpu_ = false;
}
virtual ~CpuPassStrategy() = default;
......@@ -126,10 +132,14 @@ class GpuPassStrategy : public PassStrategy {
"conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
});
use_gpu_ = true;
}
GpuPassStrategy(const GpuPassStrategy &other)
: PassStrategy(other.AllPasses()) {}
: PassStrategy(other.AllPasses()) {
use_gpu_ = true;
}
void EnableMKLDNN() override;
......
nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto)
nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
add_subdirectory(plugin)
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tensorrt/op_teller.h"
namespace paddle {
namespace inference {
namespace tensorrt {
// Just tell by the op_types.
struct SimpleOpTypeSetTeller : public Teller {
SimpleOpTypeSetTeller() {}
bool operator()(const std::string& op_type,
const framework::OpDesc& desc) override {
return teller_set.count(op_type);
}
private:
std::unordered_set<std::string> teller_set{
{"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
"depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
"elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
"conv2d_transpose", "leaky_relu"}};
};
bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {
for (auto& teller : tellers_) {
if ((*teller)(op_type, desc)) return true;
}
return false;
}
OpTeller::OpTeller() { tellers_.emplace_back(new SimpleOpTypeSetTeller); }
} // namespace tensorrt
} // namespace inference
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_desc.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* Single Op teller definition.
* One can override this and define a more complex tell logic, considerring more
* issues such as op_desc.
*/
struct Teller {
virtual bool operator()(const std::string& op_type,
const framework::OpDesc& desc) = 0;
virtual ~Teller() = default;
};
/*
* A real example:
*
* struct SomeTeller : public Teller {
* bool operator()(const std::string& op_type,
* const framework::OpDesc& desc) override {
* return op_type == "fc" && desc.Inputs().size() == 2;
* }
*};
*/
/*
* class OpTeller helps to tell whether a fluid
* operator can be transformed to a TensorRT layer.
*/
class OpTeller {
public:
static OpTeller& Global() {
static std::unique_ptr<OpTeller> x(new OpTeller);
return *x;
}
bool Tell(const std::string& op_type, const framework::OpDesc& desc);
private:
OpTeller();
private:
std::vector<std::unique_ptr<Teller>> tellers_;
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
......@@ -165,12 +165,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
}
void SetConfig(contrib::AnalysisConfig *cfg) {
cfg->prog_file = FLAGS_infer_model + "/__model__";
cfg->param_file = FLAGS_infer_model + "/param";
cfg->use_gpu = false;
cfg->device = 0;
cfg->specify_input_name = true;
cfg->enable_ir_optim = true;
cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
cfg->SwitchSpecifyInputNames();
cfg->SwitchIrOptim(true);
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
......@@ -105,11 +105,10 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
}
void SetConfig(AnalysisConfig *cfg) {
cfg->model_dir = FLAGS_infer_model;
cfg->use_gpu = false;
cfg->device = 0;
cfg->specify_input_name = true;
cfg->enable_ir_optim = true;
cfg->SetModel(FLAGS_infer_model);
cfg->DisableGpu();
cfg->SwitchSpecifyInputNames();
cfg->SwitchIrOptim();
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
......@@ -76,11 +76,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
}
void SetConfig(contrib::AnalysisConfig *cfg) {
cfg->model_dir = FLAGS_infer_model;
cfg->use_gpu = false;
cfg->device = 0;
cfg->specify_input_name = true;
cfg->enable_ir_optim = true;
cfg->SetModel(FLAGS_infer_model);
cfg->DisableGpu();
cfg->SwitchSpecifyInputNames();
cfg->SwitchIrOptim();
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
......@@ -84,13 +84,12 @@ void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) {
cfg->SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0],
buffer_param.size());
} else {
cfg->prog_file = FLAGS_infer_model + "/__model__";
cfg->param_file = FLAGS_infer_model + "/param";
cfg->SetModel(FLAGS_infer_model + "/__model__",
FLAGS_infer_model + "/param");
}
cfg->use_gpu = false;
cfg->device = 0;
cfg->specify_input_name = true;
cfg->enable_ir_optim = true;
cfg->DisableGpu();
cfg->SwitchSpecifyInputNames();
cfg->SwitchIrOptim();
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
......@@ -21,12 +21,10 @@ namespace inference {
namespace analysis {
void SetConfig(AnalysisConfig *cfg) {
cfg->param_file = FLAGS_infer_model + "/params";
cfg->prog_file = FLAGS_infer_model + "/model";
cfg->use_gpu = false;
cfg->device = 0;
cfg->enable_ir_optim = true;
cfg->specify_input_name = true;
cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
cfg->DisableGpu();
cfg->SwitchIrOptim();
cfg->SwitchSpecifyInputNames();
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
}
......
......@@ -204,12 +204,10 @@ void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor,
}
void SetConfig(AnalysisConfig *cfg) {
cfg->prog_file = FLAGS_infer_model + "/__model__";
cfg->param_file = FLAGS_infer_model + "/param";
cfg->use_gpu = false;
cfg->device = 0;
cfg->specify_input_name = true;
cfg->enable_ir_optim = true;
cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
cfg->DisableGpu();
cfg->SwitchSpecifyInputNames();
cfg->SwitchIrOptim();
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......@@ -225,10 +223,10 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
// Easy for profiling independently.
TEST(Analyzer_rnn1, profile) {
contrib::AnalysisConfig cfg(false);
contrib::AnalysisConfig cfg;
SetConfig(&cfg);
cfg.fraction_of_gpu_memory = 0.1;
cfg.pass_builder()->TurnOnDebug();
cfg.DisableGpu();
cfg.SwitchIrDebug();
std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all;
......@@ -293,16 +291,18 @@ TEST(Analyzer_rnn1, multi_thread) {
TEST(Analyzer_rnn1, ZeroCopy) {
AnalysisConfig config;
SetConfig(&config);
config.use_feed_fetch_ops = false;
config.SwitchUseFeedFetchOps(false);
PaddlePlace place;
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
config.use_feed_fetch_ops = true;
auto native_predictor = CreatePaddlePredictor<NativeConfig>(config);
config.SwitchUseFeedFetchOps(true);
auto native_predictor =
CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
config.use_feed_fetch_ops = true; // the analysis predictor needs feed/fetch.
config.SwitchUseFeedFetchOps(
true); // the analysis predictor needs feed/fetch.
auto analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
#define NEW_TENSOR(name__) \
......@@ -362,7 +362,7 @@ TEST(Analyzer_rnn1, ZeroCopy) {
TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
AnalysisConfig config;
SetConfig(&config);
config.use_feed_fetch_ops = false;
config.SwitchUseFeedFetchOps(false);
#define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__);
......
......@@ -105,12 +105,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
}
void SetConfig(AnalysisConfig *cfg) {
cfg->prog_file = FLAGS_infer_model + "/__model__";
cfg->param_file = FLAGS_infer_model + "/param";
cfg->use_gpu = false;
cfg->device = 0;
cfg->specify_input_name = true;
cfg->enable_ir_optim = true;
cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
cfg->DisableGpu();
cfg->SwitchSpecifyInputNames();
cfg->SwitchIrOptim();
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
......@@ -89,11 +89,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
}
void SetConfig(AnalysisConfig *cfg) {
cfg->model_dir = FLAGS_infer_model;
cfg->use_gpu = false;
cfg->device = 0;
cfg->specify_input_name = true;
cfg->enable_ir_optim = true;
cfg->SetModel(FLAGS_infer_model);
cfg->DisableGpu();
cfg->SwitchSpecifyInputNames();
cfg->SwitchIrOptim();
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
......@@ -122,12 +122,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) {
}
void SetConfig(AnalysisConfig *cfg) {
cfg->param_file = FLAGS_infer_model + "/params";
cfg->prog_file = FLAGS_infer_model + "/model";
cfg->use_gpu = false;
cfg->device = 0;
cfg->enable_ir_optim = true;
cfg->specify_input_name = true;
cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
cfg->DisableGpu();
cfg->SwitchSpecifyInputNames();
cfg->pass_builder()->TurnOnDebug();
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
}
......
......@@ -47,11 +47,10 @@ struct DataReader {
};
void SetConfig(AnalysisConfig *cfg) {
cfg->model_dir = FLAGS_infer_model;
cfg->use_gpu = false;
cfg->device = 0;
cfg->specify_input_name = true;
cfg->enable_ir_optim = true;
cfg->SetModel(FLAGS_infer_model);
cfg->DisableGpu();
cfg->SwitchSpecifyInputNames();
cfg->SwitchIrOptim();
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
......@@ -51,12 +51,11 @@ Record ProcessALine(const std::string &line) {
}
void SetConfig(AnalysisConfig *cfg) {
cfg->param_file = FLAGS_infer_model + "/__params__";
cfg->prog_file = FLAGS_infer_model + "/__model__";
cfg->use_gpu = false;
cfg->device = 0;
cfg->enable_ir_optim = true;
cfg->specify_input_name = true;
cfg->SetModel(FLAGS_infer_model + "/__model__",
FLAGS_infer_model + "/__params__");
cfg->DisableGpu();
cfg->SwitchIrDebug();
cfg->SwitchSpecifyInputNames();
// TODO(TJ): fix fusion gru
cfg->pass_builder()->DeletePass("fc_gru_fuse_pass");
}
......
......@@ -64,19 +64,23 @@ std::ostream &operator<<(std::ostream &os,
num_spaces++;
os << *reinterpret_cast<const NativeConfig *>(&config);
if (!config.model_from_memory()) {
os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file() << "\n";
os << GenSpaces(num_spaces) << "param_file: " << config.params_file()
<< "\n";
} else {
os << GenSpaces(num_spaces)
<< "prog_file and param_file: load from memory \n";
}
os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim
os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
<< "\n";
os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
<< "\n";
os << GenSpaces(num_spaces)
<< "use_feed_fetch_ops: " << config.use_feed_fetch_ops_enabled() << "\n";
os << GenSpaces(num_spaces)
<< "use_feed_fetch_ops: " << config.use_feed_fetch_ops << "\n";
os << GenSpaces(num_spaces) << "use_tensorrt: " << config.use_tensorrt()
<< "use_tensorrt: " << config.tensorrt_engine_enabled() << "\n";
os << GenSpaces(num_spaces) << "use_mkldnn: " << config.mkldnn_enabled()
<< "\n";
os << GenSpaces(num_spaces) << "use_mkldnn: " << config.use_mkldnn() << "\n";
num_spaces--;
os << GenSpaces(num_spaces) << "}\n";
return os;
......
......@@ -328,7 +328,10 @@ void CompareNativeAndAnalysis(
const std::vector<std::vector<PaddleTensor>> &inputs) {
PrintConfig(config, true);
std::vector<PaddleTensor> native_outputs, analysis_outputs;
TestOneThreadPrediction(config, inputs, &native_outputs, false);
const auto *analysis_config =
reinterpret_cast<const contrib::AnalysisConfig *>(config);
auto native_config = analysis_config->ToNativeConfig();
TestOneThreadPrediction(&native_config, inputs, &native_outputs, false);
TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
CompareResult(analysis_outputs, native_outputs);
}
......
......@@ -46,22 +46,20 @@ void SetConfig<contrib::AnalysisConfig>(contrib::AnalysisConfig* config,
std::string model_dir, bool use_gpu,
bool use_tensorrt, int batch_size) {
if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
config->prog_file = model_dir + "/" + FLAGS_prog_filename;
config->param_file = model_dir + "/" + FLAGS_param_filename;
config->SetModel(model_dir + "/" + FLAGS_prog_filename,
model_dir + "/" + FLAGS_param_filename);
} else {
config->model_dir = model_dir;
config->SetModel(model_dir);
}
if (use_gpu) {
config->use_gpu = true;
config->device = 0;
config->fraction_of_gpu_memory = 0.15;
config->EnableUseGpu(100, 0);
if (use_tensorrt) {
config->EnableTensorRtEngine(1 << 10, batch_size);
config->pass_builder()->DeletePass("conv_bn_fuse_pass");
config->pass_builder()->DeletePass("fc_fuse_pass");
config->pass_builder()->TurnOnDebug();
} else {
config->enable_ir_optim = true;
config->SwitchIrOptim();
}
}
}
......@@ -77,7 +75,8 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
std::vector<PaddleTensor> outputs;
if (use_analysis || use_tensorrt) {
contrib::AnalysisConfig config(true);
contrib::AnalysisConfig config;
config.EnableUseGpu(100, 0);
config.pass_builder()->TurnOnDebug();
SetConfig<contrib::AnalysisConfig>(&config, model_dir, true, use_tensorrt,
FLAGS_batch_size);
......@@ -109,7 +108,8 @@ void compare(std::string model_dir, bool use_tensorrt) {
&native_outputs, false);
std::vector<PaddleTensor> analysis_outputs;
contrib::AnalysisConfig analysis_config(true);
contrib::AnalysisConfig analysis_config;
analysis_config.EnableUseGpu(50, 0);
SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true,
use_tensorrt, FLAGS_batch_size);
TestOneThreadPrediction(
......@@ -154,9 +154,9 @@ TEST(TensorRT_mobilenet, analysis) {
TEST(AnalysisPredictor, use_gpu) {
std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
AnalysisConfig config(true);
config.model_dir = model_dir;
config.fraction_of_gpu_memory = 0.15;
AnalysisConfig config;
config.EnableUseGpu(100, 0);
config.SetModel(model_dir);
config.pass_builder()->TurnOnDebug();
std::vector<std::vector<PaddleTensor>> inputs_all;
......
......@@ -319,6 +319,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
bool fuse_relu = ctx.Attr<bool>("fuse_relu");
bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
bool is_conv3d = strides.size() == 3U;
......@@ -329,6 +331,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
dilations[2] == 1
: dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
"dilation in convolution is not implemented yet");
PADDLE_ENFORCE(is_conv3d != true, "int8 does not support conv3d currently");
const T* input_data = input->data<T>();
......@@ -340,15 +343,24 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
GetWeightsTz(weights_tz, g, is_conv3d);
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
mkldnn::memory::data_type src_dt =
paddle::framework::ToMKLDNNDataType(input->type());
auto dst_dt = fuse_relu ? paddle::framework::ToMKLDNNDataType(
framework::DataTypeTrait<uint8_t>::DataType)
: paddle::framework::ToMKLDNNDataType(
framework::DataTypeTrait<int8_t>::DataType);
if (force_fp32_output) {
dst_dt = paddle::framework::ToMKLDNNDataType(
framework::DataTypeTrait<float>::DataType);
}
// Get unique name for storing MKLDNN primitives
std::string key;
key.reserve(MaxKeyLength);
mkldnn::memory::data_type src_dt =
paddle::framework::ToMKLDNNDataType(input->type());
platform::ConvMKLDNNHandler::AppendKey(
&key, src_tz, weights_tz, strides, paddings, dilations, groups, src_dt,
input->format(), ctx.op().Output("Output"));
input->format(), dst_dt, ctx.op().Output("Output"));
const std::string key_conv_pd = key + "@conv_pd";
std::shared_ptr<mkldnn::convolution_forward> conv_p = nullptr;
......@@ -413,13 +425,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
platform::MKLDNNMemDesc(src_tz, src_dt, chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc(
weights_tz, memory::data_type::s8, chosen_memory_format);
auto dst_dt = force_fp32_output
? paddle::framework::ToMKLDNNDataType(
framework::DataTypeTrait<float>::DataType)
: paddle::framework::ToMKLDNNDataType(
framework::DataTypeTrait<int8_t>::DataType);
auto dst_md =
platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format);
// create a conv primitive descriptor and save it for usage in backward
......@@ -429,11 +434,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
memory::format::x);
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
strides, paddings, mkldnn_engine,
output_shift_scale, is_test);
fuse_relu, output_shift_scale, is_test);
} else {
conv_pd =
ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
mkldnn_engine, output_shift_scale, is_test);
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides,
paddings, mkldnn_engine, fuse_relu,
output_shift_scale, is_test);
}
// Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd);
......@@ -459,7 +464,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
mask_reorder);
if (!force_fp32_output) {
dst_memory_p = platform::SetDstMemory<int8_t>(ctx, output, handler);
if (fuse_relu) {
dst_memory_p = platform::SetDstMemory<uint8_t>(ctx, output, handler);
} else {
dst_memory_p = platform::SetDstMemory<int8_t>(ctx, output, handler);
}
} else {
dst_memory_p = platform::SetDstMemory<float>(ctx, output, handler);
}
......@@ -518,8 +527,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
mkldnn_engine, key));
}
if (!force_fp32_output) {
dst_memory_p =
platform::SetDstMemoryHandler<int8_t>(ctx, output, handler);
if (fuse_relu) {
dst_memory_p =
platform::SetDstMemoryHandler<uint8_t>(ctx, output, handler);
} else {
dst_memory_p =
platform::SetDstMemoryHandler<int8_t>(ctx, output, handler);
}
} else {
dst_memory_p =
platform::SetDstMemoryHandler<float>(ctx, output, handler);
......@@ -563,11 +577,18 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
}
mkldnn::primitive_attr CreatePostOps(
const std::vector<float> output_shift_scale) const {
bool fuse_relu, const std::vector<float> output_shift_scale) const {
mkldnn::primitive_attr conv_attr;
mkldnn::post_ops post_operations;
int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0;
conv_attr.set_output_scales(mask, output_shift_scale);
if (fuse_relu) {
constexpr float scale = 1.0f;
constexpr float negative_slope = 0.0f;
constexpr float placeholder = 1.0f; // beta
post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
negative_slope, placeholder);
}
conv_attr.set_post_ops(post_operations);
return conv_attr;
}
......@@ -600,7 +621,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::engine& engine,
const mkldnn::engine& engine, const bool fuse_relu,
const std::vector<float> output_shift_scale,
bool is_test) const {
memory::dims stride_dims = {strides[0], strides[1]};
......@@ -613,7 +634,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
propagation, mkldnn::convolution_direct, src, weights, dst, stride_dims,
padding_dims, padding_dims, mkldnn::padding_kind::zero);
mkldnn::primitive_attr conv_attr = CreatePostOps(output_shift_scale);
mkldnn::primitive_attr conv_attr =
CreatePostOps(fuse_relu, output_shift_scale);
auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
conv_desc, conv_attr, engine);
......@@ -652,7 +674,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const memory::desc& bias, const memory::desc& dst,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::engine& engine,
const mkldnn::engine& engine, const bool fuse_relu,
const std::vector<float> output_shift_scale,
bool is_test) const {
memory::dims stride_dims = {strides[0], strides[1]};
......@@ -665,7 +687,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
propagation, mkldnn::convolution_direct, src, weights, bias, dst,
stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
mkldnn::primitive_attr conv_attr = CreatePostOps(output_shift_scale);
mkldnn::primitive_attr conv_attr =
CreatePostOps(fuse_relu, output_shift_scale);
auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
conv_desc, conv_attr, engine);
......
......@@ -62,27 +62,19 @@ struct CUBlas<float> {
cudaDataType_t Atype, int lda, const void *B,
cudaDataType_t Btype, int ldb, const float *beta, void *C,
cudaDataType_t Ctype, int ldc) {
// Because the gcc 4.8 doesn't expand template parameter pack that
// appears in a lambda-expression, I can not use template parameter pack
// here.
auto cublas_call = [&]() {
// Because the gcc 4.8 doesn't expand template parameter pack that
// appears in a lambda-expression, I can not use template parameter pack
// here.
#if CUDA_VERSION >= 8000
VLOG(5) << "use_tensor_op_math: "
<< (platform::TensorCoreAvailable() ? "True" : "False");
VLOG(5) << "use_tensor_op_math: "
<< (dev_ctx->tensor_core_available() ? "True" : "False");
dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE(platform::dynload::cublasSgemmEx(
dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype,
lda, B, Btype, ldb, beta, C, Ctype, ldc));
handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
beta, C, Ctype, ldc));
});
#else
PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0");
#endif
};
#if CUDA_VERSION >= 9000
// NOTES: To use Tensor Core, we should change the cublas config,
// but the cublas may be hold by multi-thread.
dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH);
#else
cublas_call();
PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0");
#endif
}
};
......@@ -170,32 +162,24 @@ struct CUBlas<platform::float16> {
cudaDataType_t Btype, int ldb, const void *beta, void *C,
cudaDataType_t Ctype, int ldc,
cudaDataType_t computeType) {
auto cublas_call = [&]() {
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
#if CUDA_VERSION >= 9000
bool use_tensor_op_math = platform::TensorCoreAvailable();
if (use_tensor_op_math) {
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
}
VLOG(5) << "use_tensor_op_math: "
<< (use_tensor_op_math ? "True" : "False");
bool use_tensor_op_math = dev_ctx->tensor_core_available();
if (use_tensor_op_math) {
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
}
VLOG(5) << "use_tensor_op_math: "
<< (use_tensor_op_math ? "True" : "False");
#endif // CUDA_VERSION >= 9000
dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype,
lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo));
handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
beta, C, Ctype, ldc, computeType, algo));
});
#else
PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0");
#endif
};
#if CUDA_VERSION >= 9000
// NOTES: To use Tensor Core, we should change the cublas config,
// but the cublas may be hold by multi-thread.
dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH);
#else
cublas_call();
PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0");
#endif
}
};
......@@ -223,9 +207,10 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
CUDA_R_32F, N);
} else {
#endif // CUDA_VERSION >= 8000
CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K,
&alpha, B, ldb, A, lda, &beta, C, N);
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
lda, &beta, C, N);
});
#if CUDA_VERSION >= 8000
}
......@@ -266,9 +251,12 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F);
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
CUBlas<platform::float16>::GEMM(context_.cublas_handle(), cuTransB, cuTransA,
N, M, K, &h_alpha, h_B, ldb, h_A, lda,
&h_beta, h_C, N);
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K,
&h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C,
N);
});
#endif // CUDA_VERSION >= 8000
}
......@@ -292,8 +280,10 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
} else {
#endif // CUDA_VERSION >= 8000
CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K,
&alpha, B, ldb, A, lda, &beta, C, ldc);
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
lda, &beta, C, ldc);
});
#if CUDA_VERSION >= 8000
}
......@@ -311,16 +301,19 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
CUBlas<platform::float16>::GEMM(context_.cublas_handle(), cuTransB, cuTransA,
N, M, K, &alpha, B, ldb, A, lda, &beta, C,
ldc);
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha,
B, ldb, A, lda, &beta, C, ldc);
});
}
template <>
template <typename T>
void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
T *y) const {
CUBlas<T>::AXPY(context_.cublas_handle(), n, &alpha, x, 1, y, 1);
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
});
}
template <>
......@@ -330,8 +323,9 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
T beta, T *C) const {
cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
CUBlas<T>::GEMV(context_.cublas_handle(), cuTransA, N, M, &alpha, A, N, B, 1,
&beta, C, 1);
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
});
}
template <>
......@@ -353,28 +347,28 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
#if CUDA_VERSION >= 9010
if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
auto cublas_call = [&]() {
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
bool use_tensor_op_math = platform::TensorCoreAvailable();
if (use_tensor_op_math) {
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
}
VLOG(5) << "use_tensor_op_math: "
<< (use_tensor_op_math ? "True" : "False");
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
bool use_tensor_op_math = context_.tensor_core_available();
if (use_tensor_op_math) {
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
}
VLOG(5) << "use_tensor_op_math: "
<< (use_tensor_op_math ? "True" : "False");
context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx(
context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B,
CUDA_R_32F, ldb, strideB, A, CUDA_R_32F, lda, strideA, &beta, C,
CUDA_R_32F, ldc, strideC, batchCount, CUDA_R_32F, algo));
};
auto &dev_ctx = const_cast<platform::CUDADeviceContext &>(context_);
dev_ctx.CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH);
handle, cuTransB, cuTransA, N, M, K, &alpha, B, CUDA_R_32F, ldb,
strideB, A, CUDA_R_32F, lda, strideA, &beta, C, CUDA_R_32F, ldc,
strideC, batchCount, CUDA_R_32F, algo));
});
} else {
#endif // CUDA_VERSION >= 9010
CUBlas<T>::GEMM_STRIDED_BATCH(context_.cublas_handle(), cuTransB, cuTransA,
N, M, K, &alpha, B, ldb, strideB, A, lda,
strideA, &beta, C, ldc, strideC, batchCount);
context_.CublasCall([&](cublasHandle_t handle) {
CUBlas<T>::GEMM_STRIDED_BATCH(handle, cuTransB, cuTransA, N, M, K, &alpha,
B, ldb, strideB, A, lda, strideA, &beta, C,
ldc, strideC, batchCount);
});
#if CUDA_VERSION >= 9010
}
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -14,25 +14,45 @@
#pragma once
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include <mutex> // NOLINT
#include <string>
#include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/macros.h"
#if CUDA_VERSION < 9000
enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 };
#endif
namespace paddle {
namespace framework {
namespace details {
class SSAGraghBuilderWithChecker : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override {
PADDLE_ENFORCE(IsValidGraph(graph.get()));
return graph;
namespace platform {
class CublasHandleHolder {
public:
CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
PADDLE_ENFORCE(dynload::cublasCreate(&handle_));
PADDLE_ENFORCE(dynload::cublasSetStream(handle_, stream));
#if CUDA_VERSION >= 9000
if (math_type == CUBLAS_TENSOR_OP_MATH) {
PADDLE_ENFORCE(
dynload::cublasSetMathMode(handle_, CUBLAS_TENSOR_OP_MATH));
}
#endif
}
~CublasHandleHolder() { PADDLE_ENFORCE(dynload::cublasDestroy(handle_)); }
template <typename Callback>
inline void Call(Callback &&callback) const {
std::lock_guard<std::mutex> guard(mtx_);
callback(handle_);
}
bool IsValidGraph(const ir::Graph* graph) const;
private:
DISABLE_COPY_AND_ASSIGN(CublasHandleHolder);
cublasHandle_t handle_;
mutable std::mutex mtx_;
};
} // namespace details
} // namespace framework
} // namespace platform
} // namespace paddle
......@@ -245,8 +245,15 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
eigen_stream_.reset(new EigenCudaStreamDevice());
eigen_stream_->Reinitialize(&stream_, place);
eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
cublas_handle_.reset(new CublasHandleHolder(stream_, CUBLAS_DEFAULT_MATH));
if (TensorCoreAvailable()) {
#if CUDA_VERSION >= 9000
cublas_tensor_core_handle_.reset(
new CublasHandleHolder(stream_, CUBLAS_TENSOR_OP_MATH));
#endif
}
if (dynload::HasCUDNN()) {
cudnn_holder_.reset(new CudnnHolder(&stream_, place));
}
......@@ -306,7 +313,8 @@ CUDADeviceContext::~CUDADeviceContext() {
SetDeviceId(place_.device);
Wait();
WaitStreamCallback();
PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
cublas_handle_.reset();
cublas_tensor_core_handle_.reset();
eigen_stream_.reset();
eigen_device_.reset();
PADDLE_ENFORCE(cudaStreamDestroy(stream_));
......@@ -335,8 +343,8 @@ Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
return eigen_device_.get();
}
cublasHandle_t CUDADeviceContext::cublas_handle() const {
return cublas_handle_;
bool CUDADeviceContext::tensor_core_available() const {
return cublas_tensor_core_handle_ != nullptr;
}
cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
......
......@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/temporary_allocator.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_helper.h"
#include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/gpu_info.h"
......@@ -209,39 +210,6 @@ class CudnnWorkspaceHandle {
std::unique_ptr<std::lock_guard<std::mutex>> guard_;
};
#if CUDA_VERSION >= 9000
class ScopedCublasMathMode {
public:
ScopedCublasMathMode(cublasHandle_t handle, cublasMath_t new_math_mode)
: handle_(handle) {
need_reset = false;
PADDLE_ENFORCE(
platform::dynload::cublasGetMathMode(handle_, &old_math_mode_),
"Failed to get old cublas math mode");
if (old_math_mode_ != new_math_mode) {
PADDLE_ENFORCE(
platform::dynload::cublasSetMathMode(handle_, new_math_mode),
"Failed to set old cublas math mode");
need_reset = true;
}
}
~ScopedCublasMathMode() {
if (need_reset) {
PADDLE_ENFORCE(
platform::dynload::cublasSetMathMode(handle_, old_math_mode_),
"Failed to set old cublas math mode");
}
}
private:
cublasHandle_t handle_;
cublasMath_t old_math_mode_;
bool need_reset;
};
#endif
class CUDADeviceContext : public DeviceContext {
public:
explicit CUDADeviceContext(CUDAPlace place);
......@@ -262,8 +230,25 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return eigen device in the device context. */
Eigen::GpuDevice* eigen_device() const;
/*! \brief Return cublas handle in the device context. */
cublasHandle_t cublas_handle() const;
/*! \brief Call cublas function safely. */
template <typename Callback>
inline void CublasCall(Callback&& callback) const {
cublas_handle_->Call(std::forward<Callback>(callback));
}
/*! \brief Check whether tensor core is supported */
bool tensor_core_available() const;
/*! \brief Call cublas function with Tensor Core safely. If
Tensor Core is not available, use DEFAULT_MATH instead. */
template <typename Callback>
inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const {
if (cublas_tensor_core_handle_) {
cublas_tensor_core_handle_->Call(std::forward<Callback>(callback));
} else {
cublas_handle_->Call(std::forward<Callback>(callback));
}
}
/*! \brief Return cudnn handle in the device context. */
cudnnHandle_t cudnn_handle() const;
......@@ -282,7 +267,6 @@ class CUDADeviceContext : public DeviceContext {
template <typename Callback>
void RecordEvent(cudaEvent_t ev, Callback callback) {
std::lock_guard<std::mutex> guard(mtx_);
callback();
PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
}
......@@ -294,18 +278,6 @@ class CUDADeviceContext : public DeviceContext {
void WaitStreamCallback() const { callback_manager_->Wait(); }
#if CUDA_VERSION >= 9000
/*! \brief CublasCall may need to change cublas's config,
* but the cublas may be hold by multi-thread, so we should
* add lock here. */
template <typename Callback>
void CublasCall(Callback callback, cublasMath_t new_math) {
std::lock_guard<std::mutex> guard(cublas_mtx_);
ScopedCublasMathMode scoped_cublas_math(cublas_handle_, new_math);
callback();
}
#endif
private:
CUDAPlace place_;
......@@ -313,7 +285,9 @@ class CUDADeviceContext : public DeviceContext {
std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
std::unique_ptr<CudnnHolder> cudnn_holder_;
cudaStream_t stream_;
cublasHandle_t cublas_handle_;
std::unique_ptr<CublasHandleHolder> cublas_handle_;
std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
int compute_capability_;
int runtime_version_;
......@@ -321,12 +295,10 @@ class CUDADeviceContext : public DeviceContext {
int multi_process_;
int max_threads_per_mp_;
mutable std::mutex mtx_;
// StreamCallbackManager is thread-safe
std::unique_ptr<StreamCallbackManager> callback_manager_;
mutable std::mutex cublas_mtx_;
DISABLE_COPY_AND_ASSIGN(CUDADeviceContext);
};
template <>
......
......@@ -43,9 +43,6 @@ TEST(Device, CUDADeviceContext) {
ASSERT_NE(nullptr, gpu_device);
cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
ASSERT_NE(nullptr, cudnn_handle);
cublasHandle_t cublas_handle = device_context->cublas_handle();
ASSERT_NE(nullptr, cublas_handle);
ASSERT_NE(nullptr, device_context->stream());
delete device_context;
}
}
......
......@@ -214,16 +214,18 @@ class MKLDNNHandler {
std::string* key, const mkldnn::memory::dims& input_dims,
const mkldnn::memory::dims& weights_dims, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& dilations,
const int& groups, const mkldnn::memory::data_type& type,
const mkldnn::memory::format& format, const std::string& suffix) {
const int& groups, const mkldnn::memory::data_type& srcdt,
const mkldnn::memory::format& format,
const mkldnn::memory::data_type& dstdt, const std::string& suffix) {
AppendKeyDims(key, input_dims);
AppendKeyDims(key, weights_dims);
AppendKeyVec(key, strides);
AppendKeyVec(key, paddings);
AppendKeyVec(key, dilations);
AppendKey(key, std::to_string(groups));
AppendKey(key, std::to_string(type));
AppendKey(key, std::to_string(srcdt));
AppendKey(key, std::to_string(format));
AppendKey(key, std::to_string(dstdt));
AppendKey(key, suffix);
}
......
......@@ -946,13 +946,6 @@ All parameter, weight, gradient are variables in Paddle.
R"DOC(The type is STR, debug_graphviz_path indicate the path that
writing the SSA Graph to file in the form of graphviz, you.
It is useful for debugging. Default "")DOC")
.def_property(
"enable_data_balance",
[](const BuildStrategy &self) { return self.enable_data_balance_; },
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
self.enable_data_balance_ = b;
}) // FIXME(chengudo): enable_data_balance seems not important
.def_property(
"enable_sequential_execution",
[](const BuildStrategy &self) {
......@@ -1007,6 +1000,10 @@ All parameter, weight, gradient are variables in Paddle.
"memory_optimize",
[](const BuildStrategy &self) { return self.memory_optimize_; },
[](BuildStrategy &self, bool b) { self.memory_optimize_ = b; })
.def_property(
"is_distribution",
[](const BuildStrategy &self) { return self.is_distribution_; },
[](BuildStrategy &self, bool b) { self.is_distribution_ = b; })
.def_property(
"memory_early_delete",
[](const BuildStrategy &self) { return self.memory_early_delete_; },
......
......@@ -199,6 +199,7 @@ function cmake_gen() {
-DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}
-DPY_VERSION=${PY_VERSION:-2.7}
-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
-DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF}
========================================
EOF
# Disable UNITTEST_USE_VIRTUALENV in docker because
......@@ -232,7 +233,8 @@ EOF
-DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}\
-DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}\
-DPY_VERSION=${PY_VERSION:-2.7} \
-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \
-DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF}
}
......@@ -447,7 +449,7 @@ EOF
elif [ "$1" == "cp37-cp37m" ]; then
pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
fi
if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
paddle version
fi
......
......@@ -29,6 +29,15 @@ ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
BuildStrategy = core.ParallelExecutor.BuildStrategy
def _is_pserver_mode(main_program):
main = main_program if main_program \
else framework.default_main_program()
for op in main.global_block().ops:
if op.type in ["send", "recv"]:
return True
return False
class ParallelExecutor(object):
"""
ParallelExecutor is designed for data parallelism, which focuses on distributing
......@@ -128,6 +137,11 @@ class ParallelExecutor(object):
build_strategy = BuildStrategy()
build_strategy.num_trainers = num_trainers
build_strategy.trainer_id = trainer_id
# FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
# num_trainers is 1, so the current fields of build_strategy doesn't tell if
# it's distributed model.
build_strategy.is_distribution = _is_pserver_mode(
main_program) or num_trainers > 1
# step4: get main_program, scope, local_scopes
main = main_program if main_program \
......
......@@ -47,7 +47,8 @@ class TestConv2dInt8Op(TestConv2dOp):
self.init_group()
self.init_dilation()
self.init_test_case()
self.init_dtype()
self.init_fuse_relu()
self.init_data_type()
conv2d_param = {
'stride': self.stride,
......@@ -78,7 +79,11 @@ class TestConv2dInt8Op(TestConv2dOp):
np.round((input_shift) * self.scale_in).astype(np.int32),
filter_int, self.groups,
conv2d_param).astype(np.float32) * scale_output_shift
output = np.round(output1 - output2).astype(self.dsttype)
if self.fuse_relu:
output = np.maximum(np.round(output1 - output2),
0).astype(self.dsttype)
else:
output = np.round(output1 - output2).astype(self.dsttype)
else:
filter_int = np.round(filter *
self.scale_weights[0]).astype(np.int32)
......@@ -87,7 +92,15 @@ class TestConv2dInt8Op(TestConv2dOp):
output1 = conv2d_forward_refer(
input.astype(np.int32), filter_int, self.groups,
conv2d_param).astype(np.float32)
output = np.round(output1 * scale_output_shift).astype(self.dsttype)
if self.fuse_relu:
output = np.maximum(
np.round(output1 * (self.scale_out / (
self.scale_in * self.scale_weights[0]))),
0).astype(self.dsttype)
else:
output = np.round(output1 * (self.scale_out / (
self.scale_in *
self.scale_weights[0]))).astype(self.dsttype)
self.inputs = {
'Input':
......@@ -106,6 +119,7 @@ class TestConv2dInt8Op(TestConv2dOp):
'Scale_in': self.scale_in,
'Scale_out': self.scale_out,
'Scale_weights': self.scale_weights,
'fuse_relu': self.fuse_relu
}
self.outputs = {'Output': output}
......@@ -129,12 +143,15 @@ class TestConv2dInt8Op(TestConv2dOp):
self.scale_out = 0.5
self.scale_weights = [10.0]
def init_dtype(self):
def init_data_type(self):
self.srctype = np.uint8
self.dsttype = np.int8
def init_fuse_relu(self):
self.fuse_relu = True
#--------------------test conv2d u8 in and s8 out--------------------
#--------------------test conv2d u8 in and u8 out--------------------
class TestConv2d(TestConv2dInt8Op):
......@@ -203,18 +220,43 @@ class TestWithInput1x1Filter1x1(TestConv2dInt8Op):
self.groups = 3
#--------------------test conv2d s8 in and s8 out--------------------
def init_data_type_with_fusion(self, input_dt, fuse_relu):
self.srctype = input_dt
self.dsttype = np.uint8 if fuse_relu else np.int8
def init_fuse_relu(self):
self.fuse_relu = fuse_relu
def create_test_int8_class(parent):
class TestInt8Case(parent):
def init_dtype(self):
self.srctype = np.int8
self.dsttype = np.int8
cls_name = "{0}_{1}".format(parent.__name__, "s8s8")
TestInt8Case.__name__ = cls_name
globals()[cls_name] = TestInt8Case
#--------------------test conv2d s8 in and u8 out--------------------
class TestS8U8Case(parent):
def init_data_type(self):
init_data_type_with_fusion(self, np.int8, True)
#--------------------test conv2d s8 in and s8 out--------------------
class TestS8S8Case(parent):
def init_data_type(self):
init_data_type_with_fusion(self, np.int8, False)
#--------------------test conv2d u8 in and s8 out--------------------
class TestU8S8Case(parent):
def init_data_type(self):
init_data_type_with_fusion(self, np.uint8, False)
cls_name_s8u8 = "{0}_relu_{1}".format(parent.__name__, "1")
cls_name_s8s8 = "{0}_relu_{1}".format(parent.__name__, "0")
cls_name_u8s8 = "{0}_relu_{1}".format(parent.__name__, "0")
TestS8U8Case.__name__ = cls_name_s8u8
TestS8S8Case.__name__ = cls_name_s8s8
TestU8S8Case.__name__ = cls_name_u8s8
globals()[cls_name_s8u8] = TestS8U8Case
globals()[cls_name_s8s8] = TestS8S8Case
globals()[cls_name_u8s8] = TestU8S8Case
create_test_int8_class(TestConv2dInt8Op)
......
......@@ -75,8 +75,6 @@ class TestReaderReset(unittest.TestCase):
exe.run(startup_prog)
build_strategy = fluid.BuildStrategy()
if with_double_buffer:
build_strategy.enable_data_balance = True
exec_strategy = fluid.ExecutionStrategy()
parallel_exe = fluid.ParallelExecutor(
use_cuda=self.use_cuda,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册