“6f136ec6fb16f4d5b142aae46d18f772d85c4efb”上不存在“develop/api_doc/fluid/layers.html”
提交 b5670d4c 编写于 作者: W wangguibao

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into async_executor

...@@ -66,6 +66,8 @@ option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) ...@@ -66,6 +66,8 @@ option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF)
option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF) option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
option(WITH_ANAKIN "Compile with Anakin library" OFF) option(WITH_ANAKIN "Compile with Anakin library" OFF)
option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF)
option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON)
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
option(ON_INFER "Turn on inference optimization." OFF) option(ON_INFER "Turn on inference optimization." OFF)
......
...@@ -58,19 +58,21 @@ ExternalProject_Add( ...@@ -58,19 +58,21 @@ ExternalProject_Add(
-DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
-DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
-DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER} -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
-DBUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN}
-DBUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM}
${EXTERNAL_OPTIONAL_ARGS} ${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
) )
message(STATUS "Anakin for inference is enabled") message(STATUS "Anakin for inference is enabled")
message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}") message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
add_dependencies(extern_anakin protobuf mklml)
add_library(anakin_shared SHARED IMPORTED GLOBAL) add_library(anakin_shared SHARED IMPORTED GLOBAL)
set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB}) set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB})
add_dependencies(anakin_shared extern_anakin protobuf mklml) add_dependencies(anakin_shared extern_anakin)
add_library(anakin_saber SHARED IMPORTED GLOBAL) add_library(anakin_saber SHARED IMPORTED GLOBAL)
set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB}) set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
add_dependencies(anakin_saber extern_anakin protobuf mklml) add_dependencies(anakin_saber extern_anakin)
list(APPEND external_project_dependencies anakin_shared anakin_saber) list(APPEND external_project_dependencies anakin_shared anakin_saber)
...@@ -137,6 +137,10 @@ cc_library(version SRCS version.cc) ...@@ -137,6 +137,10 @@ cc_library(version SRCS version.cc)
cc_test(version_test SRCS version_test.cc DEPS version) cc_test(version_test SRCS version_test.cc DEPS version)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto)
cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
...@@ -164,7 +168,7 @@ if(WITH_DISTRIBUTE) ...@@ -164,7 +168,7 @@ if(WITH_DISTRIBUTE)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
else() else()
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator)
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif() endif()
......
...@@ -79,9 +79,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -79,9 +79,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
BuildStrategy strategy_; BuildStrategy strategy_;
}; };
std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy() std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
const { bool finalize_strategy) const {
if (is_finalized_) {
return pass_builder_;
}
pass_builder_.reset(new ParallelExecutorPassBuilder(*this)); pass_builder_.reset(new ParallelExecutorPassBuilder(*this));
if (finalize_strategy) {
is_finalized_ = true;
}
return pass_builder_; return pass_builder_;
} }
...@@ -95,10 +101,8 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -95,10 +101,8 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
#else #else
const bool use_cuda) const { const bool use_cuda) const {
#endif #endif
// Create a default one if not initialized by user. // Create a default one if not finalized by user.
if (!pass_builder_) { CreatePassesFromStrategy(false);
CreatePassesFromStrategy();
}
std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program)); std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
......
...@@ -75,12 +75,20 @@ struct BuildStrategy { ...@@ -75,12 +75,20 @@ struct BuildStrategy {
bool remove_unnecessary_lock_{false}; bool remove_unnecessary_lock_{false};
// NOTE:
// Before you add new options, think if it's a general strategy that works
// with other strategy. If not, the strategy should be created through
// CreatePassesFromStrategy and the pass can be managed separately.
// User normally doesn't need to call this API. // User normally doesn't need to call this API.
// The PassBuilder allows for more customized insert, remove of passes // The PassBuilder allows for more customized insert, remove of passes
// from python side. // from python side.
// A new PassBuilder is created based on configs defined above and // A new PassBuilder is created based on configs defined above and
// passes are owned by the PassBuilder. // passes are owned by the PassBuilder.
std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy() const; std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy(
bool finalize_strategy) const;
bool IsFinalized() const { return is_finalized_; }
// Apply the passes built by the pass_builder_. The passes will be // Apply the passes built by the pass_builder_. The passes will be
// applied to the Program and output an ir::Graph. // applied to the Program and output an ir::Graph.
...@@ -97,6 +105,7 @@ struct BuildStrategy { ...@@ -97,6 +105,7 @@ struct BuildStrategy {
#endif #endif
private: private:
mutable bool is_finalized_ = false;
mutable std::shared_ptr<ir::PassBuilder> pass_builder_; mutable std::shared_ptr<ir::PassBuilder> pass_builder_;
}; };
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/ngraph_operator.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/macros.h"
...@@ -25,6 +26,7 @@ limitations under the License. */ ...@@ -25,6 +26,7 @@ limitations under the License. */
DECLARE_bool(benchmark); DECLARE_bool(benchmark);
DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -81,6 +83,24 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, ...@@ -81,6 +83,24 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
} }
} }
static void EnableFusedOp(ExecutorPrepareContext* ctx) {
#ifdef PADDLE_WITH_NGRAPH
VLOG(3) << "use_ngraph=True";
auto intervals = FusedOperator::FusedOpIntervals(&ctx->ops_);
for (auto& interval : intervals) {
auto* fused_op = new FusedOperator(ctx->prog_, ctx->block_id_,
interval.at(0), interval.at(1));
*interval[0] = std::unique_ptr<OperatorBase>(fused_op);
}
for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
ctx->ops_.erase(it->at(0) + 1, it->at(1));
}
#else
LOG(WARNING)
<< "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
#endif
}
Executor::Executor(const platform::Place& place) : place_(place) {} Executor::Executor(const platform::Place& place) : place_(place) {}
void Executor::Close() { void Executor::Close() {
...@@ -338,6 +358,7 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare( ...@@ -338,6 +358,7 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
} }
if (FLAGS_use_ngraph) EnableFusedOp(ctx.get());
return ctx; return ctx;
} }
...@@ -486,6 +507,5 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) { ...@@ -486,6 +507,5 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) {
<< "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option"; << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
#endif #endif
} }
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#include <algorithm>
#include <functional>
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "ngraph/ngraph.hpp"
namespace paddle {
namespace framework {
std::map<std::string,
std::function<void(const std::shared_ptr<OperatorBase>&,
std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>>
NgraphBridge::NG_NODE_MAP = {};
void NgraphBridge::build_graph(const std::shared_ptr<OperatorBase>& op) {
auto& op_type = op->Type();
NG_NODE_MAP[op_type](op, ngb_node_map);
}
} // namespace framework
} // namespace paddle
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_NGRAPH
#include <algorithm>
#include <map>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
#include "ngraph/ngraph.hpp"
namespace paddle {
namespace framework {
class NgraphBridge {
public:
static std::map<
std::string,
std::function<void(const std::shared_ptr<OperatorBase>&,
std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>>
NG_NODE_MAP;
explicit NgraphBridge(
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
var_node_map)
: ngb_node_map(var_node_map) {}
void build_graph(const std::shared_ptr<OperatorBase>& op);
private:
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map;
};
} // namespace framework
} // namespace paddle
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#include <glog/logging.h>
#include <algorithm>
#include <map>
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/ngraph_operator.h"
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/framework/var_type.h"
namespace paddle {
namespace framework {
static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = {
{proto::VarType::FP32, ngraph::element::f32},
{proto::VarType::FP64, ngraph::element::f64},
{proto::VarType::INT32, ngraph::element::i32},
{proto::VarType::INT64, ngraph::element::i64},
{proto::VarType::BOOL, ngraph::element::boolean},
};
typedef enum { /* nGraph support state on ops */
FULL_TRAIN, /* Support full ops for train */
PARTIAL_TRAIN, /* Support partial ops for train */
FULL_TEST, /* Support full list of ops for test */
PARTIAL_TEST /* Support partial list of ops for test */
} op_state;
class NgraphOperator {
public:
explicit NgraphOperator(const Scope& scope, const platform::Place& place,
const std::vector<std::shared_ptr<OperatorBase>>& ops,
const std::unordered_map<
std::string, ngraph::element::Type>& var_type_map,
const std::unordered_set<std::string>& persist,
const std::unordered_set<std::string>& fetches,
const std::unordered_set<std::string>& post_op_inputs,
op_state ng_op_state)
: scope_(scope),
place_(place),
fused_ops_(ops),
var_type_map_(var_type_map),
persistables_(persist),
fetches_(fetches),
post_op_inputs_(post_op_inputs),
ng_op_state_(ng_op_state) {}
void Run(const Scope& scope, const platform::Place& place) const;
private:
static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
func_cache;
const Scope& scope_;
const platform::Place& place_;
std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
std::unordered_set<std::string> persistables_;
std::unordered_set<std::string> fetches_;
std::unordered_set<std::string> post_op_inputs_;
op_state ng_op_state_;
};
std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
FusedOperator::FusedOpIntervals(
std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops) {
std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
intervals;
if (ops->empty()) {
return intervals;
}
size_t size = ops->size();
size_t left = 0;
while (left < size && ops.at(left)->Type() != kFeedOpType) {
++left;
}
if (left == size) {
return intervals;
}
while (left < size && ops->at(left)->Type() == kFeedOpType) {
++left;
}
size_t right = left;
while (right < size && ops->at(right)->Type() != kFetchOpType) {
++right;
}
if (right == size) {
return intervals;
}
if (left >= right) return intervals;
// (left, right - 1) represents indices between feed and fetch
size_t pivot = left;
while (pivot < right) {
auto op_type = ops->at(pivot)->Type();
if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) ==
paddle::framework::NgraphBridge::NG_NODE_MAP.end()) {
++pivot;
} else {
size_t start = pivot, end = start;
while (pivot < right &&
(paddle::framework::NgraphBridge::NG_NODE_MAP.find(
ops.at(pivot)->Type()) !=
paddle::framework::NgraphBridge::NG_NODE_MAP.end())) {
++pivot;
++end;
}
std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>
interval = {ops->begin() + start, ops->begin() + end};
intervals.push_back(interval);
}
} // end while
return intervals;
}
FusedOperator::FusedOperator(
const ProgramDesc& prog, size_t block_id,
std::vector<std::unique_ptr<OperatorBase>>::iterator start,
std::vector<std::unique_ptr<OperatorBase>>::iterator end,
const std::string& type, const VariableNameMap& inputs,
const VariableNameMap& outputs, const AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) {
for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start;
it != end; ++it) {
fused_ops_.push_back(std::move(*it));
}
for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = end;
(*it)->Type() != kFetchOpType; ++it) {
for (auto& var_name_item : (*it)->Inputs()) {
for (auto& var_name : var_name_item.second) {
post_op_inputs_.insert(var_name);
}
}
}
if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) {
is_complete = true;
}
Process();
}
void FusedOperator::Process() {
auto& bdesc = pdesc_.Block(block_);
for (auto& var : bdesc.AllVars()) {
if (!(var->GetType() == proto::VarType::SELECTED_ROWS ||
var->GetType() == proto::VarType::LOD_TENSOR ||
var->GetType() == proto::VarType::LOD_TENSOR_ARRAY)) {
continue;
}
auto var_name = var->Name();
if (var->Name() == framework::kEmptyVarName) {
continue;
}
if (var_name != "fetch" && var_name != "feed") {
auto pd_type = var->GetDataType();
if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) {
PADDLE_THROW("Data type of var %s not found in pd2ng_type_map",
var_name);
}
var_type_map_[var_name] = pd2ng_type_map[pd_type];
}
if (var->Persistable()) {
persistables_.insert(var->Name());
}
}
for (auto* op : bdesc.AllOps()) {
if (op->Type() == kFetchOpType) {
std::string fetch_target_name = op->Input("X")[0];
fetches_.insert(fetch_target_name);
}
}
}
void FusedOperator::RunImpl(const Scope& scope,
const platform::Place& place) const {
op_state ng_op_state = PARTIAL_TEST;
auto& bdesc = pdesc_.Block(block_);
for (auto* op : bdesc.AllOps()) {
if (op->Type().find("_grad") != std::string::npos) {
ng_op_state = PARTIAL_TRAIN;
break;
}
}
if (is_full) {
ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN;
}
NgraphOperator ngraph_op(scope, place, fused_ops_, var_type_map_,
persistables_, fetches_, post_op_inputs_,
ng_op_state);
ngraph_op.Run(scope, place);
}
} // namespace framework
} // namespace paddle
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_NGRAPH
#include <algorithm>
#include <atomic>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/attribute.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/variant.h"
#include "ngraph/ngraph.hpp"
namespace paddle {
namespace framework {
class FusedOperator : public OperatorBase {
public:
static std::vector<
std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
FusedOpIntervals(
std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops);
explicit FusedOperator(
const ProgramDesc& prog, size_t block_id,
std::vector<std::unique_ptr<OperatorBase>>::iterator start,
std::vector<std::unique_ptr<OperatorBase>>::iterator end,
const std::string& type = "fused_op", const VariableNameMap& inputs = {},
const VariableNameMap& outputs = {}, const AttributeMap& attrs = {});
void RunImpl(const Scope& scope, const platform::Place& place) const final;
private:
const ProgramDesc pdesc_;
size_t block_;
std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
std::unordered_set<std::string> persistables_;
std::unordered_set<std::string> fetches_;
std::unordered_set<std::string> post_op_inputs_;
bool is_full_ = false;
void Process();
};
} // namespace framework
} // namespace paddle
#endif
...@@ -45,7 +45,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) { ...@@ -45,7 +45,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
std::unordered_set<std::string> teller_set( std::unordered_set<std::string> teller_set(
{"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
"depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
"elementwise_add", "dropout"}); "elementwise_add", "dropout", "split"});
if (!node->IsOp()) return false; if (!node->IsOp()) return false;
if (teller_set.count(node->Op()->Type())) { if (teller_set.count(node->Op()->Type())) {
......
...@@ -548,4 +548,5 @@ USE_TRT_CONVERTER(batch_norm); ...@@ -548,4 +548,5 @@ USE_TRT_CONVERTER(batch_norm);
USE_TRT_CONVERTER(concat); USE_TRT_CONVERTER(concat);
USE_TRT_CONVERTER(dropout); USE_TRT_CONVERTER(dropout);
USE_TRT_CONVERTER(pad); USE_TRT_CONVERTER(pad);
USE_TRT_CONVERTER(split);
#endif #endif
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/analysis_predictor.h"
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <thread> #include <thread> // NOLINT
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
......
...@@ -23,7 +23,7 @@ limitations under the License. */ ...@@ -23,7 +23,7 @@ limitations under the License. */
#include <memory> #include <memory>
#include <thread> //NOLINT #include <thread> //NOLINT
#include "utils.h" #include "utils.h" // NOLINT
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
DEFINE_bool(use_gpu, false, "Whether use gpu."); DEFINE_bool(use_gpu, false, "Whether use gpu.");
......
...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); ...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
......
...@@ -49,6 +49,8 @@ struct AnalysisConfig : public NativeConfig { ...@@ -49,6 +49,8 @@ struct AnalysisConfig : public NativeConfig {
void EnableTensorRtEngine(int workspace_size = 1 << 20, void EnableTensorRtEngine(int workspace_size = 1 << 20,
int max_batch_size = 1); int max_batch_size = 1);
bool use_tensorrt() const { return use_tensorrt_; }
// NOTE this is just for internal development, please not use it. // NOTE this is just for internal development, please not use it.
// NOT stable yet. // NOT stable yet.
void EnableMKLDNN(); void EnableMKLDNN();
......
...@@ -91,7 +91,7 @@ class CpuPassStrategy : public PassStrategy { ...@@ -91,7 +91,7 @@ class CpuPassStrategy : public PassStrategy {
virtual ~CpuPassStrategy() = default; virtual ~CpuPassStrategy() = default;
virtual void EnableMKLDNN() override { void EnableMKLDNN() override {
// TODO(Superjomn) Consider the way to mix CPU with GPU. // TODO(Superjomn) Consider the way to mix CPU with GPU.
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
passes_.insert(passes_.begin(), "mkldnn_placement_pass"); passes_.insert(passes_.begin(), "mkldnn_placement_pass");
...@@ -123,7 +123,7 @@ class GpuPassStrategy : public PassStrategy { ...@@ -123,7 +123,7 @@ class GpuPassStrategy : public PassStrategy {
GpuPassStrategy(const GpuPassStrategy &other) GpuPassStrategy(const GpuPassStrategy &other)
: PassStrategy(other.AllPasses()) {} : PassStrategy(other.AllPasses()) {}
virtual void EnableMKLDNN() override; void EnableMKLDNN() override;
virtual ~GpuPassStrategy() = default; virtual ~GpuPassStrategy() = default;
}; };
......
nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto device_context) nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto device_context)
nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
add_subdirectory(plugin)
add_subdirectory(convert) add_subdirectory(convert)
# Add TRT tests # Add TRT tests
nv_library(tensorrt_converter nv_library(tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc pad_op.cc batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
DEPS tensorrt_engine operator scope framework_proto op_registry) pad_op.cc split_op.cc
DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
nv_test(test_op_converter SRCS test_op_converter.cc DEPS nv_test(test_op_converter SRCS test_op_converter.cc DEPS
${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter) ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter)
...@@ -28,6 +29,8 @@ nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc ...@@ -28,6 +29,8 @@ nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL)
nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine dropout_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine dropout_op SERIAL)
nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine pad_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine pad_op SERIAL)
nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_plugin
split_op concat_op SERIAL)
...@@ -19,7 +19,7 @@ namespace inference { ...@@ -19,7 +19,7 @@ namespace inference {
namespace tensorrt { namespace tensorrt {
/* /*
* MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights. * ConcatOp
*/ */
class ConcatOpConverter : public OpConverter { class ConcatOpConverter : public OpConverter {
public: public:
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* SplitOp.
*/
class SplitOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(40) << "convert a fluid split op to tensorrt split layer";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
auto input_dims = input->getDimensions();
int input_num = op_desc.Input("X").size();
size_t output_num = op_desc.Output("Out").size();
// Get Attrs
PADDLE_ENFORCE(input_num == 1);
int axis = boost::get<int>(op_desc.GetAttr("axis"));
std::vector<int> output_lengths =
boost::get<std::vector<int>>(op_desc.GetAttr("sections"));
PADDLE_ENFORCE(axis != 0);
if (axis < 0) {
axis += input_dims.nbDims;
} else {
axis -= 1;
}
PADDLE_ENFORCE(output_lengths.size() == output_num);
//
SplitPlugin* plugin = new SplitPlugin(axis, output_lengths);
nvinfer1::IPluginLayer* layer =
engine_->AddPlugin(&input, input_num, plugin);
std::string layer_name = "split (Output: ";
for (size_t i = 0; i < output_num; i++) {
auto output_name = op_desc.Output("Out")[i];
layer->getOutput(i)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(i));
layer_name += output_name;
if (test_mode) {
engine_->DeclareOutput(output_name);
}
}
layer->setName((layer_name + ")").c_str());
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
REGISTER_TRT_OP_CONVERTER(split, SplitOpConverter);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace tensorrt {
TEST(split_op, test) {
std::unordered_set<std::string> parameters({""});
framework::Scope scope;
TRTConvertValidation validator(10, parameters, scope, 1000);
validator.DeclInputVar("split_input", nvinfer1::DimsCHW(3, 2, 2));
validator.DeclOutputVar("split_out1", nvinfer1::DimsCHW(2, 2, 2));
validator.DeclOutputVar("split_out2", nvinfer1::DimsCHW(1, 2, 2));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("split");
desc.SetInput("X", {"split_input"});
desc.SetOutput("Out", {"split_out1", "split_out2"});
int num = 0;
int axis = 1;
std::vector<int> output_lengths = {2, 1};
desc.SetAttr("axis", axis);
desc.SetAttr("num", num);
desc.SetAttr("sections", output_lengths);
validator.SetOp(*desc.Proto());
validator.Execute(1);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(split);
...@@ -255,6 +255,12 @@ void TensorRTEngine::freshDeviceId() { ...@@ -255,6 +255,12 @@ void TensorRTEngine::freshDeviceId() {
cudaSetDevice(device_); cudaSetDevice(device_);
} }
nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
nvinfer1::ITensor *const *inputs, int nbInputs, PluginTensorRT *plugin) {
owned_plugin_.emplace_back(plugin);
return infer_network_.get()->addPluginExt(inputs, nbInputs, *plugin);
}
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/engine.h"
#include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
namespace paddle { namespace paddle {
...@@ -125,6 +126,8 @@ class TensorRTEngine : public EngineBase { ...@@ -125,6 +126,8 @@ class TensorRTEngine : public EngineBase {
void SetRuntimeBatch(size_t batch_size); void SetRuntimeBatch(size_t batch_size);
int GetRuntimeBatch(); int GetRuntimeBatch();
int GetDevice() { return device_; } int GetDevice() { return device_; }
nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
int nbInputs, PluginTensorRT*);
// A pointer to CPU memory is needed of the TRT weight. // A pointer to CPU memory is needed of the TRT weight.
// Before TRT runs, fluid loads weight into GPU storage. // Before TRT runs, fluid loads weight into GPU storage.
...@@ -164,8 +167,10 @@ class TensorRTEngine : public EngineBase { ...@@ -164,8 +167,10 @@ class TensorRTEngine : public EngineBase {
std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_; std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_;
std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/> std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
itensor_map_; itensor_map_;
// The specific GPU id that the TensorRTEngine bounded to. // The specific GPU id that the TensorRTEngine bounded to.
int device_; int device_;
std::vector<std::unique_ptr<PluginTensorRT>> owned_plugin_;
// TensorRT related internal members // TensorRT related internal members
template <typename T> template <typename T>
......
nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu DEPS enforce)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cassert>
#include <cstring>
#include <type_traits>
#include <vector>
template <typename T>
inline void SerializeValue(void** buffer, T const& value);
template <typename T>
inline void DeserializeValue(void const** buffer, size_t* buffer_size,
T* value);
namespace {
template <typename T, class Enable = void>
struct Serializer {};
template <typename T>
struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
std::is_enum<T>::value ||
std::is_pod<T>::value>::type> {
static size_t SerializedSize(T const& value) { return sizeof(T); }
static void Serialize(void** buffer, T const& value) {
std::memcpy(*buffer, &value, sizeof(T));
reinterpret_cast<char*&>(*buffer) += sizeof(T);
}
static void Deserialize(void const** buffer, size_t* buffer_size, T* value) {
assert(*buffer_size >= sizeof(T));
std::memcpy(value, *buffer, sizeof(T));
reinterpret_cast<char const*&>(*buffer) += sizeof(T);
*buffer_size -= sizeof(T);
}
};
template <>
struct Serializer<const char*> {
static size_t SerializedSize(const char* value) { return strlen(value) + 1; }
static void Serialize(void** buffer, const char* value) {
std::strcpy(static_cast<char*>(*buffer), value);
reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
}
static void Deserialize(void const** buffer, size_t* buffer_size,
const char** value) {
*value = static_cast<char const*>(*buffer);
size_t data_size = strnlen(*value, *buffer_size) + 1;
assert(*buffer_size >= data_size);
reinterpret_cast<char const*&>(*buffer) += data_size;
*buffer_size -= data_size;
}
};
template <typename T>
struct Serializer<std::vector<T>,
typename std::enable_if<std::is_arithmetic<T>::value ||
std::is_enum<T>::value ||
std::is_pod<T>::value>::type> {
static size_t SerializedSize(std::vector<T> const& value) {
return sizeof(value.size()) + value.size() * sizeof(T);
}
static void Serialize(void** buffer, std::vector<T> const& value) {
SerializeValue(buffer, value.size());
size_t nbyte = value.size() * sizeof(T);
std::memcpy(*buffer, value.data(), nbyte);
reinterpret_cast<char*&>(*buffer) += nbyte;
}
static void Deserialize(void const** buffer, size_t* buffer_size,
std::vector<T>* value) {
size_t size;
DeserializeValue(buffer, buffer_size, &size);
value->resize(size);
size_t nbyte = value->size() * sizeof(T);
assert(*buffer_size >= nbyte);
std::memcpy(value->data(), *buffer, nbyte);
reinterpret_cast<char const*&>(*buffer) += nbyte;
*buffer_size -= nbyte;
}
};
} // namespace
template <typename T>
inline size_t SerializedSize(T const& value) {
return Serializer<T>::SerializedSize(value);
}
template <typename T>
inline void SerializeValue(void** buffer, T const& value) {
return Serializer<T>::Serialize(buffer, value);
}
template <typename T>
inline void DeserializeValue(void const** buffer, size_t* buffer_size,
T* value) {
return Serializer<T>::Deserialize(buffer, buffer_size, value);
}
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdio.h>
#include <cassert>
#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
namespace paddle {
namespace inference {
namespace tensorrt {
nvinfer1::Dims SplitPlugin::getOutputDimensions(int index,
const nvinfer1::Dims* inputDims,
int nbInputs) {
assert(nbInputs == 1);
assert(index < this->getNbOutputs());
nvinfer1::Dims const& input_dims = inputDims[0];
nvinfer1::Dims output_dims = input_dims;
output_dims.d[axis_] = output_length_.at(index);
return output_dims;
}
int SplitPlugin::initialize() {
std::vector<int> segment_offsets(1, 0);
for (int i = 0; i < this->getNbOutputs(); ++i) {
segment_offsets.push_back(segment_offsets.back() + output_length_[i]);
}
segment_offsets_ = segment_offsets;
nvinfer1::Dims dims = this->getInputDims(0);
nx_ = 1;
for (int i = dims.nbDims - 1; i > axis_; --i) {
nx_ *= dims.d[i];
}
ny_ = dims.d[axis_];
nz_ = 1;
for (int i = axis_ - 1; i >= 0; --i) {
nz_ *= dims.d[i];
}
return 0;
}
int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
void** outputs, void* workspace, cudaStream_t stream) {
auto const& input_dims = this->getInputDims(0);
int input_size = 0;
float const* idata = reinterpret_cast<float const*>(inputs[0]);
float** odatas = reinterpret_cast<float**>(outputs);
// kernel impl here.
int inputBatchOffset = nx_ * ny_ * nz_;
for (size_t i = 0; i < this->getNbOutputs(); i++) {
for (size_t j = 0; j < batchSize; j++) {
cudaMemcpyAsync(
odatas[i] +
j * (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ *
sizeof(float),
inputs[0] +
(inputBatchOffset * j + segment_offsets_[i] * nx_) *
sizeof(float),
(segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * sizeof(float),
cudaMemcpyDeviceToDevice, stream);
}
}
return cudaGetLastError() != cudaSuccess;
}
} // tensorrt
} // inference
} // paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
namespace paddle {
namespace inference {
namespace tensorrt {
class SplitPlugin : public PluginTensorRT {
int axis_;
std::vector<int> output_length_;
int nx_, ny_, nz_;
std::vector<int> segment_offsets_;
protected:
virtual size_t getSerializationSize() override {
return SerializedSize(axis_) + SerializedSize(output_length_) +
getBaseSerializationSize();
}
// TRT will call this func when we need to serialize the configuration of
// tensorrt.
// It should not be called by users.
virtual void serialize(void *buffer) override {
serializeBase(buffer);
SerializeValue(&buffer, axis_);
SerializeValue(&buffer, output_length_);
}
public:
SplitPlugin(int axis, std::vector<int> const &output_lengths)
: axis_(axis), output_length_(output_lengths) {
assert(axis <= nvinfer1::Dims::MAX_DIMS);
}
// It was used for tensorrt deserialization.
// It should not be called by users.
SplitPlugin(void const *serialData, size_t serialLength) {
deserializeBase(serialData, serialLength);
DeserializeValue(&serialData, &serialLength, &axis_);
DeserializeValue(&serialData, &serialLength, &output_length_);
}
SplitPlugin *clone() const override {
return new SplitPlugin(axis_, output_length_);
}
virtual const char *getPluginType() const override { return "split"; }
virtual int getNbOutputs() const override { return output_length_.size(); }
virtual nvinfer1::Dims getOutputDimensions(int index,
const nvinfer1::Dims *inputs,
int nbInputDims) override;
virtual int initialize() override;
virtual int enqueue(int batchSize, const void *const *inputs, void **outputs,
void *workspace, cudaStream_t stream) override;
};
} // tensorrt
} // inference
} // paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
namespace paddle {
namespace inference {
namespace tensorrt {
void PluginTensorRT::serializeBase(void*& buffer) {
SerializeValue(&buffer, input_dims_);
SerializeValue(&buffer, max_batch_size_);
SerializeValue(&buffer, data_type_);
SerializeValue(&buffer, data_format_);
}
void PluginTensorRT::deserializeBase(void const*& serialData,
size_t& serialLength) {
DeserializeValue(&serialData, &serialLength, &input_dims_);
DeserializeValue(&serialData, &serialLength, &max_batch_size_);
DeserializeValue(&serialData, &serialLength, &data_type_);
DeserializeValue(&serialData, &serialLength, &data_format_);
}
size_t PluginTensorRT::getBaseSerializationSize() {
return (SerializedSize(input_dims_) + SerializedSize(max_batch_size_) +
SerializedSize(data_type_) + SerializedSize(data_format_));
}
bool PluginTensorRT::supportsFormat(nvinfer1::DataType type,
nvinfer1::PluginFormat format) const {
return ((type == nvinfer1::DataType::kFLOAT) &&
(format == nvinfer1::PluginFormat::kNCHW));
}
void PluginTensorRT::configureWithFormat(const nvinfer1::Dims* inputDims,
int nbInputs,
const nvinfer1::Dims* outputDims,
int nbOutputs, nvinfer1::DataType type,
nvinfer1::PluginFormat format,
int maxBatchSize) {
data_type_ = type;
data_format_ = format;
input_dims_.assign(inputDims, inputDims + nbInputs);
max_batch_size_ = maxBatchSize;
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cassert>
#include <cstring>
#include <iostream>
#include <unordered_map>
#include <vector>
#include "NvInfer.h"
#include "paddle/fluid/inference/tensorrt/plugin/serialize.h"
namespace paddle {
namespace inference {
namespace tensorrt {
class PluginTensorRT : public nvinfer1::IPluginExt {
public:
PluginTensorRT() {}
PluginTensorRT(const void* serialized_data, size_t length) {}
nvinfer1::Dims const& getInputDims(int index) const {
return input_dims_.at(index);
}
size_t getMaxBatchSize() const { return max_batch_size_; }
nvinfer1::DataType getDataType() const { return data_type_; }
nvinfer1::PluginFormat getDataFormat() const { return data_format_; }
virtual const char* getPluginVersion() const { return "1"; }
size_t getWorkspaceSize(int) const override { return 0; }
void terminate() override {}
virtual ~PluginTensorRT() {}
// Check format support. The default is FLOAT32 and NCHW.
bool supportsFormat(nvinfer1::DataType type,
nvinfer1::PluginFormat format) const override;
void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs,
const nvinfer1::Dims* outputDims, int nbOutputs,
nvinfer1::DataType type,
nvinfer1::PluginFormat format,
int maxBatchSize) override;
// *NOTE* The following functions need to be overrided in the subclass.
virtual nvinfer1::IPluginExt* clone() const = 0;
virtual const char* getPluginType() const = 0;
// Initialize the layer for execution. This is called when the engine is
// created.
int initialize() override { return 0; }
// Serialize the layer config to buffer.
virtual void serialize(void* buffer) = 0;
virtual size_t getSerializationSize() = 0;
virtual int enqueue(int batchSize, const void* const* inputs, void** outputs,
void* workspace, cudaStream_t stream) = 0;
protected:
// Deserialize input_dims, max_batch_size, data_type, data_format
void deserializeBase(void const*& serialData, size_t& serialLength);
size_t getBaseSerializationSize();
// Serialize input_dims, max_batch_size, data_type, data_format
void serializeBase(void*& buffer);
std::vector<nvinfer1::Dims> input_dims_;
size_t max_batch_size_;
nvinfer1::DataType data_type_;
nvinfer1::PluginFormat data_format_;
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
...@@ -108,8 +108,7 @@ if(WITH_GPU AND TENSORRT_FOUND) ...@@ -108,8 +108,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}) if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz") inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
endif() endif()
inference_analysis_test(test_trt_models SRCS trt_models_tester.cc inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor
ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL) ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
endif() endif()
...@@ -178,7 +178,8 @@ TEST(Analyzer_dam, profile) { ...@@ -178,7 +178,8 @@ TEST(Analyzer_dam, profile) {
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
PADDLE_ENFORCE_GT(outputs.size(), 0); PADDLE_ENFORCE_GT(outputs.size(), 0);
...@@ -216,7 +217,9 @@ TEST(Analyzer_dam, compare) { ...@@ -216,7 +217,9 @@ TEST(Analyzer_dam, compare) {
SetInput(&input_slots_all); SetInput(&input_slots_all);
if (FLAGS_use_analysis) { if (FLAGS_use_analysis) {
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all);
} }
} }
......
...@@ -133,7 +133,8 @@ TEST(Analyzer_LAC, profile) { ...@@ -133,7 +133,8 @@ TEST(Analyzer_LAC, profile) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
// the first inference result // the first inference result
...@@ -175,7 +176,8 @@ TEST(Analyzer_LAC, compare) { ...@@ -175,7 +176,8 @@ TEST(Analyzer_LAC, compare) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
} // namespace analysis } // namespace analysis
......
...@@ -121,7 +121,8 @@ TEST(Analyzer_Chinese_ner, profile) { ...@@ -121,7 +121,8 @@ TEST(Analyzer_Chinese_ner, profile) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
// the first inference result // the first inference result
...@@ -160,7 +161,8 @@ TEST(Analyzer_Chinese_ner, compare) { ...@@ -160,7 +161,8 @@ TEST(Analyzer_Chinese_ner, compare) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
} // namespace inference } // namespace inference
......
...@@ -45,7 +45,8 @@ void profile(bool use_mkldnn = false) { ...@@ -45,7 +45,8 @@ void profile(bool use_mkldnn = false) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
} }
TEST(Analyzer_resnet50, profile) { profile(); } TEST(Analyzer_resnet50, profile) { profile(); }
...@@ -74,7 +75,8 @@ void compare(bool use_mkldnn = false) { ...@@ -74,7 +75,8 @@ void compare(bool use_mkldnn = false) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
TEST(Analyzer_resnet50, compare) { compare(); } TEST(Analyzer_resnet50, compare) { compare(); }
......
...@@ -233,8 +233,8 @@ TEST(Analyzer_rnn1, profile) { ...@@ -233,8 +233,8 @@ TEST(Analyzer_rnn1, profile) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
LOG(INFO) << "to test prediction"; TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); input_slots_all, &outputs, FLAGS_num_threads);
} }
// Check the fuse status // Check the fuse status
...@@ -261,7 +261,8 @@ TEST(Analyzer_rnn1, compare) { ...@@ -261,7 +261,8 @@ TEST(Analyzer_rnn1, compare) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
// Test Multi-Thread. // Test Multi-Thread.
...@@ -272,7 +273,8 @@ TEST(Analyzer_rnn1, multi_thread) { ...@@ -272,7 +273,8 @@ TEST(Analyzer_rnn1, multi_thread) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, 4 /* multi_thread */); TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, 4 /* multi_thread */);
} }
// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing // Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
......
...@@ -132,7 +132,8 @@ TEST(Analyzer_rnn2, profile) { ...@@ -132,7 +132,8 @@ TEST(Analyzer_rnn2, profile) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
// the first inference result // the first inference result
...@@ -153,7 +154,8 @@ TEST(Analyzer_rnn2, compare) { ...@@ -153,7 +154,8 @@ TEST(Analyzer_rnn2, compare) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
} // namespace inference } // namespace inference
......
...@@ -161,7 +161,8 @@ TEST(Analyzer_seq_conv1, profile) { ...@@ -161,7 +161,8 @@ TEST(Analyzer_seq_conv1, profile) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
// the first inference result // the first inference result
...@@ -199,7 +200,8 @@ TEST(Analyzer_seq_conv1, compare) { ...@@ -199,7 +200,8 @@ TEST(Analyzer_seq_conv1, compare) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
} // namespace inference } // namespace inference
......
...@@ -74,7 +74,8 @@ TEST(Analyzer_Text_Classification, profile) { ...@@ -74,7 +74,8 @@ TEST(Analyzer_Text_Classification, profile) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1) { if (FLAGS_num_threads == 1) {
// Get output // Get output
...@@ -101,7 +102,8 @@ TEST(Analyzer_Text_Classification, compare) { ...@@ -101,7 +102,8 @@ TEST(Analyzer_Text_Classification, compare) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) { TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) {
...@@ -112,7 +114,8 @@ TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) { ...@@ -112,7 +114,8 @@ TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
} // namespace inference } // namespace inference
......
...@@ -94,7 +94,8 @@ void profile(bool use_mkldnn = false) { ...@@ -94,7 +94,8 @@ void profile(bool use_mkldnn = false) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
const float ocr_result_data[] = { const float ocr_result_data[] = {
...@@ -136,7 +137,8 @@ void compare(bool use_mkldnn = false) { ...@@ -136,7 +137,8 @@ void compare(bool use_mkldnn = false) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
TEST(Analyzer_vis, compare) { compare(); } TEST(Analyzer_vis, compare) { compare(); }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <ostream>
#include <string>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace paddle {
namespace inference {
thread_local int num_spaces = 0;
static std::string GenSpaces(int num_spaces) {
std::ostringstream os;
for (int i = 0; i < num_spaces; ++i) {
os << " ";
}
return os.str();
}
std::ostream &operator<<(std::ostream &os,
const PaddlePredictor::Config &config) {
os << GenSpaces(num_spaces) << "PaddlePredictor::Config {\n";
num_spaces++;
os << GenSpaces(num_spaces) << "model_dir: " << config.model_dir << "\n";
num_spaces--;
os << GenSpaces(num_spaces) << "}\n";
return os;
}
std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
os << GenSpaces(num_spaces) << "NativeConfig {\n";
num_spaces++;
os << *reinterpret_cast<const PaddlePredictor::Config *>(&config);
os << GenSpaces(num_spaces) << "use_gpu: " << config.use_gpu << "\n";
os << GenSpaces(num_spaces) << "device: " << config.device << "\n";
os << GenSpaces(num_spaces)
<< "fraction_of_gpu_memory: " << config.fraction_of_gpu_memory << "\n";
os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
os << GenSpaces(num_spaces)
<< "specify_input_name: " << config.specify_input_name << "\n";
num_spaces--;
os << GenSpaces(num_spaces) << "}\n";
return os;
}
std::ostream &operator<<(std::ostream &os,
const contrib::AnalysisConfig &config) {
os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n";
num_spaces++;
os << *reinterpret_cast<const NativeConfig *>(&config);
os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim
<< "\n";
os << GenSpaces(num_spaces)
<< "use_feed_fetch_ops: " << config.use_feed_fetch_ops << "\n";
os << GenSpaces(num_spaces) << "use_tensorrt: " << config.use_tensorrt()
<< "\n";
os << GenSpaces(num_spaces) << "use_mkldnn: " << config.use_mkldnn() << "\n";
num_spaces--;
os << GenSpaces(num_spaces) << "}\n";
return os;
}
} // namespace inference
} // namespace paddle
...@@ -19,13 +19,16 @@ ...@@ -19,13 +19,16 @@
#include <string> #include <string>
#include <thread> // NOLINT #include <thread> // NOLINT
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/tests/api/config_printer.h"
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -38,10 +41,18 @@ DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); ...@@ -38,10 +41,18 @@ DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
DEFINE_bool(use_analysis, true, DEFINE_bool(use_analysis, true,
"Running the inference program in analysis mode."); "Running the inference program in analysis mode.");
DECLARE_bool(profile);
namespace paddle { namespace paddle {
namespace inference { namespace inference {
using contrib::AnalysisConfig; void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
if (use_analysis) {
LOG(INFO) << *reinterpret_cast<const contrib::AnalysisConfig *>(config);
return;
}
LOG(INFO) << *config;
}
void CompareResult(const std::vector<PaddleTensor> &outputs, void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &ref_outputs) { const std::vector<PaddleTensor> &ref_outputs) {
...@@ -77,12 +88,13 @@ void CompareResult(const std::vector<PaddleTensor> &outputs, ...@@ -77,12 +88,13 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
} }
std::unique_ptr<PaddlePredictor> CreateTestPredictor( std::unique_ptr<PaddlePredictor> CreateTestPredictor(
const AnalysisConfig &config, bool use_analysis = true) { const PaddlePredictor::Config *config, bool use_analysis = true) {
if (use_analysis) { if (use_analysis) {
return CreatePaddlePredictor<contrib::AnalysisConfig>(config); return CreatePaddlePredictor<contrib::AnalysisConfig>(
} else { *(reinterpret_cast<const contrib::AnalysisConfig *>(config)));
return CreatePaddlePredictor<NativeConfig>(config);
} }
return CreatePaddlePredictor<NativeConfig>(
*(reinterpret_cast<const NativeConfig *>(config)));
} }
size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); } size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }
...@@ -111,11 +123,23 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor, ...@@ -111,11 +123,23 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
} }
void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs, void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
const std::string &dirname) { const std::string &dirname, bool is_combined = true,
std::string model_filename = "model",
std::string params_filename = "params") {
// Set fake_image_data // Set fake_image_data
PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data."); PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
std::vector<std::vector<int64_t>> feed_target_shapes = std::vector<std::vector<int64_t>> feed_target_shapes = GetFeedTargetShapes(
GetFeedTargetShapes(dirname, true, "model", "params"); dirname, is_combined, model_filename, params_filename);
std::ostringstream os;
for (size_t i = 0; i < feed_target_shapes.size(); ++i) {
os << "feed target " << i << ": {" << feed_target_shapes[i][0];
for (size_t j = 1; j < feed_target_shapes[i].size(); ++j) {
os << ", " << feed_target_shapes[i][j];
}
os << "}\n";
}
LOG(INFO) << os.str();
int dim1 = feed_target_shapes[0][1]; int dim1 = feed_target_shapes[0][1];
int dim2 = feed_target_shapes[0][2]; int dim2 = feed_target_shapes[0][2];
int dim3 = feed_target_shapes[0][3]; int dim3 = feed_target_shapes[0][3];
...@@ -139,25 +163,43 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs, ...@@ -139,25 +163,43 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
} }
void TestOneThreadPrediction( void TestOneThreadPrediction(
const AnalysisConfig &config, const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs, const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, bool use_analysis = true) { std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
int batch_size = FLAGS_batch_size; int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat; int num_times = FLAGS_repeat;
auto predictor = CreateTestPredictor(config, use_analysis); auto predictor = CreateTestPredictor(config, use_analysis);
Timer timer;
timer.tic(); // warmup run
LOG(INFO) << "Warm up run...";
{
Timer warmup_timer;
warmup_timer.tic();
predictor->Run(inputs[0], outputs, batch_size);
PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1);
#if !defined(_WIN32)
if (FLAGS_profile) {
paddle::platform::ResetProfiler();
}
#endif
}
LOG(INFO) << "Run " << num_times << " times...";
{
Timer run_timer;
run_timer.tic();
for (int i = 0; i < num_times; i++) { for (int i = 0; i < num_times; i++) {
for (size_t j = 0; j < inputs.size(); j++) { for (size_t j = 0; j < inputs.size(); j++) {
predictor->Run(inputs[j], outputs); predictor->Run(inputs[j], outputs, batch_size);
} }
} }
PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times, PrintTime(batch_size, num_times, 1, 0, run_timer.toc() / num_times,
inputs.size()); inputs.size());
}
} }
void TestMultiThreadPrediction( void TestMultiThreadPrediction(
const AnalysisConfig &config, const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs, const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, int num_threads, std::vector<PaddleTensor> *outputs, int num_threads,
bool use_analysis = true) { bool use_analysis = true) {
...@@ -200,12 +242,11 @@ void TestMultiThreadPrediction( ...@@ -200,12 +242,11 @@ void TestMultiThreadPrediction(
} }
} }
void TestPrediction(const AnalysisConfig &config, void TestPrediction(const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs, const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, int num_threads, std::vector<PaddleTensor> *outputs, int num_threads,
bool use_analysis = FLAGS_use_analysis) { bool use_analysis = FLAGS_use_analysis) {
LOG(INFO) << "use_analysis: " << use_analysis PrintConfig(config, use_analysis);
<< ", use_mkldnn: " << config.use_mkldnn();
if (num_threads == 1) { if (num_threads == 1) {
TestOneThreadPrediction(config, inputs, outputs, use_analysis); TestOneThreadPrediction(config, inputs, outputs, use_analysis);
} else { } else {
...@@ -215,9 +256,9 @@ void TestPrediction(const AnalysisConfig &config, ...@@ -215,9 +256,9 @@ void TestPrediction(const AnalysisConfig &config,
} }
void CompareNativeAndAnalysis( void CompareNativeAndAnalysis(
const AnalysisConfig &config, const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs) { const std::vector<std::vector<PaddleTensor>> &inputs) {
LOG(INFO) << "use_mkldnn: " << config.use_mkldnn(); PrintConfig(config, true);
std::vector<PaddleTensor> native_outputs, analysis_outputs; std::vector<PaddleTensor> native_outputs, analysis_outputs;
TestOneThreadPrediction(config, inputs, &native_outputs, false); TestOneThreadPrediction(config, inputs, &native_outputs, false);
TestOneThreadPrediction(config, inputs, &analysis_outputs, true); TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
// You may obtain a copy of the License at You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and See the License for the specific language governing permissions and
// limitations under the License. limitations under the License. */
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle { namespace paddle {
using paddle::contrib::AnalysisConfig; namespace inference {
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_bool(use_tensorrt, true, "Test the performance of TensorRT engine.");
DEFINE_string(prog_filename, "", "Name of model file.");
NativeConfig GetConfigNative() { DEFINE_string(param_filename, "", "Name of parameters file.");
NativeConfig config;
config.model_dir = FLAGS_dirname; template <typename ConfigType>
// LOG(INFO) << "dirname " << config.model_dir; void SetConfig(ConfigType* config, std::string model_dir, bool use_gpu,
config.fraction_of_gpu_memory = 0.15; bool use_tensorrt = false, int batch_size = -1) {
config.use_gpu = true; if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
config.device = 0; config->prog_file = model_dir + "/" + FLAGS_prog_filename;
return config; config->param_file = model_dir + "/" + FLAGS_param_filename;
} else {
config->model_dir = model_dir;
}
if (use_gpu) {
config->use_gpu = true;
config->device = 0;
config->fraction_of_gpu_memory = 0.15;
}
} }
void PrepareTRTConfig(AnalysisConfig *config) { template <>
config->model_dir = FLAGS_dirname + "/" + "mobilenet"; void SetConfig<contrib::AnalysisConfig>(contrib::AnalysisConfig* config,
std::string model_dir, bool use_gpu,
bool use_tensorrt, int batch_size) {
if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
config->prog_file = model_dir + "/" + FLAGS_prog_filename;
config->param_file = model_dir + "/" + FLAGS_param_filename;
} else {
config->model_dir = model_dir;
}
if (use_gpu) {
config->use_gpu = true;
config->device = 0;
config->fraction_of_gpu_memory = 0.15; config->fraction_of_gpu_memory = 0.15;
config->EnableTensorRtEngine(1 << 10, 5); if (use_tensorrt) {
config->EnableTensorRtEngine(1 << 10, batch_size);
config->pass_builder()->DeletePass("conv_bn_fuse_pass"); config->pass_builder()->DeletePass("conv_bn_fuse_pass");
config->pass_builder()->DeletePass("fc_fuse_pass"); config->pass_builder()->DeletePass("fc_fuse_pass");
config->pass_builder()->TurnOnDebug(); config->pass_builder()->TurnOnDebug();
} else {
config->enable_ir_optim = true;
}
}
} }
void PrepareInputs(std::vector<PaddleTensor> *tensors, int batch_size) { void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
PADDLE_ENFORCE_EQ(tensors->size(), 1UL); std::vector<std::vector<PaddleTensor>> inputs_all;
auto &tensor = tensors->front(); if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
int height = 224; SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
int width = 224; FLAGS_param_filename);
float *data = new float[batch_size * 3 * height * width]; } else {
memset(data, 0, sizeof(float) * (batch_size * 3 * height * width)); SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
data[0] = 1.0f;
// Prepare inputs
tensor.name = "input_0";
tensor.shape = std::vector<int>({batch_size, 3, height, width});
tensor.data = PaddleBuf(static_cast<void *>(data),
sizeof(float) * (batch_size * 3 * height * width));
tensor.dtype = PaddleDType::FLOAT32;
}
void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
auto config0 = GetConfigNative();
config0.model_dir = model_dirname;
AnalysisConfig config1(true);
PrepareTRTConfig(&config1);
config1.model_dir = model_dirname;
auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
auto predictor1 = CreatePaddlePredictor(config1);
// Prepare inputs
std::vector<PaddleTensor> paddle_tensor_feeds(1);
PrepareInputs(&paddle_tensor_feeds, batch_size);
// Prepare outputs
std::vector<PaddleTensor> outputs0;
std::vector<PaddleTensor> outputs1;
CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0));
CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
const size_t num_elements = outputs0.front().data.length() / sizeof(float);
const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
EXPECT_EQ(num_elements, num_elements1);
auto *data0 = static_cast<float *>(outputs0.front().data.data());
auto *data1 = static_cast<float *>(outputs1.front().data.data());
ASSERT_GT(num_elements, 0UL);
for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
EXPECT_NEAR(data0[i], data1[i], 1e-3);
} }
}
TEST(trt_models_test, mobilenet) { std::vector<PaddleTensor> outputs;
CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + "mobilenet"); if (use_analysis || use_tensorrt) {
} contrib::AnalysisConfig config(true);
TEST(trt_models_test, resnet50) { SetConfig<contrib::AnalysisConfig>(&config, model_dir, true, use_tensorrt,
CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + "resnet50"); FLAGS_batch_size);
} TestPrediction(reinterpret_cast<PaddlePredictor::Config*>(&config),
TEST(trt_models_test, resnext50) { inputs_all, &outputs, FLAGS_num_threads, true);
CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + "resnext50"); } else {
NativeConfig config;
SetConfig<NativeConfig>(&config, model_dir, true, false);
TestPrediction(reinterpret_cast<PaddlePredictor::Config*>(&config),
inputs_all, &outputs, FLAGS_num_threads, false);
}
} }
TEST(trt_models_test, raw_gpu) { void compare(std::string model_dir, bool use_tensorrt) {
std::string model_dir = FLAGS_dirname + "/" + "mobilenet"; std::vector<std::vector<PaddleTensor>> inputs_all;
auto config0 = GetConfigNative(); if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
config0.model_dir = model_dir; SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
int batch_size = 2; FLAGS_param_filename);
} else {
AnalysisConfig config1(true); SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
config1.fraction_of_gpu_memory = 0.1; }
config1.enable_ir_optim = true;
config1.model_dir = model_dir;
auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0); std::vector<PaddleTensor> native_outputs;
auto predictor1 = CreatePaddlePredictor(config1); NativeConfig native_config;
SetConfig<NativeConfig>(&native_config, model_dir, true, false,
FLAGS_batch_size);
TestOneThreadPrediction(
reinterpret_cast<PaddlePredictor::Config*>(&native_config), inputs_all,
&native_outputs, false);
std::vector<PaddleTensor> analysis_outputs;
contrib::AnalysisConfig analysis_config(true);
SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true,
use_tensorrt, FLAGS_batch_size);
TestOneThreadPrediction(
reinterpret_cast<PaddlePredictor::Config*>(&analysis_config), inputs_all,
&analysis_outputs, true);
CompareResult(native_outputs, analysis_outputs);
}
// Prepare inputs TEST(TensorRT_mobilenet, compare) {
std::vector<PaddleTensor> paddle_tensor_feeds(1); std::string model_dir = FLAGS_infer_model + "/mobilenet";
PrepareInputs(&paddle_tensor_feeds, batch_size); compare(model_dir, /* use_tensorrt */ true);
}
// Prepare outputs TEST(TensorRT_resnet50, compare) {
std::vector<PaddleTensor> outputs0; std::string model_dir = FLAGS_infer_model + "/resnet50";
std::vector<PaddleTensor> outputs1; compare(model_dir, /* use_tensorrt */ true);
CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0)); }
CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
const size_t num_elements = outputs0.front().data.length() / sizeof(float); TEST(TensorRT_resnext50, compare) {
const size_t num_elements1 = outputs1.front().data.length() / sizeof(float); std::string model_dir = FLAGS_infer_model + "/resnext50";
EXPECT_EQ(num_elements, num_elements1); compare(model_dir, /* use_tensorrt */ true);
}
auto *data0 = static_cast<float *>(outputs0.front().data.data()); TEST(TensorRT_resnext50, profile) {
auto *data1 = static_cast<float *>(outputs1.front().data.data()); std::string model_dir = FLAGS_infer_model + "/resnext50";
profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt);
}
ASSERT_GT(num_elements, 0UL); TEST(TensorRT_mobilenet, analysis) {
for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) { std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
EXPECT_NEAR(data0[i], data1[i], 1e-3); compare(model_dir, /* use_tensorrt */ false);
}
} }
} // namespace inference
} // namespace paddle } // namespace paddle
USE_PASS(tensorrt_subgraph_pass); USE_PASS(tensorrt_subgraph_pass);
...@@ -50,12 +50,18 @@ static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache"; ...@@ -50,12 +50,18 @@ static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache";
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
static_cast<size_t>(1024) * 1024 * 1024; static_cast<size_t>(1024) * 1024 * 1024;
static constexpr size_t kNUM_CUDNN_FWD_ALGS = #if CUDNN_VERSION_MIN(6, 0, 5)
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT; static constexpr size_t kNUM_CUDNN_FWD_ALGS = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT; CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT; CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
#else
// cuDNN v5 has no CUDNN_CONVOLUTION_FWD_ALGO_COUNT etc.
static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7;
static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
#endif
template <typename T> template <typename T>
class CUDNNConvOpKernel : public framework::OpKernel<T> { class CUDNNConvOpKernel : public framework::OpKernel<T> {
......
...@@ -19,8 +19,10 @@ namespace paddle { ...@@ -19,8 +19,10 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
template class SoftmaxFunctor<platform::CPUDeviceContext, float>; template class SoftmaxFunctor<platform::CPUDeviceContext, float, true>;
template class SoftmaxFunctor<platform::CPUDeviceContext, double>; template class SoftmaxFunctor<platform::CPUDeviceContext, float, false>;
template class SoftmaxFunctor<platform::CPUDeviceContext, double, true>;
template class SoftmaxFunctor<platform::CPUDeviceContext, double, false>;
template class SoftmaxGradFunctor<platform::CPUDeviceContext, float>; template class SoftmaxGradFunctor<platform::CPUDeviceContext, float>;
template class SoftmaxGradFunctor<platform::CPUDeviceContext, double>; template class SoftmaxGradFunctor<platform::CPUDeviceContext, double>;
......
...@@ -98,9 +98,14 @@ template class SoftmaxGradCUDNNFunctor<float>; ...@@ -98,9 +98,14 @@ template class SoftmaxGradCUDNNFunctor<float>;
template class SoftmaxGradCUDNNFunctor<double>; template class SoftmaxGradCUDNNFunctor<double>;
template class SoftmaxGradCUDNNFunctor<platform::float16>; template class SoftmaxGradCUDNNFunctor<platform::float16>;
template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16>; template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
template class SoftmaxFunctor<platform::CUDADeviceContext, float>; false>;
template class SoftmaxFunctor<platform::CUDADeviceContext, double>; template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
true>;
template class SoftmaxFunctor<platform::CUDADeviceContext, float, false>;
template class SoftmaxFunctor<platform::CUDADeviceContext, double, false>;
template class SoftmaxFunctor<platform::CUDADeviceContext, float, true>;
template class SoftmaxFunctor<platform::CUDADeviceContext, double, true>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>; template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>; template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext, template class SoftmaxGradFunctor<platform::CUDADeviceContext,
......
...@@ -19,7 +19,7 @@ namespace paddle { ...@@ -19,7 +19,7 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T, bool is_test>
class SoftmaxFunctor { class SoftmaxFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor* X, void operator()(const DeviceContext& context, const framework::Tensor* X,
......
...@@ -32,9 +32,9 @@ struct ValueClip { ...@@ -32,9 +32,9 @@ struct ValueClip {
} }
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T, bool is_test>
void SoftmaxFunctor<DeviceContext, T>::operator()(const DeviceContext& context, void SoftmaxFunctor<DeviceContext, T, is_test>::operator()(
const framework::Tensor* X, const DeviceContext& context, const framework::Tensor* X,
framework::Tensor* Y) { framework::Tensor* Y) {
auto logits = EigenMatrix<T>::From(*X); auto logits = EigenMatrix<T>::From(*X);
auto softmax = EigenMatrix<T>::From(*Y); auto softmax = EigenMatrix<T>::From(*Y);
...@@ -65,6 +65,39 @@ void SoftmaxFunctor<DeviceContext, T>::operator()(const DeviceContext& context, ...@@ -65,6 +65,39 @@ void SoftmaxFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
.broadcast(one_by_class)); .broadcast(one_by_class));
} }
template <typename DeviceContext, typename T>
class SoftmaxFunctor<DeviceContext, T, true> {
void operator()(const DeviceContext& context, const framework::Tensor* X,
framework::Tensor* Y) {
auto logits = EigenMatrix<T>::From(*X);
auto softmax = EigenMatrix<T>::From(*Y);
const int kBatchDim = 0;
const int kClassDim = 1;
const int batch_size = logits.dimension(kBatchDim);
const int num_classes = logits.dimension(kClassDim);
Eigen::DSizes<int, 1> along_class(kClassDim);
Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
Eigen::DSizes<int, 2> one_by_class(1, num_classes);
auto shifted_logits = (logits -
logits.maximum(along_class)
.eval()
.reshape(batch_by_one)
.broadcast(one_by_class));
softmax.device(*context.eigen_device()) = shifted_logits.exp();
softmax.device(*context.eigen_device()) = (softmax *
softmax.sum(along_class)
.inverse()
.eval()
.reshape(batch_by_one)
.broadcast(one_by_class));
}
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
void SoftmaxGradFunctor<DeviceContext, T>::operator()( void SoftmaxGradFunctor<DeviceContext, T>::operator()(
const DeviceContext& context, const framework::Tensor* y, const DeviceContext& context, const framework::Tensor* y,
......
...@@ -35,8 +35,13 @@ class SoftmaxKernel : public framework::OpKernel<T> { ...@@ -35,8 +35,13 @@ class SoftmaxKernel : public framework::OpKernel<T> {
Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1); Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1);
Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
math::SoftmaxFunctor<DeviceContext, T>()( #ifdef ON_INFER
math::SoftmaxFunctor<DeviceContext, T, true>()(
context.template device_context<DeviceContext>(), &X_2d, &Out_2d); context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
#else
math::SoftmaxFunctor<DeviceContext, T, false>()(
context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
#endif
} }
}; };
......
...@@ -42,8 +42,8 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> { ...@@ -42,8 +42,8 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
auto& dev_ctx = auto& dev_ctx =
context.template device_context<platform::CPUDeviceContext>(); context.template device_context<platform::CPUDeviceContext>();
math::SoftmaxFunctor<platform::CPUDeviceContext, T>()(dev_ctx, logits, math::SoftmaxFunctor<platform::CPUDeviceContext, T, false>()(
softmax); dev_ctx, logits, softmax);
math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()( math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"), dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"),
context.Attr<int>("ignore_index")); context.Attr<int>("ignore_index"));
......
...@@ -651,9 +651,9 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -651,9 +651,9 @@ All parameter, weight, gradient are variables in Paddle.
[](ir::Pass &self, const std::string &name, const std::string &attr) { [](ir::Pass &self, const std::string &name, const std::string &attr) {
self.Set<std::string>(name, new std::string(attr)); self.Set<std::string>(name, new std::string(attr));
}) })
.def("set_int", [](ir::Pass &self, const std::string &name, int val) { .def("set_int", [](ir::Pass &self, const std::string &name,
self.Set<const int>(name, new int(val)); int val) { self.Set<const int>(name, new int(val)); })
}); .def("type", &ir::Pass::Type);
py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb( py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
m, "PassBuilder"); m, "PassBuilder");
...@@ -792,6 +792,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -792,6 +792,7 @@ All parameter, weight, gradient are variables in Paddle.
"reduce_strategy", "reduce_strategy",
[](const BuildStrategy &self) { return self.reduce_; }, [](const BuildStrategy &self) { return self.reduce_; },
[](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) { [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
self.reduce_ = strategy; self.reduce_ = strategy;
}, },
R"DOC(The type is STR, there are two reduce strategies in ParallelExecutor, R"DOC(The type is STR, there are two reduce strategies in ParallelExecutor,
...@@ -805,6 +806,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -805,6 +806,7 @@ All parameter, weight, gradient are variables in Paddle.
[](const BuildStrategy &self) { return self.gradient_scale_; }, [](const BuildStrategy &self) { return self.gradient_scale_; },
[](BuildStrategy &self, [](BuildStrategy &self,
BuildStrategy::GradientScaleStrategy strategy) { BuildStrategy::GradientScaleStrategy strategy) {
PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
self.gradient_scale_ = strategy; self.gradient_scale_ = strategy;
}, },
R"DOC(The type is STR, there are three ways of defining :math:`loss@grad` in R"DOC(The type is STR, there are three ways of defining :math:`loss@grad` in
...@@ -816,6 +818,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -816,6 +818,7 @@ All parameter, weight, gradient are variables in Paddle.
"debug_graphviz_path", "debug_graphviz_path",
[](const BuildStrategy &self) { return self.debug_graphviz_path_; }, [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
[](BuildStrategy &self, const std::string &path) { [](BuildStrategy &self, const std::string &path) {
PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
self.debug_graphviz_path_ = path; self.debug_graphviz_path_ = path;
}, },
R"DOC(The type is STR, debug_graphviz_path indicate the path that R"DOC(The type is STR, debug_graphviz_path indicate the path that
...@@ -825,6 +828,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -825,6 +828,7 @@ All parameter, weight, gradient are variables in Paddle.
"enable_data_balance", "enable_data_balance",
[](const BuildStrategy &self) { return self.enable_data_balance_; }, [](const BuildStrategy &self) { return self.enable_data_balance_; },
[](BuildStrategy &self, bool b) { [](BuildStrategy &self, bool b) {
PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
self.enable_data_balance_ = b; self.enable_data_balance_ = b;
}) // FIXME(chengudo): enable_data_balance seems not important }) // FIXME(chengudo): enable_data_balance seems not important
.def_property( .def_property(
...@@ -833,6 +837,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -833,6 +837,7 @@ All parameter, weight, gradient are variables in Paddle.
return self.enable_sequential_execution_; return self.enable_sequential_execution_;
}, },
[](BuildStrategy &self, bool b) { [](BuildStrategy &self, bool b) {
PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
self.enable_sequential_execution_ = b; self.enable_sequential_execution_ = b;
}, },
R"DOC(The type is BOOL. If set True, the execution order of ops would be the same as what is in the program. Default False.)DOC") R"DOC(The type is BOOL. If set True, the execution order of ops would be the same as what is in the program. Default False.)DOC")
...@@ -842,6 +847,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -842,6 +847,7 @@ All parameter, weight, gradient are variables in Paddle.
return self.remove_unnecessary_lock_; return self.remove_unnecessary_lock_;
}, },
[](BuildStrategy &self, bool b) { [](BuildStrategy &self, bool b) {
PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
self.remove_unnecessary_lock_ = b; self.remove_unnecessary_lock_ = b;
}, },
R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default False.)DOC") R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default False.)DOC")
...@@ -851,15 +857,19 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -851,15 +857,19 @@ All parameter, weight, gradient are variables in Paddle.
return self.fuse_elewise_add_act_ops_; return self.fuse_elewise_add_act_ops_;
}, },
[](BuildStrategy &self, bool b) { [](BuildStrategy &self, bool b) {
PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
self.fuse_elewise_add_act_ops_ = b; self.fuse_elewise_add_act_ops_ = b;
}, },
R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether
to fuse elementwise_add_op and activation_op, to fuse elementwise_add_op and activation_op,
it may make the execution faster. Default False)DOC") it may make the execution faster. Default False)DOC")
.def("_create_passes_from_strategy", .def("_finalize_strategy_and_create_passes",
[](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> { [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
return self.CreatePassesFromStrategy(); return self.CreatePassesFromStrategy(true);
}); },
R"DOC(Allow user to customized passes. Normally model-specific
optimization passes should be defined in this way. BuildStrategy
cannot be updated after being finalized.)DOC");
pe.def(py::init<const std::vector<platform::Place> &, pe.def(py::init<const std::vector<platform::Place> &,
const std::unordered_set<std::string> &, const std::unordered_set<std::string> &,
......
...@@ -156,6 +156,8 @@ function cmake_gen() { ...@@ -156,6 +156,8 @@ function cmake_gen() {
-DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
-DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF} -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
-DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}
-DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}
-DPY_VERSION=${PY_VERSION:-2.7} -DPY_VERSION=${PY_VERSION:-2.7}
-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
======================================== ========================================
...@@ -188,6 +190,8 @@ EOF ...@@ -188,6 +190,8 @@ EOF
-DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \ -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
-DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \ -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \ -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
-DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}\
-DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}\
-DPY_VERSION=${PY_VERSION:-2.7} \ -DPY_VERSION=${PY_VERSION:-2.7} \
-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
...@@ -777,6 +781,17 @@ function main() { ...@@ -777,6 +781,17 @@ function main() {
test_fluid_lib test_fluid_lib
assert_api_spec_approvals assert_api_spec_approvals
;; ;;
assert_api)
assert_api_not_changed ${PYTHON_ABI:-""}
;;
test_inference)
gen_capi_package
gen_fluid_lib
test_fluid_lib
;;
assert_api_approvals)
assert_api_spec_approvals
;;
maccheck) maccheck)
cmake_gen ${PYTHON_ABI:-""} cmake_gen ${PYTHON_ABI:-""}
build_mac build_mac
......
...@@ -117,10 +117,10 @@ def __bootstrap__(): ...@@ -117,10 +117,10 @@ def __bootstrap__():
read_env_flags = [ read_env_flags = [
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb', 'eager_delete_scope', 'use_mkldnn', 'use_ngraph',
'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory',
'dist_threadpool_size', 'cpu_deterministic', 'eager_delete_tensor_gb', 'paddle_num_threads', 'dist_threadpool_size', 'cpu_deterministic',
'reader_queue_speed_test_mode' 'eager_delete_tensor_gb', 'reader_queue_speed_test_mode'
] ]
if core.is_compiled_with_dist(): if core.is_compiled_with_dist():
read_env_flags.append('rpc_deadline') read_env_flags.append('rpc_deadline')
......
...@@ -6822,7 +6822,7 @@ def prelu(x, mode, param_attr=None, name=None): ...@@ -6822,7 +6822,7 @@ def prelu(x, mode, param_attr=None, name=None):
alpha_shape = x.shape alpha_shape = x.shape
dtype = helper.input_dtype(input_param_name='x') dtype = helper.input_dtype(input_param_name='x')
alpha = helper.create_parameter( alpha = helper.create_parameter(
attr=param_attr, attr=helper.param_attr,
shape=alpha_shape, shape=alpha_shape,
dtype='float32', dtype='float32',
is_bias=False, is_bias=False,
......
...@@ -105,7 +105,7 @@ class TestDistRunnerBase(object): ...@@ -105,7 +105,7 @@ class TestDistRunnerBase(object):
build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
if args.batch_merge_repeat > 1: if args.batch_merge_repeat > 1:
pass_builder = build_stra._create_passes_from_strategy() pass_builder = build_stra._finalize_strategy_and_create_passes()
mypass = pass_builder.insert_pass( mypass = pass_builder.insert_pass(
len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass") len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass")
mypass.set_int("num_repeats", args.batch_merge_repeat) mypass.set_int("num_repeats", args.batch_merge_repeat)
......
...@@ -94,7 +94,12 @@ class TestPassBuilder(unittest.TestCase): ...@@ -94,7 +94,12 @@ class TestPassBuilder(unittest.TestCase):
def test_parallel_testing_with_new_strategy(self): def test_parallel_testing_with_new_strategy(self):
build_strategy = fluid.BuildStrategy() build_strategy = fluid.BuildStrategy()
pass_builder = build_strategy._create_passes_from_strategy() self.assertFalse(build_strategy.fuse_elewise_add_act_ops)
build_strategy.fuse_elewise_add_act_ops = True
pass_builder = build_strategy._finalize_strategy_and_create_passes()
self.assertTrue("fuse_elewise_add_act_pass" in
[p.type() for p in pass_builder.all_passes()])
origin_len = len(pass_builder.all_passes()) origin_len = len(pass_builder.all_passes())
viz_pass = pass_builder.append_pass("graph_viz_pass") viz_pass = pass_builder.append_pass("graph_viz_pass")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册