提交 9dbd3d5a 编写于 作者: D Dang Qingqing

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into quantize_transpiler_update

......@@ -69,6 +69,7 @@ option(WITH_ANAKIN "Compile with Anakin library" OFF)
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
option(WITH_INFERENCE "Compile fluid inference library" ON)
option(WITH_INFERENCE_API_TEST "Test fluid inference high-level api interface" OFF)
option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
......
......@@ -140,5 +140,11 @@ def parse_args():
'--use_lars',
action='store_true',
help='If set, use lars for optimizers, ONLY support resnet module.')
parser.add_argument(
'--reduce_strategy',
type=str,
choices=['reduce', 'all_reduce'],
default='all_reduce',
help='Specify the reduce strategy, can be reduce, all_reduce')
args = parser.parse_args()
return args
......@@ -170,6 +170,14 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
strategy = fluid.ExecutionStrategy()
strategy.num_threads = args.cpus
strategy.allow_op_delay = False
build_strategy = fluid.BuildStrategy()
if args.reduce_strategy == "reduce":
build_strategy.reduce_strategy = fluid.BuildStrategy(
).ReduceStrategy.Reduce
else:
build_strategy.reduce_strategy = fluid.BuildStrategy(
).ReduceStrategy.AllReduce
avg_loss = train_args[0]
if args.update_method == "pserver":
......@@ -184,6 +192,7 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
avg_loss.name,
main_program=train_prog,
exec_strategy=strategy,
build_strategy=build_strategy,
num_trainers=num_trainers,
trainer_id=trainer_id)
......
......@@ -67,11 +67,14 @@ def cnn_model(data):
def get_model(args, is_train, main_prog, startup_prog):
# NOTE: mnist is small, we don't implement data sharding yet.
filelist = [
os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
]
opt = None
data_file_handle = None
with fluid.program_guard(main_prog, startup_prog):
if args.use_reader_op:
filelist = [
os.path.join(args.data_path, f)
for f in os.listdir(args.data_path)
]
data_file_handle = fluid.layers.open_files(
filenames=filelist,
shapes=[[-1, 1, 28, 28], (-1, 1)],
......@@ -100,7 +103,7 @@ def get_model(args, is_train, main_prog, startup_prog):
if is_train:
opt = fluid.optimizer.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999)
opt.minimize()
opt.minimize(avg_cost)
if args.memory_optimize:
fluid.memory_optimize(main_prog)
......
......@@ -207,7 +207,7 @@ def get_model(args, is_train, main_prog, startup_prog):
total_images = 1281167 / trainer_count
step = int(total_images / args.batch_size + 1)
step = int(total_images / (args.batch_size * args.gpus) + 1)
epochs = [30, 60, 90]
bd = [step * e for e in epochs]
base_lr = args.learning_rate
......
......@@ -16,7 +16,9 @@ find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
DOC "Path to TensorRT library.")
if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
if(WITH_DSO)
set(TENSORRT_FOUND ON)
endif(WITH DSO)
else()
set(TENSORRT_FOUND OFF)
endif()
......
# PaddlePaddle Releasing Process
PaddlePaddle manages its branches using "git-flow branching model", and [Semantic Versioning](http://semver.org/) as it's version number semantics.
PaddlePaddle manages its branches using Trunk Based Development, and [Semantic Versioning](http://semver.org/) as it's version number semantics.
Each time we release a new PaddlePaddle version, we should follow the below steps:
......
......@@ -59,7 +59,7 @@ paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], vara
paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None))
paddle.fluid.InferenceTranspiler.__init__
paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
......@@ -305,9 +305,9 @@ paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'neg
paddle.fluid.layers.detection_output ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0))
paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None))
paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'anchor_var', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3))
paddle.fluid.layers.rpn_target_assign ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True))
paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'gt_boxes', 'im_scales', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None))
paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True))
paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
......@@ -346,7 +346,7 @@ paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'con
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None))
paddle.fluid.transpiler.InferenceTranspiler.__init__
paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
......
......@@ -28,10 +28,20 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
if(WITH_GPU)
cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle rpc_op_handle
all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
endif()
cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
if(WITH_GPU)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto reference_count_pass)
else()
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
endif()
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
simple_threadpool device_context)
......
......@@ -46,7 +46,8 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
#endif
void AllReduceOpHandle::RunImpl() {
platform::RecordEvent r("all_reduce", nullptr);
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
if (NoDummyInputSize() == 1) {
return; // No need to all reduce when GPU count = 1;
} else {
......
......@@ -15,12 +15,15 @@
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
namespace details {
void BroadcastOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
if (places_.size() == 1) return;
// The input and output may have dummy vars.
......
......@@ -32,6 +32,10 @@ struct ComputationOpHandle : public OpHandleBase {
std::string Name() const override;
const Scope *GetScope() const { return scope_; }
const platform::Place &GetPlace() const { return place_; }
protected:
void RunImpl() override;
......
......@@ -127,6 +127,9 @@ static const char kLocalScopes[] = "local_scopes";
static const char kStrategy[] = "strategy";
void MultiDevSSAGraphBuilder::Init() const {
all_vars_.clear();
balance_vars_.clear();
loss_var_name_ = Get<const std::string>(kLossVarName);
places_ = Get<const std::vector<platform::Place>>(kPlaces);
local_scopes_ = Get<const std::vector<Scope *>>(kLocalScopes);
......@@ -348,14 +351,31 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
size_t cur_device_id = 0;
bool is_forwarding = true;
bool is_dist_train = false;
for (ir::Node *node : sorted_ops) {
if (boost::get<int>(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
static_cast<int>(OpRole::kRPC)) {
CreateRPCOp(&result, node);
int op_dev_id = CreateRPCOp(&result, node);
PADDLE_ENFORCE(op_dev_id != -1,
"Can not schedule the RPC operator to the right place.");
if (node->Op()->Type() == "recv") {
auto recv_vars_attr =
boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE(recv_vars_attr.size() == 2UL); // [parameter, gradient]
if (recv_vars_attr[0].find(".block") == std::string::npos) {
bcast_var_name_set[op_dev_id].emplace(recv_vars_attr[0]);
}
}
is_dist_train = true;
} else if (IsDistTrainOp(node, send_vars, recv_vars)) {
CreateDistTrainOp(&result, node);
int op_dev_id = CreateDistTrainOp(&result, node);
if (node->Op()->Type() == "concat") {
auto origin_param_name = node->Op()->OutputArgumentNames()[0];
bcast_var_name_set[op_dev_id].emplace(origin_param_name);
}
} else if (IsScaleLossOp(node)) {
// user can customize loss@grad if not use_default_grad_scale_
if (strategy_.gradient_scale_ !=
......@@ -414,7 +434,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
CreateReduceOp(&result, g_name, cur_device_id);
graph->Get<ShardedVarDevice>(kShardedVarDevice)
.emplace(g_name, cur_device_id);
if (!is_dist_train) {
bcast_var_name_set[cur_device_id].emplace(p_name);
}
break;
case BuildStrategy::ReduceStrategy::kAllReduce:
if (IsSparseGradient(g_name)) {
......@@ -436,14 +458,19 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
}
}
}
bool use_gpu = false;
#ifdef PADDLE_WITH_CUDA
use_gpu = nccl_ctxs_ != nullptr;
#endif
if (use_gpu && strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
// Insert BCast Ops
// Insert broadcast operators principle:
// 1. Broadcast optimized parameters in Reduce strategy;
// 2. No need broadcast optimized parameters in AllReduce strategy because of
// the optimization sub-graph would be run on every GPU;
// 3. Allways broadcast received parameters in Distribute Training.
if ((use_gpu &&
strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) ||
is_dist_train) {
for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
auto &to_bcast_set = bcast_var_name_set[dev_id];
for (auto &bcast_name : to_bcast_set) {
......@@ -675,7 +702,7 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
return var;
}
void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
ir::Node *node) const {
int op_dev_id = -1;
std::vector<std::string> input_var_names;
......@@ -719,6 +746,7 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
node->Op()->Type());
CreateComputationalOp(result, node, op_dev_id);
return op_dev_id;
}
void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
......@@ -737,7 +765,7 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
}
// Create RPC related op handles that connects its in ops and out ops.
void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
ir::Node *node) const {
int op_dev_id = -1;
if (node->Op()->Type() == "send") {
......@@ -824,6 +852,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id);
}
}
return op_dev_id;
}
bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {
......
......@@ -40,12 +40,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
size_t device_id) const;
void Init() const;
private:
mutable std::string loss_var_name_;
mutable std::vector<platform::Place> places_;
mutable std::vector<Scope *> local_scopes_;
mutable std::unordered_set<std::string> grad_names_;
#ifdef PADDLE_WITH_CUDA
mutable platform::NCCLContextMap *nccl_ctxs_;
#endif
......@@ -54,8 +48,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
bool IsScaleLossOp(ir::Node *node) const;
void CreateRPCOp(ir::Graph *result, ir::Node *node) const;
void CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
/**
* Is this operator as the end-point operator before/after send operator.
......@@ -95,13 +89,17 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
size_t GetAppropriateDeviceID(
const std::vector<std::string> &var_names) const;
private:
void SetCommunicationContext(OpHandleBase *op_handle,
const platform::Place &p) const;
mutable std::string loss_var_name_;
mutable std::vector<platform::Place> places_;
mutable std::vector<Scope *> local_scopes_;
mutable std::unordered_set<std::string> grad_names_;
mutable BuildStrategy strategy_;
mutable std::unordered_map<std::string, VarDesc *> all_vars_;
mutable std::vector<int64_t> balance_vars_;
void SetCommunicationContext(OpHandleBase *op_handle,
const platform::Place &p) const;
};
} // namespace details
} // namespace framework
......
......@@ -27,7 +27,8 @@ namespace framework {
namespace details {
void ReduceOpHandle::RunImpl() {
platform::RecordEvent r("reduce", nullptr);
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
if (places_.size() == 1) return;
// the input and output may have dummy var.
auto in_var_handles = DynamicCast<VarHandle>(inputs_);
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <atomic>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace framework {
namespace details {
using ReferenceCountMap = std::unordered_map<std::string, int>;
using AtomicReferenceCountMap =
std::unordered_map<std::string, std::atomic<int>>;
using DeviceReferenceCountMap =
std::unordered_map<int, std::unique_ptr<ReferenceCountMap>>;
using AtomicDeviceReferenceCountMap =
std::unordered_map<int, std::unique_ptr<AtomicReferenceCountMap>>;
using DeviceGarbageCollectorMap =
std::unordered_map<int,
std::unique_ptr<GarbageCollector<framework::Tensor>>>;
class ReferenceCountOpHandle : public OpHandleBase {
public:
ReferenceCountOpHandle(ir::Node *node, const Scope *scope,
const platform::CUDAPlace &place,
const std::vector<std::string> &var_names,
GarbageCollector<Tensor> *gc,
AtomicReferenceCountMap *ref_cnts)
: OpHandleBase(node),
scope_(scope),
var_names_(var_names),
gc_(gc),
ref_cnts_(ref_cnts) {
dev_ctx_ = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place));
if (IsStreamGarabageCollector()) {
PADDLE_ENFORCE(cudaSetDevice(place.device));
PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
}
}
~ReferenceCountOpHandle() {
if (IsStreamGarabageCollector()) {
auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
PADDLE_ENFORCE(cudaEventDestroy(event_));
}
}
std::string Name() const override { return "reference_count"; }
protected:
void RunImpl() override {
auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
std::vector<LoDTensor *> tensors;
for (auto &name : var_names_) {
auto it = ref_cnts_->find(name);
if (it == ref_cnts_->end()) continue;
auto *var = exec_scope->FindVar(name);
if (var == nullptr || !var->IsType<LoDTensor>()) continue;
if (it->second.fetch_sub(1) <= 1) {
tensors.emplace_back(var->GetMutable<LoDTensor>());
}
}
if (!tensors.empty()) {
ClearTensors(tensors);
}
}
private:
void ClearTensors(const std::vector<LoDTensor *> &tensors) {
auto *gc = dynamic_cast<StreamGarbageCollector<Tensor> *>(gc_);
if (gc != nullptr) {
auto compute_stream = dev_ctx_->stream();
auto callback_stream = gc->stream();
auto callback_func = [=]() {
PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
};
gc_->Add(tensors, callback_func);
} else {
gc_->Add(tensors);
}
}
bool IsStreamGarabageCollector() const {
return dynamic_cast<const StreamGarbageCollector<Tensor> *>(gc_) != nullptr;
}
const Scope *scope_;
platform::CUDADeviceContext *dev_ctx_;
std::vector<std::string> var_names_;
GarbageCollector<Tensor> *gc_; // not own
AtomicReferenceCountMap *ref_cnts_; // not own
cudaEvent_t event_;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/reference_count_pass.h"
namespace paddle {
namespace framework {
namespace details {
std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
auto &ref_cnts = Get<DeviceReferenceCountMap>(kGlobalReferenceCount);
auto &cur_ref_cnts = Get<AtomicDeviceReferenceCountMap>(kCurReferenceCount);
auto &gcs = Get<DeviceGarbageCollectorMap>(kGarbageCollector);
// It is not easy to find the right reference counts of varaibles in graph
// Step 1: Find all variables in computation ops
// Step 2: Find all variables in non-computation ops which refers to variables
// in computation ops
std::unordered_set<std::string> names;
auto get_ref_cnts_from_compute_op = [&](
const std::unique_ptr<OpHandleBase> &op,
const std::vector<VarHandleBase *> &vars) {
std::vector<std::string> var_names_in_op;
auto *compute_op = dynamic_cast<ComputationOpHandle *>(op.get());
if (compute_op == nullptr ||
!platform::is_gpu_place(compute_op->GetPlace()))
return var_names_in_op;
auto place = boost::get<platform::CUDAPlace>(compute_op->GetPlace());
for (VarHandleBase *var_handle_base : vars) {
auto *var_handle = dynamic_cast<VarHandle *>(var_handle_base);
if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue;
if (!platform::is_gpu_place(var_handle->place_) ||
boost::get<platform::CUDAPlace>(var_handle->place_) != place)
continue;
VarDesc *var_desc = var_handle->Node()->Var();
auto var_name = var_handle->Node()->Name();
// This is wierd but there is really some variables without var_desc
// in computation_op
if (var_desc == nullptr) {
if (compute_op->Node()->Op()->Block()->FindVar(var_name) == nullptr)
continue;
} else {
if (var_desc->Persistable() ||
var_desc->Proto()->type().type() != proto::VarType::LOD_TENSOR)
continue;
}
// compute op only runs in one device
if (ref_cnts[place.device]->count(var_name))
++(*ref_cnts[place.device])[var_name];
else
(*ref_cnts[place.device])[var_name] = 1;
names.insert(var_name);
var_names_in_op.push_back(var_name);
}
return var_names_in_op;
};
auto update_ref_cnts_from_non_compute_op = [&](
const std::unique_ptr<OpHandleBase> &op,
const std::vector<VarHandleBase *> &vars) {
if (dynamic_cast<ComputationOpHandle *>(op.get()) != nullptr) return;
for (VarHandleBase *var_handle_base : vars) {
auto *var_handle = dynamic_cast<VarHandle *>(var_handle_base);
if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue;
auto var_name = var_handle->Node()->Name();
auto var_place = var_handle->place_;
if (!platform::is_gpu_place(var_place)) continue;
auto place = boost::get<platform::CUDAPlace>(var_place);
if (names.count(var_name) == 0) continue;
if (ref_cnts.count(place.device) &&
ref_cnts[place.device]->count(var_name)) {
++(*ref_cnts[place.device])[var_name];
}
}
};
std::unordered_map<OpHandleBase *, ReferenceCountOpHandle *>
compute_ref_cnt_map;
auto &all_ops = graph->Get<GraphOps>(kGraphOps);
for (auto &op : all_ops) {
auto in_var_names = get_ref_cnts_from_compute_op(op, op->Inputs());
auto out_var_names = get_ref_cnts_from_compute_op(op, op->Outputs());
if (in_var_names.empty() && out_var_names.empty()) continue;
in_var_names.insert(in_var_names.end(), out_var_names.begin(),
out_var_names.end());
auto *compute_op = dynamic_cast<ComputationOpHandle *>(op.get());
auto place = boost::get<platform::CUDAPlace>(compute_op->GetPlace());
ir::Node *ref_cnt_node =
graph->CreateEmptyNode("reference_count", ir::Node::Type::kOperation);
auto *ref_cnt_handle = new ReferenceCountOpHandle(
ref_cnt_node, compute_op->GetScope(), place, in_var_names,
gcs[place.device].get(), cur_ref_cnts[place.device].get());
auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
compute_op->AddOutput(dep_var);
ref_cnt_handle->AddInput(dep_var);
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
compute_ref_cnt_map[compute_op] = ref_cnt_handle;
}
for (auto &op : all_ops) {
update_ref_cnts_from_non_compute_op(op, op->Inputs());
update_ref_cnts_from_non_compute_op(op, op->Outputs());
}
std::vector<std::unique_ptr<OpHandleBase>> new_all_ops;
new_all_ops.reserve(compute_ref_cnt_map.size() + all_ops.size());
for (auto &op : all_ops) {
new_all_ops.emplace_back(std::move(op));
auto it = compute_ref_cnt_map.find(new_all_ops.back().get());
if (it != compute_ref_cnt_map.end()) {
new_all_ops.emplace_back(it->second);
}
}
all_ops.swap(new_all_ops);
return graph;
}
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_PASS(reference_count_pass,
paddle::framework::details::ReferenceCountPass)
.RequirePassAttr(paddle::framework::details::kGlobalReferenceCount)
.RequirePassAttr(paddle::framework::details::kCurReferenceCount)
.RequirePassAttr(paddle::framework::details::kGarbageCollector);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/details/reference_count_op_handle.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace details {
constexpr char kGlobalReferenceCount[] = "reference_count";
constexpr char kCurReferenceCount[] = "current_reference_count";
constexpr char kGarbageCollector[] = "garbage_collector";
class ReferenceCountPass : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -51,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() {
->stream();
memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
platform::CPUPlace(), &coeff_, sizeof(float), stream);
VLOG(1) << place_ << "RUN Scale loss grad op";
VLOG(10) << place_ << "RUN Scale loss grad op";
});
#endif
}
......
......@@ -18,6 +18,9 @@
#include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/framework/details/reference_count_op_handle.h"
#endif
namespace paddle {
namespace framework {
......@@ -65,12 +68,28 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
drop_scope_counter_ += 1;
#ifdef PADDLE_WITH_CUDA
const std::string gc_name = "garbage_collector";
DeviceGarbageCollectorMap *gc =
Graph().Has(gc_name) ? &(Graph().Get<DeviceGarbageCollectorMap>(gc_name))
: nullptr;
#endif
if (!fetch_tensors.empty() ||
drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
drop_scope_counter_ = 0;
// Wait All computational streams
for (auto p : places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait();
#ifdef PADDLE_WITH_CUDA
if (gc != nullptr && platform::is_gpu_place(p)) {
auto gpu_place = boost::get<platform::CUDAPlace>(p);
auto &gc_at_place = gc->at(gpu_place.device);
gc_at_place->Wait();
gc_at_place->Reset();
}
#endif
}
for (auto &scope : local_scopes_) {
auto &local_scope =
......
......@@ -37,7 +37,11 @@ int kProgramId = -1;
ExecutorPrepareContext::ExecutorPrepareContext(
const framework::ProgramDesc& prog, size_t block_id)
: prog_(prog), block_id_(block_id) {}
: prog_(prog), block_id_(block_id) {
if (GetEagerDeletionThreshold() >= 0) {
ref_cnts_ = GetNonPersistableReferenceCount<int>(prog_, block_id_);
}
}
ExecutorPrepareContext::~ExecutorPrepareContext() {
VLOG(5) << "destroy ExecutorPrepareContext";
......@@ -329,15 +333,81 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
CreateVariables(ctx->prog_, local_scope, ctx->block_id_);
}
int64_t max_memory_size = GetEagerDeletionThreshold();
std::unique_ptr<GarbageCollector<Tensor>> gc;
if (max_memory_size >= 0) {
ctx->ResetReferenceCount();
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) {
gc.reset(new DefaultStreamGarbageCollector<Tensor>(
boost::get<platform::CUDAPlace>(place_), max_memory_size));
} else {
#endif
gc.reset(new CPUGarbageCollector<Tensor>(
boost::get<platform::CPUPlace>(place_), max_memory_size));
#ifdef PADDLE_WITH_CUDA
}
#endif
}
for (auto& op : ctx->ops_) {
op->Run(*local_scope, place_);
if (gc != nullptr) {
std::vector<std::string> erase_vars;
for (auto& input : op->Inputs()) {
for (auto& input_name : input.second) {
auto it = ctx->cur_ref_cnts_.find(input_name);
if (it == ctx->cur_ref_cnts_.end()) continue;
if (it->second == 1) { // should delete it
erase_vars.emplace_back(input_name);
ctx->cur_ref_cnts_.erase(input_name);
} else {
--(it->second);
}
}
}
for (auto& output : op->Outputs()) {
for (auto& output_name : output.second) {
auto it = ctx->cur_ref_cnts_.find(output_name);
if (it == ctx->cur_ref_cnts_.end()) continue;
if (it->second == 1) {
erase_vars.emplace_back(output_name);
ctx->cur_ref_cnts_.erase(output_name);
} else {
--(it->second);
}
}
}
if (!erase_vars.empty()) {
std::vector<framework::LoDTensor*> erase_tensors;
for (auto& name : erase_vars) {
auto* var = local_scope->FindVar(name);
if (var == nullptr) continue;
if (var->IsType<framework::LoDTensor>()) {
auto* tensor = var->GetMutable<framework::LoDTensor>();
erase_tensors.push_back(tensor);
}
}
if (!erase_tensors.empty()) gc->Add(erase_tensors);
}
}
if (FLAGS_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_);
}
}
if (gc != nullptr) {
gc->Wait();
} else {
platform::DeviceContextPool::Instance().Get(place_)->Wait();
}
if (local_scope != scope) {
scope->DeleteScope(local_scope);
} else {
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
......@@ -27,13 +28,58 @@ namespace paddle {
namespace framework {
extern void InitializeVariable(Variable* var, proto::VarType::Type var_type);
template <typename T>
std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
const ProgramDesc& prog, size_t block_id) {
auto& block = prog.Block(block_id);
std::unordered_set<std::string> ignored_vars;
std::unordered_map<std::string, T> ref_cnts;
for (auto var_desc : block.AllVars()) {
auto type = var_desc->Proto()->type().type();
if (type != proto::VarType::LOD_TENSOR || var_desc->Persistable()) {
ignored_vars.insert(var_desc->Name()); // ignore persistable vars
}
}
for (auto op_desc : block.AllOps()) {
for (auto& input : op_desc->Inputs()) {
for (auto& input_name : input.second) {
if (!ignored_vars.count(input_name)) {
if (ref_cnts.count(input_name))
++ref_cnts[input_name];
else
ref_cnts[input_name] = 1;
}
}
}
for (auto& output : op_desc->Outputs()) {
for (auto output_name : output.second) {
if (!ignored_vars.count(output_name)) {
if (ref_cnts.count(output_name))
++ref_cnts[output_name];
else
ref_cnts[output_name] = 1;
}
}
}
}
return ref_cnts;
}
struct ExecutorPrepareContext {
ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
~ExecutorPrepareContext();
void ResetReferenceCount() { cur_ref_cnts_ = ref_cnts_; }
const framework::ProgramDesc& prog_;
size_t block_id_;
std::vector<std::unique_ptr<OperatorBase>> ops_;
std::unordered_map<std::string, int> ref_cnts_;
std::unordered_map<std::string, int> cur_ref_cnts_;
};
class Executor {
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <deque>
#include <functional>
#include <memory>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
// T should have memory_size() and clear() method
template <typename T>
class GarbageCollector {
public:
GarbageCollector(const platform::Place &place, size_t max_memory_size)
: max_memory_size_(std::max(max_memory_size, static_cast<size_t>(1))) {
garbages_.reset(new std::deque<T *>());
dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
}
virtual ~GarbageCollector() {}
void Reset() {
std::lock_guard<std::mutex> guard(mutex_);
garbages_.reset(new std::deque<T *>());
cur_memory_size_ = 0;
}
template <typename Container>
void Add(const Container &objs) {
Add(objs, []() {});
}
template <typename Container, typename Callback>
void Add(const Container &objs, Callback &&callback) {
std::shared_ptr<std::deque<T *>> clear_deque;
{
std::lock_guard<std::mutex> guard(mutex_);
for (auto *obj : objs) {
garbages_->push_back(obj);
cur_memory_size_ += obj->memory_size();
}
if (cur_memory_size_ >= max_memory_size_) {
cur_memory_size_ = 0;
clear_deque = garbages_;
garbages_.reset(new std::deque<T *>());
}
}
if (clear_deque != nullptr) {
callback();
ClearCallback([=]() {
for (auto *obj : *clear_deque) obj->clear();
});
}
}
virtual void Wait() const {}
protected:
virtual void ClearCallback(const std::function<void()> &callback) = 0;
platform::DeviceContext *dev_ctx_;
std::shared_ptr<std::deque<T *>> garbages_;
mutable std::mutex mutex_;
const size_t max_memory_size_;
size_t cur_memory_size_ = 0;
};
template <typename T>
class CPUGarbageCollector : public GarbageCollector<T> {
public:
CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size)
: GarbageCollector<T>(place, max_memory_size) {}
protected:
void ClearCallback(const std::function<void()> &callback) override {
callback();
}
};
#ifdef PADDLE_WITH_CUDA
template <typename T>
class DefaultStreamGarbageCollector : public GarbageCollector<T> {
public:
DefaultStreamGarbageCollector(const platform::CUDAPlace &place,
size_t max_memory_size)
: GarbageCollector<T>(place, max_memory_size) {}
cudaStream_t stream() const {
return static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
->stream();
}
void Wait() const override {
this->dev_ctx_->Wait();
static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
->WaitStreamCallback();
}
protected:
void ClearCallback(const std::function<void()> &callback) override {
static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
->AddStreamCallback(callback);
}
};
template <typename T>
class StreamGarbageCollector : public GarbageCollector<T> {
public:
StreamGarbageCollector(const platform::CUDAPlace &place,
size_t max_memory_size)
: GarbageCollector<T>(place, max_memory_size) {
PADDLE_ENFORCE(cudaSetDevice(place.device));
PADDLE_ENFORCE(cudaStreamCreate(&stream_));
callback_manager_.reset(new platform::StreamCallbackManager(stream_));
}
~StreamGarbageCollector() {
auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
PADDLE_ENFORCE(cudaSetDevice(place.device));
PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
PADDLE_ENFORCE(cudaStreamDestroy(stream_));
}
void Wait() const override {
PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
std::lock_guard<std::mutex> guard(this->mutex_);
callback_manager_->Wait();
}
cudaStream_t stream() const { return stream_; }
protected:
void ClearCallback(const std::function<void()> &callback) override {
std::lock_guard<std::mutex> guard(this->mutex_);
callback_manager_->AddCallback(callback);
}
private:
cudaStream_t stream_;
std::unique_ptr<platform::StreamCallbackManager> callback_manager_;
};
#endif
} // namespace framework
} // namespace paddle
......@@ -58,7 +58,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
desc.SetInput("Input", std::vector<std::string>({conv_relu_i_in}));
desc.SetInput("Filter", std::vector<std::string>({conv_relu_w_in}));
desc.SetInput("Bias", std::vector<std::string>({conv_relu_b_in}));
desc.SetOutput("Out", std::vector<std::string>({conv_relu_out}));
desc.SetOutput("Output", std::vector<std::string>({conv_relu_out}));
desc.SetType("conv2d");
for (auto& attr : conv->Op()->GetAttrMap()) {
desc.SetAttr(attr.first, attr.second);
......
......@@ -94,6 +94,14 @@ class Graph {
};
}
template <typename AttrType>
void SetNotOwned(const std::string &attr_name, AttrType *attr) {
PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the graph",
attr_name);
attrs_[attr_name] = attr;
attr_dels_[attr_name] = []() {};
}
const std::unordered_set<ir::Node *> &Nodes() const { return node_set_; }
// Create a normal variable with non-null VarDesc.
......
......@@ -429,7 +429,7 @@ struct LSTM : public PatternBase {
struct GRU : public PatternBase {
GRU(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "lstm") {}
: PatternBase(pattern, name_scope, "gru") {}
PDNode* operator()(PDNode* x);
......
......@@ -188,6 +188,30 @@ ParallelExecutor::ParallelExecutor(
main_program, member_->places_, loss_var_name, params,
member_->local_scopes_, member_->use_cuda_, build_strategy,
member_->nccl_ctxs_.get());
auto max_memory_size = GetEagerDeletionThreshold();
if (max_memory_size >= 0) {
for (auto &place : member_->places_) {
if (!platform::is_gpu_place(place)) continue;
auto gpu_place = boost::get<platform::CUDAPlace>(place);
if (gcs_[gpu_place.device] == nullptr) {
ref_cnts_[gpu_place.device].reset(new details::ReferenceCountMap());
cur_ref_cnts_[gpu_place.device].reset(
new details::AtomicReferenceCountMap());
gcs_[gpu_place.device].reset(
new StreamGarbageCollector<Tensor>(gpu_place, max_memory_size));
}
}
if (!gcs_.empty()) {
auto ref_cnt_pass =
ir::PassRegistry::Instance().Get("reference_count_pass");
ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_);
ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_);
ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
graph = ref_cnt_pass->Apply(std::move(graph));
graph->SetNotOwned("garbage_collector", &gcs_);
}
}
#else
std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
main_program, member_->places_, loss_var_name, params,
......@@ -209,30 +233,9 @@ ParallelExecutor::ParallelExecutor(
void ParallelExecutor::BCastParamsToDevices(
const std::unordered_set<std::string> &vars) const {
// the initializing bcast, all vars would be bcast from device(0),
// otherwise
// bcast from the specified device.
bool initializing = member_->executor_ ? false : true;
// the initializing bcast, all vars would be bcast from device(0).
for (auto &var : vars) {
int var_dev_id = -1;
if (member_->executor_) {
auto &sharded_var_device =
member_->executor_->Graph().Get<details::ShardedVarDevice>(
details::kShardedVarDevice);
if (sharded_var_device.find(var) != sharded_var_device.end()) {
var_dev_id = sharded_var_device.at(var);
}
}
if (!initializing && var_dev_id == -1) continue;
framework::Variable *main_var = nullptr;
if (initializing) {
main_var = member_->local_scopes_[0]->FindVar(var);
} else {
main_var = member_->local_scopes_[var_dev_id]->FindVar(var);
}
framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
continue;
}
......@@ -248,8 +251,7 @@ void ParallelExecutor::BCastParamsToDevices(
auto place = member_->places_[i];
void *buffer;
if ((initializing && i == 0) ||
(!initializing && static_cast<int>(i) == var_dev_id)) {
if (i == 0) {
buffer = const_cast<void *>(main_tensor.data<void>());
} else {
auto local_scope = member_->local_scopes_[i];
......@@ -266,29 +268,18 @@ void ParallelExecutor::BCastParamsToDevices(
platform::NCCLGroupGuard guard;
for (size_t i = 0; i < member_->places_.size(); ++i) {
auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
if (initializing) {
platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
nccl_ctx.comm_, nccl_ctx.stream());
} else {
if (var_dev_id >= 0) {
platform::dynload::ncclBcast(buffers[i], numel, data_type,
var_dev_id, nccl_ctx.comm_,
nccl_ctx.stream());
}
}
}
member_->nccl_ctxs_->WaitAll();
}
#else
PADDLE_THROW("Not compiled with CUDA");
#endif
} else {
platform::CPUPlace cpu;
for (size_t i = 0; i < member_->places_.size(); ++i) {
if ((initializing && i == 0) ||
(!initializing && static_cast<int>(i) == var_dev_id))
continue;
if (i == 0) continue;
auto local_scope = member_->local_scopes_[i];
auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
......@@ -310,6 +301,11 @@ void ParallelExecutor::BCastParamsToDevices(
void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
const std::string &fetched_var_name) {
platform::RecordBlock b(0);
#ifdef PADDLE_WITH_CUDA
if (!gcs_.empty()) {
ResetReferenceCount();
}
#endif
auto fetch_data = member_->executor_->Run(fetch_tensors);
*member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
fetch_data;
......@@ -367,3 +363,6 @@ USE_PASS(graph_viz_pass);
USE_PASS(multi_devices_pass);
USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass);
#ifdef PADDLE_WITH_CUDA
USE_PASS(reference_count_pass);
#endif
......@@ -15,7 +15,9 @@ limitations under the License. */
#pragma once
#include <paddle/fluid/framework/details/build_strategy.h>
#include <atomic>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/execution_strategy.h"
......@@ -27,6 +29,10 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/framework/details/reference_count_pass.h"
#endif
namespace paddle {
namespace framework {
......@@ -66,10 +72,27 @@ class ParallelExecutor {
void Run(const std::vector<std::string> &fetch_tensors,
const std::string &fetched_var_name);
private:
void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
private:
ParallelExecutorPrivate *member_;
#ifdef PADDLE_WITH_CUDA
// ref_cnts_ is only initialized when ParallelExecutor constructs, and then
// keeps unchanged
// Before each iteration, cur_ref_cnts_ is reset to ref_cnts_
details::DeviceReferenceCountMap ref_cnts_;
details::AtomicDeviceReferenceCountMap cur_ref_cnts_;
details::DeviceGarbageCollectorMap gcs_;
void ResetReferenceCount() {
for (auto &pair1 : ref_cnts_) {
for (auto &pair2 : *(pair1.second)) {
(*(cur_ref_cnts_[pair1.first]))[pair2.first] = pair2.second;
}
}
}
#endif
};
} // namespace framework
......
......@@ -31,9 +31,21 @@ DEFINE_bool(
"Delete local scope eagerly. It will reduce GPU memory usage but "
"slow down the destruction of variables.(around 1% performance harm)");
DEFINE_double(
eager_delete_tensor_gb, -1.0,
"Memory size threshold (GB) when the garbage collector clear tensors."
"Disabled when this value is less than 0");
namespace paddle {
namespace framework {
int64_t GetEagerDeletionThreshold() {
return FLAGS_eager_delete_tensor_gb < 0
? -1
: static_cast<int64_t>(FLAGS_eager_delete_tensor_gb *
(static_cast<int64_t>(1) << 30));
}
Scope::~Scope() { DropKids(); }
Scope& Scope::NewScope() const {
......
......@@ -26,6 +26,8 @@ limitations under the License. */
namespace paddle {
namespace framework {
int64_t GetEagerDeletionThreshold();
class Scope;
/**
......
......@@ -151,6 +151,8 @@ class Tensor {
void set_layout(const DataLayout layout) { layout_ = layout; }
void clear() { holder_ = nullptr; }
private:
/**
* @note Placeholder hides type T, so it doesn't appear as a template
......
......@@ -17,9 +17,7 @@ get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
# paddle_fluid_origin exclude inference api interface
cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
#if(APPLE)
add_subdirectory(api)
#endif()
add_subdirectory(api)
# Create static library
cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api analysis_predictor)
......@@ -57,5 +55,7 @@ endif()
if(WITH_TESTING)
# tests/book depends the models that generated by python/paddle/fluid/tests/book
add_subdirectory(tests/book)
if(WITH_INFERENCE_API_TEST)
add_subdirectory(tests/api)
endif()
endif()
......@@ -69,8 +69,9 @@ class DfgPassManagerImpl final : public DfgPassManager {
if (FLAGS_IA_enable_tensorrt_subgraph_engine) {
auto trt_teller = [&](const Node* node) {
std::unordered_set<std::string> teller_set(
{"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax",
"depthwise_conv2d", "batch_norm", "concat"});
{"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
"depthwise_conv2d", "batch_norm", "concat", "tanh",
"elementwise_add", "dropout"});
if (!node->IsFunction()) return false;
const auto* func = static_cast<const Function*>(node);
......
......@@ -72,6 +72,9 @@ class Analyzer : public OrderedRegistry<PassManager> {
"mul_gru_fuse_pass", //
"seq_concat_fc_fuse_pass", //
"fc_fuse_pass", //
#ifdef PADDLE_WITH_MKLDNN
"conv_relu_mkldnn_fuse_pass", //
#endif
}};
std::unordered_set<std::string> disabled_ir_passes_;
......
......@@ -440,6 +440,7 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT
}
return false;
};
for (auto &node : graph) {
for (auto *in : node->inlinks) {
// The Value that is written by nodes inside a sub-graph shouldn't be the
......@@ -459,6 +460,7 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT
std::vector<Node *>(outputs.begin(), outputs.end()));
}
// Filter the Intermediate results of the subgraph node.
void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
std::vector<Node *> op_nodes;
for (auto &node : GraphTraits<DataFlowGraph>(*graph).nodes_in_TS()) {
......@@ -480,9 +482,11 @@ void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
for (auto *out : op_nodes[i]->outlinks) {
if (follow_up_input_names.count(out->name())) {
filtered_subgraph_outlinks.push_back(out);
} else {
out->SetDeleted();
}
}
PADDLE_ENFORCE_GE(filtered_subgraph_outlinks.size(), 1UL);
// The filtered_subgraph_outlinks may be empty.
op_nodes[i]->outlinks = filtered_subgraph_outlinks;
}
}
......
......@@ -106,20 +106,23 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
// collect inputs
std::unordered_set<std::string> input_names;
std::unordered_set<std::string> input_names_with_id;
for (auto *x : func->inlinks) {
input_names.insert(x->name());
input_names_with_id.insert(x->name() + std::to_string(x->id()));
}
desc.SetInput(
"Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
std::unordered_set<std::string> output_names;
std::unordered_set<std::string> output_names_with_id;
for (auto *x : func->outlinks) {
output_names.insert(x->name());
output_names_with_id.insert(x->name() + std::to_string(x->id()));
}
std::vector<std::string> output_temp(output_names.begin(),
output_names.end());
desc.SetOutput("Ys", output_temp);
desc.SetOutput(
"Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
desc.SetType("tensorrt_engine");
std::unordered_map<std::string, std::string> output_name_map;
......@@ -153,11 +156,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
std::vector<std::string> replaced_names;
for (int k = 0; k < in_var->arguments_size(); k++) {
std::string arg_value = in_var->arguments(k);
if (input_names.count(arg_value)) {
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (input_names_with_id.count(arg_value_with_id)) {
replaced_names.push_back(arg_value);
} else {
replaced_names.push_back(arg_value +
std::to_string(var2id[arg_value]));
replaced_names.push_back(arg_value_with_id);
}
}
in_var->clear_arguments();
......@@ -176,11 +180,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
std::vector<std::string> replaced_names;
for (int k = 0; k < out_var->arguments_size(); k++) {
std::string arg_value = out_var->arguments(k);
if (output_names.count(arg_value)) {
output_name_map[arg_value] =
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (output_names_with_id.count(arg_value_with_id)) {
output_name_map[arg_value] = arg_value_with_id;
}
replaced_names.push_back(arg_value + std::to_string(var2id[arg_value]));
replaced_names.push_back(arg_value_with_id);
}
out_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
......
......@@ -74,13 +74,141 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
node_map.at(b)->attr(kUnionFindParent).Int32() = a_ancestor;
}
// This is a simple representation of a graph.
// The BriefNode hold the pointer of the Node.
// This is to avoid changing the original graph
// in the process of trt graph analysis.
struct BriefNode {
explicit BriefNode(Node *n) { node = n; }
Node *node;
std::vector<BriefNode *> inlinks;
std::vector<BriefNode *> outlinks;
};
// Union two adjacent BriefNode.
// Suppose we have two adjacent nodes src and dst.
// We will perform the following operations:
// 1. add all inputs(except src) of dst to src inlinks.
// 2. add all outputs of dst to src outlinks.
// 3. change all the dst's inputs and outputs
// corresponding inlinks and outlinks to src node.
// 4. delete all dst's inlinks and outlinks.
void UnionContractedNodes(const std::unordered_map<int, BriefNode *> &node_map,
int src_id, int dst_id) {
// merge the two adjacent nodes into one node.
BriefNode *src_node = node_map.at(src_id);
BriefNode *dst_node = node_map.at(dst_id);
std::unordered_set<BriefNode *> inputs(src_node->inlinks.begin(),
src_node->inlinks.end());
std::unordered_set<BriefNode *> outputs;
for (auto *n : src_node->outlinks) {
if (n != dst_node) outputs.insert(n);
}
// Add the inlinks and outlinks of dst node to src node.
std::vector<BriefNode *> dst_in_nodes = dst_node->inlinks;
for (BriefNode *node : dst_in_nodes) {
if (node != src_node) {
inputs.insert(node);
}
}
std::vector<BriefNode *> dst_out_nodes = dst_node->outlinks;
for (BriefNode *node : dst_out_nodes) {
outputs.insert(node);
}
// update the dst and src node's inlinks and outlinks.
#ifdef __clang__
src_node->inlinks = std::vector<BriefNode *>(inputs.begin(), inputs.end());
src_node->outlinks = std::vector<BriefNode *>(outputs.begin(), outputs.end());
dst_node->inlinks.clear();
dst_node->outlinks.clear();
#else
src_node->inlinks =
std::move(std::vector<BriefNode *>(inputs.begin(), inputs.end()));
src_node->outlinks =
std::move(std::vector<BriefNode *>(outputs.begin(), outputs.end()));
dst_node->inlinks.clear();
dst_node->outlinks.clear();
#endif
auto inlink_or_outlink_cleaner = [&](std::vector<BriefNode *> &nodes) {
for (auto *&n : nodes) {
if (n == src_node || n == dst_node) {
n = src_node;
}
}
};
// Change all the dst inputs and outputs corresponding inlink and
// outlink to the src node.
for (auto *node : src_node->inlinks) {
inlink_or_outlink_cleaner(node->outlinks);
}
for (auto *node : src_node->outlinks) {
inlink_or_outlink_cleaner(node->inlinks);
}
}
// FlexibleDFS
// If reverse is true, do reverse dfs.
// If enter func is not nullptr, calls enter(node) before visiting any children
// of node.
// If leave func not nullptr, calls leave(node) after visiting all parents of
// node.
void FlexibleDFS(const std::vector<BriefNode *> &source, bool reverse,
const std::function<bool(const BriefNode *)> &enter,
const std::function<bool(const BriefNode *)> &leave) {
typedef struct {
const BriefNode *node;
bool leave;
} FNode;
std::vector<FNode> stack;
for (auto &node : source) {
stack.push_back(FNode{node, false});
}
std::unordered_set<const BriefNode *> visited;
while (!stack.empty()) {
auto fnode = stack.back();
stack.pop_back();
if (fnode.leave) {
if (leave && !leave(fnode.node)) return;
}
if (visited.count(fnode.node)) continue;
visited.insert(fnode.node);
if (enter && !enter(fnode.node)) return;
if (leave) stack.push_back(FNode{fnode.node, true});
const std::vector<BriefNode *> iter_nodes =
reverse == true ? fnode.node->inlinks : fnode.node->outlinks;
for (const BriefNode *node : iter_nodes) {
if (!visited.count(node)) {
stack.push_back(FNode{node, false});
}
}
}
}
std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
// Run the Extract algorithm to find all subgraphs.
std::vector<Node *> marked_nodes;
// We use brief_node_map to represent the original graph in order to avoid
// changing the original graph.
std::unordered_map<int, BriefNode *> brief_node_map;
for (auto &node : GraphTraits<DataFlowGraph>(*graph_).nodes_in_TS()) {
brief_node_map[node.id()] = new BriefNode(&node);
if (node.attr(kMarkerAttrName).Bool()) {
marked_nodes.push_back(&node);
}
}
// extract sub-graphs in the marked node set, use Union Find algorithm.
node_map_t node_map; // id to ptr
for (auto *n : marked_nodes) {
......@@ -88,11 +216,73 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
n->attr(kUnionFindParent).Int32() = n->id();
node_map[n->id()] = n;
}
std::unordered_set<Node *> visited;
for (auto *n : marked_nodes) {
for (auto *out : n->outlinks) {
if (node_map.count(out->id())) {
UnionFindCombine(node_map, n->id(), out->id());
// create breif node map
for (auto &itr : brief_node_map) {
for (Node *node : itr.second->node->inlinks) {
itr.second->inlinks.push_back(brief_node_map[node->id()]);
}
for (Node *node : itr.second->node->outlinks) {
itr.second->outlinks.push_back(brief_node_map[node->id()]);
}
}
for (auto &itr : brief_node_map) {
BriefNode *brief_node = itr.second;
if (!brief_node->node->attr(kMarkerAttrName).Bool()) {
VLOG(4) << brief_node->node->id() << " node not a trt candicate.";
continue;
}
// Our algorithm must guarantee that:
// 1. The graph is always directed acyclic graph(DAG).
// 2. If there is a path in the subgraph from X to Y (X and Y are both
// nodes in the subgraph), then all paths from X to Y are in the
// subgraph.
//
// In order to achieve the above guarantee.
// For adjacent nodes src -> dst.
// 1. Get all dst input nodes except src.
// 2. Reverse DFS from those input nodes
// 3. If there is a path from input nodes to src,
// then the src and dst nodes can not be fused into one node,
// otherwise it can be done.
while (true) {
std::unordered_set<BriefNode *> contract_nodes;
for (auto *out : brief_node->outlinks) {
// must be an trt candidate
if (!out->node->attr(kMarkerAttrName).Bool()) continue;
// get all dst input nodes except src.
std::vector<BriefNode *> source_nodes;
for (auto *n : out->inlinks) {
if (n != brief_node) {
source_nodes.push_back(n);
}
}
// Reverse DFS from the source_nodes.
bool have_excess_path = false;
FlexibleDFS(source_nodes, true, nullptr,
[&have_excess_path, brief_node](const BriefNode *n) {
if (n == brief_node) {
have_excess_path = true;
return false;
}
return true;
});
if (have_excess_path) continue;
contract_nodes.insert(out);
}
if (contract_nodes.empty()) break;
for (auto dst_node : contract_nodes) {
UnionFindCombine(node_map, brief_node->node->id(),
dst_node->node->id());
UnionContractedNodes(brief_node_map, brief_node->node->id(),
dst_node->node->id());
}
}
}
......@@ -128,6 +318,7 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() {
auto io = ExtractInputAndOutputOfSubGraph(subgraph);
block_node->inlinks = std::move(io.first);
block_node->outlinks = std::move(io.second);
for (auto *node : subgraph) {
// TODO(Superjomn) need a unified mechanism to treat deleted node in each
// pass.
......
......@@ -82,7 +82,7 @@ TEST(SubGraphSplitter, Fuse) {
// At least one nodes should be deleted.
ASSERT_EQ(dfg.nodes.size(), count0 + 1); // added a new FunctionBlock
ASSERT_EQ(6, count1);
ASSERT_EQ(11, count1);
}
} // namespace analysis
......
......@@ -69,25 +69,4 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
endfunction()
anakin_target(inference_anakin_api)
anakin_target(inference_anakin_api_shared)
if (WITH_TESTING)
# TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
if(WITH_GPU)
set(anakin_test_extra_deps dynload_cuda)
set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
cc_test(api_anakin_engine_tester SRCS api_anakin_engine_tester.cc
ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
endif()
cc_test(api_anakin_engine_rnn_tester SRCS api_anakin_engine_rnn_tester.cc
ARGS --model=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin
--datapath=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn_data.txt
DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
endif(WITH_TESTING)
endif()
......@@ -77,6 +77,9 @@ bool AnalysisPredictor::Init(
OptimizeInferenceProgram();
ctx_ = executor_->Prepare(*inference_program_, 0);
if (config_._use_mkldnn) {
executor_->EnableMKLDNN(*inference_program_);
}
VLOG(5) << "to create variables";
PADDLE_ENFORCE(scope_.get());
......
......@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
......@@ -64,13 +64,15 @@ PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) {
void PaddleBuf::Resize(size_t length) {
// Only the owned memory can be reset, the external memory can't be changed.
if (length_ == length) return;
if (length_ >= length) return;
if (memory_owned_) {
Free();
}
data_ = new char[length];
data_ = malloc(length);
length_ = length;
memory_owned_ = true;
} else {
PADDLE_THROW("The memory is allocated externally, can not Resized");
}
}
void PaddleBuf::Reset(void* data, size_t length) {
......@@ -82,8 +84,8 @@ void PaddleBuf::Reset(void* data, size_t length) {
void PaddleBuf::Free() {
if (memory_owned_ && data_) {
assert(length_ > 0);
delete[] static_cast<char*>(data_);
PADDLE_ENFORCE_GT(length_, 0);
free(static_cast<char*>(data_));
data_ = nullptr;
length_ = 0;
}
......
......@@ -106,6 +106,9 @@ bool NativePaddlePredictor::Init(
}
ctx_ = executor_->Prepare(*inference_program_, 0);
if (config_._use_mkldnn) {
executor_->EnableMKLDNN(*inference_program_);
}
executor_->CreateVariables(*inference_program_,
sub_scope_ ? sub_scope_ : scope_.get(), 0);
......
......@@ -153,11 +153,21 @@ CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
} // namespace paddle
USE_TRT_CONVERTER(elementwise_add_weight);
USE_TRT_CONVERTER(elementwise_add_tensor);
USE_TRT_CONVERTER(elementwise_sub_tensor);
USE_TRT_CONVERTER(elementwise_div_tensor);
USE_TRT_CONVERTER(elementwise_mul_tensor);
USE_TRT_CONVERTER(elementwise_max_tensor);
USE_TRT_CONVERTER(elementwise_min_tensor);
USE_TRT_CONVERTER(elementwise_pow_tensor);
USE_TRT_CONVERTER(mul);
USE_TRT_CONVERTER(conv2d);
USE_TRT_CONVERTER(relu);
USE_TRT_CONVERTER(sigmoid);
USE_TRT_CONVERTER(tanh);
USE_TRT_CONVERTER(fc);
USE_TRT_CONVERTER(pool2d);
USE_TRT_CONVERTER(softmax);
USE_TRT_CONVERTER(batch_norm);
USE_TRT_CONVERTER(concat);
USE_TRT_CONVERTER(dropout);
......@@ -123,10 +123,16 @@ std::string DescribeTensor(const PaddleTensor &tensor) {
}
void PrintTime(int batch_size, int repeat, int num_threads, int tid,
double latency) {
double latency, int epoch = 1) {
LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
<< ", threads: " << num_threads << ", thread id: " << tid
<< ", latency: " << latency << "ms ======";
if (epoch > 1) {
int samples = batch_size * epoch;
LOG(INFO) << "====== sample number: " << samples
<< ", average latency of each sample: " << latency / samples
<< "ms ======";
}
}
} // namespace inference
......
......@@ -45,7 +45,7 @@ class PaddleBuf {
PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {}
// Own memory.
PaddleBuf(size_t length)
explicit PaddleBuf(size_t length)
: data_(new char[length]), length_(length), memory_owned_(true) {}
// Resize to `length` bytes.
void Resize(size_t length);
......@@ -121,6 +121,8 @@ struct NativeConfig : public PaddlePredictor::Config {
bool use_gpu{false};
int device{0};
float fraction_of_gpu_memory{-1.f}; // Negative to notify initialization.
// NOTE: NOT use it, just for the internal test, will discard later
bool _use_mkldnn{false};
// Specify the variable's name of each input.
bool specify_input_name{false};
......
# Add TRT tests
nv_library(tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
DEPS tensorrt_engine operator scope framework_proto op_registry)
nv_test(test_op_converter SRCS test_op_converter.cc DEPS
......@@ -24,6 +24,8 @@ nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL)
nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL)
nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL)
nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine dropout_op SERIAL)
......@@ -19,28 +19,64 @@ namespace paddle {
namespace inference {
namespace tensorrt {
class ReluOpConverter : public OpConverter {
class ActivationOpConverter : public OpConverter {
public:
ReluOpConverter() {}
ActivationOpConverter() {}
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
// Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange.
framework::OpDesc op_desc(op, nullptr);
LOG(INFO) << "convert a fluid relu op to tensorrt activation layer whose "
"type is Relu";
LOG(INFO)
<< "convert a fluid Activation op to tensorrt activation layer whose "
"type is "
<< op_type_;
const nvinfer1::ITensor* input_tensor =
engine_->GetITensor(op_desc.Input("X")[0]);
auto op_pair = ops.find(op_type_);
if (op_pair == ops.end()) {
PADDLE_THROW("Wrong activation op type!");
}
nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER(
engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
nvinfer1::ActivationType::kRELU);
op_pair->second);
auto output_name = op_desc.Output("Out")[0];
layer->setName((op_type_ + " (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside.
engine_->DeclareOutput(output_name);
}
}
protected:
std::string op_type_;
static const std::unordered_map<std::string, nvinfer1::ActivationType> ops;
};
const std::unordered_map<std::string, nvinfer1::ActivationType>
ActivationOpConverter::ops = {
{"relu", nvinfer1::ActivationType::kRELU},
{"sigmoid", nvinfer1::ActivationType::kSIGMOID},
{"tanh", nvinfer1::ActivationType::kTANH},
};
class ReluOpConverter : public ActivationOpConverter {
public:
ReluOpConverter() { op_type_ = "relu"; }
};
class SigmoidOpConverter : public ActivationOpConverter {
public:
SigmoidOpConverter() { op_type_ = "sigmoid"; }
};
class TanhOpConverter : public ActivationOpConverter {
public:
TanhOpConverter() { op_type_ = "tanh"; }
};
} // namespace tensorrt
......@@ -48,3 +84,5 @@ class ReluOpConverter : public OpConverter {
} // namespace paddle
REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter);
REGISTER_TRT_OP_CONVERTER(sigmoid, SigmoidOpConverter);
REGISTER_TRT_OP_CONVERTER(tanh, TanhOpConverter);
......@@ -116,6 +116,8 @@ class BatchNormOpConverter : public OpConverter {
scale_weights.get(), power_weights.get());
auto output_name = op_desc.Output("Y").front();
layer->setName(("batch_norm (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->weight_map[op_desc.Input("Bias").front()] =
std::move(combile_bias_tensor);
engine_->weight_map[op_desc.Input("Scale").front()] =
......
......@@ -42,6 +42,8 @@ class ConcatOpConverter : public OpConverter {
axis = axis - 1; // Remove batch dim
layer->setAxis(axis);
auto output_name = op_desc.Output("Out")[0];
layer->setName(("concat (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside.
......
......@@ -78,8 +78,10 @@ class Conv2dOpConverter : public OpConverter {
layer->setNbGroups(groups);
auto output_name = op_desc.Output("Output").front();
layer->setName(("conv2d (Output: " + output_name + ")").c_str());
engine_->weight_map[op_desc.Input("Filter").front()] =
std::move(weight_tensor);
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) {
engine_->DeclareOutput(output_name);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* DropoutOp. This Layer doesn't has weights.
*/
class DropoutOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(4) << "convert a fluid dropout op to tensorrt dropout layer";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
float dropout_prob = boost::get<float>(op_desc.GetAttr("dropout_prob"));
platform::CPUPlace cpu_place;
std::unique_ptr<framework::LoDTensor> weight_tensor(
new framework::LoDTensor());
weight_tensor->Resize(framework::make_ddim({1}));
auto* weight_data =
weight_tensor->mutable_data<float>(platform::CPUPlace());
weight_data[0] = 1 - dropout_prob;
TensorRTEngine::Weight scale_weights{
nvinfer1::DataType::kFLOAT, static_cast<void*>(weight_data),
weight_tensor->memory_size() / sizeof(float)};
TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT, nullptr,
0};
TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
0};
auto* layer = TRT_ENGINE_ADD_LAYER(
engine_, Scale, *const_cast<nvinfer1::ITensor*>(input1),
nvinfer1::ScaleMode::kUNIFORM, shift_weights.get(), scale_weights.get(),
power_weights.get());
engine_->weight_map[op_desc.Output("Out").front() + "_dropout"] =
std::move(weight_tensor);
auto output_name = op_desc.Output("Out")[0];
layer->setName(("dropout (Output: " + output_name + ")").c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) {
engine_->DeclareOutput(output_name);
}
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(dropout);
REGISTER_TRT_OP_CONVERTER(dropout, DropoutOpConverter);
......@@ -89,6 +89,8 @@ class ElementwiseWeightOpConverter : public OpConverter {
shift_weights.get(), scale_weights.get(), power_weights.get());
auto output_name = op_desc.Output("Out")[0];
layer->setName(("elementwise_add (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor);
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { // the test framework can not determine which is the
......@@ -137,6 +139,8 @@ class ElementwiseTensorOpConverter : public OpConverter {
*const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
auto output_name = op_desc.Output("Out")[0];
layer->setName(("elementwise (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside.
......
......@@ -107,6 +107,8 @@ class FcOpConverter : public OpConverter {
n_output, tmp_weight.get(), bias.get());
auto output_name = op_desc.Output("Out").front();
layer->setName(("fc (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
engine_->weight_map[op_desc.Input("Y").front()] = std::move(tmp);
if (test_mode) {
......
......@@ -72,6 +72,8 @@ class Pool2dOpConverter : public OpConverter {
layer->setPadding(nv_paddings);
auto output_name = op_desc.Output("Out")[0];
layer->setName(("pool2d (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) {
engine_->DeclareOutput(output_name);
......
......@@ -20,18 +20,18 @@ namespace paddle {
namespace inference {
namespace tensorrt {
TEST(ReluOpConverter, main) {
void test_activation(std::string act_type) {
framework::Scope scope;
std::unordered_set<std::string> parameters;
TRTConvertValidation validator(10, parameters, scope, 1000);
validator.DeclInputVar("relu-X", nvinfer1::Dims2(10, 6));
validator.DeclOutputVar("relu-Out", nvinfer1::Dims2(10, 6));
validator.DeclInputVar("act-X", nvinfer1::Dims2(10, 6));
validator.DeclOutputVar("act-Out", nvinfer1::Dims2(10, 6));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("relu");
desc.SetInput("X", {"relu-X"});
desc.SetOutput("Out", {"relu-Out"});
desc.SetType(act_type);
desc.SetInput("X", {"act-X"});
desc.SetOutput("Out", {"act-Out"});
LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto());
......@@ -40,8 +40,16 @@ TEST(ReluOpConverter, main) {
validator.Execute(5);
}
TEST(ReluOpConverter, main) { test_activation("relu"); }
TEST(SigmoidOpConverter, main) { test_activation("sigmoid"); }
TEST(TanhOpConverter, main) { test_activation("tanh"); }
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(relu);
USE_OP(sigmoid);
USE_OP(tanh);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace tensorrt {
TEST(DropoutOpConverter, main) {
framework::Scope scope;
std::unordered_set<std::string> parameters;
TRTConvertValidation validator(8, parameters, scope, 1000);
std::vector<int> tensor_shape{8, 10};
validator.DeclInputVar("dropout-X", tensor_shape,
nvinfer1::DimsCHW(10, 1, 1));
validator.DeclOutputVar("dropout-Out", nvinfer1::DimsCHW(10, 1, 1));
validator.DeclOutputVar("mask-Out", nvinfer1::DimsCHW(10, 1, 1));
// Prepare Op description
framework::OpDesc desc;
int is_test = 1;
float dropout_prob = 0.4;
desc.SetType("dropout");
desc.SetInput("X", {"dropout-X"});
desc.SetOutput("Mask", {"mask-Out"});
desc.SetOutput("Out", {"dropout-Out"});
desc.SetAttr("is_test", is_test);
desc.SetAttr("dropout_prob", dropout_prob);
LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto());
LOG(INFO) << "execute";
std::unordered_set<std::string> neglected_output = {"mask-Out"};
validator.Execute(8, neglected_output);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(dropout);
function (inference_download_and_uncompress install_dir url)
get_filename_component(filename ${url} NAME)
message(STATUS "Download inference test stuff ${filename} from ${url}")
set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com")
set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
"A path setting inference demo download directories.")
set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)
function (inference_download install_dir url filename)
message(STATUS "Download inference test stuff from ${url}/${filename}")
execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}")
execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}")
message(STATUS "finish downloading ${filename}")
endfunction(inference_download_and_uncompress)
endfunction()
function (inference_download_and_uncompress install_dir url filename)
inference_download(${install_dir} ${url} ${filename})
execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
endfunction()
function(download_model_and_data install_dir model_url data_url)
if (NOT EXISTS ${install_dir} AND WITH_INFERENCE)
inference_download_and_uncompress(${install_dir} ${model_url})
inference_download_and_uncompress(${install_dir} ${data_url})
function(download_model_and_data install_dir model_name data_name)
if (NOT EXISTS ${install_dir})
inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${model_name})
inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${data_name})
endif()
endfunction()
function(inference_analysis_api_test target install_dir filename)
inference_analysis_test(${target} SRCS ${filename}
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
endfunction()
# RNN1
set(RNN1_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fmodel.tar.gz")
set(RNN1_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fdata.txt.tar.gz")
set(RNN1_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/rnn1")
download_model_and_data(${RNN1_INSTALL_DIR} ${RNN1_MODEL_URL} ${RNN1_DATA_URL})
inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
ARGS --infer_model=${RNN1_INSTALL_DIR}/model
--infer_data=${RNN1_INSTALL_DIR}/data.txt)
if(NOT APPLE)
set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc)
else()
# TODO: fix this test on MACOS, the reason is that
# fusion_seqexpand_concat_fc_op is not supported on MACOS
message(WARNING "These tests has been disabled in OSX before being fixed: \n test_analyzer_rnn1")
endif()
# RNN2
set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc)
# chinese_ner
set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz")
set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz")
set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner")
download_model_and_data(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} ${CHINESE_NER_DATA_URL})
inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
--infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_ner ${CHINESE_NER_INSTALL_DIR} analyzer_ner_tester.cc)
# lac
set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz")
set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz")
set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac")
download_model_and_data(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} ${LAC_DATA_URL})
inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
ARGS --infer_model=${LAC_INSTALL_DIR}/model
--infer_data=${LAC_INSTALL_DIR}/data.txt)
set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac")
download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_tester.cc)
# text_classification
set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz")
set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz")
set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification")
download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} ${TEXT_CLASSIFICATION_DATA_URL})
inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta
--infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt
--topn=1 # Just run top 1 batch.
)
set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR} analyzer_text_classification_tester.cc)
# ocr
set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
if (NOT EXISTS ${OCR_INSTALL_DIR})
inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
endif()
inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
# anakin
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
# anakin rnn1
set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin")
set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1")
inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin")
inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt")
cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc
ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin
--datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt
DEPS inference_anakin_api_shared SERIAL)
# anakin mobilenet
if(WITH_GPU)
set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet")
inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin")
cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc
ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin
DEPS inference_anakin_api_shared dynload_cuda SERIAL)
endif()
endif()
......@@ -12,21 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path for LAC");
DEFINE_string(infer_data, "", "data file for LAC");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(burning, 0, "Burning before repeat.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
......@@ -124,48 +110,38 @@ const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
void TestLACPrediction(const std::string &model_path,
const std::string &data_file, const int batch_size,
const int repeat, bool test_all_data,
bool use_analysis = false) {
NativeConfig config;
config.model_dir = model_path;
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
std::vector<PaddleTensor> input_slots, outputs_slots;
DataRecord data(data_file, batch_size);
GetOneBatch(&input_slots, &data, batch_size);
std::unique_ptr<PaddlePredictor> predictor;
if (use_analysis) {
const int repeat, bool use_analysis = false) {
AnalysisConfig cfg;
cfg.model_dir = model_path;
cfg.use_gpu = false;
cfg.device = 0;
cfg.specify_input_name = true;
cfg.enable_ir_optim = true;
std::vector<PaddleTensor> input_slots, outputs_slots;
DataRecord data(data_file, batch_size);
GetOneBatch(&input_slots, &data, batch_size);
std::unique_ptr<PaddlePredictor> predictor;
if (use_analysis) {
predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
} else {
predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
}
for (int i = 0; i < FLAGS_burning; i++) {
predictor->Run(input_slots, &outputs_slots);
}
Timer timer;
if (test_all_data) {
double sum = 0;
LOG(INFO) << "Total number of samples: " << data.datasets.size();
for (int i = 0; i < repeat; i++) {
if (FLAGS_test_all_data) {
LOG(INFO) << "test all data";
std::vector<std::vector<PaddleTensor>> input_slots_all;
for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
GetOneBatch(&input_slots, &data, batch_size);
timer.tic();
predictor->Run(input_slots, &outputs_slots);
sum += timer.toc();
}
input_slots_all.emplace_back(input_slots);
}
PrintTime(batch_size, repeat, 1, 0, sum / repeat);
LOG(INFO) << "Average latency of each sample: "
<< sum / repeat / data.datasets.size() << " ms";
LOG(INFO) << "total number of samples: " << data.datasets.size();
TestPrediction(cfg, input_slots_all, &outputs_slots, FLAGS_num_threads);
return;
}
timer.tic();
......@@ -190,19 +166,10 @@ void TestLACPrediction(const std::string &model_path,
if (use_analysis) {
// run once for comparion as reference
auto ref_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
std::vector<PaddleTensor> ref_outputs_slots;
ref_predictor->Run(input_slots, &ref_outputs_slots);
EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size());
auto &ref_out = ref_outputs_slots[0];
size_t ref_size =
std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
[](int a, int b) { return a * b; });
EXPECT_EQ(size, ref_size);
int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
for (size_t i = 0; i < size; ++i) {
EXPECT_EQ(pdata_ref[i], pdata[i]);
}
CompareResult(ref_outputs_slots, outputs_slots);
AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get());
......@@ -231,13 +198,13 @@ void TestLACPrediction(const std::string &model_path,
TEST(Analyzer_LAC, native) {
LOG(INFO) << "LAC with native";
TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
FLAGS_repeat, FLAGS_test_all_data);
FLAGS_repeat);
}
TEST(Analyzer_LAC, analysis) {
LOG(INFO) << "LAC with analysis";
TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
FLAGS_repeat, FLAGS_test_all_data, true);
FLAGS_repeat, true);
}
} // namespace analysis
......
......@@ -12,20 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data path");
DEFINE_int32(batch_size, 10, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
......@@ -113,17 +100,6 @@ const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
48, 39, 38, 16, 25};
void TestChineseNERPrediction(bool use_analysis) {
NativeConfig config;
config.prog_file = FLAGS_infer_model + "/__model__";
config.param_file = FLAGS_infer_model + "/param";
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
std::vector<PaddleTensor> input_slots, outputs;
std::unique_ptr<PaddlePredictor> predictor;
Timer timer;
if (use_analysis) {
AnalysisConfig cfg;
cfg.prog_file = FLAGS_infer_model + "/__model__";
cfg.param_file = FLAGS_infer_model + "/param";
......@@ -131,32 +107,28 @@ void TestChineseNERPrediction(bool use_analysis) {
cfg.device = 0;
cfg.specify_input_name = true;
cfg.enable_ir_optim = true;
std::vector<PaddleTensor> input_slots, outputs;
std::unique_ptr<PaddlePredictor> predictor;
Timer timer;
if (use_analysis) {
predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
} else {
predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
}
if (FLAGS_test_all_data) {
LOG(INFO) << "test all data";
double sum = 0;
size_t num_samples;
for (int i = 0; i < FLAGS_repeat; i++) {
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
// Just one batch, the num_samples remains the same.
num_samples = data.num_samples;
for (size_t bid = 0; bid < num_samples / FLAGS_batch_size; ++bid) {
std::vector<std::vector<PaddleTensor>> input_slots_all;
for (size_t bid = 0; bid < data.num_samples / FLAGS_batch_size; ++bid) {
PrepareInputs(&input_slots, &data, FLAGS_batch_size);
timer.tic();
predictor->Run(input_slots, &outputs);
sum += timer.toc();
}
input_slots_all.emplace_back(input_slots);
}
LOG(INFO) << "total number of samples: " << num_samples;
PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
LOG(INFO) << "average latency of each sample: "
<< sum / FLAGS_repeat / num_samples;
LOG(INFO) << "total number of samples: " << data.num_samples;
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
return;
}
// Prepare inputs.
......@@ -182,19 +154,10 @@ void TestChineseNERPrediction(bool use_analysis) {
if (use_analysis) {
// run once for comparion as reference
auto ref_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
std::vector<PaddleTensor> ref_outputs_slots;
ref_predictor->Run(input_slots, &ref_outputs_slots);
EXPECT_EQ(ref_outputs_slots.size(), outputs.size());
auto &ref_out = ref_outputs_slots[0];
size_t ref_size =
std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
[](int a, int b) { return a * b; });
EXPECT_EQ(size, ref_size);
int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
for (size_t i = 0; i < size; ++i) {
EXPECT_EQ(pdata_ref[i], result[i]);
}
CompareResult(ref_outputs_slots, outputs);
AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get());
......
......@@ -12,24 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <google/protobuf/text_format.h>
#include <gtest/gtest.h>
#include <thread> // NOLINT
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data path");
DEFINE_int32(batch_size, 10, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
......@@ -164,26 +147,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
}
}
void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &base_outputs) {
PADDLE_ENFORCE_GT(outputs.size(), 0);
PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i];
auto &base_out = base_outputs[i];
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
1, [](int a, int b) { return a * b; });
PADDLE_ENFORCE_EQ(size, size1);
PADDLE_ENFORCE_GT(size, 0);
float *data = static_cast<float *>(out.data.data());
float *base_data = static_cast<float *>(base_out.data.data());
for (size_t i = 0; i < size; i++) {
EXPECT_NEAR(data[i], base_data[i], 1e-3);
}
}
}
// Test with a really complicate model.
void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
AnalysisConfig config;
......@@ -198,7 +161,6 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
config.ir_passes.clear(); // Do not exclude any pass.
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
auto base_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
......@@ -213,45 +175,14 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
base_predictor->Run(input_slots, &base_outputs);
std::vector<std::vector<PaddleTensor>> input_slots_all;
input_slots_all.emplace_back(input_slots);
if (num_threads == 1) {
// Prepare inputs.
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
predictor->Run(input_slots, &outputs);
}
PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times);
TestOneThreadPrediction(config, input_slots_all, &outputs);
CompareResult(outputs, base_outputs);
} else {
std::vector<std::thread> threads;
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
// TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
// because AttentionLSTM's hard code nodeid will be damanged.
for (int tid = 0; tid < num_threads; ++tid) {
predictors.emplace_back(
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config));
}
for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() {
// Each thread should have local input_slots and outputs.
std::vector<PaddleTensor> input_slots;
DataRecord data(FLAGS_infer_data, batch_size);
PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs;
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
predictors[tid]->Run(input_slots, &outputs);
}
PrintTime(batch_size, num_times, num_threads, tid,
timer.toc() / num_times);
CompareResult(outputs, base_outputs);
});
}
for (int i = 0; i < num_threads; ++i) {
threads[i].join();
}
// only return the output of first thread
TestMultiThreadPrediction(config, input_slots_all, &outputs, num_threads);
}
if (use_analysis && activate_ir) {
......@@ -293,8 +224,7 @@ TEST(Analyzer, RNN_tests) {
// Directly infer with the original model.
TestRNN1Prediction(false, false, i);
// Inference with the original model with the analysis turned on, the
// analysis
// module will transform the program to a data flow graph.
// analysis module will transform the program to a data flow graph.
TestRNN1Prediction(true, false, i);
// Inference with analysis and IR. The IR module will fuse some large
// kernels.
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <google/protobuf/text_format.h>
#include <gtest/gtest.h>
#include <thread> // NOLINT
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data path");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
namespace paddle {
namespace inference {
using namespace framework; // NOLINT
struct DataRecord {
std::vector<std::vector<std::vector<float>>> link_step_data_all;
std::vector<size_t> lod;
std::vector<std::vector<float>> rnn_link_data;
std::vector<float> result_data;
size_t batch_iter{0};
size_t batch_size{1};
DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) {
Load(path);
}
DataRecord NextBatch() {
DataRecord data;
size_t batch_end = batch_iter + batch_size;
// NOTE skip the final batch, if no enough data is provided.
if (batch_end <= link_step_data_all.size()) {
data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter,
link_step_data_all.begin() + batch_end);
// Prepare LoDs
data.lod.push_back(0);
CHECK(!data.link_step_data_all.empty()) << "empty";
for (size_t j = 0; j < data.link_step_data_all.size(); j++) {
for (const auto &d : data.link_step_data_all[j]) {
data.rnn_link_data.push_back(d);
// calculate lod
data.lod.push_back(data.lod.back() + 11);
}
}
}
batch_iter += batch_size;
return data;
}
void Load(const std::string &path) {
std::ifstream file(path);
std::string line;
int num_lines = 0;
while (std::getline(file, line)) {
num_lines++;
std::vector<std::string> data;
split(line, ':', &data);
if (num_lines % 2) { // feature
std::vector<std::string> feature_data;
split(data[1], ' ', &feature_data);
std::vector<std::vector<float>> link_step_data;
int feature_count = 1;
std::vector<float> feature;
for (auto &step_data : feature_data) {
std::vector<float> tmp;
split_to_float(step_data, ',', &tmp);
feature.insert(feature.end(), tmp.begin(), tmp.end());
if (feature_count % 11 == 0) { // each sample has 11 features
link_step_data.push_back(feature);
feature.clear();
}
feature_count++;
}
link_step_data_all.push_back(std::move(link_step_data));
} else { // result
std::vector<float> tmp;
split_to_float(data[1], ',', &tmp);
result_data.insert(result_data.end(), tmp.begin(), tmp.end());
}
}
}
};
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
int batch_size) {
PaddleTensor feed_tensor;
feed_tensor.name = "feed";
auto one_batch = data->NextBatch();
int token_size = one_batch.rnn_link_data.size();
// each token has 11 features, each feature's dim is 54.
std::vector<int> rnn_link_data_shape({token_size * 11, 54});
feed_tensor.shape = rnn_link_data_shape;
feed_tensor.lod.assign({one_batch.lod});
feed_tensor.dtype = PaddleDType::FLOAT32;
TensorAssignData<float>(&feed_tensor, one_batch.rnn_link_data);
// Set inputs.
input_slots->assign({feed_tensor});
}
void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<float> &base_result) {
PADDLE_ENFORCE_GT(outputs.size(), 0);
for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i];
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
PADDLE_ENFORCE_GT(size, 0);
float *data = static_cast<float *>(out.data.data());
for (size_t i = 0; i < size; i++) {
EXPECT_NEAR(data[i], base_result[i], 1e-3);
}
}
}
// Test with a really complicate model.
void TestRNN2Prediction() {
AnalysisConfig config;
config.prog_file = FLAGS_infer_model + "/__model__";
config.param_file = FLAGS_infer_model + "/param";
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
config.enable_ir_optim = true;
PADDLE_ENFORCE(config.ir_mode ==
AnalysisConfig::IrPassMode::kExclude); // default
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
auto base_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots;
DataRecord data(FLAGS_infer_data, batch_size);
PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs, base_outputs;
Timer timer1;
timer1.tic();
for (int i = 0; i < num_times; i++) {
base_predictor->Run(input_slots, &base_outputs);
}
PrintTime(batch_size, num_times, 1, 0, timer1.toc() / num_times);
Timer timer2;
timer2.tic();
for (int i = 0; i < num_times; i++) {
predictor->Run(input_slots, &outputs);
}
PrintTime(batch_size, num_times, 1, 0, timer2.toc() / num_times);
CompareResult(base_outputs, data.result_data);
CompareResult(outputs, data.result_data);
}
TEST(Analyzer, rnn2) { TestRNN2Prediction(); }
} // namespace inference
} // namespace paddle
......@@ -12,23 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <gflags/gflags.h>
#include <glog/logging.h> // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
#include <gtest/gtest.h>
#include <fstream>
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/timer.h"
DEFINE_string(infer_model, "", "Directory of the inference model.");
DEFINE_string(infer_data, "", "Path of the dataset.");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(repeat, 1, "How many times to repeat run.");
DEFINE_int32(topn, -1, "Run top n batches of data to save time");
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
......@@ -37,24 +21,25 @@ struct DataReader {
explicit DataReader(const std::string &path)
: file(new std::ifstream(path)) {}
bool NextBatch(PaddleTensor *tensor, int batch_size) {
bool NextBatch(std::vector<PaddleTensor> *input, int batch_size) {
PADDLE_ENFORCE_EQ(batch_size, 1);
std::string line;
tensor->lod.clear();
tensor->lod.emplace_back(std::vector<size_t>({0}));
PaddleTensor tensor;
tensor.dtype = PaddleDType::INT64;
tensor.lod.emplace_back(std::vector<size_t>({0}));
std::vector<int64_t> data;
for (int i = 0; i < batch_size; i++) {
if (!std::getline(*file, line)) return false;
inference::split_to_int64(line, ' ', &data);
}
tensor->lod.front().push_back(data.size());
tensor.lod.front().push_back(data.size());
tensor->data.Resize(data.size() * sizeof(int64_t));
memcpy(tensor->data.data(), data.data(), data.size() * sizeof(int64_t));
tensor->shape.clear();
tensor->shape.push_back(data.size());
tensor->shape.push_back(1);
tensor.data.Resize(data.size() * sizeof(int64_t));
memcpy(tensor.data.data(), data.data(), data.size() * sizeof(int64_t));
tensor.shape.push_back(data.size());
tensor.shape.push_back(1);
input->assign({tensor});
return true;
}
......@@ -68,32 +53,28 @@ void Main(int batch_size) {
config.model_dir = FLAGS_infer_model;
config.use_gpu = false;
config.enable_ir_optim = true;
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots(1);
// one batch starts
// data --
auto &input = input_slots[0];
input.dtype = PaddleDType::INT64;
inference::Timer timer;
double sum = 0;
std::vector<PaddleTensor> output_slots;
std::vector<PaddleTensor> input_slots, output_slots;
DataReader reader(FLAGS_infer_data);
std::vector<std::vector<PaddleTensor>> input_slots_all;
if (FLAGS_test_all_data) {
LOG(INFO) << "test all data";
int num_batches = 0;
for (int t = 0; t < FLAGS_repeat; t++) {
DataReader reader(FLAGS_infer_data);
while (reader.NextBatch(&input, FLAGS_batch_size)) {
if (FLAGS_topn > 0 && num_batches > FLAGS_topn) break;
timer.tic();
CHECK(predictor->Run(input_slots, &output_slots));
sum += timer.toc();
while (reader.NextBatch(&input_slots, FLAGS_batch_size)) {
input_slots_all.emplace_back(input_slots);
++num_batches;
}
LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size;
TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
return;
}
PrintTime(batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
// one batch starts
// data --
reader.NextBatch(&input_slots, FLAGS_batch_size);
input_slots_all.emplace_back(input_slots);
TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
// Get output
LOG(INFO) << "get outputs " << output_slots.size();
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
namespace analysis {
struct Record {
std::vector<float> data;
std::vector<int32_t> shape;
};
Record ProcessALine(const std::string &line) {
VLOG(3) << "process a line";
std::vector<std::string> columns;
split(line, '\t', &columns);
CHECK_EQ(columns.size(), 2UL)
<< "data format error, should be <data>\t<shape>";
Record record;
std::vector<std::string> data_strs;
split(columns[0], ' ', &data_strs);
for (auto &d : data_strs) {
record.data.push_back(std::stof(d));
}
std::vector<std::string> shape_strs;
split(columns[1], ' ', &shape_strs);
for (auto &s : shape_strs) {
record.shape.push_back(std::stoi(s));
}
VLOG(3) << "data size " << record.data.size();
VLOG(3) << "data shape size " << record.shape.size();
return record;
}
/*
* Use the native and analysis fluid engine to inference the demo.
* ocr, mobilenet and se_resnext50
*/
void TestVisualPrediction(bool use_mkldnn) {
std::unique_ptr<PaddlePredictor> predictor;
AnalysisConfig cfg;
cfg.param_file = FLAGS_infer_model + "/__params__";
cfg.prog_file = FLAGS_infer_model + "/__model__";
cfg.use_gpu = false;
cfg._use_mkldnn = use_mkldnn;
cfg.device = 0;
cfg.enable_ir_optim = true;
// TODO(TJ): fix fusion gru
cfg.ir_passes.push_back("fc_gru_fuse_pass");
#ifdef PADDLE_WITH_MKLDNN
// disable mkldnn fuse since it should have some bugs
cfg.ir_passes.push_back("conv_relu_mkldnn_fuse_pass");
#endif
predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
// Only have single batch of data.
std::string line;
std::ifstream file(FLAGS_infer_data);
std::getline(file, line);
auto record = ProcessALine(line);
file.close();
// Inference.
PaddleTensor input;
input.shape = record.shape;
input.data =
PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
input.dtype = PaddleDType::FLOAT32;
std::vector<PaddleTensor> outputs_slots;
Timer timer;
timer.tic();
for (int i = 0; i < FLAGS_repeat; i++) {
predictor->Run({input}, &outputs_slots);
}
PrintTime(/*batch size*/ 1, FLAGS_repeat, /*num threads*/ 1, /*thread id*/ 0,
timer.toc() / FLAGS_repeat);
VLOG(3) << "output.size " << outputs_slots.size();
// run native as reference
auto ref_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
std::vector<PaddleTensor> ref_outputs_slots;
ref_predictor->Run({input}, &ref_outputs_slots);
CompareResult(outputs_slots, ref_outputs_slots);
// print what are fused
AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get());
auto &fuse_statis = analysis_predictor->analysis_argument()
.Get<std::unordered_map<std::string, int>>(
framework::ir::kFuseStatisAttr);
for (auto &item : fuse_statis) {
LOG(INFO) << "fused " << item.first << " " << item.second;
}
int num_ops = 0;
for (auto &node :
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
if (node->IsFunction()) {
++num_ops;
}
}
LOG(INFO) << "has num ops: " << num_ops;
}
TEST(Analyzer_vis, analysis) { TestVisualPrediction(/*use_mkldnn*/ false); }
#ifdef PADDLE_WITH_MKLDNN
TEST(Analyzer_vis, analysis_mkldnn) {
TestVisualPrediction(/*use_mkldnn*/ true);
}
#endif
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <gtest/gtest.h>
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data file");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(burning, 0, "Burning before repeat.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
namespace paddle {
namespace inference {
void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &ref_outputs) {
EXPECT_GT(outputs.size(), 0);
EXPECT_EQ(outputs.size(), ref_outputs.size());
for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i];
auto &ref_out = ref_outputs[i];
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
size_t ref_size =
std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
[](int a, int b) { return a * b; });
EXPECT_GT(size, 0);
EXPECT_EQ(size, ref_size);
EXPECT_EQ(out.dtype, ref_out.dtype);
switch (out.dtype) {
case PaddleDType::INT64: {
int64_t *pdata = static_cast<int64_t *>(out.data.data());
int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
for (size_t j = 0; j < size; ++j) {
EXPECT_EQ(pdata_ref[j], pdata[j]);
}
break;
}
case PaddleDType::FLOAT32: {
float *pdata = static_cast<float *>(out.data.data());
float *pdata_ref = static_cast<float *>(ref_out.data.data());
for (size_t j = 0; j < size; ++j) {
EXPECT_NEAR(pdata_ref[j], pdata[j], 1e-3);
}
break;
}
}
}
}
void TestOneThreadPrediction(
AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
std::vector<PaddleTensor> *outputs) {
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
for (size_t j = 0; j < inputs.size(); j++) {
predictor->Run(inputs[j], outputs);
}
}
PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times,
inputs.size());
}
void TestMultiThreadPrediction(
AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
std::vector<PaddleTensor> *outputs, int num_threads) {
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
std::vector<std::thread> threads;
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
// TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
// because AttentionLSTM's hard code nodeid will be damanged.
for (int tid = 0; tid < num_threads; ++tid) {
predictors.emplace_back(
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config));
}
for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() {
// Each thread should have local inputs and outputs.
// The inputs of each thread are all the same.
std::vector<std::vector<PaddleTensor>> inputs_tid = inputs;
std::vector<PaddleTensor> outputs_tid;
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
for (size_t j = 0; j < inputs_tid.size(); j++) {
predictors[tid]->Run(inputs_tid[j], &outputs_tid);
}
}
PrintTime(batch_size, num_times, num_threads, tid,
timer.toc() / num_times, inputs_tid.size());
});
}
for (int i = 0; i < num_threads; ++i) {
threads[i].join();
}
}
void TestPrediction(AnalysisConfig config,
const std::vector<std::vector<PaddleTensor>> inputs,
std::vector<PaddleTensor> *outputs, int num_threads) {
if (num_threads == 1) {
TestOneThreadPrediction(config, inputs, outputs);
} else {
TestMultiThreadPrediction(config, inputs, outputs, num_threads);
}
}
} // namespace inference
} // namespace paddle
......@@ -167,6 +167,8 @@ void BuddyAllocator::Free(void* p) {
}
size_t BuddyAllocator::Used() { return total_used_; }
size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; }
size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; }
void* BuddyAllocator::SystemAlloc(size_t size) {
size_t index = 0;
......
......@@ -42,6 +42,8 @@ class BuddyAllocator {
void* Alloc(size_t unaligned_size);
void Free(void* ptr);
size_t Used();
size_t GetMinChunkSize();
size_t GetMaxChunkSize();
public:
// Disable copy and assignment
......
......@@ -119,8 +119,8 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
<< place.device << ", available " << avail << " bytes";
LOG(WARNING) << "total " << total;
LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
LOG(WARNING) << "GpuMinChunkSize " << buddy_allocator->GetMinChunkSize();
LOG(WARNING) << "GpuMaxChunkSize " << buddy_allocator->GetMaxChunkSize();
LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
platform::SetDeviceId(cur_dev);
}
......
......@@ -296,6 +296,7 @@ op_library(flatten_op DEPS reshape_op)
op_library(sequence_pad_op DEPS sequence_padding)
op_library(unstack_op DEPS stack_op)
op_library(fake_quantize_op DEPS memory)
op_library(fusion_lstm_op DEPS cpu_lstm_compute)
if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv im2col)
......
......@@ -300,10 +300,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
bool fuse_relu = ctx.Attr<bool>("fuse_relu");
bool fuse_eltwise = ctx.Attr<bool>("fuse_eltwise");
int groups = ctx.Attr<int>("groups");
// TODO(pzelazko-intel) add support for group convolution and dilation
PADDLE_ENFORCE(groups == 1, "group convolution is not implemented yet");
// TODO: add support for dilation
PADDLE_ENFORCE(
dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
"dilation in convolution is not implemented yet");
......@@ -314,6 +314,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> weights_tz =
paddle::framework::vectorize2int(filter->dims());
int g = std::max(groups, 1);
if (g > 1) {
int o = weights_tz[0];
int i = weights_tz[1];
int h = weights_tz[2];
int w = weights_tz[3];
weights_tz.resize(5);
weights_tz[0] = g;
weights_tz[1] = o / g;
weights_tz[2] = i;
weights_tz[3] = h;
weights_tz[4] = w;
}
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// Get unique name for storing MKLDNN primitives
......@@ -327,7 +340,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<T>(), filter->format());
{weights_tz}, platform::MKLDNNGetDataType<T>(),
(g == 1) ? filter->format() : mkldnn::memory::format::goihw);
/* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose
......@@ -340,7 +354,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc(
weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
weights_tz, platform::MKLDNNGetDataType<T>(),
(g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw);
std::vector<int> bias_tz; // TODO(mgallus): avoid empty vector creation.
// Currently used whenever bias is != nullptr.
auto dst_md = platform::MKLDNNMemDesc(
......@@ -352,12 +367,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
bias_tz = paddle::framework::vectorize2int(bias->dims());
auto bias_md = platform::MKLDNNMemDesc(
bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
conv_pd =
ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, strides,
paddings, mkldnn_engine, fuse_relu);
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
strides, paddings, mkldnn_engine,
fuse_relu, fuse_eltwise);
} else {
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides,
paddings, mkldnn_engine, fuse_relu);
conv_pd =
ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
mkldnn_engine, fuse_relu, fuse_eltwise);
}
// Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd);
......@@ -407,16 +423,26 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
}
private:
mkldnn::primitive_attr AddRelu() const {
mkldnn::primitive_attr CreatePostOps(bool fuse_relu,
bool fuse_eltwise) const {
mkldnn::primitive_attr conv_attr;
mkldnn::post_ops post_operations;
// Fusion with Elementwise layer relies on adding a sum post-operation with
// the scale parameter. It is assumed that when fuse_eltwise is true, the
// Output tensor contains the data coming from residual connection. The
// result of this post_op is: Output = scale * Output + Conv_Out.
if (fuse_eltwise) {
post_operations.append_sum(1.0f);
}
// Fusion with ReLU layer is executed through the PostOps feature. Create a
// PostOps object and configure it to execute an eltwise relu operation.
mkldnn::primitive_attr conv_attr;
if (fuse_relu) {
constexpr float scale = 1.0f;
constexpr float negative_slope = 0.0f;
constexpr float placeholder = 0.0f;
mkldnn::post_ops post_operations;
post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
negative_slope, placeholder);
}
conv_attr.set_post_ops(post_operations);
return conv_attr;
}
......@@ -425,8 +451,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::engine& engine,
const bool fuse_relu) const {
const mkldnn::engine& engine, const bool fuse_relu,
const bool fuse_eltwise) const {
memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]};
......@@ -435,10 +461,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
dst, stride_dims, padding_dims, padding_dims,
mkldnn::padding_kind::zero);
mkldnn::primitive_attr conv_attr;
if (fuse_relu) {
conv_attr = AddRelu();
}
mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise);
auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
conv_desc, conv_attr, engine);
......@@ -452,8 +475,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const memory::desc& bias, const memory::desc& dst,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::engine& engine,
const bool fuse_relu) const {
const mkldnn::engine& engine, const bool fuse_relu,
const bool fuse_eltwise) const {
memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]};
......@@ -462,10 +485,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
bias, dst, stride_dims, padding_dims, padding_dims,
mkldnn::padding_kind::zero);
mkldnn::primitive_attr conv_attr;
if (fuse_relu) {
conv_attr = AddRelu();
}
mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise);
auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
conv_desc, conv_attr, engine);
......
......@@ -164,6 +164,11 @@ void Conv2DOpMaker::Make() {
.SetDefault(false);
AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<bool>("fuse_eltwise",
"(bool, default false) Only used in mkldnn kernel. Used "
"whenever convolution output is connected via skip connection "
"to a previous layer.")
.SetDefault(false);
AddAttr<std::string>(
"data_format",
"(string, default NCHW) Only used in "
......
......@@ -9,6 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
......@@ -21,7 +22,7 @@ namespace operators {
*/
template <typename T>
inline void BoxToDelta(const int box_num, const framework::Tensor& ex_boxes,
const framework::Tensor& gt_boxes, const T* weights,
const framework::Tensor& gt_boxes, const float* weights,
const bool normalized, framework::Tensor* box_delta) {
auto ex_boxes_et = framework::EigenTensor<T, 2>::From(ex_boxes);
auto gt_boxes_et = framework::EigenTensor<T, 2>::From(gt_boxes);
......@@ -62,5 +63,35 @@ void Gather(const T* in, const int in_stride, const int* index, const int num,
}
}
template <typename T>
void BboxOverlaps(const framework::Tensor& r_boxes,
const framework::Tensor& c_boxes,
framework::Tensor* overlaps) {
auto r_boxes_et = framework::EigenTensor<T, 2>::From(r_boxes);
auto c_boxes_et = framework::EigenTensor<T, 2>::From(c_boxes);
auto overlaps_et = framework::EigenTensor<T, 2>::From(*overlaps);
int r_num = r_boxes.dims()[0];
int c_num = c_boxes.dims()[0];
auto zero = static_cast<T>(0.0);
T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h,
inter_area;
for (int i = 0; i < r_num; ++i) {
r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) *
(r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1);
for (int j = 0; j < c_num; ++j) {
c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) *
(c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1);
x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0));
y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1));
x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2));
y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3));
inter_w = std::max(x_max - x_min + 1, zero);
inter_h = std::max(y_max - y_min + 1, zero);
inter_area = inter_w * inter_h;
overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area);
}
}
}
} // namespace operators
} // namespace paddle
......@@ -42,10 +42,11 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
"Input(RpnRois) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput("GtClasses"),
"Input(GtClasses) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput("IsCrowd"),
"Input(IsCrowd) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput("GtBoxes"),
"Input(GtBoxes) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput("ImScales"),
"Input(ImScales) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasOutput("Rois"),
"Output(Rois) of RpnTargetAssignOp should not be null");
......@@ -64,22 +65,21 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
auto rpn_rois_dims = ctx->GetInputDim("RpnRois");
auto gt_classes_dims = ctx->GetInputDim("GtClasses");
auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
auto im_scales_dims = ctx->GetInputDim("ImScales");
auto im_info_dims = ctx->GetInputDim("ImInfo");
PADDLE_ENFORCE_EQ(rpn_rois_dims.size(), 2,
"The rank of Input(RpnRois) must be 2.");
PADDLE_ENFORCE_EQ(gt_classes_dims.size(), 1,
"The rank of Input(GtClasses) must be 1.");
PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2,
"The rank of Input(GtBoxes) must be 2.");
PADDLE_ENFORCE_EQ(im_scales_dims.size(), 1,
"The rank of Input(ImScales) must be 1.");
PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
"The rank of Input(ImInfo) must be 2.");
int class_nums = ctx->Attrs().Get<int>("class_nums");
ctx->SetOutputDim("Rois", {-1, 4});
ctx->SetOutputDim("LabelsInt32", {-1});
ctx->SetOutputDim("LabelsInt32", {-1, 1});
ctx->SetOutputDim("BboxTargets", {-1, 4 * class_nums});
ctx->SetOutputDim("BboxInsideWeights", {-1, 4 * class_nums});
ctx->SetOutputDim("BboxOutsideWeights", {-1, 4 * class_nums});
......@@ -105,45 +105,18 @@ void Concat(const platform::CPUDeviceContext& context,
concat_functor(context, inputs, axis, out_tensor);
}
template <typename T>
void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes,
Tensor* overlaps) {
auto r_boxes_et = framework::EigenTensor<T, 2>::From(r_boxes);
auto c_boxes_et = framework::EigenTensor<T, 2>::From(c_boxes);
auto overlaps_et = framework::EigenTensor<T, 2>::From(*overlaps);
int r_num = r_boxes.dims()[0];
int c_num = c_boxes.dims()[0];
auto zero = static_cast<T>(0.0);
T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h,
inter_area;
for (int i = 0; i < r_num; ++i) {
r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) *
(r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1);
for (int j = 0; j < c_num; ++j) {
c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) *
(c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1);
x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0));
y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1));
x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2));
y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3));
inter_w = std::max(x_max - x_min + 1, zero);
inter_h = std::max(y_max - y_min + 1, zero);
inter_area = inter_w * inter_h;
overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area);
}
}
}
template <typename T>
std::vector<std::vector<int>> SampleFgBgGt(
const platform::CPUDeviceContext& context, Tensor* iou,
const int batch_size_per_im, const float fg_fraction, const float fg_thresh,
const float bg_thresh_hi, const float bg_thresh_lo,
std::minstd_rand engine) {
const Tensor& is_crowd, const int batch_size_per_im,
const float fg_fraction, const float fg_thresh, const float bg_thresh_hi,
const float bg_thresh_lo, std::minstd_rand engine, const bool use_random) {
std::vector<int> fg_inds;
std::vector<int> bg_inds;
std::vector<int> gt_inds;
T* proposal_to_gt_overlaps = iou->mutable_data<T>(context.GetPlace());
int64_t gt_num = is_crowd.numel();
const int* crowd_data = is_crowd.data<int>();
T* proposal_to_gt_overlaps = iou->data<T>();
int64_t row = iou->dims()[0];
int64_t col = iou->dims()[1];
float epsilon = 0.00001;
......@@ -152,6 +125,9 @@ std::vector<std::vector<int>> SampleFgBgGt(
for (int64_t i = 0; i < row; ++i) {
const T* v = proposal_to_gt_overlaps + i * col;
T max_overlap = *std::max_element(v, v + col);
if ((i < gt_num) && (crowd_data[i])) {
max_overlap = -1.0;
}
if (max_overlap > fg_thresh) {
for (int64_t j = 0; j < col; ++j) {
T val = proposal_to_gt_overlaps[i * col + j];
......@@ -170,10 +146,11 @@ std::vector<std::vector<int>> SampleFgBgGt(
}
// Reservoir Sampling
std::uniform_real_distribution<float> uniform(0, 1);
int fg_rois_per_im = std::floor(batch_size_per_im * fg_fraction);
int fg_rois_this_image = fg_inds.size();
int fg_rois_per_this_image = std::min(fg_rois_per_im, fg_rois_this_image);
std::uniform_real_distribution<float> uniform(0, 1);
if (use_random) {
const int64_t fg_size = static_cast<int64_t>(fg_inds.size());
if (fg_size > fg_rois_per_this_image) {
for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) {
......@@ -184,6 +161,7 @@ std::vector<std::vector<int>> SampleFgBgGt(
}
}
}
}
std::vector<int> new_fg_inds(fg_inds.begin(),
fg_inds.begin() + fg_rois_per_this_image);
std::vector<int> new_gt_inds(gt_inds.begin(),
......@@ -192,6 +170,7 @@ std::vector<std::vector<int>> SampleFgBgGt(
int bg_rois_per_image = batch_size_per_im - fg_rois_per_this_image;
int bg_rois_this_image = bg_inds.size();
int bg_rois_per_this_image = std::min(bg_rois_per_image, bg_rois_this_image);
if (use_random) {
const int64_t bg_size = static_cast<int64_t>(bg_inds.size());
if (bg_size > bg_rois_per_this_image) {
for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) {
......@@ -200,6 +179,7 @@ std::vector<std::vector<int>> SampleFgBgGt(
std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i);
}
}
}
std::vector<int> new_bg_inds(bg_inds.begin(),
bg_inds.begin() + bg_rois_per_this_image);
std::vector<std::vector<int>> res;
......@@ -248,14 +228,14 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
template <typename T>
std::vector<Tensor> SampleRoisForOneImage(
const platform::CPUDeviceContext& context, Tensor* rpn_rois,
Tensor* gt_classes, Tensor* gt_boxes, Tensor* im_scale,
Tensor* gt_classes, Tensor* is_crowd, Tensor* gt_boxes, Tensor* im_info,
const int batch_size_per_im, const float fg_fraction, const float fg_thresh,
const float bg_thresh_hi, const float bg_thresh_lo,
const std::vector<float>& bbox_reg_weights, const int class_nums,
std::minstd_rand engine) {
std::minstd_rand engine, bool use_random) {
auto rpn_rois_et = framework::EigenTensor<T, 2>::From(*rpn_rois);
auto im_scale_data = im_scale->data<T>()[0];
rpn_rois_et = rpn_rois_et / im_scale_data;
auto im_scale = im_info->data<T>()[2];
rpn_rois_et = rpn_rois_et / im_scale;
Tensor boxes;
int proposals_num = gt_boxes->dims()[0] + rpn_rois->dims()[0];
......@@ -270,8 +250,8 @@ std::vector<Tensor> SampleRoisForOneImage(
// Generate proposal index
std::vector<std::vector<int>> fg_bg_gt = SampleFgBgGt<T>(
context, &proposal_to_gt_overlaps, batch_size_per_im, fg_fraction,
fg_thresh, bg_thresh_hi, bg_thresh_lo, engine);
context, &proposal_to_gt_overlaps, *is_crowd, batch_size_per_im,
fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, engine, use_random);
std::vector<int> fg_inds = fg_bg_gt[0];
std::vector<int> bg_inds = fg_bg_gt[1];
std::vector<int> gt_inds = fg_bg_gt[2];
......@@ -291,15 +271,15 @@ std::vector<Tensor> SampleRoisForOneImage(
// Compute targets
Tensor bbox_targets_single;
bbox_targets_single.mutable_data<T>(bbox_dim, context.GetPlace());
BoxToDelta<T>(fg_num, sampled_boxes, sampled_gts, nullptr, false,
&bbox_targets_single);
BoxToDelta<T>(fg_num, sampled_boxes, sampled_gts, bbox_reg_weights.data(),
false, &bbox_targets_single);
// Scale rois
Tensor sampled_rois;
sampled_rois.mutable_data<T>(sampled_boxes.dims(), context.GetPlace());
auto sampled_rois_et = framework::EigenTensor<T, 2>::From(sampled_rois);
auto sampled_boxes_et = framework::EigenTensor<T, 2>::From(sampled_boxes);
sampled_rois_et = sampled_boxes_et * im_scale_data;
sampled_rois_et = sampled_boxes_et * im_scale;
// Expand box targets
Tensor bbox_targets, bbox_inside_weights, bbox_outside_weights;
......@@ -351,8 +331,9 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& context) const override {
auto* rpn_rois = context.Input<LoDTensor>("RpnRois");
auto* gt_classes = context.Input<LoDTensor>("GtClasses");
auto* is_crowd = context.Input<LoDTensor>("IsCrowd");
auto* gt_boxes = context.Input<LoDTensor>("GtBoxes");
auto* im_scales = context.Input<LoDTensor>("ImScales");
auto* im_info = context.Input<LoDTensor>("ImInfo");
auto* rois = context.Output<LoDTensor>("Rois");
auto* labels_int32 = context.Output<LoDTensor>("LabelsInt32");
......@@ -369,18 +350,21 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
std::vector<float> bbox_reg_weights =
context.Attr<std::vector<float>>("bbox_reg_weights");
int class_nums = context.Attr<int>("class_nums");
bool use_random = context.Attr<bool>("use_random");
PADDLE_ENFORCE_EQ(rpn_rois->lod().size(), 1UL,
"GenerateProposalLabelsOp rpn_rois needs 1 level of LoD");
PADDLE_ENFORCE_EQ(
gt_classes->lod().size(), 1UL,
"GenerateProposalLabelsOp gt_classes needs 1 level of LoD");
PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL,
"GenerateProposalLabelsOp is_crowd needs 1 level of LoD");
PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL,
"GenerateProposalLabelsOp gt_boxes needs 1 level of LoD");
int64_t n = static_cast<int64_t>(rpn_rois->lod().back().size() - 1);
rois->mutable_data<T>({n * batch_size_per_im, kBoxDim}, context.GetPlace());
labels_int32->mutable_data<int>({n * batch_size_per_im},
labels_int32->mutable_data<int>({n * batch_size_per_im, 1},
context.GetPlace());
bbox_targets->mutable_data<T>({n * batch_size_per_im, kBoxDim * class_nums},
context.GetPlace());
......@@ -391,8 +375,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
std::random_device rnd;
std::minstd_rand engine;
int seed =
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
int seed = rnd();
engine.seed(seed);
framework::LoD lod;
......@@ -403,19 +386,23 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
auto rpn_rois_lod = rpn_rois->lod().back();
auto gt_classes_lod = gt_classes->lod().back();
auto is_crowd_lod = is_crowd->lod().back();
auto gt_boxes_lod = gt_boxes->lod().back();
for (int i = 0; i < n; ++i) {
Tensor rpn_rois_slice =
rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]);
Tensor gt_classes_slice =
gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]);
Tensor is_crowd_slice =
is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
Tensor gt_boxes_slice =
gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
Tensor im_scales_slice = im_scales->Slice(i, i + 1);
Tensor im_info_slice = im_info->Slice(i, i + 1);
std::vector<Tensor> tensor_output = SampleRoisForOneImage<T>(
dev_ctx, &rpn_rois_slice, &gt_classes_slice, &gt_boxes_slice,
&im_scales_slice, batch_size_per_im, fg_fraction, fg_thresh,
bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums, engine);
dev_ctx, &rpn_rois_slice, &gt_classes_slice, &is_crowd_slice,
&gt_boxes_slice, &im_info_slice, batch_size_per_im, fg_fraction,
fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums,
engine, use_random);
Tensor sampled_rois = tensor_output[0];
Tensor sampled_labels_int32 = tensor_output[1];
Tensor sampled_bbox_targets = tensor_output[2];
......@@ -442,7 +429,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
bbox_inside_weights->set_lod(lod);
bbox_outside_weights->set_lod(lod);
rois->Resize({num_rois, kBoxDim});
labels_int32->Resize({num_rois});
labels_int32->Resize({num_rois, 1});
bbox_targets->Resize({num_rois, kBoxDim * class_nums});
bbox_inside_weights->Resize({num_rois, kBoxDim * class_nums});
bbox_outside_weights->Resize({num_rois, kBoxDim * class_nums});
......@@ -455,8 +442,9 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
// TODO(buxingyuan): Add Document
AddInput("RpnRois", "RpnRois.");
AddInput("GtClasses", "GtClasses.");
AddInput("IsCrowd", "IsCrowd.");
AddInput("GtBoxes", "GtBoxes.");
AddInput("ImScales", "ImScales.");
AddInput("ImInfo", "ImInfo.");
AddOutput("Rois", "Rois.");
AddOutput("LabelsInt32", "LabelsInt32.");
......@@ -471,8 +459,7 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<float>("bg_thresh_lo", "bg_thresh_lo");
AddAttr<std::vector<float>>("bbox_reg_weights", "bbox_reg_weights");
AddAttr<int>("class_nums", "class_nums");
AddAttr<bool>("fix_seed", "fix_seed").SetDefault(false);
AddAttr<int>("seed", "seed").SetDefault(0);
AddAttr<bool>("use_random", "use_random").SetDefault(true);
AddComment(R"DOC(
Generate Proposals Labels Operator.
......
......@@ -89,12 +89,11 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
}
for (int64_t i = 0; i < row; ++i) {
T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len];
T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1];
T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;
T anchor_center_x = (anchor_data[i * len + 2] + anchor_data[i * len]) / 2;
T anchor_center_y =
(anchor_data[i * len + 3] + anchor_data[i * len + 1]) / 2;
T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;
T bbox_center_x = 0, bbox_center_y = 0;
T bbox_width = 0, bbox_height = 0;
......@@ -106,25 +105,31 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
bbox_center_y = variances_data[i * len + 1] *
bbox_deltas_data[i * len + 1] * anchor_height +
anchor_center_y;
bbox_width = std::exp(variances_data[i * len + 2] *
bbox_deltas_data[i * len + 2]) *
bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
bbox_deltas_data[i * len + 2],
std::log(1000.0 / 16.0))) *
anchor_width;
bbox_height = std::exp(variances_data[i * len + 3] *
bbox_deltas_data[i * len + 3]) *
bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
bbox_deltas_data[i * len + 3],
std::log(1000.0 / 16.0))) *
anchor_height;
} else {
bbox_center_x =
bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
bbox_center_y =
bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width;
bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height;
bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
std::log(1000.0 / 16.0))) *
anchor_width;
bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
std::log(1000.0 / 16.0))) *
anchor_height;
}
proposals_data[i * len] = bbox_center_x - bbox_width / 2;
proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2;
proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2;
proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
}
// return proposals;
}
......@@ -156,18 +161,23 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
float min_size, const Tensor &im_info, Tensor *keep) {
const T *im_info_data = im_info.data<T>();
T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
min_size *= im_info_data[2];
T im_scale = im_info_data[2];
keep->Resize({boxes->dims()[0], 1});
min_size = std::max(min_size, 1.0f);
int *keep_data = keep->mutable_data<int>(ctx.GetPlace());
int keep_len = 0;
for (int i = 0; i < boxes->dims()[0]; ++i) {
T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
T ws_origin_scale =
(boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1;
T hs_origin_scale =
(boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1;
T x_ctr = boxes_data[4 * i] + ws / 2;
T y_ctr = boxes_data[4 * i + 1] + hs / 2;
if (ws >= min_size && hs >= min_size && x_ctr <= im_info_data[1] &&
y_ctr <= im_info_data[0]) {
if (ws_origin_scale >= min_size && hs_origin_scale >= min_size &&
x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) {
keep_data[keep_len++] = i;
}
}
......@@ -218,8 +228,8 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
const T inter_ymin = std::max(box1[1], box2[1]);
const T inter_xmax = std::min(box1[2], box2[2]);
const T inter_ymax = std::min(box1[3], box2[3]);
const T inter_w = inter_xmax - inter_xmin;
const T inter_h = inter_ymax - inter_ymin;
const T inter_w = std::max(0.0f, inter_xmax - inter_xmin + 1);
const T inter_h = std::max(0.0f, inter_ymax - inter_ymin + 1);
const T inter_area = inter_w * inter_h;
const T bbox1_area = BBoxArea<T>(box1, normalized);
const T bbox2_area = BBoxArea<T>(box2, normalized);
......
......@@ -31,8 +31,14 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("DistMat"),
"Input(DistMat) of RpnTargetAssignOp should not be null");
PADDLE_ENFORCE(ctx->HasInput("Anchor"),
"Input(Anchor) of RpnTargetAssignOp should not be null");
PADDLE_ENFORCE(ctx->HasInput("GtBoxes"),
"Input(GtBoxes) of RpnTargetAssignOp should not be null");
PADDLE_ENFORCE(ctx->HasInput("IsCrowd"),
"Input(Anchor) of RpnTargetAssignOp should not be null");
PADDLE_ENFORCE(ctx->HasInput("ImInfo"),
"Input(ImInfo) of RpnTargetAssignOp should not be null");
PADDLE_ENFORCE(
ctx->HasOutput("LocationIndex"),
......@@ -43,10 +49,20 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(
ctx->HasOutput("TargetLabel"),
"Output(TargetLabel) of RpnTargetAssignOp should not be null");
auto in_dims = ctx->GetInputDim("DistMat");
PADDLE_ENFORCE_EQ(in_dims.size(), 2,
"The rank of Input(DistMat) must be 2.");
PADDLE_ENFORCE(
ctx->HasOutput("TargetBBox"),
"Output(TargetBBox) of RpnTargetAssignOp should not be null");
auto anchor_dims = ctx->GetInputDim("Anchor");
auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
auto im_info_dims = ctx->GetInputDim("ImInfo");
PADDLE_ENFORCE_EQ(anchor_dims.size(), 2,
"The rank of Input(Anchor) must be 2.");
PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2,
"The rank of Input(GtBoxes) must be 2.");
PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
"The rank of Input(ImInfo) must be 2.");
ctx->SetOutputDim("LocationIndex", {-1});
ctx->SetOutputDim("ScoreIndex", {-1});
......@@ -59,198 +75,383 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(
ctx.Input<framework::LoDTensor>("DistMat")->type()),
ctx.Input<framework::LoDTensor>("Anchor")->type()),
platform::CPUPlace());
}
};
template <typename T>
class RpnTargetAssignKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* anchor_t = context.Input<Tensor>("Anchor"); // (H*W*A) * 4
auto* gt_bbox_t = context.Input<Tensor>("GtBox");
auto* dist_t = context.Input<LoDTensor>("DistMat");
auto* loc_index_t = context.Output<Tensor>("LocationIndex");
auto* score_index_t = context.Output<Tensor>("ScoreIndex");
auto* tgt_bbox_t = context.Output<Tensor>("TargetBBox");
auto* tgt_lbl_t = context.Output<Tensor>("TargetLabel");
auto lod = dist_t->lod().back();
int64_t batch_num = static_cast<int64_t>(lod.size() - 1);
int64_t anchor_num = dist_t->dims()[1];
PADDLE_ENFORCE_EQ(anchor_num, anchor_t->dims()[0]);
int rpn_batch_size = context.Attr<int>("rpn_batch_size_per_im");
float pos_threshold = context.Attr<float>("rpn_positive_overlap");
float neg_threshold = context.Attr<float>("rpn_negative_overlap");
float fg_fraction = context.Attr<float>("fg_fraction");
int fg_num_per_batch = static_cast<int>(rpn_batch_size * fg_fraction);
int64_t max_num = batch_num * anchor_num;
auto place = context.GetPlace();
void AppendRpns(LoDTensor* out, int64_t offset, Tensor* to_add) {
auto* out_data = out->data<T>();
auto* to_add_data = to_add->data<T>();
memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T));
}
tgt_bbox_t->mutable_data<T>({max_num, 4}, place);
auto* loc_index = loc_index_t->mutable_data<int>({max_num}, place);
auto* score_index = score_index_t->mutable_data<int>({max_num}, place);
Tensor tmp_tgt_lbl;
auto* tmp_lbl_data = tmp_tgt_lbl.mutable_data<int64_t>({max_num}, place);
auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
math::SetConstant<platform::CPUDeviceContext, int64_t> iset;
iset(dev_ctx, &tmp_tgt_lbl, static_cast<int64_t>(-1));
std::random_device rnd;
std::minstd_rand engine;
int seed =
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
engine.seed(seed);
int fg_num = 0;
int bg_num = 0;
for (int i = 0; i < batch_num; ++i) {
Tensor dist = dist_t->Slice(lod[i], lod[i + 1]);
Tensor gt_bbox = gt_bbox_t->Slice(lod[i], lod[i + 1]);
auto fg_bg_gt = SampleFgBgGt(dev_ctx, dist, pos_threshold, neg_threshold,
rpn_batch_size, fg_num_per_batch, engine,
tmp_lbl_data + i * anchor_num);
int cur_fg_num = fg_bg_gt[0].size();
int cur_bg_num = fg_bg_gt[1].size();
std::transform(fg_bg_gt[0].begin(), fg_bg_gt[0].end(), loc_index,
[i, anchor_num](int d) { return d + i * anchor_num; });
memcpy(score_index, loc_index, cur_fg_num * sizeof(int));
std::transform(fg_bg_gt[1].begin(), fg_bg_gt[1].end(),
score_index + cur_fg_num,
[i, anchor_num](int d) { return d + i * anchor_num; });
// get target bbox deltas
if (cur_fg_num) {
Tensor fg_gt;
T* gt_data = fg_gt.mutable_data<T>({cur_fg_num, 4}, place);
Tensor tgt_bbox = tgt_bbox_t->Slice(fg_num, fg_num + cur_fg_num);
T* tgt_data = tgt_bbox.data<T>();
Gather<T>(anchor_t->data<T>(), 4,
reinterpret_cast<int*>(&fg_bg_gt[0][0]), cur_fg_num,
tgt_data);
Gather<T>(gt_bbox.data<T>(), 4, reinterpret_cast<int*>(&fg_bg_gt[2][0]),
cur_fg_num, gt_data);
BoxToDelta<T>(cur_fg_num, tgt_bbox, fg_gt, nullptr, false, &tgt_bbox);
}
loc_index += cur_fg_num;
score_index += cur_fg_num + cur_bg_num;
fg_num += cur_fg_num;
bg_num += cur_bg_num;
}
int lbl_num = fg_num + bg_num;
PADDLE_ENFORCE_LE(fg_num, max_num);
PADDLE_ENFORCE_LE(lbl_num, max_num);
tgt_bbox_t->Resize({fg_num, 4});
loc_index_t->Resize({fg_num});
score_index_t->Resize({lbl_num});
auto* lbl_data = tgt_lbl_t->mutable_data<int64_t>({lbl_num, 1}, place);
Gather<int64_t>(tmp_lbl_data, 1, score_index_t->data<int>(), lbl_num,
lbl_data);
template <typename T>
std::vector<Tensor> FilterStraddleAnchor(
const platform::CPUDeviceContext& context, const Tensor* anchor,
const float rpn_straddle_thresh, T im_height, T im_width) {
std::vector<int> inds_inside;
int anchor_num = anchor->dims()[0];
auto* anchor_data = anchor->data<T>();
if (rpn_straddle_thresh >= 0) {
int index;
for (int i = 0; i < anchor_num; ++i) {
index = i * 4;
if ((anchor_data[index + 0] >= -rpn_straddle_thresh) &&
(anchor_data[index + 1] >= -rpn_straddle_thresh) &&
(anchor_data[index + 2] < im_width + rpn_straddle_thresh) &&
(anchor_data[index + 3] < im_height + rpn_straddle_thresh)) {
inds_inside.emplace_back(i);
}
private:
void ScoreAssign(const T* dist_data, const Tensor& anchor_to_gt_max,
const int row, const int col, const float pos_threshold,
const float neg_threshold, int64_t* target_label,
std::vector<int>* fg_inds, std::vector<int>* bg_inds) const {
float epsilon = 0.0001;
for (int64_t i = 0; i < row; ++i) {
const T* v = dist_data + i * col;
T max = *std::max_element(v, v + col);
for (int64_t j = 0; j < col; ++j) {
if (std::abs(max - v[j]) < epsilon) {
target_label[j] = 1;
}
} else {
for (int i = 0; i < anchor_num; ++i) {
inds_inside.emplace_back(i);
}
}
int inside_num = inds_inside.size();
Tensor inds_inside_t;
int* inds_inside_data =
inds_inside_t.mutable_data<int>({inside_num}, context.GetPlace());
std::copy(inds_inside.begin(), inds_inside.end(), inds_inside_data);
Tensor inside_anchor_t;
T* inside_anchor_data =
inside_anchor_t.mutable_data<T>({inside_num, 4}, context.GetPlace());
Gather<T>(anchor->data<T>(), 4, inds_inside_data, inside_num,
inside_anchor_data);
std::vector<Tensor> res;
res.emplace_back(inds_inside_t);
res.emplace_back(inside_anchor_t);
return res;
}
// Pick the fg/bg
const T* anchor_to_gt_max_data = anchor_to_gt_max.data<T>();
for (int64_t j = 0; j < col; ++j) {
if (anchor_to_gt_max_data[j] >= pos_threshold) {
target_label[j] = 1;
} else if (anchor_to_gt_max_data[j] < neg_threshold) {
target_label[j] = 0;
}
if (target_label[j] == 1) {
fg_inds->push_back(j);
} else if (target_label[j] == 0) {
bg_inds->push_back(j);
}
template <typename T>
Tensor FilterCrowdGt(const platform::CPUDeviceContext& context,
Tensor* gt_boxes, Tensor* is_crowd) {
int gt_num = gt_boxes->dims()[0];
std::vector<int> not_crowd_inds;
auto* is_crowd_data = is_crowd->data<int>();
for (int i = 0; i < gt_num; ++i) {
if (is_crowd_data[i] == 0) {
not_crowd_inds.emplace_back(i);
}
}
void ReservoirSampling(const int num, std::minstd_rand engine,
std::vector<int>* inds) const {
int ncrowd_num = not_crowd_inds.size();
Tensor ncrowd_gt_boxes;
T* ncrowd_gt_boxes_data =
ncrowd_gt_boxes.mutable_data<T>({ncrowd_num, 4}, context.GetPlace());
Gather<T>(gt_boxes->data<T>(), 4, not_crowd_inds.data(), ncrowd_num,
ncrowd_gt_boxes_data);
return ncrowd_gt_boxes;
}
void ReservoirSampling(const int num, std::vector<int>* inds,
std::minstd_rand engine, bool use_random) {
std::uniform_real_distribution<float> uniform(0, 1);
size_t len = inds->size();
if (len > static_cast<size_t>(num)) {
if (use_random) {
for (size_t i = num; i < len; ++i) {
int rng_ind = std::floor(uniform(engine) * i);
if (rng_ind < num)
std::iter_swap(inds->begin() + rng_ind, inds->begin() + i);
}
}
inds->resize(num);
}
}
template <typename T>
void ScoreAssign(const T* anchor_by_gt_overlap_data,
const Tensor& anchor_to_gt_max, const Tensor& gt_to_anchor_max,
const int rpn_batch_size_per_im, const float rpn_fg_fraction,
const float rpn_positive_overlap,
const float rpn_negative_overlap, std::vector<int>* fg_inds,
std::vector<int>* bg_inds, std::vector<int>* tgt_lbl,
std::minstd_rand engine, bool use_random) {
float epsilon = 0.00001;
int anchor_num = anchor_to_gt_max.dims()[0];
int gt_num = gt_to_anchor_max.dims()[0];
std::vector<int> target_label(anchor_num, -1);
std::vector<int> fg_inds_fake;
std::vector<int> bg_inds_fake;
const T* anchor_to_gt_max_data = anchor_to_gt_max.data<T>();
const T* gt_to_anchor_max_data = gt_to_anchor_max.data<T>();
// TODO(buxingyuan): Match with Detectron now
// but it seems here is a bug in two directions assignment
// in which the later one may overwrites the former one.
for (int64_t i = 0; i < anchor_num; ++i) {
bool is_anchors_with_max_overlap = false;
for (int64_t j = 0; j < gt_num; ++j) {
T value = anchor_by_gt_overlap_data[i * gt_num + j];
T diff = std::abs(value - gt_to_anchor_max_data[j]);
if (diff < epsilon) {
is_anchors_with_max_overlap = true;
break;
}
}
bool is_anchor_great_than_thresh =
(anchor_to_gt_max_data[i] >= rpn_positive_overlap);
if (is_anchors_with_max_overlap || is_anchor_great_than_thresh) {
fg_inds_fake.push_back(i);
}
}
// std::vector<std::vector<int>> RpnTargetAssign(
std::vector<std::vector<int>> SampleFgBgGt(
const platform::CPUDeviceContext& ctx, const Tensor& dist,
const float pos_threshold, const float neg_threshold,
const int rpn_batch_size, const int fg_num, std::minstd_rand engine,
int64_t* target_label) const {
auto* dist_data = dist.data<T>();
int row = dist.dims()[0];
int col = dist.dims()[1];
// Reservoir Sampling
int fg_num = static_cast<int>(rpn_fg_fraction * rpn_batch_size_per_im);
ReservoirSampling(fg_num, &fg_inds_fake, engine, use_random);
fg_num = static_cast<int>(fg_inds_fake.size());
for (int64_t i = 0; i < fg_num; ++i) {
target_label[fg_inds_fake[i]] = 1;
}
int bg_num = rpn_batch_size_per_im - fg_num;
for (int64_t i = 0; i < anchor_num; ++i) {
if (anchor_to_gt_max_data[i] < rpn_negative_overlap) {
bg_inds_fake.push_back(i);
}
}
ReservoirSampling(bg_num, &bg_inds_fake, engine, use_random);
bg_num = static_cast<int>(bg_inds_fake.size());
for (int64_t i = 0; i < bg_num; ++i) {
target_label[bg_inds_fake[i]] = 0;
}
for (int64_t i = 0; i < anchor_num; ++i) {
if (target_label[i] == 1) fg_inds->emplace_back(i);
if (target_label[i] == 0) bg_inds->emplace_back(i);
}
fg_num = fg_inds->size();
bg_num = bg_inds->size();
tgt_lbl->resize(fg_num + bg_num, 0);
std::vector<int> fg_lbl(fg_num, 1);
std::vector<int> bg_lbl(bg_num, 0);
std::copy(fg_lbl.begin(), fg_lbl.end(), tgt_lbl->data());
std::copy(bg_lbl.begin(), bg_lbl.end(), tgt_lbl->data() + fg_num);
}
template <typename T>
std::vector<Tensor> SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx,
const Tensor& anchor_by_gt_overlap,
const int rpn_batch_size_per_im,
const float rpn_positive_overlap,
const float rpn_negative_overlap,
const float rpn_fg_fraction,
std::minstd_rand engine, bool use_random) {
auto* anchor_by_gt_overlap_data = anchor_by_gt_overlap.data<T>();
int anchor_num = anchor_by_gt_overlap.dims()[0];
int gt_num = anchor_by_gt_overlap.dims()[1];
std::vector<int> fg_inds;
std::vector<int> bg_inds;
std::vector<int> gt_inds;
std::vector<int> tgt_lbl;
// Calculate the max IoU between anchors and gt boxes
// Map from anchor to gt box that has highest overlap
auto place = ctx.GetPlace();
Tensor anchor_to_gt_max, anchor_to_gt_argmax;
anchor_to_gt_max.mutable_data<T>({col}, place);
int* argmax = anchor_to_gt_argmax.mutable_data<int>({col}, place);
auto x = framework::EigenMatrix<T>::From(dist);
auto x_col_max = framework::EigenVector<T>::Flatten(anchor_to_gt_max);
auto x_col_argmax =
Tensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
anchor_to_gt_max.mutable_data<T>({anchor_num}, place);
int* argmax = anchor_to_gt_argmax.mutable_data<int>({anchor_num}, place);
gt_to_anchor_max.mutable_data<T>({gt_num}, place);
auto anchor_by_gt_overlap_et =
framework::EigenMatrix<T>::From(anchor_by_gt_overlap);
auto anchor_to_gt_max_et =
framework::EigenVector<T>::Flatten(anchor_to_gt_max);
auto gt_to_anchor_max_et =
framework::EigenVector<T>::Flatten(gt_to_anchor_max);
auto anchor_to_gt_argmax_et =
framework::EigenVector<int>::Flatten(anchor_to_gt_argmax);
x_col_max = x.maximum(Eigen::DSizes<int, 1>(0));
x_col_argmax = x.argmax(0).template cast<int>();
anchor_to_gt_max_et =
anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(1));
anchor_to_gt_argmax_et =
anchor_by_gt_overlap_et.argmax(1).template cast<int>();
gt_to_anchor_max_et =
anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(0));
// Follow the Faster RCNN's implementation
ScoreAssign(dist_data, anchor_to_gt_max, row, col, pos_threshold,
neg_threshold, target_label, &fg_inds, &bg_inds);
// Reservoir Sampling
ReservoirSampling(fg_num, engine, &fg_inds);
int fg_num2 = static_cast<int>(fg_inds.size());
int bg_num = rpn_batch_size - fg_num2;
ReservoirSampling(bg_num, engine, &bg_inds);
gt_inds.reserve(fg_num2);
for (int i = 0; i < fg_num2; ++i) {
ScoreAssign(anchor_by_gt_overlap_data, anchor_to_gt_max, gt_to_anchor_max,
rpn_batch_size_per_im, rpn_fg_fraction, rpn_positive_overlap,
rpn_negative_overlap, &fg_inds, &bg_inds, &tgt_lbl, engine,
use_random);
int fg_num = fg_inds.size();
int bg_num = bg_inds.size();
gt_inds.reserve(fg_num);
for (int i = 0; i < fg_num; ++i) {
gt_inds.emplace_back(argmax[fg_inds[i]]);
}
std::vector<std::vector<int>> fg_bg_gt;
fg_bg_gt.emplace_back(fg_inds);
fg_bg_gt.emplace_back(bg_inds);
fg_bg_gt.emplace_back(gt_inds);
return fg_bg_gt;
Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t;
int* loc_index_data = loc_index_t.mutable_data<int>({fg_num}, place);
int* score_index_data =
score_index_t.mutable_data<int>({fg_num + bg_num}, place);
int* tgt_lbl_data = tgt_lbl_t.mutable_data<int>({fg_num + bg_num}, place);
int* gt_inds_data = gt_inds_t.mutable_data<int>({fg_num}, place);
std::copy(fg_inds.begin(), fg_inds.end(), loc_index_data);
std::copy(fg_inds.begin(), fg_inds.end(), score_index_data);
std::copy(bg_inds.begin(), bg_inds.end(), score_index_data + fg_num);
std::copy(tgt_lbl.begin(), tgt_lbl.end(), tgt_lbl_data);
std::copy(gt_inds.begin(), gt_inds.end(), gt_inds_data);
std::vector<Tensor> loc_score_tgtlbl_gt;
loc_score_tgtlbl_gt.emplace_back(loc_index_t);
loc_score_tgtlbl_gt.emplace_back(score_index_t);
loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t);
loc_score_tgtlbl_gt.emplace_back(gt_inds_t);
return loc_score_tgtlbl_gt;
}
template <typename T>
class RpnTargetAssignKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* anchor = context.Input<Tensor>("Anchor"); // (H*W*A) * 4
auto* gt_boxes = context.Input<LoDTensor>("GtBoxes");
auto* is_crowd = context.Input<LoDTensor>("IsCrowd");
auto* im_info = context.Input<LoDTensor>("ImInfo");
auto* loc_index = context.Output<LoDTensor>("LocationIndex");
auto* score_index = context.Output<LoDTensor>("ScoreIndex");
auto* tgt_bbox = context.Output<LoDTensor>("TargetBBox");
auto* tgt_lbl = context.Output<LoDTensor>("TargetLabel");
PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL,
"RpnTargetAssignOp gt_boxes needs 1 level of LoD");
PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL,
"RpnTargetAssignOp is_crowd needs 1 level of LoD");
int64_t anchor_num = static_cast<int64_t>(anchor->dims()[0]);
int64_t batch_num = static_cast<int64_t>(gt_boxes->lod().back().size() - 1);
int rpn_batch_size_per_im = context.Attr<int>("rpn_batch_size_per_im");
float rpn_straddle_thresh = context.Attr<float>("rpn_straddle_thresh");
float rpn_positive_overlap = context.Attr<float>("rpn_positive_overlap");
float rpn_negative_overlap = context.Attr<float>("rpn_negative_overlap");
float rpn_fg_fraction = context.Attr<float>("rpn_fg_fraction");
bool use_random = context.Attr<bool>("use_random");
int64_t max_num = batch_num * rpn_batch_size_per_im;
auto place = context.GetPlace();
loc_index->mutable_data<int>({max_num}, place);
score_index->mutable_data<int>({max_num}, place);
tgt_bbox->mutable_data<T>({max_num, 4}, place);
tgt_lbl->mutable_data<int>({max_num, 1}, place);
auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
std::random_device rnd;
std::minstd_rand engine;
int seed = rnd();
engine.seed(seed);
framework::LoD lod_loc, loc_score;
std::vector<size_t> lod0_loc(1, 0);
std::vector<size_t> lod0_score(1, 0);
int total_loc_num = 0;
int total_score_num = 0;
auto gt_boxes_lod = gt_boxes->lod().back();
auto is_crowd_lod = is_crowd->lod().back();
for (int i = 0; i < batch_num; ++i) {
Tensor gt_boxes_slice =
gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
Tensor is_crowd_slice =
is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
Tensor im_info_slice = im_info->Slice(i, i + 1);
auto* im_info_data = im_info_slice.data<T>();
auto im_height = im_info_data[0];
auto im_width = im_info_data[1];
auto im_scale = im_info_data[2];
// Filter straddle anchor
std::vector<Tensor> filter_output = FilterStraddleAnchor<T>(
dev_ctx, anchor, rpn_straddle_thresh, im_height, im_width);
Tensor inds_inside = filter_output[0];
Tensor inside_anchor = filter_output[1];
// Filter crowd gt
Tensor ncrowd_gt_boxes =
FilterCrowdGt<T>(dev_ctx, &gt_boxes_slice, &is_crowd_slice);
auto ncrowd_gt_boxes_et =
framework::EigenTensor<T, 2>::From(ncrowd_gt_boxes);
ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale;
Tensor anchor_by_gt_overlap;
anchor_by_gt_overlap.mutable_data<T>(
{inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place);
BboxOverlaps<T>(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap);
auto loc_score_tgtlbl_gt = SampleRpnFgBgGt<T>(
dev_ctx, anchor_by_gt_overlap, rpn_batch_size_per_im,
rpn_positive_overlap, rpn_negative_overlap, rpn_fg_fraction, engine,
use_random);
Tensor sampled_loc_index = loc_score_tgtlbl_gt[0];
Tensor sampled_score_index = loc_score_tgtlbl_gt[1];
Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
Tensor sampled_gt_index = loc_score_tgtlbl_gt[3];
int loc_num = sampled_loc_index.dims()[0];
int score_num = sampled_score_index.dims()[0];
// unmap to all anchor
Tensor sampled_loc_index_unmap, sampled_score_index_unmap;
sampled_loc_index_unmap.mutable_data<int>({loc_num}, place);
sampled_score_index_unmap.mutable_data<int>({score_num}, place);
Gather<int>(inds_inside.data<int>(), 1, sampled_loc_index.data<int>(),
loc_num, sampled_loc_index_unmap.data<int>());
Gather<int>(inds_inside.data<int>(), 1, sampled_score_index.data<int>(),
score_num, sampled_score_index_unmap.data<int>());
// get target bbox deltas
Tensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
auto* sampled_anchor_data =
sampled_anchor.mutable_data<T>({loc_num, 4}, place);
auto* sampled_gt_data = sampled_gt.mutable_data<T>({loc_num, 4}, place);
Gather<T>(anchor->data<T>(), 4, sampled_loc_index_unmap.data<int>(),
loc_num, sampled_anchor_data);
Gather<T>(ncrowd_gt_boxes.data<T>(), 4, sampled_gt_index.data<int>(),
loc_num, sampled_gt_data);
sampled_tgt_bbox.mutable_data<T>({loc_num, 4}, place);
BoxToDelta<T>(loc_num, sampled_anchor, sampled_gt, nullptr, false,
&sampled_tgt_bbox);
// Add anchor offset
int anchor_offset = i * anchor_num;
auto sampled_loc_index_unmap_et =
framework::EigenTensor<int, 1>::From(sampled_loc_index_unmap);
sampled_loc_index_unmap_et = sampled_loc_index_unmap_et + anchor_offset;
auto sampled_score_index_unmap_et =
framework::EigenTensor<int, 1>::From(sampled_score_index_unmap);
sampled_score_index_unmap_et =
sampled_score_index_unmap_et + anchor_offset;
AppendRpns<int>(loc_index, total_loc_num, &sampled_loc_index_unmap);
AppendRpns<int>(score_index, total_score_num, &sampled_score_index_unmap);
AppendRpns<T>(tgt_bbox, total_loc_num * 4, &sampled_tgt_bbox);
AppendRpns<int>(tgt_lbl, total_score_num, &sampled_tgtlbl);
total_loc_num += loc_num;
total_score_num += score_num;
lod0_loc.emplace_back(total_loc_num);
lod0_score.emplace_back(total_score_num);
}
PADDLE_ENFORCE_LE(total_loc_num, max_num);
PADDLE_ENFORCE_LE(total_score_num, max_num);
lod_loc.emplace_back(lod0_loc);
loc_score.emplace_back(lod0_score);
loc_index->set_lod(lod_loc);
score_index->set_lod(loc_score);
tgt_bbox->set_lod(lod_loc);
tgt_lbl->set_lod(loc_score);
loc_index->Resize({total_loc_num});
score_index->Resize({total_score_num});
tgt_bbox->Resize({total_loc_num, 4});
tgt_lbl->Resize({total_score_num, 1});
}
};
......@@ -259,18 +460,22 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
void Make() override {
AddInput("Anchor",
"(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4].");
AddInput("GtBox", "(LoDTensor) input groud-truth bbox with shape [K, 4].");
AddInput(
"DistMat",
"(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
"[K, M]. It is pair-wise distance matrix between the entities "
"represented by each row and each column. For example, assumed one "
"entity is A with shape [K], another entity is B with shape [M]. The "
"DistMat[i][j] is the distance between A[i] and B[j]. The bigger "
"the distance is, the better macthing the pairs are. Please note, "
"This tensor can contain LoD information to represent a batch of "
"inputs. One instance of this batch can contain different numbers of "
"entities.");
AddInput("GtBoxes",
"(LoDTensor) input groud-truth bbox with shape [K, 4].");
AddInput("IsCrowd",
"(LoDTensor) input which indicates groud-truth is crowd.");
AddInput("ImInfo",
"(LoDTensor) input image information with shape [N, 3]. "
"N is the batch size, each image information includes height, "
"width and scale.");
AddAttr<int>("rpn_batch_size_per_im",
"Total number of RPN examples per image.")
.SetDefault(256);
AddAttr<float>(
"rpn_straddle_thresh",
"Remove RPN anchors that go outside the image by straddle_thresh "
"pixels, "
"Set to -1 or a large value, e.g. 100000, to disable pruning anchors.");
AddAttr<float>(
"rpn_positive_overlap",
"Minimum overlap required between an anchor and ground-truth "
......@@ -282,20 +487,15 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
"box for the (anchor, gt box) pair to be a negative examples.")
.SetDefault(0.3);
AddAttr<float>(
"fg_fraction",
"rpn_fg_fraction",
"Target fraction of RoI minibatch that "
"is labeled foreground (i.e. class > 0), 0-th class is background.")
.SetDefault(0.25);
AddAttr<int>("rpn_batch_size_per_im",
"Total number of RPN examples per image.")
.SetDefault(256);
AddAttr<bool>("fix_seed",
"A flag indicating whether to use a fixed seed to generate "
"random mask. NOTE: DO NOT set this flag to true in "
"training. Setting this flag to true is only useful in "
"unittest.")
.SetDefault(false);
AddAttr<int>("seed", "RpnTargetAssign random seed.").SetDefault(0);
AddAttr<bool>("use_random",
"A flag indicating whether to use a ReservoirSampling. "
"NOTE: DO NOT set this flag to false in training. "
"Setting this flag to false is only useful in unittest.")
.SetDefault(true);
AddOutput(
"LocationIndex",
"(Tensor), The indexes of foreground anchors in all RPN anchors, the "
......@@ -308,16 +508,16 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
"ScoreIndex is [F + B], F and B are sampled foreground and backgroud "
" number.");
AddOutput("TargetBBox",
"(Tensor<int64_t>), The target bbox deltas with shape "
"(Tensor), The target bbox deltas with shape "
"[F, 4], F is the sampled foreground number.");
AddOutput(
"TargetLabel",
"(Tensor<int64_t>), The target labels of each anchor with shape "
"(Tensor<int>), The target labels of each anchor with shape "
"[F + B, 1], F and B are sampled foreground and backgroud number.");
AddComment(R"DOC(
This operator can be, for given the IoU between the ground truth bboxes and the
This operator can be, for a given set of ground truth bboxes and the
anchors, to assign classification and regression targets to each prediction.
The Score index and LocationIndex will be generated according to the DistMat.
The ScoreIndex and LocationIndex will be generated according to the anchor-groundtruth IOU.
The rest anchors would not contibute to the RPN training loss
ScoreIndex is composed of foreground anchor indexes(positive labels) and
......
......@@ -125,7 +125,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
VarHandlePtr h(new VarHandle(ep, "Get", var_name_val, p_ctx, p_scope));
s->Prepare(h, time_out);
framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] {
framework::AsyncIO([var_name_val, s, this] {
// prepare input
sendrecv::VariableMessage req;
req.set_varname(var_name_val);
......@@ -166,7 +166,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
s->Prepare(h, time_out);
framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
time_out, s, this] {
s, this] {
auto* var = p_scope->FindVar(in_var_name_val);
::grpc::ByteBuffer req;
......@@ -290,13 +290,19 @@ void GRPCClient::Proceed() {
c->Finish(false);
}
delete c;
bool notify = false;
{
std::lock_guard<std::mutex> lk(sync_mutex_);
req_count_--;
notify = (req_count_ <= 0 || !c->status_.ok());
}
delete c;
if (notify) {
sync_cond_.notify_all();
}
}
VLOG(3) << "GRPCClient Proceed end";
}
......
......@@ -82,8 +82,10 @@ class ProtoEncodeHelper {
: base_(buf), p_(buf), limit_(base_ + max_size) {}
~ProtoEncodeHelper() {
#define REPLACE_ENFORCE_GLOG 1
// Make sure callers didn't do operations that went over max_size promised
PADDLE_ENFORCE_LE(p_, limit_);
paddle::platform::throw_on_error(p_ <= limit_);
#undef REPLACE_ENFORCE_GLOG
}
const char* data() const { return base_; }
......
......@@ -56,7 +56,7 @@ class VarHandle {
const std::string& name,
const platform::DeviceContext* p_ctx = nullptr,
const framework::Scope* p_scope = nullptr)
: ok_(kVarHandleDefaultState) {
: status_(kDefaultState) {
ep_ = ep;
ctx_ = p_ctx;
scope_ = p_scope;
......@@ -68,18 +68,20 @@ class VarHandle {
public:
bool Wait() {
int ret = kDefaultState;
{
std::unique_lock<std::mutex> lk(sync_mutex_);
wait_cond_.wait(lk, [this] { return ok_ != kVarHandleDefaultState; });
wait_cond_.wait(lk, [this] { return status_ != kDefaultState; });
ret = status_;
}
VLOG(7) << "VarHandle wait:" << ok_;
return ok_ != 0;
VLOG(7) << "VarHandle wait:" << ret;
return ret != kErrorState;
}
void Finish(bool ok) {
{
std::unique_lock<std::mutex> lk(sync_mutex_);
ok_ = ok;
status_ = ok ? kFinishState : kErrorState;
}
VLOG(7) << "VarHandle finish:" << ok;
wait_cond_.notify_all();
......@@ -87,8 +89,8 @@ class VarHandle {
std::string String() const {
std::ostringstream s;
s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], ok:[" << ok_
<< "]";
s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], status:["
<< status_ << "]";
return s.str();
}
......@@ -111,9 +113,13 @@ class VarHandle {
protected:
std::mutex sync_mutex_;
std::condition_variable wait_cond_;
int ok_;
static const int kVarHandleDefaultState = -1;
enum VarHandleStatus {
kDefaultState = -1,
kErrorState = 0,
kFinishState = 1,
};
VarHandleStatus status_;
private:
DISABLE_COPY_AND_ASSIGN(VarHandle);
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/operators/fusion_lstm_op.h"
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_lstm_compute.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
......@@ -269,7 +270,6 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast<T>(1), prev, D, \
wh_data, D4, static_cast<T>(1), out, D4)
// gates: W_ch, W_ih, W_fh, W_oh
#define GET_Ct(ct_1, gates, ct) \
/* C_t = C_t-1 * fgated + cand_gated * igated*/ \
act_cand(D, gates, gates); \
......@@ -395,11 +395,22 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
}
}
} else {
// TODO(TJ): unly workaround, clean me
std::function<void(T*, const T*, T*, T*)> compute_ctht;
if (platform::jit::MayIUse(platform::jit::avx) &&
act_gate_str == "sigmoid" && act_cand_str == "tanh" &&
act_cell_str == "tanh" && D == 8) {
compute_ctht = math::lstm_compute_ctht<T>;
} else {
compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) {
COMPUTE_CtHt(gates, ct_1, ct, ht);
};
}
for (int i = 0; i < N; ++i) {
PROCESS_H0C0
for (int step = tstart; step < seq_len; ++step) {
GEMM_WH_ADDON(1, prev_h_data, xx_data);
COMPUTE_CtHt(xx_data, prev_c_data, c_out_data, h_out_data);
compute_ctht(xx_data, prev_c_data, c_out_data, h_out_data);
MOVE_ONE_STEP;
}
}
......@@ -532,12 +543,23 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
MOVE_ONE_STEP;
}
} else {
// TODO(TJ): unly workaround, clean me
std::function<void(T*, const T*, T*, T*)> compute_ctht;
if (platform::jit::MayIUse(platform::jit::avx) &&
act_gate_str == "sigmoid" && act_cand_str == "tanh" &&
act_cell_str == "tanh" && D == 8) {
compute_ctht = math::lstm_compute_ctht<T>;
} else {
compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) {
COMPUTE_CtHt(gates, ct_1, ct, ht);
};
}
for (int step = tstart; step < max_seq_len; ++step) {
const int cur_bs = batch_starts[step + 1] - batch_starts[step];
GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
DEFINE_CUR;
for (int i = 0; i < cur_bs; ++i) {
COMPUTE_CtHt(cur_in_data, cur_prev_c_data, cur_c_out_data,
compute_ctht(cur_in_data, cur_prev_c_data, cur_c_out_data,
cur_h_out_data);
MOVE_ONE_BATCH;
}
......
......@@ -59,8 +59,7 @@ static void ParallelExecuteBlocks(
framework::ProgramDesc *program, framework::Scope *scope) {
std::vector<std::future<void>> fs;
for (size_t idx : parallel_blkids) {
fs.push_back(
framework::Async([&executor, &prepared, &program, &scope, idx]() {
fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() {
int run_block = idx; // thread local
try {
VLOG(3) << "running server block: " << run_block
......
......@@ -45,6 +45,8 @@ math_library(im2col)
if (NOT WIN32) # windows do not support avx functions yet.
math_library(gru_compute DEPS activation_functions math_function)
math_library(lstm_compute DEPS activation_functions)
# TODO(TJ): ugly workaround, clean me
cc_library(cpu_lstm_compute SRCS cpu_lstm_compute.cc DEPS activation_functions cblas cpu_info)
endif (NOT WIN32)
cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/cpu_lstm_compute.h"
namespace paddle {
namespace operators {
namespace math {} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/cpu_info.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace paddle {
namespace operators {
namespace math {
// TODO(TJ): ugly workaround, clean me
template <typename T>
void lstm_compute_ctht(T* gates, const T* ct_1, T* ct, T* ht) {
// gates: W_ch, W_ih, W_fh, W_oh
vec_sigmoid<T, platform::jit::avx>(24, gates + 8, gates + 8);
vec_tanh<T, platform::jit::avx>(8, gates, gates);
const T *i = gates + 8, *f = gates + 16, *o = gates + 24;
const T min = SIGMOID_THRESHOLD_MIN;
const T max = SIGMOID_THRESHOLD_MAX;
for (int d = 0; d < 8; ++d) {
// C_t = C_t-1 * fgated + cand_gated * igated
ct[d] = ct_1[d] * f[d] + gates[d] * i[d];
// H_t = act_cell(C_t) * ogated
T tmp = ct[d] * 2;
tmp = static_cast<T>(0) - ((tmp < min) ? min : ((tmp > max) ? max : tmp));
vec_exp<T>(1, &tmp, &tmp);
tmp = static_cast<T>(2) / (static_cast<T>(1) + tmp) - static_cast<T>(1);
ht[d] = tmp * o[d];
}
}
#ifdef __AVX__
namespace detail {
namespace forward {
namespace avx {
__m256 Sigmoid(const __m256 a);
__m256 Tanh(const __m256 a);
} // namespace avx
} // namespace forward
} // namespace detail
template <>
void lstm_compute_ctht<float>(float* gates, const float* ct_1, float* ct,
float* ht) {
namespace act = detail::forward::avx;
// gates: W_ch, W_ih, W_fh, W_oh
__m256 c, i, f, o;
c = _mm256_loadu_ps(gates);
i = _mm256_loadu_ps(gates + 8);
f = _mm256_loadu_ps(gates + 16);
o = _mm256_loadu_ps(gates + 24);
/* C_t = C_t-1 * fgated + cand_gated * igated*/
c = _mm256_mul_ps(act::Tanh(c), act::Sigmoid(i));
i = _mm256_loadu_ps(ct_1);
f = _mm256_mul_ps(i, act::Sigmoid(f));
f = _mm256_add_ps(c, f);
_mm256_storeu_ps(ct, f);
/* H_t = act_cell(C_t) * ogated */
o = _mm256_mul_ps(act::Tanh(f), act::Sigmoid(o));
_mm256_storeu_ps(ht, o);
}
#endif
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <functional>
#include <string>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
......@@ -476,7 +477,7 @@ class VecActivations {
} else if (type == "identity" || type == "") {
return vec_identity<T, isa>;
}
LOG(FATAL) << "Not support type: " << type;
PADDLE_THROW("Not support type: %s", type);
}
};
......
......@@ -103,6 +103,58 @@ class MaxSeqPoolGradFunctor {
}
};
template <typename T>
class LastSeqPoolFunctor {
public:
void operator()(const platform::CPUDeviceContext& context,
const framework::LoDTensor& input,
framework::Tensor* output) {
// Create pointers to input and output data
auto* in_data = input.data<T>();
auto* out_data = output->data<T>();
// Calculate the size of each item in sequence
int64_t item_size = input.numel() / input.dims()[0];
auto lod = input.lod()[0];
int seq_num = static_cast<int>(lod.size()) - 1;
for (int i = 0; i < seq_num; ++i) {
// Calculate the length of each sequence
int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
// Point to the begin of next sequence
in_data += seq_len * item_size;
// Copy the last item of sequence to output
std::memcpy(out_data, (in_data - item_size), item_size * sizeof(T));
out_data += item_size;
}
}
};
template <typename T>
class FirstSeqPoolFunctor {
public:
void operator()(const platform::CPUDeviceContext& context,
const framework::LoDTensor& input,
framework::Tensor* output) {
// Create pointers to input and output data
auto* in_data = input.data<T>();
auto* out_data = output->data<T>();
// Calculate the size of each item in sequence
int64_t item_size = input.numel() / input.dims()[0];
auto lod = input.lod()[0];
int seq_num = static_cast<int>(lod.size()) - 1;
for (int i = 0; i < seq_num; ++i) {
// Calculate the length of each sequence
int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
// Copy the first item of sequence to output
std::memcpy(out_data, in_data, item_size * sizeof(T));
// Point to the next sequence
in_data += seq_len * item_size;
out_data += item_size;
}
}
};
template <typename T>
class SequencePoolFunctor<platform::CPUDeviceContext, T> {
public:
......@@ -116,6 +168,16 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
max_pool(context, input, output, index);
return;
}
if (pooltype == "LAST") {
math::LastSeqPoolFunctor<T> last_pool;
last_pool(context, input, output);
return;
}
if (pooltype == "FIRST") {
math::FirstSeqPoolFunctor<T> first_pool;
first_pool(context, input, output);
return;
}
auto lod = input.lod()[0];
auto& place = *context.eigen_device();
for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
......@@ -133,10 +195,6 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
} else if (pooltype == "SQRT") {
out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
std::sqrt(static_cast<T>(h));
} else if (pooltype == "LAST") {
out_e.device(place) = in_e.chip(h - 1, 0);
} else if (pooltype == "FIRST") {
out_e.device(place) = in_e.chip(0, 0);
} else {
PADDLE_THROW("unsupported pooling pooltype");
}
......
......@@ -71,8 +71,7 @@ class MaxOutOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of MaxoutOp"
"should not be null.");
"Input(X) of MaxoutOpshould not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of MaxoutOp should not be null.");
auto in_x_dims = ctx->GetInputDim("X");
......@@ -90,9 +89,10 @@ class MaxOutOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of MaxOutOpGrad must not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Input(X@GRAD) should not be null.");
"Output(Grad@X) of MaxOutOpGrad should not be null.");
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
};
......
......@@ -26,10 +26,13 @@ class PReluOp : public framework::OperatorWithKernel {
std::string mode = ctx->Attrs().Get<std::string>("mode");
auto x_dim = ctx->GetInputDim("X");
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
PADDLE_ENFORCE(ctx->HasInput("Alpha"), "Input(Alpha) should not be null");
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of PreluOp should not be null");
PADDLE_ENFORCE(ctx->HasInput("Alpha"),
"Input(Alpha) of PreluOp should not be null");
PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of PreluOp should not be null");
if (mode == "all") {
PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1,
"For mode 'all', size of weight Alpha must be one.");
......
......@@ -42,7 +42,7 @@ class RNNMemoryHelperOp : public framework::OperatorBase {
auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
auto &mem_tensor = mem_var->Get<framework::LoDTensor>();
out_tensor->ShareDataWith(mem_tensor);
framework::TensorCopySync(mem_tensor, dev_place, out_tensor);
out_tensor->set_lod(mem_tensor.lod());
}
};
......@@ -50,8 +50,10 @@ class RNNMemoryHelperOp : public framework::OperatorBase {
class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "");
PADDLE_ENFORCE(ctx->HasOutput("Out"), "");
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of rnn_memory_helper op should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output of rnn_memory_helper op should not be null.");
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ "Out");
}
......@@ -107,7 +109,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
} else {
auto &out_grad_tensor = out_grad_var->Get<framework::LoDTensor>();
auto *in_grad_tensor = in_grad_var->GetMutable<framework::LoDTensor>();
in_grad_tensor->ShareDataWith(out_grad_tensor);
framework::TensorCopySync(out_grad_tensor, dev_place, in_grad_tensor);
in_grad_tensor->set_lod(out_grad_tensor.lod());
}
}
......@@ -133,8 +135,11 @@ class RNNMemoryHelperGradOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {
auto x_grad_name = framework::GradVarName("X");
PADDLE_ENFORCE(ctx->HasOutput(x_grad_name), "");
PADDLE_ENFORCE(ctx->HasInput("X"), "");
PADDLE_ENFORCE(ctx->HasOutput(x_grad_name),
"Gradient of Input(X) in rnn_memory_helper_grad of should "
"not be null.");
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of rnn_memory_helper_grad of should not be null.");
ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ x_grad_name);
}
......
......@@ -25,7 +25,7 @@ class SliceOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input (Input) of slice op should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
......@@ -58,7 +58,7 @@ class SliceOp : public framework::OperatorWithKernel {
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
ctx.GetPlace());
......@@ -119,15 +119,54 @@ Following examples will explain how slice works:
}
};
class SliceOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"), "Input should not be null");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null");
auto x_dims = ctx->GetInputDim("Input");
auto x_grad_name = framework::GradVarName("Input");
if (ctx->HasOutput(x_grad_name)) {
ctx->SetOutputDim(x_grad_name, x_dims);
}
}
};
class SliceOpGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
auto* bind = new framework::OpDesc();
bind->SetInput("Input", Input("Input"));
bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
bind->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
bind->SetAttrMap(Attrs());
bind->SetType("slice_grad");
return std::unique_ptr<framework::OpDesc>(bind);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(slice, ops::SliceOp, ops::SliceOpMaker,
paddle::framework::EmptyGradOpMaker);
ops::SliceOpGradMaker);
REGISTER_OPERATOR(slice_grad, ops::SliceOpGrad);
REGISTER_OP_CPU_KERNEL(
slice, ops::SliceKernel<paddle::platform::CPUDeviceContext, int>,
ops::SliceKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::SliceKernel<paddle::platform::CPUDeviceContext, float>,
ops::SliceKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
slice_grad, ops::SliceGradKernel<paddle::platform::CPUDeviceContext, int>,
ops::SliceGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::SliceGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::SliceGradKernel<paddle::platform::CPUDeviceContext, double>);
......@@ -20,3 +20,10 @@ REGISTER_OP_CUDA_KERNEL(
ops::SliceKernel<paddle::platform::CUDADeviceContext, double>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, int>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
slice_grad,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <algorithm>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
......@@ -84,5 +85,79 @@ class SliceKernel : public framework::OpKernel<T> {
out_t.device(place) = in_t.slice(offsets, extents);
}
};
template <typename DeviceContext, typename T>
class SliceGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
size_t rank = ctx.Input<framework::Tensor>(framework::GradVarName("Out"))
->dims()
.size();
switch (rank) {
case 1:
SliceCompute<1>(ctx);
break;
case 2:
SliceCompute<2>(ctx);
break;
case 3:
SliceCompute<3>(ctx);
break;
case 4:
SliceCompute<4>(ctx);
break;
case 5:
SliceCompute<5>(ctx);
break;
case 6:
SliceCompute<6>(ctx);
break;
}
}
private:
template <size_t D>
void SliceCompute(const framework::ExecutionContext& context) const {
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
auto* d_out =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* d_input =
context.Output<framework::Tensor>(framework::GradVarName("Input"));
d_input->mutable_data<T>(context.GetPlace());
auto out_dims = d_out->dims();
auto in_dims = d_input->dims();
auto axes = context.Attr<std::vector<int>>("axes");
auto starts = context.Attr<std::vector<int>>("starts");
auto offsets = Eigen::array<int, D>();
auto extents = Eigen::array<int, D>();
for (size_t i = 0; i < D; ++i) {
offsets[i] = 0;
extents[i] = out_dims[i];
}
int start;
for (size_t i = 0; i < axes.size(); ++i) {
start = starts[i];
if (start < 0) {
start = (start + in_dims[axes[i]]);
}
start = std::max(start, 0);
offsets[axes[i]] = start;
}
Eigen::array<std::pair<int, int>, D> paddings;
for (size_t i = 0; i < paddings.size(); ++i) {
paddings[i].first = offsets[i];
paddings[i].second = (in_dims[i] - out_dims[i]) - offsets[i];
}
auto d_in_t =
framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*d_input);
auto d_out_t =
framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*d_out);
d_in_t.device(place) = d_out_t.pad(paddings, 0);
}
};
} // namespace operators
} // namespace paddle
......@@ -160,11 +160,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
fluid_t->mutable_data<float>(platform::CUDAPlace(
boost::get<platform::CUDAPlace>(context.GetPlace()).device)),
size * sizeof(float));
//} else {
// engine->GetOutputInGPU(
// y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
// size * sizeof(float));
//}
output_index += 1;
}
......
......@@ -63,7 +63,7 @@ class WhileOp : public framework::OperatorBase {
while (cond.data<bool>()[0]) {
auto &current_scope = scope.NewScope();
step_scopes->push_back(&current_scope);
executor.RunPreparedContext(ctx.get(), &current_scope, false);
executor.RunPreparedContext(ctx.get(), &current_scope, false, true, true);
if (is_test) {
scope.DeleteScope(&current_scope);
}
......@@ -169,7 +169,8 @@ class WhileGradOp : public framework::OperatorBase {
}
}
}
executor.RunPreparedContext(ctx.get(), *cur_scope_iter, false);
executor.RunPreparedContext(ctx.get(), *cur_scope_iter, false, true,
true);
auto &pg_names = Outputs(kXGRAD);
auto &p_names = Inputs(kX);
......
......@@ -51,7 +51,7 @@ ENDIF()
# memcpy depends on device_context, here add deps individually for
# avoiding cycle dependencies
cc_library(device_context SRCS device_context.cc init.cc DEPS malloc
cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc
place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
......
......@@ -210,11 +210,14 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
if (dynload::HasCUDNN()) {
cudnn_holder_.reset(new CudnnHolder(&stream_, place));
}
callback_manager_.reset(new StreamCallbackManager(stream_));
}
CUDADeviceContext::~CUDADeviceContext() {
SetDeviceId(place_.device);
Wait();
WaitStreamCallback();
PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
eigen_stream_.reset();
eigen_device_.reset();
......
......@@ -31,6 +31,9 @@ limitations under the License. */
#include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/stream_callback_manager.h"
#endif
#include "unsupported/Eigen/CXX11/Tensor"
namespace paddle {
......@@ -112,6 +115,17 @@ class CUDADeviceContext : public DeviceContext {
PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
}
template <typename Callback>
void AddStreamCallback(Callback&& callback) const {
std::lock_guard<std::mutex> guard(callback_mtx_);
callback_manager_->AddCallback(callback);
}
void WaitStreamCallback() const {
std::lock_guard<std::mutex> guard(callback_mtx_);
callback_manager_->Wait();
}
private:
CUDAPlace place_;
......@@ -125,7 +139,12 @@ class CUDADeviceContext : public DeviceContext {
int multi_process;
int max_threads_per_mp;
std::mutex mtx_;
mutable std::mutex mtx_;
// This lock is only used by callback
// If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes
mutable std::mutex callback_mtx_;
std::unique_ptr<StreamCallbackManager> callback_manager_;
};
template <>
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cuda.h>
#include <cuda_runtime.h>
#include <functional>
#include <memory>
#include "ThreadPool.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace platform {
using StreamCallback = std::function<void(cudaStream_t, cudaError_t)>;
class StreamCallbackManager;
struct StreamCallbackContext {
template <typename Callback>
inline StreamCallbackContext(const StreamCallbackManager *manager,
Callback &&callback)
: manager_(manager), callback_(callback) {}
const StreamCallbackManager *manager_; // do not own
StreamCallback callback_;
};
class StreamCallbackManager {
public:
explicit inline StreamCallbackManager(cudaStream_t stream = nullptr)
: stream_(stream), thread_pool_(new ThreadPool(1)) {}
template <typename Callback>
inline void AddCallback(Callback &&callback) const {
AddCallbackWithStreamAndErrorInfo(
[=](cudaStream_t, cudaError_t) { callback(); });
}
template <typename Callback>
inline void AddCallbackWithStreamAndErrorInfo(Callback &&callback) const {
auto *stream_callback_context = new StreamCallbackContext(this, callback);
PADDLE_ENFORCE(cudaStreamAddCallback(
stream_, StreamCallbackManager::StreamCallbackFunc,
stream_callback_context, 0));
}
void Wait() const { thread_pool_.reset(new ThreadPool(1)); }
private:
const cudaStream_t stream_;
mutable std::unique_ptr<ThreadPool> thread_pool_;
// cudaStreamCallback cannot call CUDA API inside, so we have to use
// thread_pool here
static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
cudaError_t status,
void *user_data) {
auto *callback_context_ptr =
reinterpret_cast<StreamCallbackContext *>(user_data);
callback_context_ptr->manager_->thread_pool_->enqueue([=]() {
std::unique_ptr<StreamCallbackContext> callback_context(
callback_context_ptr);
callback_context->callback_(stream, status);
});
}
};
} // namespace platform
} // namespace paddle
......@@ -683,7 +683,6 @@ All parameter, weight, gradient are variables in Paddle.
const std::string &, Scope *, std::vector<Scope *> &,
const ExecutionStrategy &, const BuildStrategy &, size_t,
size_t>())
.def("_bcast_params", &ParallelExecutor::BCastParamsToDevices)
// NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope*
......
cc_library(stringpiece SRCS piece.cc)
cc_library(pretty_log SRCS pretty_log.cc)
cc_test(test_pretty_log SRCS pretty_log.cc)
cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
cc_test(to_string_test SRCS to_string_test.cc)
......@@ -33,6 +33,7 @@ function print_usage() {
${BLUE}single_test${NONE}: run a single unit test
${BLUE}bind_test${NONE}: parallel tests bind to different GPU
${BLUE}doc${NONE}: generate paddle documents
${BLUE}gen_doc_lib${NONE}: generate paddle documents library
${BLUE}html${NONE}: convert C++ source code into HTML
${BLUE}dockerfile${NONE}: generate paddle release dockerfile
${BLUE}capi${NONE}: generate paddle CAPI package
......@@ -67,6 +68,23 @@ function cmake_gen() {
# Support build for all python versions, currently
# including cp27-cp27m and cp27-cp27mu.
PYTHON_FLAGS=""
SYSTEM=`uname -s`
if [ "$SYSTEM" == "Darwin" ]; then
if [[ "$1" == "cp27-cp27m" ]] || [[ "$1" == "" ]]; then
echo "using python abi: $1"
if [ -d "/Library/Frameworks/Python.framework/Versions/2.7" ]; then
export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
export PATH=/Library/Frameworks/Python.framework/Versions/2.7/bin/:${PATH}
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/bin/python2.7
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/2.7/include/python2.7
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/lib/libpython2.7.dylib"
else
exit 1
fi
# TODO: qiyang add python3 part here
fi
else
if [ "$1" != "" ]; then
echo "using python abi: $1"
if [ "$1" == "cp27-cp27m" ]; then
......@@ -89,6 +107,7 @@ function cmake_gen() {
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so"
fi
fi
fi
cat <<EOF
========================================
......@@ -116,6 +135,8 @@ function cmake_gen() {
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_CONTRIB=${WITH_CONTRIB:-ON}
-DWITH_INFERENCE=${WITH_INFERENCE:-ON}
-DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
-DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo}
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
-DPY_VERSION=${PY_VERSION:-2.7}
========================================
......@@ -146,6 +167,8 @@ EOF
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-DWITH_INFERENCE=${WITH_INFERENCE:-ON} \
-DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
-DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo} \
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
-DPY_VERSION=${PY_VERSION:-2.7}
}
......@@ -200,6 +223,19 @@ EOF
make install -j `nproc`
}
function build_mac() {
mkdir -p ${PADDLE_ROOT}/build
cd ${PADDLE_ROOT}/build
cat <<EOF
============================================
Building in /paddle/build ...
============================================
EOF
make clean
sudo make -j 8
sudo make install -j 8
}
function build_android() {
if [ $ANDROID_ABI == "arm64-v8a" ]; then
ANDROID_ARCH=arm64
......@@ -323,6 +359,27 @@ EOF
fi
}
function run_mac_test() {
mkdir -p ${PADDLE_ROOT}/build
cd ${PADDLE_ROOT}/build
if [ ${WITH_TESTING:-ON} == "ON" ] ; then
cat <<EOF
========================================
Running unit tests ...
========================================
EOF
# TODO: jiabin need to refine this part when these tests fixed on mac
ctest --output-on-failure -j8
# make install should also be test when unittest
make install -j 8
pip install /usr/local/opt/paddle/share/wheels/*.whl
if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
paddle version
fi
fi
}
function assert_api_not_changed() {
mkdir -p ${PADDLE_ROOT}/build/.check_api_workspace
cd ${PADDLE_ROOT}/build/.check_api_workspace
......@@ -431,24 +488,42 @@ EOF
linkchecker doc/v2/cn/html/index.html
linkchecker doc/v2/api/en/html/index.html
if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
}
# Deploy to the the content server if its a "develop" or "release/version" branch
# The "develop_doc" branch is reserved to test full deploy process without impacting the real content.
if [ "$TRAVIS_BRANCH" == "develop_doc" ]; then
PPO_SCRIPT_BRANCH=develop
elif [[ "$TRAVIS_BRANCH" == "develop" || "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then
PPO_SCRIPT_BRANCH=master
else
# Early exit, this branch doesn't require documentation build
return 0;
fi
# Fetch the paddlepaddle.org deploy_docs.sh from the appopriate branch
export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/$PPO_SCRIPT_BRANCH/scripts/deploy/deploy_docs.sh
export PYTHONPATH=$PYTHONPATH:${PADDLE_ROOT}/build/python:/paddle/build/python
cd ..
curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH ${PADDLE_ROOT} ${PADDLE_ROOT}/build/doc/ ${PPO_SCRIPT_BRANCH}
cd -
function gen_doc_lib() {
mkdir -p ${PADDLE_ROOT}/build
cd ${PADDLE_ROOT}/build
cat <<EOF
========================================
Building documentation library ...
In /paddle/build
========================================
EOF
cmake .. \
-DCMAKE_BUILD_TYPE=Release \
-DWITH_DOC=ON \
-DWITH_GPU=OFF \
-DWITH_MKL=OFF \
-DWITH_FLUID_ONLY=ON
local LIB_TYPE=$1
case $LIB_TYPE in
full)
# Build full Paddle Python module. Will timeout without caching 'copy_paddle_pybind' first
make -j `nproc` gen_proto_py framework_py_proto copy_paddle_pybind paddle_python
;;
pybind)
# Build paddle pybind library. Takes 49 minutes to build. Might timeout
make -j `nproc` copy_paddle_pybind
;;
proto)
# Even smaller library.
make -j `nproc` framework_py_proto
;;
*)
exit 0
;;
esac
}
function gen_html() {
......@@ -608,6 +683,9 @@ function main() {
doc)
gen_docs
;;
gen_doc_lib)
gen_doc_lib $2
;;
html)
gen_html
;;
......@@ -637,6 +715,17 @@ function main() {
test_fluid_inference_lib
assert_api_spec_approvals
;;
maccheck)
cmake_gen ${PYTHON_ABI:-""}
build_mac
run_mac_test
;;
cicheck_py35)
cmake_gen ${PYTHON_ABI:-""}
build
run_test
assert_api_not_changed
;;
*)
print_usage
exit 0
......
......@@ -67,7 +67,7 @@ def get_word_dict():
for field in movie_reviews.fileids(category):
for words in movie_reviews.words(field):
word_freq_dict[words] += 1
words_sort_list = six.iteritems(word_freq_dict)
words_sort_list = list(six.iteritems(word_freq_dict))
words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
for index, word in enumerate(words_sort_list):
words_freq_sorted.append((word[0], index))
......
......@@ -122,7 +122,7 @@ def __bootstrap__():
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
"dist_threadpool_size", 'cpu_deterministic'
"dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb'
]
if core.is_compiled_with_dist():
read_env_flags.append('rpc_deadline')
......
......@@ -55,15 +55,19 @@ for _OP in set(__auto__):
globals()[_OP] = generate_layer_fn(_OP)
def rpn_target_assign(loc,
scores,
def rpn_target_assign(bbox_pred,
cls_logits,
anchor_box,
anchor_var,
gt_box,
gt_boxes,
is_crowd,
im_info,
rpn_batch_size_per_im=256,
fg_fraction=0.25,
rpn_straddle_thresh=0.0,
rpn_fg_fraction=0.5,
rpn_positive_overlap=0.7,
rpn_negative_overlap=0.3):
rpn_negative_overlap=0.3,
use_random=True):
"""
** Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection. **
......@@ -83,14 +87,13 @@ def rpn_target_assign(loc,
the positive anchors.
Args:
loc(Variable): A 3-D Tensor with shape [N, M, 4] represents the
bbox_pred(Variable): A 3-D Tensor with shape [N, M, 4] represents the
predicted locations of M bounding bboxes. N is the batch size,
and each bounding box has four coordinate values and the layout
is [xmin, ymin, xmax, ymax].
scores(Variable): A 3-D Tensor with shape [N, M, C] represents the
predicted confidence predictions. N is the batch size, C is the
class number, M is number of bounding boxes. For each category
there are total M scores which corresponding M bounding boxes.
cls_logits(Variable): A 3-D Tensor with shape [N, M, 1] represents the
predicted confidence predictions. N is the batch size, 1 is the
frontground and background sigmoid, M is number of bounding boxes.
anchor_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes,
each box is represented as [xmin, ymin, xmax, ymax],
[xmin, ymin] is the left top coordinate of the anchor box,
......@@ -99,11 +102,16 @@ def rpn_target_assign(loc,
coordinate of the anchor box.
anchor_var(Variable): A 2-D Tensor with shape [M,4] holds expanded
variances of anchors.
gt_box (Variable): The ground-truth boudding boxes (bboxes) are a 2D
gt_boxes (Variable): The ground-truth boudding boxes (bboxes) are a 2D
LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth
bboxes of mini-batch input.
is_crowd (Variable): A 1-D LoDTensor which indicates groud-truth is crowd.
im_info (Variable): A 2-D LoDTensor with shape [N, 3]. N is the batch size,
3 is the height, width and scale.
rpn_batch_size_per_im(int): Total number of RPN examples per image.
fg_fraction(float): Target fraction of RoI minibatch that is labeled
rpn_straddle_thresh(float): Remove RPN anchors that go outside the image
by straddle_thresh pixels.
rpn_fg_fraction(float): Target fraction of RoI minibatch that is labeled
foreground (i.e. class > 0), 0-th class is background.
rpn_positive_overlap(float): Minimum overlap required between an anchor
and ground-truth box for the (anchor, gt box) pair to be a positive
......@@ -129,45 +137,48 @@ def rpn_target_assign(loc,
Examples:
.. code-block:: python
loc = layers.data(name='location', shape=[2, 80],
bbox_pred = layers.data(name='bbox_pred', shape=[100, 4],
append_batch_size=False, dtype='float32')
scores = layers.data(name='scores', shape=[2, 40],
cls_logits = layers.data(name='cls_logits', shape=[100, 1],
append_batch_size=False, dtype='float32')
anchor_box = layers.data(name='anchor_box', shape=[20, 4],
append_batch_size=False, dtype='float32')
gt_box = layers.data(name='gt_box', shape=[10, 4],
gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
append_batch_size=False, dtype='float32')
loc_pred, score_pred, loc_target, score_target =
fluid.layers.detection_output(loc=location,
scores=scores,
fluid.layers.rpn_target_assign(bbox_pred=bbox_pred,
cls_logits=cls_logits,
anchor_box=anchor_box,
gt_box=gt_box)
gt_boxes=gt_boxes)
"""
helper = LayerHelper('rpn_target_assign', **locals())
# Compute overlaps between the prior boxes and the gt boxes overlaps
iou = iou_similarity(x=gt_box, y=anchor_box)
# Assign target label to anchors
loc_index = helper.create_tmp_variable(dtype='int32')
score_index = helper.create_tmp_variable(dtype='int32')
target_label = helper.create_tmp_variable(dtype='int64')
target_label = helper.create_tmp_variable(dtype='int32')
target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype)
helper.append_op(
type="rpn_target_assign",
inputs={'Anchor': anchor_box,
'GtBox': gt_box,
'DistMat': iou},
inputs={
'Anchor': anchor_box,
'GtBoxes': gt_boxes,
'IsCrowd': is_crowd,
'ImInfo': im_info
},
outputs={
'LocationIndex': loc_index,
'ScoreIndex': score_index,
'TargetLabel': target_label,
'TargetBBox': target_bbox,
'TargetBBox': target_bbox
},
attrs={
'rpn_batch_size_per_im': rpn_batch_size_per_im,
'rpn_straddle_thresh': rpn_straddle_thresh,
'rpn_positive_overlap': rpn_positive_overlap,
'rpn_negative_overlap': rpn_negative_overlap,
'fg_fraction': fg_fraction
'rpn_fg_fraction': rpn_fg_fraction,
'use_random': use_random
})
loc_index.stop_gradient = True
......@@ -175,12 +186,12 @@ def rpn_target_assign(loc,
target_label.stop_gradient = True
target_bbox.stop_gradient = True
scores = nn.reshape(x=scores, shape=(-1, 1))
loc = nn.reshape(x=loc, shape=(-1, 4))
predicted_scores = nn.gather(scores, score_index)
predicted_location = nn.gather(loc, loc_index)
cls_logits = nn.reshape(x=cls_logits, shape=(-1, 1))
bbox_pred = nn.reshape(x=bbox_pred, shape=(-1, 4))
predicted_cls_logits = nn.gather(cls_logits, score_index)
predicted_bbox_pred = nn.gather(bbox_pred, loc_index)
return predicted_scores, predicted_location, target_label, target_bbox
return predicted_cls_logits, predicted_bbox_pred, target_label, target_bbox
def detection_output(loc,
......@@ -1258,15 +1269,17 @@ def anchor_generator(input,
def generate_proposal_labels(rpn_rois,
gt_classes,
is_crowd,
gt_boxes,
im_scales,
im_info,
batch_size_per_im=256,
fg_fraction=0.25,
fg_thresh=0.25,
bg_thresh_hi=0.5,
bg_thresh_lo=0.0,
bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
class_nums=None):
class_nums=None,
use_random=True):
"""
** Generate proposal labels Faster-RCNN **
TODO(buxingyuan): Add Document
......@@ -1285,8 +1298,9 @@ def generate_proposal_labels(rpn_rois,
inputs={
'RpnRois': rpn_rois,
'GtClasses': gt_classes,
'IsCrowd': is_crowd,
'GtBoxes': gt_boxes,
'ImScales': im_scales
'ImInfo': im_info
},
outputs={
'Rois': rois,
......@@ -1302,7 +1316,8 @@ def generate_proposal_labels(rpn_rois,
'bg_thresh_hi': bg_thresh_hi,
'bg_thresh_lo': bg_thresh_lo,
'bbox_reg_weights': bbox_reg_weights,
'class_nums': class_nums
'class_nums': class_nums,
'use_random': use_random
})
rois.stop_gradient = True
......
......@@ -142,11 +142,6 @@ class ParallelExecutor(object):
main = main if main else framework.default_main_program()
if scope == None:
scope = executor.global_scope()
# FIXME(Yancey1989): it's a temporary approach to determinate the distribute
# train program, call self.bcast_param() at the end of each mini-batch.
self.is_dist = True if "recv" in [
op.type for op in main.global_block().ops
] else False
if share_vars_from and not isinstance(share_vars_from,
ParallelExecutor):
......@@ -286,21 +281,11 @@ class ParallelExecutor(object):
self.executor.run(fetch_list, fetch_var_name)
arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
if self.is_dist:
self._bcast_params()
if return_numpy:
return executor.as_numpy(arr)
return [arr[i] for i in range(len(arr))]
def _bcast_params(self):
"""
Broadcast the parameters to other devices. It is used during
distributed training.
"""
self.executor._bcast_params(set(self.persistable_vars))
@property
def device_count(self):
return len(self._act_places)
......@@ -148,6 +148,8 @@ class TestAnchorGenerator(unittest.TestCase):
class TestGenerateProposalLabels(unittest.TestCase):
def test_generate_proposal_labels(self):
program = Program()
with program_guard(program):
rpn_rois = layers.data(
name='rpn_rois',
shape=[4, 4],
......@@ -160,15 +162,21 @@ class TestGenerateProposalLabels(unittest.TestCase):
dtype='int32',
lod_level=1,
append_batch_size=False)
is_crowd = layers.data(
name='is_crowd',
shape=[6],
dtype='int32',
lod_level=1,
append_batch_size=False)
gt_boxes = layers.data(
name='gt_boxes',
shape=[6, 4],
dtype='float32',
lod_level=1,
append_batch_size=False)
im_scales = layers.data(
name='im_scales',
shape=[1],
im_info = layers.data(
name='im_info',
shape=[1, 3],
dtype='float32',
lod_level=1,
append_batch_size=False)
......@@ -176,8 +184,9 @@ class TestGenerateProposalLabels(unittest.TestCase):
rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights = fluid.layers.generate_proposal_labels(
rpn_rois=rpn_rois,
gt_classes=gt_classes,
is_crowd=is_crowd,
gt_boxes=gt_boxes,
im_scales=im_scales,
im_info=im_info,
batch_size_per_im=2,
fg_fraction=0.5,
fg_thresh=0.5,
......@@ -254,18 +263,18 @@ class TestRpnTargetAssign(unittest.TestCase):
def test_rpn_target_assign(self):
program = Program()
with program_guard(program):
loc_shape = [10, 50, 4]
score_shape = [10, 50, 2]
bbox_pred_shape = [10, 50, 4]
cls_logits_shape = [10, 50, 2]
anchor_shape = [50, 4]
loc = layers.data(
name='loc',
shape=loc_shape,
bbox_pred = layers.data(
name='bbox_pred',
shape=bbox_pred_shape,
append_batch_size=False,
dtype='float32')
scores = layers.data(
name='scores',
shape=score_shape,
cls_logits = layers.data(
name='cls_logits',
shape=cls_logits_shape,
append_batch_size=False,
dtype='float32')
anchor_box = layers.data(
......@@ -278,17 +287,31 @@ class TestRpnTargetAssign(unittest.TestCase):
shape=anchor_shape,
append_batch_size=False,
dtype='float32')
gt_box = layers.data(
name='gt_box', shape=[4], lod_level=1, dtype='float32')
gt_boxes = layers.data(
name='gt_boxes', shape=[4], lod_level=1, dtype='float32')
is_crowd = layers.data(
name='is_crowd',
shape=[10],
dtype='int32',
lod_level=1,
append_batch_size=False)
im_info = layers.data(
name='im_info',
shape=[1, 3],
dtype='float32',
lod_level=1,
append_batch_size=False)
pred_scores, pred_loc, tgt_lbl, tgt_bbox = layers.rpn_target_assign(
loc=loc,
scores=scores,
bbox_pred=bbox_pred,
cls_logits=cls_logits,
anchor_box=anchor_box,
anchor_var=anchor_var,
gt_box=gt_box,
gt_boxes=gt_boxes,
is_crowd=is_crowd,
im_info=im_info,
rpn_batch_size_per_im=256,
fg_fraction=0.25,
rpn_straddle_thresh=0.0,
rpn_fg_fraction=0.5,
rpn_positive_overlap=0.7,
rpn_negative_overlap=0.3)
......
......@@ -29,6 +29,19 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl
list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test
list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test
if(APPLE)
message(WARNING "These tests has been disabled in OSX before being fixed: \n test_detection_map_op \n test_desc_clone \n test_debugger \n test_program_code \n test_dist_transformer \n test_dist_se_resnext")
# this op is not support on mac
list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
# TODO: add the unitest back when it fixed
list(REMOVE_ITEM TEST_OPS test_detection_map_op)
list(REMOVE_ITEM TEST_OPS test_desc_clone)
list(REMOVE_ITEM TEST_OPS test_debugger)
list(REMOVE_ITEM TEST_OPS test_program_code)
list(REMOVE_ITEM TEST_OPS test_dist_transformer)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
endif()
function(py_test_modules TARGET_NAME)
if(WITH_TESTING)
set(options SERIAL)
......@@ -46,6 +59,7 @@ function(py_test_modules TARGET_NAME)
endfunction()
list(REMOVE_ITEM TEST_OPS test_warpctc_op)
list(REMOVE_ITEM TEST_OPS test_dist_train)
list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
......@@ -61,11 +75,12 @@ if(WITH_DISTRIBUTE)
set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
endif()
py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150)
py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
......@@ -92,7 +92,7 @@ class TrainTaskConfig(object):
src_vocab_fpath = data_path + "vocab.bpe.32000"
trg_vocab_fpath = data_path + "vocab.bpe.32000"
train_file_pattern = data_path + "train.tok.clean.bpe.32000.en-de"
val_file_pattern = data_path + "newstest2013.tok.bpe.32000.en-de"
val_file_pattern = data_path + "newstest2013.tok.bpe.32000.en-de.cut"
pool_size = 2000
sort_type = None
local = True
......@@ -624,6 +624,7 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
init = True
# Validate and save the model for inference.
if batch_id == 0 or batch_id == 4:
if TrainTaskConfig.val_file_pattern is not None:
val_avg_cost, val_ppl = test()
print("[%f]" % val_avg_cost)
......@@ -1701,8 +1702,9 @@ class DistTransformer2x2(TestDistRunnerBase):
exe.run(startup_prog)
exe.run(pserver_prog)
def run_trainer(self, place, args):
def run_trainer(self, use_cuda, args):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
TrainTaskConfig.use_gpu = use_cuda
sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model(
args.is_dist, not args.sync_mode)
......
......@@ -109,15 +109,20 @@ def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
return t
from paddle.fluid.transpiler.details import op_to_code
def operator_equal(a, b):
if op_to_code(a) != op_to_code(b):
raise ValueError("In operator_equal not equal\n")
for k, v in six.iteritems(a.__dict__):
if isinstance(v, fluid.framework.Program) or \
isinstance(v, fluid.framework.Block):
continue
elif isinstance(v, core.OpDesc):
if v.serialize_to_string() != b.__dict__[k].serialize_to_string():
raise ValueError("In operator_equal not equal:{0}\n".format(k))
continue
elif isinstance(v, collections.OrderedDict):
v0 = sorted(list(six.iteritems(v)), key=lambda x: x[0])
......
......@@ -61,9 +61,10 @@ class TestDistRunnerBase(object):
exe.run(startup_prog)
exe.run(pserver_prog)
def run_trainer(self, place, args):
def run_trainer(self, use_cuda, args):
import paddle
import paddle.fluid as fluid
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
self.get_model(batch_size=2)
if args.mem_opt:
......@@ -91,7 +92,7 @@ class TestDistRunnerBase(object):
build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
exe = fluid.ParallelExecutor(
True,
use_cuda,
loss_name=avg_cost.name,
exec_strategy=strategy,
build_strategy=build_stra)
......@@ -142,9 +143,8 @@ def runtime_main(test_class):
if args.role == "pserver" and args.is_dist:
model.run_pserver(args)
else:
p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
) else fluid.CPUPlace()
model.run_trainer(p, args)
use_cuda = True if core.is_compiled_with_cuda() else False
model.run_trainer(use_cuda, args)
import paddle.compat as cpt
......@@ -225,11 +225,12 @@ class TestDistBase(unittest.TestCase):
def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
# TODO(typhoonzero): should auto adapt GPU count on the machine.
required_envs = {
"PATH": os.getenv("PATH"),
"PYTHONPATH": os.getenv("PYTHONPATH"),
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
"PATH": os.getenv("PATH", ""),
"PYTHONPATH": os.getenv("PYTHONPATH", ""),
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
"FLAGS_fraction_of_gpu_memory_to_use": "0.15",
"FLAGS_cudnn_deterministic": "1"
"FLAGS_cudnn_deterministic": "1",
"CPU_NUM": "1"
}
if check_error_log:
......
......@@ -14,6 +14,7 @@
from __future__ import print_function
import os
import unittest
import paddle
from test_dist_base import TestDistBase
......@@ -44,6 +45,14 @@ def download_files():
test_url = url_prefix + 'newstest2013.tok.bpe.32000.en-de'
test_md5 = '9dd74a266dbdb25314183899f269b4a2'
paddle.dataset.common.download(test_url, 'test_dist_transformer', test_md5)
# cut test data for faster CI
orig_path = os.path.join(paddle.dataset.common.DATA_HOME,
"test_dist_transformer",
"newstest2013.tok.bpe.32000.en-de")
head_path = os.path.join(paddle.dataset.common.DATA_HOME,
"test_dist_transformer",
"newstest2013.tok.bpe.32000.en-de.cut")
os.system("head -n10 %s > %s" % (orig_path, head_path))
class TestDistTransformer2x2Sync(TestDistBase):
......
......@@ -62,7 +62,7 @@ class TranspilerTest(unittest.TestCase):
t = self._transpiler_instance(config)
trainer_main = t.get_trainer_program()
trainer_main = t.get_trainer_program(wait_port=False)
trainer_startup = fluid.default_startup_program()
assert (src.num_blocks == 1)
......
......@@ -20,10 +20,10 @@ import paddle.fluid as fluid
from op_test import OpTest
def generate_proposal_labels_in_python(
rpn_rois, gt_classes, gt_boxes, im_scales, batch_size_per_im,
fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights,
class_nums):
def generate_proposal_labels_in_python(rpn_rois, gt_classes, is_crowd, gt_boxes,
im_info, batch_size_per_im, fg_fraction,
fg_thresh, bg_thresh_hi, bg_thresh_lo,
bbox_reg_weights, class_nums):
rois = []
labels_int32 = []
bbox_targets = []
......@@ -31,13 +31,13 @@ def generate_proposal_labels_in_python(
bbox_outside_weights = []
lod = []
assert len(rpn_rois) == len(
im_scales), 'batch size of rpn_rois and ground_truth is not matched'
im_info), 'batch size of rpn_rois and ground_truth is not matched'
for im_i in range(len(im_scales)):
for im_i in range(len(im_info)):
frcn_blobs = _sample_rois(
rpn_rois[im_i], gt_classes[im_i], gt_boxes[im_i], im_scales[im_i],
batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi,
bg_thresh_lo, bbox_reg_weights, class_nums)
rpn_rois[im_i], gt_classes[im_i], is_crowd[im_i], gt_boxes[im_i],
im_info[im_i], batch_size_per_im, fg_fraction, fg_thresh,
bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums)
lod.append(frcn_blobs['rois'].shape[0])
......@@ -50,13 +50,14 @@ def generate_proposal_labels_in_python(
return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights, lod
def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im,
fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo,
bbox_reg_weights, class_nums):
def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi,
bg_thresh_lo, bbox_reg_weights, class_nums):
rois_per_image = int(batch_size_per_im)
fg_rois_per_im = int(np.round(fg_fraction * rois_per_image))
# Roidb
im_scale = im_info[2]
inv_im_scale = 1. / im_scale
rpn_rois = rpn_rois * inv_im_scale
......@@ -78,6 +79,9 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im,
box_to_gt_ind_map[overlapped_boxes_ind] = overlaps_argmax[
overlapped_boxes_ind]
crowd_ind = np.where(is_crowd)[0]
gt_overlaps[crowd_ind] = -1
max_overlaps = gt_overlaps.max(axis=1)
max_classes = gt_overlaps.argmax(axis=1)
......@@ -85,9 +89,10 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im,
fg_inds = np.where(max_overlaps >= fg_thresh)[0]
fg_rois_per_this_image = np.minimum(fg_rois_per_im, fg_inds.shape[0])
# Sample foreground if there are too many
if fg_inds.shape[0] > fg_rois_per_this_image:
fg_inds = np.random.choice(
fg_inds, size=fg_rois_per_this_image, replace=False)
# if fg_inds.shape[0] > fg_rois_per_this_image:
# fg_inds = np.random.choice(
# fg_inds, size=fg_rois_per_this_image, replace=False)
fg_inds = fg_inds[:fg_rois_per_this_image]
# Background
bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >=
......@@ -96,9 +101,10 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im,
bg_rois_per_this_image = np.minimum(bg_rois_per_this_image,
bg_inds.shape[0])
# Sample background if there are too many
if bg_inds.shape[0] > bg_rois_per_this_image:
bg_inds = np.random.choice(
bg_inds, size=bg_rois_per_this_image, replace=False)
# if bg_inds.shape[0] > bg_rois_per_this_image:
# bg_inds = np.random.choice(
# bg_inds, size=bg_rois_per_this_image, replace=False)
bg_inds = bg_inds[:bg_rois_per_this_image]
keep_inds = np.append(fg_inds, bg_inds)
sampled_labels = max_classes[keep_inds]
......@@ -208,8 +214,9 @@ class TestGenerateProposalLabelsOp(OpTest):
self.inputs = {
'RpnRois': (self.rpn_rois[0], self.rpn_rois_lod),
'GtClasses': (self.gt_classes[0], self.gts_lod),
'IsCrowd': (self.is_crowd[0], self.gts_lod),
'GtBoxes': (self.gt_boxes[0], self.gts_lod),
'ImScales': self.im_scales[0]
'ImInfo': self.im_info
}
self.attrs = {
'batch_size_per_im': self.batch_size_per_im,
......@@ -218,14 +225,15 @@ class TestGenerateProposalLabelsOp(OpTest):
'bg_thresh_hi': self.bg_thresh_hi,
'bg_thresh_lo': self.bg_thresh_lo,
'bbox_reg_weights': self.bbox_reg_weights,
'class_nums': self.class_nums
'class_nums': self.class_nums,
'use_random': False
}
self.outputs = {
'Rois': (self.rois[0], [self.lod]),
'LabelsInt32': (self.labels_int32[0], [self.lod]),
'BboxTargets': (self.bbox_targets[0], [self.lod]),
'BboxInsideWeights': (self.bbox_inside_weights[0], [self.lod]),
'BboxOutsideWeights': (self.bbox_outside_weights[0], [self.lod]),
'Rois': (self.rois, [self.lod]),
'LabelsInt32': (self.labels_int32, [self.lod]),
'BboxTargets': (self.bbox_targets, [self.lod]),
'BboxInsideWeights': (self.bbox_inside_weights, [self.lod]),
'BboxOutsideWeights': (self.bbox_outside_weights, [self.lod]),
}
def test_check_output(self):
......@@ -236,8 +244,8 @@ class TestGenerateProposalLabelsOp(OpTest):
self.set_data()
def init_test_params(self):
self.batch_size_per_im = 10
self.fg_fraction = 1.0
self.batch_size_per_im = 512
self.fg_fraction = 0.25
self.fg_thresh = 0.5
self.bg_thresh_hi = 0.5
self.bg_thresh_lo = 0.0
......@@ -246,14 +254,14 @@ class TestGenerateProposalLabelsOp(OpTest):
def init_test_input(self):
np.random.seed(0)
image_nums = 1
gt_nums = 6 # Keep same with batch_size_per_im for unittest
proposal_nums = self.batch_size_per_im - gt_nums
images_shape = []
self.im_scales = []
for i in range(image_nums):
images_shape.append(np.random.randint(200, size=2))
self.im_scales.append(np.ones((1)).astype(np.float32))
proposal_nums = 2000 #self.batch_size_per_im - gt_nums
images_shape = [[64, 64]]
self.im_info = np.ones((len(images_shape), 3)).astype(np.float32)
for i in range(len(images_shape)):
self.im_info[i, 0] = images_shape[i][0]
self.im_info[i, 1] = images_shape[i][1]
self.im_info[i, 2] = 0.8 #scale
self.rpn_rois, self.rpn_rois_lod = _generate_proposals(images_shape,
proposal_nums)
......@@ -261,16 +269,23 @@ class TestGenerateProposalLabelsOp(OpTest):
images_shape, self.class_nums, gt_nums)
self.gt_classes = [gt['gt_classes'] for gt in ground_truth]
self.gt_boxes = [gt['boxes'] for gt in ground_truth]
self.is_crowd = [gt['is_crowd'] for gt in ground_truth]
def init_test_output(self):
self.rois, self.labels_int32, self.bbox_targets, \
self.bbox_inside_weights, self.bbox_outside_weights, \
self.lod = generate_proposal_labels_in_python(
self.rpn_rois, self.gt_classes, self.gt_boxes, self.im_scales,
self.rpn_rois, self.gt_classes, self.is_crowd, self.gt_boxes, self.im_info,
self.batch_size_per_im, self.fg_fraction,
self.fg_thresh, self.bg_thresh_hi, self.bg_thresh_lo,
self.bbox_reg_weights, self.class_nums
)
self.rois = np.vstack(self.rois)
self.labels_int32 = np.hstack(self.labels_int32)
self.labels_int32 = self.labels_int32[:, np.newaxis]
self.bbox_targets = np.vstack(self.bbox_targets)
self.bbox_inside_weights = np.vstack(self.bbox_inside_weights)
self.bbox_outside_weights = np.vstack(self.bbox_outside_weights)
def _generate_proposals(images_shape, proposal_nums):
......@@ -280,7 +295,7 @@ def _generate_proposals(images_shape, proposal_nums):
for i, image_shape in enumerate(images_shape):
proposals = _generate_boxes(image_shape, proposal_nums)
rpn_rois.append(proposals)
num_proposals += len(proposals)
num_proposals = len(proposals)
rpn_rois_lod.append(num_proposals)
return rpn_rois, [rpn_rois_lod]
......@@ -294,7 +309,11 @@ def _generate_groundtruth(images_shape, class_nums, gt_nums):
gt_classes = np.random.randint(
low=1, high=class_nums, size=gt_nums).astype(np.int32)
gt_boxes = _generate_boxes(image_shape, gt_nums)
ground_truth.append(dict(gt_classes=gt_classes, boxes=gt_boxes))
is_crowd = np.zeros((gt_nums), dtype=np.int32)
is_crowd[0] = 1
ground_truth.append(
dict(
gt_classes=gt_classes, boxes=gt_boxes, is_crowd=is_crowd))
num_gts += len(gt_classes)
gts_lod.append(num_gts)
return ground_truth, [gts_lod]
......
......@@ -114,10 +114,10 @@ def box_coder(all_anchors, bbox_deltas, variances):
#anchor_loc: width, height, center_x, center_y
anchor_loc = np.zeros_like(bbox_deltas, dtype=np.float32)
anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0]
anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1]
anchor_loc[:, 2] = (all_anchors[:, 2] + all_anchors[:, 0]) / 2
anchor_loc[:, 3] = (all_anchors[:, 3] + all_anchors[:, 1]) / 2
anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] + 1
anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] + 1
anchor_loc[:, 2] = all_anchors[:, 0] + 0.5 * anchor_loc[:, 0]
anchor_loc[:, 3] = all_anchors[:, 1] + 0.5 * anchor_loc[:, 1]
#predicted bbox: bbox_center_x, bbox_center_y, bbox_width, bbox_height
pred_bbox = np.zeros_like(bbox_deltas, dtype=np.float32)
......@@ -127,23 +127,29 @@ def box_coder(all_anchors, bbox_deltas, variances):
i, 0] + anchor_loc[i, 2]
pred_bbox[i, 1] = variances[i, 1] * bbox_deltas[i, 1] * anchor_loc[
i, 1] + anchor_loc[i, 3]
pred_bbox[i, 2] = math.exp(variances[i, 2] *
bbox_deltas[i, 2]) * anchor_loc[i, 0]
pred_bbox[i, 3] = math.exp(variances[i, 3] *
bbox_deltas[i, 3]) * anchor_loc[i, 1]
pred_bbox[i, 2] = math.exp(
min(variances[i, 2] * bbox_deltas[i, 2], math.log(
1000 / 16.0))) * anchor_loc[i, 0]
pred_bbox[i, 3] = math.exp(
min(variances[i, 3] * bbox_deltas[i, 3], math.log(
1000 / 16.0))) * anchor_loc[i, 1]
else:
for i in range(bbox_deltas.shape[0]):
pred_bbox[i, 0] = bbox_deltas[i, 0] * anchor_loc[i, 0] + anchor_loc[
i, 2]
pred_bbox[i, 1] = bbox_deltas[i, 1] * anchor_loc[i, 1] + anchor_loc[
i, 3]
pred_bbox[i, 2] = math.exp(bbox_deltas[i, 2]) * anchor_loc[i, 0]
pred_bbox[i, 3] = math.exp(bbox_deltas[i, 3]) * anchor_loc[i, 1]
pred_bbox[i, 2] = math.exp(
min(bbox_deltas[i, 2], math.log(1000 / 16.0))) * anchor_loc[i,
0]
pred_bbox[i, 3] = math.exp(
min(bbox_deltas[i, 3], math.log(1000 / 16.0))) * anchor_loc[i,
1]
proposals[:, 0] = pred_bbox[:, 0] - pred_bbox[:, 2] / 2
proposals[:, 1] = pred_bbox[:, 1] - pred_bbox[:, 3] / 2
proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2
proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2
proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - 1
proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 - 1
return proposals
......@@ -170,13 +176,16 @@ def filter_boxes(boxes, min_size, im_info):
"""Only keep boxes with both sides >= min_size and center within the image.
"""
# Scale min_size to match image scale
min_size *= im_info[2]
im_scale = im_info[2]
min_size = max(min_size, 1.0)
ws = boxes[:, 2] - boxes[:, 0] + 1
hs = boxes[:, 3] - boxes[:, 1] + 1
ws_orig_scale = (boxes[:, 2] - boxes[:, 0]) / im_scale + 1
hs_orig_scale = (boxes[:, 3] - boxes[:, 1]) / im_scale + 1
x_ctr = boxes[:, 0] + ws / 2.
y_ctr = boxes[:, 1] + hs / 2.
keep = np.where((ws >= min_size) & (hs >= min_size) & (x_ctr < im_info[1]) &
(y_ctr < im_info[0]))[0]
keep = np.where((ws_orig_scale >= min_size) & (hs_orig_scale >= min_size) &
(x_ctr < im_info[1]) & (y_ctr < im_info[0]))[0]
return keep
......@@ -204,7 +213,7 @@ def iou(box_a, box_b):
xb = min(xmax_a, xmax_b)
yb = min(ymax_a, ymax_b)
inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0)
inter_area = max(xb - xa + 1, 0.0) * max(yb - ya + 1, 0.0)
iou_ratio = inter_area / (area_a + area_b - inter_area)
......
......@@ -13,7 +13,7 @@
# limitations under the License.
from __future__ import print_function
import os
import paddle.fluid as fluid
import paddle
import numpy as np
......@@ -41,6 +41,8 @@ class TestReaderReset(unittest.TestCase):
self.data_file_name, reader, feeder)
def setUp(self):
# set parallel threads to fit 20 batches in line 49
os.environ['CPU_NUM'] = str(20)
self.use_cuda = fluid.core.is_compiled_with_cuda()
self.data_file_name = './reader_reset_test.recordio'
self.ins_shape = [3]
......
......@@ -19,48 +19,58 @@ import numpy as np
import paddle.fluid.core as core
from op_test import OpTest
from test_anchor_generator_op import anchor_generator_in_python
from test_generate_proposal_labels import _generate_groundtruth
from test_generate_proposal_labels import _bbox_overlaps, _box_to_delta
def rpn_target_assign(gt_anchor_iou, rpn_batch_size_per_im,
rpn_positive_overlap, rpn_negative_overlap, fg_fraction):
iou = np.transpose(gt_anchor_iou)
anchor_to_gt_max = iou.max(axis=1)
anchor_to_gt_argmax = iou.argmax(axis=1)
gt_to_anchor_argmax = iou.argmax(axis=0)
gt_to_anchor_max = iou[gt_to_anchor_argmax, np.arange(iou.shape[1])]
anchors_with_max_overlap = np.where(iou == gt_to_anchor_max)[0]
tgt_lbl = np.ones((iou.shape[0], ), dtype=np.int32) * -1
tgt_lbl[anchors_with_max_overlap] = 1
tgt_lbl[anchor_to_gt_max >= rpn_positive_overlap] = 1
num_fg = int(fg_fraction * rpn_batch_size_per_im)
fg_inds = np.where(tgt_lbl == 1)[0]
if len(fg_inds) > num_fg:
from test_generate_proposal_labels_op import _generate_groundtruth
from test_generate_proposal_labels_op import _bbox_overlaps, _box_to_delta
def rpn_target_assign(anchor_by_gt_overlap,
rpn_batch_size_per_im,
rpn_positive_overlap,
rpn_negative_overlap,
rpn_fg_fraction,
use_random=True):
anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
anchor_to_gt_max = anchor_by_gt_overlap[np.arange(
anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax]
gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
gt_to_anchor_max = anchor_by_gt_overlap[gt_to_anchor_argmax, np.arange(
anchor_by_gt_overlap.shape[1])]
anchors_with_max_overlap = np.where(
anchor_by_gt_overlap == gt_to_anchor_max)[0]
labels = np.ones((anchor_by_gt_overlap.shape[0], ), dtype=np.int32) * -1
labels[anchors_with_max_overlap] = 1
labels[anchor_to_gt_max >= rpn_positive_overlap] = 1
num_fg = int(rpn_fg_fraction * rpn_batch_size_per_im)
fg_inds = np.where(labels == 1)[0]
if len(fg_inds) > num_fg and use_random:
disable_inds = np.random.choice(
fg_inds, size=(len(fg_inds) - num_fg), replace=False)
tgt_lbl[disable_inds] = -1
fg_inds = np.where(tgt_lbl == 1)[0]
else:
disable_inds = fg_inds[num_fg:]
labels[disable_inds] = -1
fg_inds = np.where(labels == 1)[0]
num_bg = rpn_batch_size_per_im - np.sum(tgt_lbl == 1)
num_bg = rpn_batch_size_per_im - np.sum(labels == 1)
bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0]
tgt_lbl[bg_inds] = 0
if len(bg_inds) > num_bg:
if len(bg_inds) > num_bg and use_random:
enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)]
tgt_lbl[enable_inds] = 0
bg_inds = np.where(tgt_lbl == 0)[0]
tgt_lbl[bg_inds] = 0
else:
enable_inds = bg_inds[:num_bg]
labels[enable_inds] = 0
fg_inds = np.where(labels == 1)[0]
bg_inds = np.where(labels == 0)[0]
loc_index = fg_inds
score_index = np.hstack((fg_inds, bg_inds))
tgt_lbl = np.expand_dims(tgt_lbl, axis=1)
labels = labels[score_index]
assert not np.any(labels == -1), "Wrong labels with -1"
gt_inds = anchor_to_gt_argmax[fg_inds]
return loc_index, score_index, tgt_lbl, gt_inds
return loc_index, score_index, labels, gt_inds
def get_anchor(n, c, h, w):
......@@ -75,85 +85,129 @@ def get_anchor(n, c, h, w):
return anchors
def rpn_blob(anchor, gt_boxes, iou, lod, rpn_batch_size_per_im,
rpn_positive_overlap, rpn_negative_overlap, fg_fraction):
loc_indexes = []
score_indexes = []
tmp_tgt_labels = []
tgt_bboxes = []
anchor_num = anchor.shape[0]
def rpn_target_assign_in_python(all_anchors,
gt_boxes,
is_crowd,
im_info,
lod,
rpn_straddle_thresh,
rpn_batch_size_per_im,
rpn_positive_overlap,
rpn_negative_overlap,
rpn_fg_fraction,
use_random=True):
anchor_num = all_anchors.shape[0]
batch_size = len(lod) - 1
for i in range(batch_size):
im_height = im_info[i][0]
im_width = im_info[i][1]
im_scale = im_info[i][2]
if rpn_straddle_thresh >= 0:
# Only keep anchors inside the image by a margin of straddle_thresh
inds_inside = np.where(
(all_anchors[:, 0] >= -rpn_straddle_thresh) &
(all_anchors[:, 1] >= -rpn_straddle_thresh) & (
all_anchors[:, 2] < im_width + rpn_straddle_thresh) & (
all_anchors[:, 3] < im_height + rpn_straddle_thresh))[0]
# keep only inside anchors
inside_anchors = all_anchors[inds_inside, :]
else:
inds_inside = np.arange(all_anchors.shape[0])
inside_anchors = all_anchors
b, e = lod[i], lod[i + 1]
iou_slice = iou[b:e, :]
bboxes_slice = gt_boxes[b:e, :]
gt_boxes_slice = gt_boxes[b:e, :] * im_scale
is_crowd_slice = is_crowd[b:e]
not_crowd_inds = np.where(is_crowd_slice == 0)[0]
gt_boxes_slice = gt_boxes_slice[not_crowd_inds]
iou = _bbox_overlaps(inside_anchors, gt_boxes_slice)
loc_idx, score_idx, tgt_lbl, gt_inds = rpn_target_assign(
iou_slice, rpn_batch_size_per_im, rpn_positive_overlap,
rpn_negative_overlap, fg_fraction)
loc_inds, score_inds, labels, gt_inds = rpn_target_assign(
iou, rpn_batch_size_per_im, rpn_positive_overlap,
rpn_negative_overlap, rpn_fg_fraction, use_random)
# unmap to all anchor
loc_inds = inds_inside[loc_inds]
score_inds = inds_inside[score_inds]
fg_bboxes = bboxes_slice[gt_inds]
fg_anchors = anchor[loc_idx]
box_deltas = _box_to_delta(fg_anchors, fg_bboxes, [1., 1., 1., 1.])
sampled_gt = gt_boxes_slice[gt_inds]
sampled_anchor = all_anchors[loc_inds]
box_deltas = _box_to_delta(sampled_anchor, sampled_gt, [1., 1., 1., 1.])
if i == 0:
loc_indexes = loc_idx
score_indexes = score_idx
tmp_tgt_labels = tgt_lbl
loc_indexes = loc_inds
score_indexes = score_inds
tgt_labels = labels
tgt_bboxes = box_deltas
else:
loc_indexes = np.concatenate(
[loc_indexes, loc_idx + i * anchor_num])
[loc_indexes, loc_inds + i * anchor_num])
score_indexes = np.concatenate(
[score_indexes, score_idx + i * anchor_num])
tmp_tgt_labels = np.concatenate([tmp_tgt_labels, tgt_lbl])
[score_indexes, score_inds + i * anchor_num])
tgt_labels = np.concatenate([tgt_labels, labels])
tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
tgt_labels = tmp_tgt_labels[score_indexes]
return loc_indexes, score_indexes, tgt_bboxes, tgt_labels
class TestRpnTargetAssignOp(OpTest):
def setUp(self):
n, c, h, w = 2, 4, 14, 14
anchor = get_anchor(n, c, h, w)
all_anchors = get_anchor(n, c, h, w)
gt_num = 10
anchor = anchor.reshape(-1, 4)
anchor_num = anchor.shape[0]
im_shapes = [[64, 64], [64, 64]]
gt_box, lod = _generate_groundtruth(im_shapes, 3, 4)
bbox = np.vstack([v['boxes'] for v in gt_box])
iou = _bbox_overlaps(bbox, anchor)
anchor = anchor.astype('float32')
bbox = bbox.astype('float32')
iou = iou.astype('float32')
loc_index, score_index, tgt_bbox, tgt_lbl = rpn_blob(
anchor, bbox, iou, [0, 4, 8], 25600, 0.95, 0.03, 0.25)
all_anchors = all_anchors.reshape(-1, 4)
anchor_num = all_anchors.shape[0]
images_shape = [[64, 64], [64, 64]]
#images_shape = [[64, 64]]
groundtruth, lod = _generate_groundtruth(images_shape, 3, 4)
lod = [0, 4, 8]
#lod = [0, 4]
im_info = np.ones((len(images_shape), 3)).astype(np.float32)
for i in range(len(images_shape)):
im_info[i, 0] = images_shape[i][0]
im_info[i, 1] = images_shape[i][1]
im_info[i, 2] = 0.8 #scale
gt_boxes = np.vstack([v['boxes'] for v in groundtruth])
is_crowd = np.hstack([v['is_crowd'] for v in groundtruth])
all_anchors = all_anchors.astype('float32')
gt_boxes = gt_boxes.astype('float32')
rpn_straddle_thresh = 0.0
rpn_batch_size_per_im = 256
rpn_positive_overlap = 0.7
rpn_negative_overlap = 0.3
rpn_fg_fraction = 0.5
use_random = False
loc_index, score_index, tgt_bbox, labels = rpn_target_assign_in_python(
all_anchors, gt_boxes, is_crowd, im_info, lod, rpn_straddle_thresh,
rpn_batch_size_per_im, rpn_positive_overlap, rpn_negative_overlap,
rpn_fg_fraction, use_random)
labels = labels[:, np.newaxis]
self.op_type = "rpn_target_assign"
self.inputs = {
'Anchor': anchor,
'GtBox': (bbox, [[4, 4]]),
'DistMat': (iou, [[4, 4]]),
'Anchor': all_anchors,
'GtBoxes': (gt_boxes, [[4, 4]]),
'IsCrowd': (is_crowd, [[4, 4]]),
'ImInfo': (im_info, [[1, 1]])
}
self.attrs = {
'rpn_batch_size_per_im': 25600,
'rpn_positive_overlap': 0.95,
'rpn_negative_overlap': 0.03,
'fg_fraction': 0.25,
'fix_seed': True
'rpn_batch_size_per_im': rpn_batch_size_per_im,
'rpn_straddle_thresh': rpn_straddle_thresh,
'rpn_positive_overlap': rpn_positive_overlap,
'rpn_negative_overlap': rpn_negative_overlap,
'rpn_fg_fraction': rpn_fg_fraction,
'use_random': use_random
}
self.outputs = {
'LocationIndex': loc_index.astype('int32'),
'ScoreIndex': score_index.astype('int32'),
'TargetBBox': tgt_bbox.astype('float32'),
'TargetLabel': tgt_lbl.astype('int64'),
'TargetLabel': labels.astype('int32')
}
def test_check_output(self):
......
......@@ -41,6 +41,9 @@ class TestSliceOp(OpTest):
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(['Input'], 'Out', max_relative_error=0.006)
class TestCase1(TestSliceOp):
def config(self):
......
......@@ -30,8 +30,10 @@ class TestWhileOp(unittest.TestCase):
"d1", shape=[10], append_batch_size=False, dtype='float32')
d2 = layers.data(
"d2", shape=[10], append_batch_size=False, dtype='float32')
i = layers.zeros(shape=[1], dtype='int64')
i.stop_gradient = True
init = layers.zeros(shape=[10], dtype='float32')
mem_array = layers.array_write(x=init, i=i)
data_array = layers.array_write(x=d0, i=i)
......@@ -45,11 +47,19 @@ class TestWhileOp(unittest.TestCase):
i = layers.zeros(shape=[1], dtype='int64')
i.stop_gradient = True
array_len = layers.fill_constant(shape=[1], dtype='int64', value=3)
array_len = layers.fill_constant(shape=[1], dtype='int64', value=1)
array_len.stop_gradient = True
cond = layers.less_than(x=i, y=array_len)
j = layers.fill_constant(shape=[1], dtype='int64', value=1)
j.stop_gradient = True
array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3)
array_len2.stop_gradient = True
cond2 = layers.less_than(x=j, y=array_len2)
while_op = layers.While(cond=cond)
while_op2 = layers.While(cond=cond2)
with while_op.block():
d = layers.array_read(array=data_array, i=i)
prev = layers.array_read(array=mem_array, i=i)
......@@ -59,7 +69,16 @@ class TestWhileOp(unittest.TestCase):
layers.array_write(result, i=i, array=mem_array)
layers.less_than(x=i, y=array_len, cond=cond)
sum_result = layers.array_read(array=mem_array, i=i)
with while_op2.block():
d2 = layers.array_read(array=data_array, i=j)
prev2 = layers.array_read(array=mem_array, i=j)
result2 = layers.sums(input=[d2, prev2])
j = layers.increment(x=j, in_place=True)
layers.array_write(result2, i=j, array=mem_array)
layers.less_than(x=j, y=array_len2, cond=cond2)
sum_result = layers.array_read(array=mem_array, i=j)
loss = layers.mean(sum_result)
append_backward(loss)
......
......@@ -16,3 +16,4 @@ from __future__ import print_function
from .program_utils import *
from .ufind import *
from .checkport import *
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import time
import socket
from contextlib import closing
def wait_server_ready(endpoints):
"""
Wait until parameter servers are ready, use connext_ex to detect
port readiness.
Args:
endpoints (list): endpoints string list, like:
["127.0.0.1:8080", "127.0.0.1:8081"]
Examples:
.. code-block:: python
wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
"""
while True:
all_ok = True
for ep in endpoints:
ip_port = ep.split(":")
with closing(socket.socket(socket.AF_INET,
socket.SOCK_STREAM)) as sock:
sock.settimeout(2)
result = sock.connect_ex((ip_port[0], int(ip_port[1])))
if result != 0:
all_ok = False
if not all_ok:
sys.stderr.write("pserver not ready, wait 3 sec to retry...\n")
sys.stderr.flush()
time.sleep(3)
else:
break
......@@ -113,27 +113,32 @@ def op_to_code(op):
inputs_str += ", "
inputs_str += "}"
attr_names = sorted(op.attr_names)
attrs_str = ""
for i in range(0, len(op.attr_names)):
name = op.attr_names[i]
for i in range(0, len(attr_names)):
name = attr_names[i]
attr_type = op.desc.attr_type(name)
if attr_type == core.AttrType.BLOCK:
a = "{name} = block[{value}]".format(
name=name, type=attr_type, value=op.block_attr_id(name))
attrs_str += a
if i != len(attr_names) - 1:
attrs_str += ", "
continue
if attr_type == core.AttrType.BLOCKS:
a = "{name} = blocks{value}".format(
name=name, type=attr_type, value=op.blocks_attr_ids(name))
attrs_str += a
if i != len(attr_names) - 1:
attrs_str += ", "
continue
a = "{name} = {value}".format(
name=name, type=attr_type, value=op.desc.attr(name))
attrs_str += a
if i != len(op.attr_names) - 1:
if i != len(attr_names) - 1:
attrs_str += ", "
if outputs_str != "{}":
......
......@@ -381,7 +381,7 @@ class DistributeTranspiler(object):
pserver_endpoints)
self._split_table_grad_and_add_send_vars(program, pserver_endpoints)
def get_trainer_program(self):
def get_trainer_program(self, wait_port=True):
"""
Get transpiled trainer side program.
......@@ -393,6 +393,9 @@ class DistributeTranspiler(object):
delete_ops(self.origin_program.global_block(), self.optimize_ops)
self.origin_program.__str__()
if wait_port:
wait_server_ready(self.pserver_endpoints)
return self.origin_program
def _get_trainer_startup_program(self, recv_vars, eplist):
......
......@@ -65,8 +65,43 @@ class InferenceTranspiler(object):
if use_mkldnn:
self._fuse_conv_bias_mkldnn(program)
self._fuse_conv_relu_mkldnn(program)
self._fuse_conv_eltwise_mkldnn(program)
self._fuse_conv_relu_mkldnn(
program) # ResNet residual block merging
self._fuse_bn_relu_mkldnn(program)
def _fuse_conv_eltwise_mkldnn(self, program):
'''
Transpile the program fusing elementwise_add into conv for MKLDNN
program. Elementwise add following convolution OP can be fused by adding
'fuse_eltwise' attribute to convolution OP and replacing its output
Tensor with second parameter of elementwise_add.
The result of fuse is:
- before:
- conv->elementwise_add->any_other_op
- after:
- conv->any_other_op
:param program: program to transpile
:type program: Program
'''
self.block = program.block(0)
i = 0
while i < len(self.block.ops):
current_op = self.block.ops[i]
if current_op.type in ['conv2d']:
next_op = self.block.ops[i + 1]
if next_op.type == 'elementwise_add':
self._fuse_conv_eltwise(current_op, next_op)
self.block._remove_op(i + 1) # Remove elementwise_add
i = i + 1
self._adjust_input()
self._remove_unused_var()
# TODO(luotao): use clone() method to flush the program.desc in force,
# since some large program.desc will not be flushed immediately.
# And a better solution will be considered later.
program = program.clone()
def _fuse_conv_relu_mkldnn(self, program):
'''
Transpile the program by fused relu activation for MKLDNN program.
......@@ -88,9 +123,9 @@ class InferenceTranspiler(object):
if current_op.type in ['conv2d']:
next_op = self.block.ops[i + 1]
if next_op.type == 'relu':
# modify conv OP to include relu
# modify bnorm OP to include relu
current_op.set_attr("fuse_relu", True)
# remove conv OP
# remove relu OP
self.block._remove_op(i + 1)
i = i + 1
......@@ -409,6 +444,20 @@ class InferenceTranspiler(object):
outputs={"Output": out_var},
attrs=attrs)
def _fuse_conv_eltwise(self, conv_op, eltwise_op):
'''
fuse the conv op with elementwise_add
:param conv_op: convolution operator
:type conv_op: Operator
:param eltwise_op: operator adding data from skip connection
:type eltwise_op: Operator
'''
conv_op.set_attr("fuse_eltwise", True)
self.input_map[conv_op.output("Output")[0]] = eltwise_op.input("Y")[0]
self.input_map[eltwise_op.output("Out")[0]] = eltwise_op.input("Y")[0]
def _adjust_input(self):
for i in range(len(self.block.ops)):
current_op = self.block.ops[i]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册