diff --git a/CMakeLists.txt b/CMakeLists.txt index c2fa5420e916fd5958f6198d6e97c9b1092b5aa1..d43df124bdee2d568a0c09d5acd35d5ff96f4654 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,6 +69,7 @@ option(WITH_ANAKIN "Compile with Anakin library" OFF) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(WITH_INFERENCE "Compile fluid inference library" ON) +option(WITH_INFERENCE_API_TEST "Test fluid inference high-level api interface" OFF) option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py index ed696e82f8723eba573e8affd3f25e2aa6426e63..0d5c9652de6b814627e54018366137e214726619 100644 --- a/benchmark/fluid/args.py +++ b/benchmark/fluid/args.py @@ -140,5 +140,11 @@ def parse_args(): '--use_lars', action='store_true', help='If set, use lars for optimizers, ONLY support resnet module.') + parser.add_argument( + '--reduce_strategy', + type=str, + choices=['reduce', 'all_reduce'], + default='all_reduce', + help='Specify the reduce strategy, can be reduce, all_reduce') args = parser.parse_args() return args diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 25622ee06c69e13181f34dfffadd5e299d31c8a8..ddd9fe809853a830ca676cc98f1819f683866def 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -170,6 +170,14 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog, strategy = fluid.ExecutionStrategy() strategy.num_threads = args.cpus strategy.allow_op_delay = False + build_strategy = fluid.BuildStrategy() + if args.reduce_strategy == "reduce": + build_strategy.reduce_strategy = fluid.BuildStrategy( + ).ReduceStrategy.Reduce + else: + build_strategy.reduce_strategy = fluid.BuildStrategy( + ).ReduceStrategy.AllReduce + avg_loss = train_args[0] if args.update_method == "pserver": @@ -184,6 +192,7 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog, avg_loss.name, main_program=train_prog, exec_strategy=strategy, + build_strategy=build_strategy, num_trainers=num_trainers, trainer_id=trainer_id) diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py index cef8657ee629dcbc19221fd3440844a56627e920..f123e07fb711bd8ff67c1ecf5ec9a02c1e79eb1d 100644 --- a/benchmark/fluid/models/mnist.py +++ b/benchmark/fluid/models/mnist.py @@ -67,11 +67,14 @@ def cnn_model(data): def get_model(args, is_train, main_prog, startup_prog): # NOTE: mnist is small, we don't implement data sharding yet. - filelist = [ - os.path.join(args.data_path, f) for f in os.listdir(args.data_path) - ] + opt = None + data_file_handle = None with fluid.program_guard(main_prog, startup_prog): if args.use_reader_op: + filelist = [ + os.path.join(args.data_path, f) + for f in os.listdir(args.data_path) + ] data_file_handle = fluid.layers.open_files( filenames=filelist, shapes=[[-1, 1, 28, 28], (-1, 1)], @@ -100,7 +103,7 @@ def get_model(args, is_train, main_prog, startup_prog): if is_train: opt = fluid.optimizer.AdamOptimizer( learning_rate=0.001, beta1=0.9, beta2=0.999) - opt.minimize() + opt.minimize(avg_cost) if args.memory_optimize: fluid.memory_optimize(main_prog) diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py index d71b855612ae32083b2b2e3448db3749c340633b..1b3bfe659c7d97b58dc4121387d4db22266381c5 100644 --- a/benchmark/fluid/models/resnet.py +++ b/benchmark/fluid/models/resnet.py @@ -207,7 +207,7 @@ def get_model(args, is_train, main_prog, startup_prog): total_images = 1281167 / trainer_count - step = int(total_images / args.batch_size + 1) + step = int(total_images / (args.batch_size * args.gpus) + 1) epochs = [30, 60, 90] bd = [step * e for e in epochs] base_lr = args.learning_rate diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake index ac19b1651893f18b14c62a0986df75bed25d7e80..8f65a737c43a124c05574d6eb9c3050fdab5299a 100644 --- a/cmake/tensorrt.cmake +++ b/cmake/tensorrt.cmake @@ -16,7 +16,9 @@ find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a DOC "Path to TensorRT library.") if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY) + if(WITH_DSO) set(TENSORRT_FOUND ON) + endif(WITH DSO) else() set(TENSORRT_FOUND OFF) endif() diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md index b810dc941d27fdb5004812ab58e105502e83280f..00650946ff2e658cfad0e63a8f1e008902a2d36e 100644 --- a/doc/fluid/dev/releasing_process_en.md +++ b/doc/fluid/dev/releasing_process_en.md @@ -1,6 +1,6 @@ # PaddlePaddle Releasing Process -PaddlePaddle manages its branches using "git-flow branching model", and [Semantic Versioning](http://semver.org/) as it's version number semantics. +PaddlePaddle manages its branches using Trunk Based Development, and [Semantic Versioning](http://semver.org/) as it's version number semantics. Each time we release a new PaddlePaddle version, we should follow the below steps: diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 842fde1ec5f16aeec28a790f1869eaed64e3516c..e362d3486487dd0b55e3e40d1c1358f2e5604ac5 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -59,7 +59,7 @@ paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], vara paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None)) paddle.fluid.InferenceTranspiler.__init__ paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) @@ -305,9 +305,9 @@ paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'neg paddle.fluid.layers.detection_output ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)) paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)) paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral')) -paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'anchor_var', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3)) +paddle.fluid.layers.rpn_target_assign ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)) paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)) -paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'gt_boxes', 'im_scales', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None)) +paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True)) paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)) paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) @@ -346,7 +346,7 @@ paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'con paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None)) paddle.fluid.transpiler.InferenceTranspiler.__init__ paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index abd5459f6d47da6d1341284916b419325dc5977c..a8e0c4a3fedfd56e38de7568be6b3f2e76a4b25f 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -28,10 +28,20 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) +if(WITH_GPU) + cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle rpc_op_handle + all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass) +endif() + cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle) -cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto) +if(WITH_GPU) + cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto reference_count_pass) +else() + cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto) +endif() + cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index bf493a3fa44e48deec734250d04b2a413c3ed9da..7c5f5bd80a937bf1a1c891155764833d7b21c5c2 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -46,7 +46,8 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, #endif void AllReduceOpHandle::RunImpl() { - platform::RecordEvent r("all_reduce", nullptr); + platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); + if (NoDummyInputSize() == 1) { return; // No need to all reduce when GPU count = 1; } else { diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 1d9f1bd6e417e30f0799f0bbed1739cedb4e8fbf..4fdab5cd94358d08eac7f8b041bf16d09042f0bd 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -15,12 +15,15 @@ #include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { namespace details { void BroadcastOpHandle::RunImpl() { + platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); + if (places_.size() == 1) return; // The input and output may have dummy vars. diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index d9fcd92427ef38b131b4ce782c0ada37765682db..e98f1ab148db083ac63a1afd43e334fbfae62539 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -32,6 +32,10 @@ struct ComputationOpHandle : public OpHandleBase { std::string Name() const override; + const Scope *GetScope() const { return scope_; } + + const platform::Place &GetPlace() const { return place_; } + protected: void RunImpl() override; diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index d44ebbae4d4be9c79e629303805c94030b8879db..8f319116ab80b75c624f35b0e1315e7362e88d9a 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -127,6 +127,9 @@ static const char kLocalScopes[] = "local_scopes"; static const char kStrategy[] = "strategy"; void MultiDevSSAGraphBuilder::Init() const { + all_vars_.clear(); + balance_vars_.clear(); + loss_var_name_ = Get(kLossVarName); places_ = Get>(kPlaces); local_scopes_ = Get>(kLocalScopes); @@ -348,14 +351,31 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( size_t cur_device_id = 0; bool is_forwarding = true; + bool is_dist_train = false; for (ir::Node *node : sorted_ops) { if (boost::get( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == static_cast(OpRole::kRPC)) { - CreateRPCOp(&result, node); + int op_dev_id = CreateRPCOp(&result, node); + PADDLE_ENFORCE(op_dev_id != -1, + "Can not schedule the RPC operator to the right place."); + if (node->Op()->Type() == "recv") { + auto recv_vars_attr = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE(recv_vars_attr.size() == 2UL); // [parameter, gradient] + if (recv_vars_attr[0].find(".block") == std::string::npos) { + bcast_var_name_set[op_dev_id].emplace(recv_vars_attr[0]); + } + } + is_dist_train = true; } else if (IsDistTrainOp(node, send_vars, recv_vars)) { - CreateDistTrainOp(&result, node); + int op_dev_id = CreateDistTrainOp(&result, node); + if (node->Op()->Type() == "concat") { + auto origin_param_name = node->Op()->OutputArgumentNames()[0]; + bcast_var_name_set[op_dev_id].emplace(origin_param_name); + } } else if (IsScaleLossOp(node)) { // user can customize loss@grad if not use_default_grad_scale_ if (strategy_.gradient_scale_ != @@ -414,7 +434,9 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( CreateReduceOp(&result, g_name, cur_device_id); graph->Get(kShardedVarDevice) .emplace(g_name, cur_device_id); - bcast_var_name_set[cur_device_id].emplace(p_name); + if (!is_dist_train) { + bcast_var_name_set[cur_device_id].emplace(p_name); + } break; case BuildStrategy::ReduceStrategy::kAllReduce: if (IsSparseGradient(g_name)) { @@ -436,14 +458,19 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( } } } - bool use_gpu = false; #ifdef PADDLE_WITH_CUDA use_gpu = nccl_ctxs_ != nullptr; #endif - if (use_gpu && strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { - // Insert BCast Ops + // Insert broadcast operators principle: + // 1. Broadcast optimized parameters in Reduce strategy; + // 2. No need broadcast optimized parameters in AllReduce strategy because of + // the optimization sub-graph would be run on every GPU; + // 3. Allways broadcast received parameters in Distribute Training. + if ((use_gpu && + strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) || + is_dist_train) { for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { auto &to_bcast_set = bcast_var_name_set[dev_id]; for (auto &bcast_name : to_bcast_set) { @@ -675,8 +702,8 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, return var; } -void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, - ir::Node *node) const { +int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, + ir::Node *node) const { int op_dev_id = -1; std::vector input_var_names; std::vector output_var_names; @@ -719,6 +746,7 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, node->Op()->Type()); CreateComputationalOp(result, node, op_dev_id); + return op_dev_id; } void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { @@ -737,8 +765,8 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { } // Create RPC related op handles that connects its in ops and out ops. -void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, - ir::Node *node) const { +int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, + ir::Node *node) const { int op_dev_id = -1; if (node->Op()->Type() == "send") { // TODO(paddle-dev): getting the first var is not safe. @@ -824,6 +852,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id); } } + return op_dev_id; } bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index ac6d9c5a64cfde60f75c76dae0a30cc7d735e996..47aaa80f4d66a48b729d0638badcab885a50585c 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -40,12 +40,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass { size_t device_id) const; void Init() const; - private: - mutable std::string loss_var_name_; - mutable std::vector places_; - mutable std::vector local_scopes_; - mutable std::unordered_set grad_names_; - #ifdef PADDLE_WITH_CUDA mutable platform::NCCLContextMap *nccl_ctxs_; #endif @@ -54,8 +48,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass { bool IsScaleLossOp(ir::Node *node) const; - void CreateRPCOp(ir::Graph *result, ir::Node *node) const; - void CreateDistTrainOp(ir::Graph *result, ir::Node *node) const; + int CreateRPCOp(ir::Graph *result, ir::Node *node) const; + int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const; /** * Is this operator as the end-point operator before/after send operator. @@ -95,13 +89,17 @@ class MultiDevSSAGraphBuilder : public ir::Pass { size_t GetAppropriateDeviceID( const std::vector &var_names) const; - private: + void SetCommunicationContext(OpHandleBase *op_handle, + const platform::Place &p) const; + + mutable std::string loss_var_name_; + mutable std::vector places_; + mutable std::vector local_scopes_; + mutable std::unordered_set grad_names_; + mutable BuildStrategy strategy_; mutable std::unordered_map all_vars_; mutable std::vector balance_vars_; - - void SetCommunicationContext(OpHandleBase *op_handle, - const platform::Place &p) const; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 6c7e5c1fb06620b1c071b00fcfcc1b4a29bf8d62..7fc06f234d42a992328c0b6164f17945d8075c28 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -27,7 +27,8 @@ namespace framework { namespace details { void ReduceOpHandle::RunImpl() { - platform::RecordEvent r("reduce", nullptr); + platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); + if (places_.size() == 1) return; // the input and output may have dummy var. auto in_var_handles = DynamicCast(inputs_); diff --git a/paddle/fluid/framework/details/reference_count_op_handle.h b/paddle/fluid/framework/details/reference_count_op_handle.h new file mode 100644 index 0000000000000000000000000000000000000000..71db8d952f4c205b875ad254dc19c0c1f74e61b3 --- /dev/null +++ b/paddle/fluid/framework/details/reference_count_op_handle.h @@ -0,0 +1,123 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace framework { +namespace details { + +using ReferenceCountMap = std::unordered_map; +using AtomicReferenceCountMap = + std::unordered_map>; +using DeviceReferenceCountMap = + std::unordered_map>; +using AtomicDeviceReferenceCountMap = + std::unordered_map>; +using DeviceGarbageCollectorMap = + std::unordered_map>>; + +class ReferenceCountOpHandle : public OpHandleBase { + public: + ReferenceCountOpHandle(ir::Node *node, const Scope *scope, + const platform::CUDAPlace &place, + const std::vector &var_names, + GarbageCollector *gc, + AtomicReferenceCountMap *ref_cnts) + : OpHandleBase(node), + scope_(scope), + var_names_(var_names), + gc_(gc), + ref_cnts_(ref_cnts) { + dev_ctx_ = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + if (IsStreamGarabageCollector()) { + PADDLE_ENFORCE(cudaSetDevice(place.device)); + PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); + } + } + + ~ReferenceCountOpHandle() { + if (IsStreamGarabageCollector()) { + auto gpu_place = boost::get(dev_ctx_->GetPlace()); + PADDLE_ENFORCE(cudaSetDevice(gpu_place.device)); + PADDLE_ENFORCE(cudaEventDestroy(event_)); + } + } + + std::string Name() const override { return "reference_count"; } + + protected: + void RunImpl() override { + auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); + std::vector tensors; + for (auto &name : var_names_) { + auto it = ref_cnts_->find(name); + if (it == ref_cnts_->end()) continue; + + auto *var = exec_scope->FindVar(name); + if (var == nullptr || !var->IsType()) continue; + + if (it->second.fetch_sub(1) <= 1) { + tensors.emplace_back(var->GetMutable()); + } + } + + if (!tensors.empty()) { + ClearTensors(tensors); + } + } + + private: + void ClearTensors(const std::vector &tensors) { + auto *gc = dynamic_cast *>(gc_); + if (gc != nullptr) { + auto compute_stream = dev_ctx_->stream(); + auto callback_stream = gc->stream(); + auto callback_func = [=]() { + PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); + PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); + }; + gc_->Add(tensors, callback_func); + } else { + gc_->Add(tensors); + } + } + + bool IsStreamGarabageCollector() const { + return dynamic_cast *>(gc_) != nullptr; + } + + const Scope *scope_; + platform::CUDADeviceContext *dev_ctx_; + std::vector var_names_; + GarbageCollector *gc_; // not own + AtomicReferenceCountMap *ref_cnts_; // not own + cudaEvent_t event_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..344754d5a1e119c04cae08ad50126924b5824315 --- /dev/null +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -0,0 +1,150 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/reference_count_pass.h" + +namespace paddle { +namespace framework { +namespace details { + +std::unique_ptr ReferenceCountPass::ApplyImpl( + std::unique_ptr graph) const { + auto &ref_cnts = Get(kGlobalReferenceCount); + auto &cur_ref_cnts = Get(kCurReferenceCount); + auto &gcs = Get(kGarbageCollector); + + // It is not easy to find the right reference counts of varaibles in graph + // Step 1: Find all variables in computation ops + // Step 2: Find all variables in non-computation ops which refers to variables + // in computation ops + std::unordered_set names; + auto get_ref_cnts_from_compute_op = [&]( + const std::unique_ptr &op, + const std::vector &vars) { + std::vector var_names_in_op; + auto *compute_op = dynamic_cast(op.get()); + if (compute_op == nullptr || + !platform::is_gpu_place(compute_op->GetPlace())) + return var_names_in_op; + auto place = boost::get(compute_op->GetPlace()); + for (VarHandleBase *var_handle_base : vars) { + auto *var_handle = dynamic_cast(var_handle_base); + if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue; + + if (!platform::is_gpu_place(var_handle->place_) || + boost::get(var_handle->place_) != place) + continue; + + VarDesc *var_desc = var_handle->Node()->Var(); + auto var_name = var_handle->Node()->Name(); + + // This is wierd but there is really some variables without var_desc + // in computation_op + if (var_desc == nullptr) { + if (compute_op->Node()->Op()->Block()->FindVar(var_name) == nullptr) + continue; + } else { + if (var_desc->Persistable() || + var_desc->Proto()->type().type() != proto::VarType::LOD_TENSOR) + continue; + } + + // compute op only runs in one device + if (ref_cnts[place.device]->count(var_name)) + ++(*ref_cnts[place.device])[var_name]; + else + (*ref_cnts[place.device])[var_name] = 1; + + names.insert(var_name); + var_names_in_op.push_back(var_name); + } + return var_names_in_op; + }; + + auto update_ref_cnts_from_non_compute_op = [&]( + const std::unique_ptr &op, + const std::vector &vars) { + if (dynamic_cast(op.get()) != nullptr) return; + for (VarHandleBase *var_handle_base : vars) { + auto *var_handle = dynamic_cast(var_handle_base); + if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue; + + auto var_name = var_handle->Node()->Name(); + auto var_place = var_handle->place_; + if (!platform::is_gpu_place(var_place)) continue; + auto place = boost::get(var_place); + if (names.count(var_name) == 0) continue; + if (ref_cnts.count(place.device) && + ref_cnts[place.device]->count(var_name)) { + ++(*ref_cnts[place.device])[var_name]; + } + } + }; + + std::unordered_map + compute_ref_cnt_map; + auto &all_ops = graph->Get(kGraphOps); + for (auto &op : all_ops) { + auto in_var_names = get_ref_cnts_from_compute_op(op, op->Inputs()); + auto out_var_names = get_ref_cnts_from_compute_op(op, op->Outputs()); + if (in_var_names.empty() && out_var_names.empty()) continue; + in_var_names.insert(in_var_names.end(), out_var_names.begin(), + out_var_names.end()); + auto *compute_op = dynamic_cast(op.get()); + auto place = boost::get(compute_op->GetPlace()); + ir::Node *ref_cnt_node = + graph->CreateEmptyNode("reference_count", ir::Node::Type::kOperation); + auto *ref_cnt_handle = new ReferenceCountOpHandle( + ref_cnt_node, compute_op->GetScope(), place, in_var_names, + gcs[place.device].get(), cur_ref_cnts[place.device].get()); + auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); + compute_op->AddOutput(dep_var); + ref_cnt_handle->AddInput(dep_var); + graph->Get(kGraphDepVars).emplace(dep_var); + compute_ref_cnt_map[compute_op] = ref_cnt_handle; + } + + for (auto &op : all_ops) { + update_ref_cnts_from_non_compute_op(op, op->Inputs()); + update_ref_cnts_from_non_compute_op(op, op->Outputs()); + } + + std::vector> new_all_ops; + new_all_ops.reserve(compute_ref_cnt_map.size() + all_ops.size()); + for (auto &op : all_ops) { + new_all_ops.emplace_back(std::move(op)); + auto it = compute_ref_cnt_map.find(new_all_ops.back().get()); + if (it != compute_ref_cnt_map.end()) { + new_all_ops.emplace_back(it->second); + } + } + + all_ops.swap(new_all_ops); + return graph; +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(reference_count_pass, + paddle::framework::details::ReferenceCountPass) + .RequirePassAttr(paddle::framework::details::kGlobalReferenceCount) + .RequirePassAttr(paddle::framework::details::kCurReferenceCount) + .RequirePassAttr(paddle::framework::details::kGarbageCollector); diff --git a/paddle/fluid/framework/details/reference_count_pass.h b/paddle/fluid/framework/details/reference_count_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..7081280b0600b9c1985987d02d679c298ad4b8bd --- /dev/null +++ b/paddle/fluid/framework/details/reference_count_pass.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/details/reference_count_op_handle.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace details { + +constexpr char kGlobalReferenceCount[] = "reference_count"; +constexpr char kCurReferenceCount[] = "current_reference_count"; +constexpr char kGarbageCollector[] = "garbage_collector"; + +class ReferenceCountPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index 609e18581957f62b040e04e937873b7a8fa5785a..ba243979b34aa1f683de707525403becaf0a1c00 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -51,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() { ->stream(); memory::Copy(boost::get(place_), tmp, platform::CPUPlace(), &coeff_, sizeof(float), stream); - VLOG(1) << place_ << "RUN Scale loss grad op"; + VLOG(10) << place_ << "RUN Scale loss grad op"; }); #endif } diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index 5bd974d6b789a2f085c0a69de5e133187342f587..e5b1eaa7318aecde1dbf89de8fe242a3008db97c 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -18,6 +18,9 @@ #include #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/platform/profiler.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/framework/details/reference_count_op_handle.h" +#endif namespace paddle { namespace framework { @@ -65,12 +68,28 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr); drop_scope_counter_ += 1; + +#ifdef PADDLE_WITH_CUDA + const std::string gc_name = "garbage_collector"; + DeviceGarbageCollectorMap *gc = + Graph().Has(gc_name) ? &(Graph().Get(gc_name)) + : nullptr; +#endif + if (!fetch_tensors.empty() || drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { drop_scope_counter_ = 0; // Wait All computational streams for (auto p : places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); +#ifdef PADDLE_WITH_CUDA + if (gc != nullptr && platform::is_gpu_place(p)) { + auto gpu_place = boost::get(p); + auto &gc_at_place = gc->at(gpu_place.device); + gc_at_place->Wait(); + gc_at_place->Reset(); + } +#endif } for (auto &scope : local_scopes_) { auto &local_scope = diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index dad170ed78c64202b5c812bd8682887fe3b736d6..8d8042a0563a21dad216ffd53a474322c378ace6 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -37,7 +37,11 @@ int kProgramId = -1; ExecutorPrepareContext::ExecutorPrepareContext( const framework::ProgramDesc& prog, size_t block_id) - : prog_(prog), block_id_(block_id) {} + : prog_(prog), block_id_(block_id) { + if (GetEagerDeletionThreshold() >= 0) { + ref_cnts_ = GetNonPersistableReferenceCount(prog_, block_id_); + } +} ExecutorPrepareContext::~ExecutorPrepareContext() { VLOG(5) << "destroy ExecutorPrepareContext"; @@ -329,15 +333,81 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, CreateVariables(ctx->prog_, local_scope, ctx->block_id_); } + int64_t max_memory_size = GetEagerDeletionThreshold(); + + std::unique_ptr> gc; + if (max_memory_size >= 0) { + ctx->ResetReferenceCount(); +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place_)) { + gc.reset(new DefaultStreamGarbageCollector( + boost::get(place_), max_memory_size)); + } else { +#endif + gc.reset(new CPUGarbageCollector( + boost::get(place_), max_memory_size)); +#ifdef PADDLE_WITH_CUDA + } +#endif + } + for (auto& op : ctx->ops_) { op->Run(*local_scope, place_); + if (gc != nullptr) { + std::vector erase_vars; + for (auto& input : op->Inputs()) { + for (auto& input_name : input.second) { + auto it = ctx->cur_ref_cnts_.find(input_name); + if (it == ctx->cur_ref_cnts_.end()) continue; + if (it->second == 1) { // should delete it + erase_vars.emplace_back(input_name); + ctx->cur_ref_cnts_.erase(input_name); + } else { + --(it->second); + } + } + } + + for (auto& output : op->Outputs()) { + for (auto& output_name : output.second) { + auto it = ctx->cur_ref_cnts_.find(output_name); + if (it == ctx->cur_ref_cnts_.end()) continue; + if (it->second == 1) { + erase_vars.emplace_back(output_name); + ctx->cur_ref_cnts_.erase(output_name); + } else { + --(it->second); + } + } + } + + if (!erase_vars.empty()) { + std::vector erase_tensors; + for (auto& name : erase_vars) { + auto* var = local_scope->FindVar(name); + if (var == nullptr) continue; + if (var->IsType()) { + auto* tensor = var->GetMutable(); + erase_tensors.push_back(tensor); + } + } + if (!erase_tensors.empty()) gc->Add(erase_tensors); + } + } + if (FLAGS_benchmark) { VLOG(2) << "Memory used after operator " + op->Type() + " running: " << memory::memory_usage(place_); } } - platform::DeviceContextPool::Instance().Get(place_)->Wait(); + + if (gc != nullptr) { + gc->Wait(); + } else { + platform::DeviceContextPool::Instance().Get(place_)->Wait(); + } + if (local_scope != scope) { scope->DeleteScope(local_scope); } else { diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index f95808c199b9de693ec653c29374c9130be7fd59..f0cc1338a8af50030a70a9797cbcd1b0567272b5 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" @@ -27,13 +28,58 @@ namespace paddle { namespace framework { extern void InitializeVariable(Variable* var, proto::VarType::Type var_type); +template +std::unordered_map GetNonPersistableReferenceCount( + const ProgramDesc& prog, size_t block_id) { + auto& block = prog.Block(block_id); + std::unordered_set ignored_vars; + std::unordered_map ref_cnts; + + for (auto var_desc : block.AllVars()) { + auto type = var_desc->Proto()->type().type(); + if (type != proto::VarType::LOD_TENSOR || var_desc->Persistable()) { + ignored_vars.insert(var_desc->Name()); // ignore persistable vars + } + } + + for (auto op_desc : block.AllOps()) { + for (auto& input : op_desc->Inputs()) { + for (auto& input_name : input.second) { + if (!ignored_vars.count(input_name)) { + if (ref_cnts.count(input_name)) + ++ref_cnts[input_name]; + else + ref_cnts[input_name] = 1; + } + } + } + + for (auto& output : op_desc->Outputs()) { + for (auto output_name : output.second) { + if (!ignored_vars.count(output_name)) { + if (ref_cnts.count(output_name)) + ++ref_cnts[output_name]; + else + ref_cnts[output_name] = 1; + } + } + } + } + return ref_cnts; +} + struct ExecutorPrepareContext { ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id); ~ExecutorPrepareContext(); + void ResetReferenceCount() { cur_ref_cnts_ = ref_cnts_; } + const framework::ProgramDesc& prog_; size_t block_id_; std::vector> ops_; + + std::unordered_map ref_cnts_; + std::unordered_map cur_ref_cnts_; }; class Executor { diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h new file mode 100644 index 0000000000000000000000000000000000000000..b403252c972d26da6deeca54ce88a9547ffe7afa --- /dev/null +++ b/paddle/fluid/framework/garbage_collector.h @@ -0,0 +1,163 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include // NOLINT +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +// T should have memory_size() and clear() method +template +class GarbageCollector { + public: + GarbageCollector(const platform::Place &place, size_t max_memory_size) + : max_memory_size_(std::max(max_memory_size, static_cast(1))) { + garbages_.reset(new std::deque()); + dev_ctx_ = platform::DeviceContextPool::Instance().Get(place); + } + + virtual ~GarbageCollector() {} + + void Reset() { + std::lock_guard guard(mutex_); + garbages_.reset(new std::deque()); + cur_memory_size_ = 0; + } + + template + void Add(const Container &objs) { + Add(objs, []() {}); + } + + template + void Add(const Container &objs, Callback &&callback) { + std::shared_ptr> clear_deque; + { + std::lock_guard guard(mutex_); + for (auto *obj : objs) { + garbages_->push_back(obj); + cur_memory_size_ += obj->memory_size(); + } + if (cur_memory_size_ >= max_memory_size_) { + cur_memory_size_ = 0; + clear_deque = garbages_; + garbages_.reset(new std::deque()); + } + } + + if (clear_deque != nullptr) { + callback(); + ClearCallback([=]() { + for (auto *obj : *clear_deque) obj->clear(); + }); + } + } + + virtual void Wait() const {} + + protected: + virtual void ClearCallback(const std::function &callback) = 0; + + platform::DeviceContext *dev_ctx_; + std::shared_ptr> garbages_; + mutable std::mutex mutex_; + const size_t max_memory_size_; + size_t cur_memory_size_ = 0; +}; + +template +class CPUGarbageCollector : public GarbageCollector { + public: + CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + + protected: + void ClearCallback(const std::function &callback) override { + callback(); + } +}; + +#ifdef PADDLE_WITH_CUDA +template +class DefaultStreamGarbageCollector : public GarbageCollector { + public: + DefaultStreamGarbageCollector(const platform::CUDAPlace &place, + size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + + cudaStream_t stream() const { + return static_cast(this->dev_ctx_) + ->stream(); + } + + void Wait() const override { + this->dev_ctx_->Wait(); + static_cast(this->dev_ctx_) + ->WaitStreamCallback(); + } + + protected: + void ClearCallback(const std::function &callback) override { + static_cast(this->dev_ctx_) + ->AddStreamCallback(callback); + } +}; + +template +class StreamGarbageCollector : public GarbageCollector { + public: + StreamGarbageCollector(const platform::CUDAPlace &place, + size_t max_memory_size) + : GarbageCollector(place, max_memory_size) { + PADDLE_ENFORCE(cudaSetDevice(place.device)); + PADDLE_ENFORCE(cudaStreamCreate(&stream_)); + callback_manager_.reset(new platform::StreamCallbackManager(stream_)); + } + + ~StreamGarbageCollector() { + auto place = boost::get(this->dev_ctx_->GetPlace()); + PADDLE_ENFORCE(cudaSetDevice(place.device)); + PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); + PADDLE_ENFORCE(cudaStreamDestroy(stream_)); + } + + void Wait() const override { + PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); + std::lock_guard guard(this->mutex_); + callback_manager_->Wait(); + } + + cudaStream_t stream() const { return stream_; } + + protected: + void ClearCallback(const std::function &callback) override { + std::lock_guard guard(this->mutex_); + callback_manager_->AddCallback(callback); + } + + private: + cudaStream_t stream_; + std::unique_ptr callback_manager_; +}; +#endif + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc index 4408cb45acb3d46e1addf5c25c238af50e5f5e5f..09c5ec59d66445bdbd5349447b125be89cb2efdf 100644 --- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc @@ -58,7 +58,7 @@ std::unique_ptr ConvReLUFusePass::ApplyImpl( desc.SetInput("Input", std::vector({conv_relu_i_in})); desc.SetInput("Filter", std::vector({conv_relu_w_in})); desc.SetInput("Bias", std::vector({conv_relu_b_in})); - desc.SetOutput("Out", std::vector({conv_relu_out})); + desc.SetOutput("Output", std::vector({conv_relu_out})); desc.SetType("conv2d"); for (auto& attr : conv->Op()->GetAttrMap()) { desc.SetAttr(attr.first, attr.second); diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index ae8496204d4aeb88c04154d571325d440274e821..ab687e760a761d4e445726bd5149966adc2403d0 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -94,6 +94,14 @@ class Graph { }; } + template + void SetNotOwned(const std::string &attr_name, AttrType *attr) { + PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the graph", + attr_name); + attrs_[attr_name] = attr; + attr_dels_[attr_name] = []() {}; + } + const std::unordered_set &Nodes() const { return node_set_; } // Create a normal variable with non-null VarDesc. diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 371384dc56eec91db1f621c0ebb65113e7a5a5cc..1a8d9cefbfa570d2ac3f4fc32d50d705ddc67a75 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -429,7 +429,7 @@ struct LSTM : public PatternBase { struct GRU : public PatternBase { GRU(PDPattern* pattern, const std::string& name_scope) - : PatternBase(pattern, name_scope, "lstm") {} + : PatternBase(pattern, name_scope, "gru") {} PDNode* operator()(PDNode* x); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 5b8c75a93de2ddd8f7260d2191c22a5945b3d2d9..dbc3ff8657a1f2238951a791fb5ac3356c885770 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -188,6 +188,30 @@ ParallelExecutor::ParallelExecutor( main_program, member_->places_, loss_var_name, params, member_->local_scopes_, member_->use_cuda_, build_strategy, member_->nccl_ctxs_.get()); + + auto max_memory_size = GetEagerDeletionThreshold(); + if (max_memory_size >= 0) { + for (auto &place : member_->places_) { + if (!platform::is_gpu_place(place)) continue; + auto gpu_place = boost::get(place); + if (gcs_[gpu_place.device] == nullptr) { + ref_cnts_[gpu_place.device].reset(new details::ReferenceCountMap()); + cur_ref_cnts_[gpu_place.device].reset( + new details::AtomicReferenceCountMap()); + gcs_[gpu_place.device].reset( + new StreamGarbageCollector(gpu_place, max_memory_size)); + } + } + if (!gcs_.empty()) { + auto ref_cnt_pass = + ir::PassRegistry::Instance().Get("reference_count_pass"); + ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_); + ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_); + ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_); + graph = ref_cnt_pass->Apply(std::move(graph)); + graph->SetNotOwned("garbage_collector", &gcs_); + } + } #else std::unique_ptr graph = ApplyParallelExecutorPass( main_program, member_->places_, loss_var_name, params, @@ -209,30 +233,9 @@ ParallelExecutor::ParallelExecutor( void ParallelExecutor::BCastParamsToDevices( const std::unordered_set &vars) const { - // the initializing bcast, all vars would be bcast from device(0), - // otherwise - // bcast from the specified device. - bool initializing = member_->executor_ ? false : true; + // the initializing bcast, all vars would be bcast from device(0). for (auto &var : vars) { - int var_dev_id = -1; - if (member_->executor_) { - auto &sharded_var_device = - member_->executor_->Graph().Get( - details::kShardedVarDevice); - if (sharded_var_device.find(var) != sharded_var_device.end()) { - var_dev_id = sharded_var_device.at(var); - } - } - - if (!initializing && var_dev_id == -1) continue; - - framework::Variable *main_var = nullptr; - if (initializing) { - main_var = member_->local_scopes_[0]->FindVar(var); - } else { - main_var = member_->local_scopes_[var_dev_id]->FindVar(var); - } - + framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var); if (main_var == nullptr || !main_var->IsType()) { continue; } @@ -248,8 +251,7 @@ void ParallelExecutor::BCastParamsToDevices( auto place = member_->places_[i]; void *buffer; - if ((initializing && i == 0) || - (!initializing && static_cast(i) == var_dev_id)) { + if (i == 0) { buffer = const_cast(main_tensor.data()); } else { auto local_scope = member_->local_scopes_[i]; @@ -266,29 +268,18 @@ void ParallelExecutor::BCastParamsToDevices( platform::NCCLGroupGuard guard; for (size_t i = 0; i < member_->places_.size(); ++i) { auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]); - if (initializing) { - platform::dynload::ncclBcast(buffers[i], numel, data_type, 0, - nccl_ctx.comm_, nccl_ctx.stream()); - } else { - if (var_dev_id >= 0) { - platform::dynload::ncclBcast(buffers[i], numel, data_type, - var_dev_id, nccl_ctx.comm_, - nccl_ctx.stream()); - } - } + platform::dynload::ncclBcast(buffers[i], numel, data_type, 0, + nccl_ctx.comm_, nccl_ctx.stream()); } member_->nccl_ctxs_->WaitAll(); } - #else PADDLE_THROW("Not compiled with CUDA"); #endif } else { platform::CPUPlace cpu; for (size_t i = 0; i < member_->places_.size(); ++i) { - if ((initializing && i == 0) || - (!initializing && static_cast(i) == var_dev_id)) - continue; + if (i == 0) continue; auto local_scope = member_->local_scopes_[i]; auto *t = local_scope->Var(var)->GetMutable(); @@ -310,6 +301,11 @@ void ParallelExecutor::BCastParamsToDevices( void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { platform::RecordBlock b(0); +#ifdef PADDLE_WITH_CUDA + if (!gcs_.empty()) { + ResetReferenceCount(); + } +#endif auto fetch_data = member_->executor_->Run(fetch_tensors); *member_->global_scope_->Var(fetched_var_name)->GetMutable() = fetch_data; @@ -367,3 +363,6 @@ USE_PASS(graph_viz_pass); USE_PASS(multi_devices_pass); USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_print_pass); +#ifdef PADDLE_WITH_CUDA +USE_PASS(reference_count_pass); +#endif diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 5fb748fa205d5e9dbd2943b615c69aedd0e7a26f..c64906ff230df5f2b7cc9f5c6b29d68956ab8f33 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -15,7 +15,9 @@ limitations under the License. */ #pragma once #include +#include #include +#include #include #include #include "paddle/fluid/framework/details/execution_strategy.h" @@ -27,6 +29,10 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/framework/details/reference_count_pass.h" +#endif + namespace paddle { namespace framework { @@ -66,10 +72,27 @@ class ParallelExecutor { void Run(const std::vector &fetch_tensors, const std::string &fetched_var_name); + private: void BCastParamsToDevices(const std::unordered_set &vars) const; - private: ParallelExecutorPrivate *member_; + +#ifdef PADDLE_WITH_CUDA + // ref_cnts_ is only initialized when ParallelExecutor constructs, and then + // keeps unchanged + // Before each iteration, cur_ref_cnts_ is reset to ref_cnts_ + details::DeviceReferenceCountMap ref_cnts_; + details::AtomicDeviceReferenceCountMap cur_ref_cnts_; + details::DeviceGarbageCollectorMap gcs_; + + void ResetReferenceCount() { + for (auto &pair1 : ref_cnts_) { + for (auto &pair2 : *(pair1.second)) { + (*(cur_ref_cnts_[pair1.first]))[pair2.first] = pair2.second; + } + } + } +#endif }; } // namespace framework diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 2be655b89a4caf2bf9874dcab6bc0bdb2856a026..1a727a2c8c759d010606d5b605823b7252b35c69 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -31,9 +31,21 @@ DEFINE_bool( "Delete local scope eagerly. It will reduce GPU memory usage but " "slow down the destruction of variables.(around 1% performance harm)"); +DEFINE_double( + eager_delete_tensor_gb, -1.0, + "Memory size threshold (GB) when the garbage collector clear tensors." + "Disabled when this value is less than 0"); + namespace paddle { namespace framework { +int64_t GetEagerDeletionThreshold() { + return FLAGS_eager_delete_tensor_gb < 0 + ? -1 + : static_cast(FLAGS_eager_delete_tensor_gb * + (static_cast(1) << 30)); +} + Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index b6165a595d537c314a95685e8b1edbc42e387ab7..e42fff1d79d92fb7ed61768a614d8cd98f6775a0 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -26,6 +26,8 @@ limitations under the License. */ namespace paddle { namespace framework { +int64_t GetEagerDeletionThreshold(); + class Scope; /** diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 4cf95fa0ae07823289fbf337062190f05e6c6bcf..f1d268548578fea12082e2edb213a3749eccbfaf 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -151,6 +151,8 @@ class Tensor { void set_layout(const DataLayout layout) { layout_ = layout; } + void clear() { holder_ = nullptr; } + private: /** * @note Placeholder hides type T, so it doesn't appear as a template diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index efb91bcf75a3cb99a67d5a3251b1d42fc4b04170..6698efd1fa773127a84b4bcb28f57f4226dd7ae2 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -17,9 +17,7 @@ get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) # paddle_fluid_origin exclude inference api interface cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) -#if(APPLE) - add_subdirectory(api) -#endif() +add_subdirectory(api) # Create static library cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api analysis_predictor) @@ -57,5 +55,7 @@ endif() if(WITH_TESTING) # tests/book depends the models that generated by python/paddle/fluid/tests/book add_subdirectory(tests/book) - add_subdirectory(tests/api) + if(WITH_INFERENCE_API_TEST) + add_subdirectory(tests/api) + endif() endif() diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index 6dc39cae0522efd48c2e2921611adebd6937ddf7..8a8aeb5e09a0d9a6746f6d6d61c547363e0e2d30 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -69,8 +69,9 @@ class DfgPassManagerImpl final : public DfgPassManager { if (FLAGS_IA_enable_tensorrt_subgraph_engine) { auto trt_teller = [&](const Node* node) { std::unordered_set teller_set( - {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax", - "depthwise_conv2d", "batch_norm", "concat"}); + {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", + "depthwise_conv2d", "batch_norm", "concat", "tanh", + "elementwise_add", "dropout"}); if (!node->IsFunction()) return false; const auto* func = static_cast(node); diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index 399afbe64a56393176795ecdd1ac70bfedd5c91a..9bdbefc07cbc4bf7a4714927c84855837610430e 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -72,6 +72,9 @@ class Analyzer : public OrderedRegistry { "mul_gru_fuse_pass", // "seq_concat_fc_fuse_pass", // "fc_fuse_pass", // +#ifdef PADDLE_WITH_MKLDNN + "conv_relu_mkldnn_fuse_pass", // +#endif }}; std::unordered_set disabled_ir_passes_; diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc index 100a7504b8526b3587858dd7783913757ba09895..8c7d58678fd29cb25d13d64a08e6c6f26f242d8b 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph.cc @@ -440,6 +440,7 @@ ExtractInputAndOutputOfSubGraph(std::vector &graph) { // NOLINT } return false; }; + for (auto &node : graph) { for (auto *in : node->inlinks) { // The Value that is written by nodes inside a sub-graph shouldn't be the @@ -459,6 +460,7 @@ ExtractInputAndOutputOfSubGraph(std::vector &graph) { // NOLINT std::vector(outputs.begin(), outputs.end())); } +// Filter the Intermediate results of the subgraph node. void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) { std::vector op_nodes; for (auto &node : GraphTraits(*graph).nodes_in_TS()) { @@ -480,9 +482,11 @@ void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) { for (auto *out : op_nodes[i]->outlinks) { if (follow_up_input_names.count(out->name())) { filtered_subgraph_outlinks.push_back(out); + } else { + out->SetDeleted(); } } - PADDLE_ENFORCE_GE(filtered_subgraph_outlinks.size(), 1UL); + // The filtered_subgraph_outlinks may be empty. op_nodes[i]->outlinks = filtered_subgraph_outlinks; } } diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc index 8579845d51e80d73d220465d25b70944f5ad9bf2..5652940ec6d4cc7ba9a1d3a3e65f7dca1690d8c4 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc @@ -106,20 +106,23 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, // collect inputs std::unordered_set input_names; + std::unordered_set input_names_with_id; for (auto *x : func->inlinks) { input_names.insert(x->name()); + input_names_with_id.insert(x->name() + std::to_string(x->id())); } desc.SetInput( "Xs", std::vector(input_names.begin(), input_names.end())); std::unordered_set output_names; + std::unordered_set output_names_with_id; for (auto *x : func->outlinks) { output_names.insert(x->name()); + output_names_with_id.insert(x->name() + std::to_string(x->id())); } - std::vector output_temp(output_names.begin(), - output_names.end()); - desc.SetOutput("Ys", output_temp); + desc.SetOutput( + "Ys", std::vector(output_names.begin(), output_names.end())); desc.SetType("tensorrt_engine"); std::unordered_map output_name_map; @@ -153,11 +156,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, std::vector replaced_names; for (int k = 0; k < in_var->arguments_size(); k++) { std::string arg_value = in_var->arguments(k); - if (input_names.count(arg_value)) { + std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (input_names_with_id.count(arg_value_with_id)) { replaced_names.push_back(arg_value); } else { - replaced_names.push_back(arg_value + - std::to_string(var2id[arg_value])); + replaced_names.push_back(arg_value_with_id); } } in_var->clear_arguments(); @@ -176,11 +180,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, std::vector replaced_names; for (int k = 0; k < out_var->arguments_size(); k++) { std::string arg_value = out_var->arguments(k); - if (output_names.count(arg_value)) { - output_name_map[arg_value] = - arg_value + std::to_string(var2id[arg_value]); + std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (output_names_with_id.count(arg_value_with_id)) { + output_name_map[arg_value] = arg_value_with_id; } - replaced_names.push_back(arg_value + std::to_string(var2id[arg_value])); + replaced_names.push_back(arg_value_with_id); } out_var->clear_arguments(); for (size_t k = 0; k < replaced_names.size(); k++) { diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc index 670a8de667494c655bed15aa3e4ce8265448635a..b879067d2f2f6294c50e0adb21f9399a7c36698a 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc @@ -74,13 +74,141 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) { node_map.at(b)->attr(kUnionFindParent).Int32() = a_ancestor; } +// This is a simple representation of a graph. +// The BriefNode hold the pointer of the Node. +// This is to avoid changing the original graph +// in the process of trt graph analysis. +struct BriefNode { + explicit BriefNode(Node *n) { node = n; } + Node *node; + std::vector inlinks; + std::vector outlinks; +}; + +// Union two adjacent BriefNode. +// Suppose we have two adjacent nodes src and dst. +// We will perform the following operations: +// 1. add all inputs(except src) of dst to src inlinks. +// 2. add all outputs of dst to src outlinks. +// 3. change all the dst's inputs and outputs +// corresponding inlinks and outlinks to src node. +// 4. delete all dst's inlinks and outlinks. +void UnionContractedNodes(const std::unordered_map &node_map, + int src_id, int dst_id) { + // merge the two adjacent nodes into one node. + BriefNode *src_node = node_map.at(src_id); + BriefNode *dst_node = node_map.at(dst_id); + + std::unordered_set inputs(src_node->inlinks.begin(), + src_node->inlinks.end()); + std::unordered_set outputs; + + for (auto *n : src_node->outlinks) { + if (n != dst_node) outputs.insert(n); + } + + // Add the inlinks and outlinks of dst node to src node. + std::vector dst_in_nodes = dst_node->inlinks; + for (BriefNode *node : dst_in_nodes) { + if (node != src_node) { + inputs.insert(node); + } + } + + std::vector dst_out_nodes = dst_node->outlinks; + for (BriefNode *node : dst_out_nodes) { + outputs.insert(node); + } + +// update the dst and src node's inlinks and outlinks. +#ifdef __clang__ + src_node->inlinks = std::vector(inputs.begin(), inputs.end()); + src_node->outlinks = std::vector(outputs.begin(), outputs.end()); + dst_node->inlinks.clear(); + dst_node->outlinks.clear(); +#else + src_node->inlinks = + std::move(std::vector(inputs.begin(), inputs.end())); + src_node->outlinks = + std::move(std::vector(outputs.begin(), outputs.end())); + dst_node->inlinks.clear(); + dst_node->outlinks.clear(); +#endif + + auto inlink_or_outlink_cleaner = [&](std::vector &nodes) { + for (auto *&n : nodes) { + if (n == src_node || n == dst_node) { + n = src_node; + } + } + }; + // Change all the dst inputs and outputs corresponding inlink and + // outlink to the src node. + for (auto *node : src_node->inlinks) { + inlink_or_outlink_cleaner(node->outlinks); + } + + for (auto *node : src_node->outlinks) { + inlink_or_outlink_cleaner(node->inlinks); + } +} + +// FlexibleDFS +// If reverse is true, do reverse dfs. +// If enter func is not nullptr, calls enter(node) before visiting any children +// of node. +// If leave func not nullptr, calls leave(node) after visiting all parents of +// node. +void FlexibleDFS(const std::vector &source, bool reverse, + const std::function &enter, + const std::function &leave) { + typedef struct { + const BriefNode *node; + bool leave; + } FNode; + + std::vector stack; + for (auto &node : source) { + stack.push_back(FNode{node, false}); + } + std::unordered_set visited; + while (!stack.empty()) { + auto fnode = stack.back(); + stack.pop_back(); + + if (fnode.leave) { + if (leave && !leave(fnode.node)) return; + } + if (visited.count(fnode.node)) continue; + visited.insert(fnode.node); + + if (enter && !enter(fnode.node)) return; + + if (leave) stack.push_back(FNode{fnode.node, true}); + const std::vector iter_nodes = + reverse == true ? fnode.node->inlinks : fnode.node->outlinks; + for (const BriefNode *node : iter_nodes) { + if (!visited.count(node)) { + stack.push_back(FNode{node, false}); + } + } + } +} + std::vector> SubGraphSplitter::ExtractSubGraphs() { + // Run the Extract algorithm to find all subgraphs. std::vector marked_nodes; + // We use brief_node_map to represent the original graph in order to avoid + // changing the original graph. + std::unordered_map brief_node_map; + for (auto &node : GraphTraits(*graph_).nodes_in_TS()) { + brief_node_map[node.id()] = new BriefNode(&node); if (node.attr(kMarkerAttrName).Bool()) { marked_nodes.push_back(&node); } } + // extract sub-graphs in the marked node set, use Union Find algorithm. node_map_t node_map; // id to ptr for (auto *n : marked_nodes) { @@ -88,11 +216,73 @@ std::vector> SubGraphSplitter::ExtractSubGraphs() { n->attr(kUnionFindParent).Int32() = n->id(); node_map[n->id()] = n; } - std::unordered_set visited; - for (auto *n : marked_nodes) { - for (auto *out : n->outlinks) { - if (node_map.count(out->id())) { - UnionFindCombine(node_map, n->id(), out->id()); + + // create breif node map + for (auto &itr : brief_node_map) { + for (Node *node : itr.second->node->inlinks) { + itr.second->inlinks.push_back(brief_node_map[node->id()]); + } + + for (Node *node : itr.second->node->outlinks) { + itr.second->outlinks.push_back(brief_node_map[node->id()]); + } + } + + for (auto &itr : brief_node_map) { + BriefNode *brief_node = itr.second; + + if (!brief_node->node->attr(kMarkerAttrName).Bool()) { + VLOG(4) << brief_node->node->id() << " node not a trt candicate."; + continue; + } + + // Our algorithm must guarantee that: + // 1. The graph is always directed acyclic graph(DAG). + // 2. If there is a path in the subgraph from X to Y (X and Y are both + // nodes in the subgraph), then all paths from X to Y are in the + // subgraph. + // + // In order to achieve the above guarantee. + // For adjacent nodes src -> dst. + // 1. Get all dst input nodes except src. + // 2. Reverse DFS from those input nodes + // 3. If there is a path from input nodes to src, + // then the src and dst nodes can not be fused into one node, + // otherwise it can be done. + + while (true) { + std::unordered_set contract_nodes; + for (auto *out : brief_node->outlinks) { + // must be an trt candidate + if (!out->node->attr(kMarkerAttrName).Bool()) continue; + // get all dst input nodes except src. + std::vector source_nodes; + for (auto *n : out->inlinks) { + if (n != brief_node) { + source_nodes.push_back(n); + } + } + + // Reverse DFS from the source_nodes. + bool have_excess_path = false; + FlexibleDFS(source_nodes, true, nullptr, + [&have_excess_path, brief_node](const BriefNode *n) { + if (n == brief_node) { + have_excess_path = true; + return false; + } + return true; + }); + if (have_excess_path) continue; + contract_nodes.insert(out); + } + if (contract_nodes.empty()) break; + + for (auto dst_node : contract_nodes) { + UnionFindCombine(node_map, brief_node->node->id(), + dst_node->node->id()); + UnionContractedNodes(brief_node_map, brief_node->node->id(), + dst_node->node->id()); } } } @@ -128,6 +318,7 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() { auto io = ExtractInputAndOutputOfSubGraph(subgraph); block_node->inlinks = std::move(io.first); block_node->outlinks = std::move(io.second); + for (auto *node : subgraph) { // TODO(Superjomn) need a unified mechanism to treat deleted node in each // pass. diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc index 39cc433b40fad17f4f12359d4e907a250a88bd63..531a170512f727d891aa6644ee08a60c25f16876 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc @@ -82,7 +82,7 @@ TEST(SubGraphSplitter, Fuse) { // At least one nodes should be deleted. ASSERT_EQ(dfg.nodes.size(), count0 + 1); // added a new FunctionBlock - ASSERT_EQ(6, count1); + ASSERT_EQ(11, count1); } } // namespace analysis diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 5df486f345a98d7737d326c94e4854d24535ff61..e569df94c54c304852dab7c7496804c1b08d665c 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -69,25 +69,4 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI endfunction() anakin_target(inference_anakin_api) anakin_target(inference_anakin_api_shared) - if (WITH_TESTING) - # TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later. - set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com") - set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin") - set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt") - execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}") - execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N") - execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N") - if(WITH_GPU) - set(anakin_test_extra_deps dynload_cuda) - set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin") - execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N") - cc_test(api_anakin_engine_tester SRCS api_anakin_engine_tester.cc - ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin - DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL) - endif() - cc_test(api_anakin_engine_rnn_tester SRCS api_anakin_engine_rnn_tester.cc - ARGS --model=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin - --datapath=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn_data.txt - DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL) - endif(WITH_TESTING) endif() diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 2a9a7aed480e76edbac4d5ba6d7bc3b8b2dc5006..684e0ce0e292d852d4601ebd1ccd920382e42c8b 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -77,6 +77,9 @@ bool AnalysisPredictor::Init( OptimizeInferenceProgram(); ctx_ = executor_->Prepare(*inference_program_, 0); + if (config_._use_mkldnn) { + executor_->EnableMKLDNN(*inference_program_); + } VLOG(5) << "to create variables"; PADDLE_ENFORCE(scope_.get()); diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc index 5f1e1b548c7b7daa66932571d7053701bc0bd1f6..c71769a32f604358fe68c927546591310649f116 100644 --- a/paddle/fluid/inference/api/api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -64,13 +64,15 @@ PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) { void PaddleBuf::Resize(size_t length) { // Only the owned memory can be reset, the external memory can't be changed. - if (length_ == length) return; + if (length_ >= length) return; if (memory_owned_) { Free(); + data_ = malloc(length); + length_ = length; + memory_owned_ = true; + } else { + PADDLE_THROW("The memory is allocated externally, can not Resized"); } - data_ = new char[length]; - length_ = length; - memory_owned_ = true; } void PaddleBuf::Reset(void* data, size_t length) { @@ -82,8 +84,8 @@ void PaddleBuf::Reset(void* data, size_t length) { void PaddleBuf::Free() { if (memory_owned_ && data_) { - assert(length_ > 0); - delete[] static_cast(data_); + PADDLE_ENFORCE_GT(length_, 0); + free(static_cast(data_)); data_ = nullptr; length_ = 0; } diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 6fe13ed027de403bdc21882c26225bcd4cc7e49a..2e9e10139fa7008a46c3782960dfd44d3228cc26 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -106,6 +106,9 @@ bool NativePaddlePredictor::Init( } ctx_ = executor_->Prepare(*inference_program_, 0); + if (config_._use_mkldnn) { + executor_->EnableMKLDNN(*inference_program_); + } executor_->CreateVariables(*inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0); diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc index abee375313850f1490bacec11f737706c061a5e9..d9d6e139b8735c8f07c52f63c70b6b9805e03642 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc @@ -153,11 +153,21 @@ CreatePaddlePredictor( } // namespace paddle USE_TRT_CONVERTER(elementwise_add_weight); +USE_TRT_CONVERTER(elementwise_add_tensor); +USE_TRT_CONVERTER(elementwise_sub_tensor); +USE_TRT_CONVERTER(elementwise_div_tensor); +USE_TRT_CONVERTER(elementwise_mul_tensor); +USE_TRT_CONVERTER(elementwise_max_tensor); +USE_TRT_CONVERTER(elementwise_min_tensor); +USE_TRT_CONVERTER(elementwise_pow_tensor); USE_TRT_CONVERTER(mul); USE_TRT_CONVERTER(conv2d); USE_TRT_CONVERTER(relu); +USE_TRT_CONVERTER(sigmoid); +USE_TRT_CONVERTER(tanh); USE_TRT_CONVERTER(fc); USE_TRT_CONVERTER(pool2d); USE_TRT_CONVERTER(softmax); USE_TRT_CONVERTER(batch_norm); USE_TRT_CONVERTER(concat); +USE_TRT_CONVERTER(dropout); diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index f6893be428feacbba85bab380e22972848eaeb93..8e359a67738c0df180933421b45f15b39fd0e78c 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -123,10 +123,16 @@ std::string DescribeTensor(const PaddleTensor &tensor) { } void PrintTime(int batch_size, int repeat, int num_threads, int tid, - double latency) { + double latency, int epoch = 1) { LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat << ", threads: " << num_threads << ", thread id: " << tid << ", latency: " << latency << "ms ======"; + if (epoch > 1) { + int samples = batch_size * epoch; + LOG(INFO) << "====== sample number: " << samples + << ", average latency of each sample: " << latency / samples + << "ms ======"; + } } } // namespace inference diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 995da11e4a30eca72a91a53d3293aa8b033b012b..55a07ca705f9fafa9ea223a867300bd14e10c364 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -45,7 +45,7 @@ class PaddleBuf { PaddleBuf(void* data, size_t length) : data_(data), length_(length), memory_owned_{false} {} // Own memory. - PaddleBuf(size_t length) + explicit PaddleBuf(size_t length) : data_(new char[length]), length_(length), memory_owned_(true) {} // Resize to `length` bytes. void Resize(size_t length); @@ -121,6 +121,8 @@ struct NativeConfig : public PaddlePredictor::Config { bool use_gpu{false}; int device{0}; float fraction_of_gpu_memory{-1.f}; // Negative to notify initialization. + // NOTE: NOT use it, just for the internal test, will discard later + bool _use_mkldnn{false}; // Specify the variable's name of each input. bool specify_input_name{false}; diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 9d7be2d03cf7bb12afe7e52d9630f184d689dc25..fac1babf6ec6131f84d3e3b9fc6efedd9f9f6cfc 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,7 +1,7 @@ # Add TRT tests nv_library(tensorrt_converter SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc -batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc +batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc DEPS tensorrt_engine operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS @@ -24,6 +24,8 @@ nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL) nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL) - nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL) + +nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc + DEPS ${FLUID_CORE_MODULES} tensorrt_engine dropout_op SERIAL) diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc index e1cace9cc1b06f036f52e82b7b86c99a02d50f50..e73c5bbf57501e4ff3c080a46d91685035652bfa 100644 --- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc @@ -19,28 +19,64 @@ namespace paddle { namespace inference { namespace tensorrt { -class ReluOpConverter : public OpConverter { +class ActivationOpConverter : public OpConverter { public: - ReluOpConverter() {} + ActivationOpConverter() {} void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { // Here the two nullptr looks strange, that's because the // framework::OpDesc's constructor is strange. framework::OpDesc op_desc(op, nullptr); - LOG(INFO) << "convert a fluid relu op to tensorrt activation layer whose " - "type is Relu"; + LOG(INFO) + << "convert a fluid Activation op to tensorrt activation layer whose " + "type is " + << op_type_; const nvinfer1::ITensor* input_tensor = engine_->GetITensor(op_desc.Input("X")[0]); + + auto op_pair = ops.find(op_type_); + if (op_pair == ops.end()) { + PADDLE_THROW("Wrong activation op type!"); + } + nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER( engine_, Activation, *const_cast(input_tensor), - nvinfer1::ActivationType::kRELU); + op_pair->second); auto output_name = op_desc.Output("Out")[0]; + layer->setName((op_type_ + " (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { // the test framework can not determine which is the // output, so place the declaration inside. engine_->DeclareOutput(output_name); } } + + protected: + std::string op_type_; + static const std::unordered_map ops; +}; + +const std::unordered_map + ActivationOpConverter::ops = { + {"relu", nvinfer1::ActivationType::kRELU}, + {"sigmoid", nvinfer1::ActivationType::kSIGMOID}, + {"tanh", nvinfer1::ActivationType::kTANH}, +}; + +class ReluOpConverter : public ActivationOpConverter { + public: + ReluOpConverter() { op_type_ = "relu"; } +}; + +class SigmoidOpConverter : public ActivationOpConverter { + public: + SigmoidOpConverter() { op_type_ = "sigmoid"; } +}; + +class TanhOpConverter : public ActivationOpConverter { + public: + TanhOpConverter() { op_type_ = "tanh"; } }; } // namespace tensorrt @@ -48,3 +84,5 @@ class ReluOpConverter : public OpConverter { } // namespace paddle REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter); +REGISTER_TRT_OP_CONVERTER(sigmoid, SigmoidOpConverter); +REGISTER_TRT_OP_CONVERTER(tanh, TanhOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc index 94f8b0ae5606d39a722ffe28501645c9b6fc5d2e..3330af2da6c97ad153dcecd86be4b441eac62b5e 100644 --- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc @@ -116,6 +116,8 @@ class BatchNormOpConverter : public OpConverter { scale_weights.get(), power_weights.get()); auto output_name = op_desc.Output("Y").front(); + layer->setName(("batch_norm (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->weight_map[op_desc.Input("Bias").front()] = std::move(combile_bias_tensor); engine_->weight_map[op_desc.Input("Scale").front()] = diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc index bb9627bf957b63993b2c8d23e7ec8122eb004eaf..a11dfa1e8f2dacfad067d025678911200db500fb 100644 --- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc @@ -42,6 +42,8 @@ class ConcatOpConverter : public OpConverter { axis = axis - 1; // Remove batch dim layer->setAxis(axis); auto output_name = op_desc.Output("Out")[0]; + layer->setName(("concat (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { // the test framework can not determine which is the // output, so place the declaration inside. diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index 841a95db38ce7cf0cb5961ff04cb569ee2633e6f..0a37d3968c39d2c244bbd82161afddf6330e421d 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -78,8 +78,10 @@ class Conv2dOpConverter : public OpConverter { layer->setNbGroups(groups); auto output_name = op_desc.Output("Output").front(); + layer->setName(("conv2d (Output: " + output_name + ")").c_str()); engine_->weight_map[op_desc.Input("Filter").front()] = std::move(weight_tensor); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { engine_->DeclareOutput(output_name); diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..9533ecbcfda4e2500fd201d8efc64fc5bd97169a --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc @@ -0,0 +1,71 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * DropoutOp. This Layer doesn't has weights. + */ +class DropoutOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(4) << "convert a fluid dropout op to tensorrt dropout layer"; + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); + float dropout_prob = boost::get(op_desc.GetAttr("dropout_prob")); + + platform::CPUPlace cpu_place; + std::unique_ptr weight_tensor( + new framework::LoDTensor()); + weight_tensor->Resize(framework::make_ddim({1})); + auto* weight_data = + weight_tensor->mutable_data(platform::CPUPlace()); + weight_data[0] = 1 - dropout_prob; + + TensorRTEngine::Weight scale_weights{ + nvinfer1::DataType::kFLOAT, static_cast(weight_data), + weight_tensor->memory_size() / sizeof(float)}; + TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT, nullptr, + 0}; + TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, + 0}; + + auto* layer = TRT_ENGINE_ADD_LAYER( + engine_, Scale, *const_cast(input1), + nvinfer1::ScaleMode::kUNIFORM, shift_weights.get(), scale_weights.get(), + power_weights.get()); + + engine_->weight_map[op_desc.Output("Out").front() + "_dropout"] = + std::move(weight_tensor); + auto output_name = op_desc.Output("Out")[0]; + layer->setName(("dropout (Output: " + output_name + ")").c_str()); + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { + engine_->DeclareOutput(output_name); + } + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(dropout); +REGISTER_TRT_OP_CONVERTER(dropout, DropoutOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 60a72b4eb5c75b5cd12305f13763a9a1a567213f..0a6ce568f194f03c7259e1ebf28dd6ce4df2d594 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -89,6 +89,8 @@ class ElementwiseWeightOpConverter : public OpConverter { shift_weights.get(), scale_weights.get(), power_weights.get()); auto output_name = op_desc.Output("Out")[0]; + layer->setName(("elementwise_add (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { // the test framework can not determine which is the @@ -137,6 +139,8 @@ class ElementwiseTensorOpConverter : public OpConverter { *const_cast(Y), op_pair->second); auto output_name = op_desc.Output("Out")[0]; + layer->setName(("elementwise (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { // the test framework can not determine which is the // output, so place the declaration inside. diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index ad98d85aae9cf594922aca00c43718ccfbce2278..7c21ecd95da07b498eed2ab1bbdcc0e8cd184787 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -107,6 +107,8 @@ class FcOpConverter : public OpConverter { n_output, tmp_weight.get(), bias.get()); auto output_name = op_desc.Output("Out").front(); + layer->setName(("fc (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); engine_->weight_map[op_desc.Input("Y").front()] = std::move(tmp); if (test_mode) { diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 73f1b28ddf73403862e55d102a259d7b6cf67b1f..f9bb66a6e9f81a10368db7710108c319860e940a 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -72,6 +72,8 @@ class Pool2dOpConverter : public OpConverter { layer->setPadding(nv_paddings); auto output_name = op_desc.Output("Out")[0]; + layer->setName(("pool2d (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { engine_->DeclareOutput(output_name); diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index e82762ea03ecd00bce7cfb83b130a3436ccbfed3..dd3dfb0bc7b609e28462954835a0d40e0a63b6cd 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -20,18 +20,18 @@ namespace paddle { namespace inference { namespace tensorrt { -TEST(ReluOpConverter, main) { +void test_activation(std::string act_type) { framework::Scope scope; std::unordered_set parameters; TRTConvertValidation validator(10, parameters, scope, 1000); - validator.DeclInputVar("relu-X", nvinfer1::Dims2(10, 6)); - validator.DeclOutputVar("relu-Out", nvinfer1::Dims2(10, 6)); + validator.DeclInputVar("act-X", nvinfer1::Dims2(10, 6)); + validator.DeclOutputVar("act-Out", nvinfer1::Dims2(10, 6)); // Prepare Op description framework::OpDesc desc; - desc.SetType("relu"); - desc.SetInput("X", {"relu-X"}); - desc.SetOutput("Out", {"relu-Out"}); + desc.SetType(act_type); + desc.SetInput("X", {"act-X"}); + desc.SetOutput("Out", {"act-Out"}); LOG(INFO) << "set OP"; validator.SetOp(*desc.Proto()); @@ -40,8 +40,16 @@ TEST(ReluOpConverter, main) { validator.Execute(5); } +TEST(ReluOpConverter, main) { test_activation("relu"); } + +TEST(SigmoidOpConverter, main) { test_activation("sigmoid"); } + +TEST(TanhOpConverter, main) { test_activation("tanh"); } + } // namespace tensorrt } // namespace inference } // namespace paddle USE_OP(relu); +USE_OP(sigmoid); +USE_OP(tanh); diff --git a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6b8e621b702d977f5868766a6eafb98c8522c3cd --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc @@ -0,0 +1,58 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(DropoutOpConverter, main) { + framework::Scope scope; + std::unordered_set parameters; + TRTConvertValidation validator(8, parameters, scope, 1000); + + std::vector tensor_shape{8, 10}; + validator.DeclInputVar("dropout-X", tensor_shape, + nvinfer1::DimsCHW(10, 1, 1)); + validator.DeclOutputVar("dropout-Out", nvinfer1::DimsCHW(10, 1, 1)); + validator.DeclOutputVar("mask-Out", nvinfer1::DimsCHW(10, 1, 1)); + + // Prepare Op description + framework::OpDesc desc; + int is_test = 1; + float dropout_prob = 0.4; + + desc.SetType("dropout"); + desc.SetInput("X", {"dropout-X"}); + desc.SetOutput("Mask", {"mask-Out"}); + desc.SetOutput("Out", {"dropout-Out"}); + desc.SetAttr("is_test", is_test); + desc.SetAttr("dropout_prob", dropout_prob); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + std::unordered_set neglected_output = {"mask-Out"}; + + validator.Execute(8, neglected_output); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(dropout); diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index d44a2cfa7f2d2f7dde5001006e05cdff1612435b..508ef1ce40aa0882a0f39a85f97511fd9ea2a8a5 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -1,57 +1,87 @@ -function (inference_download_and_uncompress install_dir url) - get_filename_component(filename ${url} NAME) - message(STATUS "Download inference test stuff ${filename} from ${url}") +set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com") +set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING + "A path setting inference demo download directories.") +set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor) +function (inference_download install_dir url filename) + message(STATUS "Download inference test stuff from ${url}/${filename}") execute_process(COMMAND bash -c "mkdir -p ${install_dir}") - execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}") - execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}") + execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}") message(STATUS "finish downloading ${filename}") -endfunction(inference_download_and_uncompress) +endfunction() + +function (inference_download_and_uncompress install_dir url filename) + inference_download(${install_dir} ${url} ${filename}) + execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}") +endfunction() -function(download_model_and_data install_dir model_url data_url) - if (NOT EXISTS ${install_dir} AND WITH_INFERENCE) - inference_download_and_uncompress(${install_dir} ${model_url}) - inference_download_and_uncompress(${install_dir} ${data_url}) +function(download_model_and_data install_dir model_name data_name) + if (NOT EXISTS ${install_dir}) + inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${model_name}) + inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${data_name}) endif() endfunction() +function(inference_analysis_api_test target install_dir filename) + inference_analysis_test(${target} SRCS ${filename} + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt) +endfunction() + # RNN1 -set(RNN1_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fmodel.tar.gz") -set(RNN1_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fdata.txt.tar.gz") -set(RNN1_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/rnn1") -download_model_and_data(${RNN1_INSTALL_DIR} ${RNN1_MODEL_URL} ${RNN1_DATA_URL}) -inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor - ARGS --infer_model=${RNN1_INSTALL_DIR}/model - --infer_data=${RNN1_INSTALL_DIR}/data.txt) +if(NOT APPLE) + set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") + download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz") + inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc) +else() + # TODO: fix this test on MACOS, the reason is that + # fusion_seqexpand_concat_fc_op is not supported on MACOS + message(WARNING "These tests has been disabled in OSX before being fixed: \n test_analyzer_rnn1") +endif() + +# RNN2 +set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2") +download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz") +inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc) # chinese_ner -set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz") -set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz") -set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner") -download_model_and_data(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} ${CHINESE_NER_DATA_URL}) -inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor - ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model - --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt) +set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner") +download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz") +inference_analysis_api_test(test_analyzer_ner ${CHINESE_NER_INSTALL_DIR} analyzer_ner_tester.cc) # lac -set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz") -set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz") -set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac") -download_model_and_data(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} ${LAC_DATA_URL}) -inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor - ARGS --infer_model=${LAC_INSTALL_DIR}/model - --infer_data=${LAC_INSTALL_DIR}/data.txt) +set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac") +download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz") +inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_tester.cc) # text_classification -set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz") -set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz") -set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification") -download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} ${TEXT_CLASSIFICATION_DATA_URL}) -inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor - ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta - --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt - --topn=1 # Just run top 1 batch. - ) +set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification") +download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz") +inference_analysis_api_test(test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR} analyzer_text_classification_tester.cc) + +# ocr +set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") +if (NOT EXISTS ${OCR_INSTALL_DIR}) + inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz") +endif() +inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc) + +# anakin +if (WITH_ANAKIN AND WITH_MKL) # only needed in CI + # anakin rnn1 + set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin") + set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1") + inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin") + inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt") + cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc + ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin + --datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt + DEPS inference_anakin_api_shared SERIAL) + # anakin mobilenet + if(WITH_GPU) + set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet") + inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin") + cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc + ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin + DEPS inference_anakin_api_shared dynload_cuda SERIAL) + endif() +endif() diff --git a/paddle/fluid/inference/api/api_anakin_engine_tester.cc b/paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc similarity index 100% rename from paddle/fluid/inference/api/api_anakin_engine_tester.cc rename to paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc diff --git a/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc similarity index 100% rename from paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc rename to paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 7e00cb20ad0ce052a84d5491b0cdf167f0768081..bf893e32569f4b50a583ab6f43cb214ec3620e09 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -12,21 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/analyzer.h" -#include -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/api/analysis_predictor.h" -#include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/paddle_inference_pass.h" -#include "paddle/fluid/platform/profiler.h" - -DEFINE_string(infer_model, "", "model path for LAC"); -DEFINE_string(infer_data, "", "data file for LAC"); -DEFINE_int32(batch_size, 1, "batch size."); -DEFINE_int32(burning, 0, "Burning before repeat."); -DEFINE_int32(repeat, 1, "Running the inference program repeat times."); -DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); +#include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { namespace inference { @@ -124,48 +110,38 @@ const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, void TestLACPrediction(const std::string &model_path, const std::string &data_file, const int batch_size, - const int repeat, bool test_all_data, - bool use_analysis = false) { - NativeConfig config; - config.model_dir = model_path; - config.use_gpu = false; - config.device = 0; - config.specify_input_name = true; + const int repeat, bool use_analysis = false) { + AnalysisConfig cfg; + cfg.model_dir = model_path; + cfg.use_gpu = false; + cfg.device = 0; + cfg.specify_input_name = true; + cfg.enable_ir_optim = true; + std::vector input_slots, outputs_slots; DataRecord data(data_file, batch_size); GetOneBatch(&input_slots, &data, batch_size); std::unique_ptr predictor; if (use_analysis) { - AnalysisConfig cfg; - cfg.model_dir = model_path; - cfg.use_gpu = false; - cfg.device = 0; - cfg.specify_input_name = true; - cfg.enable_ir_optim = true; predictor = CreatePaddlePredictor(cfg); } else { predictor = - CreatePaddlePredictor(config); + CreatePaddlePredictor(cfg); } for (int i = 0; i < FLAGS_burning; i++) { predictor->Run(input_slots, &outputs_slots); } Timer timer; - if (test_all_data) { - double sum = 0; - LOG(INFO) << "Total number of samples: " << data.datasets.size(); - for (int i = 0; i < repeat; i++) { - for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) { - GetOneBatch(&input_slots, &data, batch_size); - timer.tic(); - predictor->Run(input_slots, &outputs_slots); - sum += timer.toc(); - } + if (FLAGS_test_all_data) { + LOG(INFO) << "test all data"; + std::vector> input_slots_all; + for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) { + GetOneBatch(&input_slots, &data, batch_size); + input_slots_all.emplace_back(input_slots); } - PrintTime(batch_size, repeat, 1, 0, sum / repeat); - LOG(INFO) << "Average latency of each sample: " - << sum / repeat / data.datasets.size() << " ms"; + LOG(INFO) << "total number of samples: " << data.datasets.size(); + TestPrediction(cfg, input_slots_all, &outputs_slots, FLAGS_num_threads); return; } timer.tic(); @@ -190,19 +166,10 @@ void TestLACPrediction(const std::string &model_path, if (use_analysis) { // run once for comparion as reference auto ref_predictor = - CreatePaddlePredictor(config); + CreatePaddlePredictor(cfg); std::vector ref_outputs_slots; ref_predictor->Run(input_slots, &ref_outputs_slots); - EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size()); - auto &ref_out = ref_outputs_slots[0]; - size_t ref_size = - std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1, - [](int a, int b) { return a * b; }); - EXPECT_EQ(size, ref_size); - int64_t *pdata_ref = static_cast(ref_out.data.data()); - for (size_t i = 0; i < size; ++i) { - EXPECT_EQ(pdata_ref[i], pdata[i]); - } + CompareResult(ref_outputs_slots, outputs_slots); AnalysisPredictor *analysis_predictor = dynamic_cast(predictor.get()); @@ -231,13 +198,13 @@ void TestLACPrediction(const std::string &model_path, TEST(Analyzer_LAC, native) { LOG(INFO) << "LAC with native"; TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, - FLAGS_repeat, FLAGS_test_all_data); + FLAGS_repeat); } TEST(Analyzer_LAC, analysis) { LOG(INFO) << "LAC with analysis"; TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, - FLAGS_repeat, FLAGS_test_all_data, true); + FLAGS_repeat, true); } } // namespace analysis diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index 6e8e43add7d3383fa79efea91c23750be9c8956f..f8c651e32f7e2ce1d8ced0e6774ffd555d351167 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -12,20 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/analyzer.h" -#include -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/api/analysis_predictor.h" -#include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/paddle_inference_pass.h" -#include "paddle/fluid/platform/profiler.h" - -DEFINE_string(infer_model, "", "model path"); -DEFINE_string(infer_data, "", "data path"); -DEFINE_int32(batch_size, 10, "batch size."); -DEFINE_int32(repeat, 1, "Running the inference program repeat times."); -DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); +#include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { namespace inference { @@ -113,50 +100,35 @@ const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26, 48, 39, 38, 16, 25}; void TestChineseNERPrediction(bool use_analysis) { - NativeConfig config; - config.prog_file = FLAGS_infer_model + "/__model__"; - config.param_file = FLAGS_infer_model + "/param"; - config.use_gpu = false; - config.device = 0; - config.specify_input_name = true; + AnalysisConfig cfg; + cfg.prog_file = FLAGS_infer_model + "/__model__"; + cfg.param_file = FLAGS_infer_model + "/param"; + cfg.use_gpu = false; + cfg.device = 0; + cfg.specify_input_name = true; + cfg.enable_ir_optim = true; std::vector input_slots, outputs; std::unique_ptr predictor; Timer timer; if (use_analysis) { - AnalysisConfig cfg; - cfg.prog_file = FLAGS_infer_model + "/__model__"; - cfg.param_file = FLAGS_infer_model + "/param"; - cfg.use_gpu = false; - cfg.device = 0; - cfg.specify_input_name = true; - cfg.enable_ir_optim = true; predictor = CreatePaddlePredictor(cfg); } else { predictor = - CreatePaddlePredictor(config); + CreatePaddlePredictor(cfg); } if (FLAGS_test_all_data) { LOG(INFO) << "test all data"; - double sum = 0; - size_t num_samples; - for (int i = 0; i < FLAGS_repeat; i++) { - DataRecord data(FLAGS_infer_data, FLAGS_batch_size); - // Just one batch, the num_samples remains the same. - num_samples = data.num_samples; - for (size_t bid = 0; bid < num_samples / FLAGS_batch_size; ++bid) { - PrepareInputs(&input_slots, &data, FLAGS_batch_size); - timer.tic(); - predictor->Run(input_slots, &outputs); - sum += timer.toc(); - } + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + std::vector> input_slots_all; + for (size_t bid = 0; bid < data.num_samples / FLAGS_batch_size; ++bid) { + PrepareInputs(&input_slots, &data, FLAGS_batch_size); + input_slots_all.emplace_back(input_slots); } - LOG(INFO) << "total number of samples: " << num_samples; - PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat); - LOG(INFO) << "average latency of each sample: " - << sum / FLAGS_repeat / num_samples; + LOG(INFO) << "total number of samples: " << data.num_samples; + TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); return; } // Prepare inputs. @@ -182,19 +154,10 @@ void TestChineseNERPrediction(bool use_analysis) { if (use_analysis) { // run once for comparion as reference auto ref_predictor = - CreatePaddlePredictor(config); + CreatePaddlePredictor(cfg); std::vector ref_outputs_slots; ref_predictor->Run(input_slots, &ref_outputs_slots); - EXPECT_EQ(ref_outputs_slots.size(), outputs.size()); - auto &ref_out = ref_outputs_slots[0]; - size_t ref_size = - std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1, - [](int a, int b) { return a * b; }); - EXPECT_EQ(size, ref_size); - int64_t *pdata_ref = static_cast(ref_out.data.data()); - for (size_t i = 0; i < size; ++i) { - EXPECT_EQ(pdata_ref[i], result[i]); - } + CompareResult(ref_outputs_slots, outputs); AnalysisPredictor *analysis_predictor = dynamic_cast(predictor.get()); diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index b8ac468b4e98bcef81cdbbf66e3f1640c03a7ab8..df96be544eaf51c52aa5592966f499fad91aab82 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -12,24 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/analyzer.h" - -#include -#include -#include // NOLINT -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/pass.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/api/analysis_predictor.h" -#include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/paddle_inference_api.h" -#include "paddle/fluid/inference/api/paddle_inference_pass.h" - -DEFINE_string(infer_model, "", "model path"); -DEFINE_string(infer_data, "", "data path"); -DEFINE_int32(batch_size, 10, "batch size."); -DEFINE_int32(repeat, 1, "Running the inference program repeat times."); -DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); +#include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { namespace inference { @@ -164,26 +147,6 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } } -void CompareResult(const std::vector &outputs, - const std::vector &base_outputs) { - PADDLE_ENFORCE_GT(outputs.size(), 0); - PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size()); - for (size_t i = 0; i < outputs.size(); i++) { - auto &out = outputs[i]; - auto &base_out = base_outputs[i]; - size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, - [](int a, int b) { return a * b; }); - size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(), - 1, [](int a, int b) { return a * b; }); - PADDLE_ENFORCE_EQ(size, size1); - PADDLE_ENFORCE_GT(size, 0); - float *data = static_cast(out.data.data()); - float *base_data = static_cast(base_out.data.data()); - for (size_t i = 0; i < size; i++) { - EXPECT_NEAR(data[i], base_data[i], 1e-3); - } - } -} // Test with a really complicate model. void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { AnalysisConfig config; @@ -198,7 +161,6 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { config.ir_passes.clear(); // Do not exclude any pass. int batch_size = FLAGS_batch_size; - int num_times = FLAGS_repeat; auto base_predictor = CreatePaddlePredictor(config); @@ -213,45 +175,14 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { base_predictor->Run(input_slots, &base_outputs); + std::vector> input_slots_all; + input_slots_all.emplace_back(input_slots); if (num_threads == 1) { - // Prepare inputs. - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - predictor->Run(input_slots, &outputs); - } - PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times); + TestOneThreadPrediction(config, input_slots_all, &outputs); CompareResult(outputs, base_outputs); } else { - std::vector threads; - std::vector> predictors; - // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled - // because AttentionLSTM's hard code nodeid will be damanged. - for (int tid = 0; tid < num_threads; ++tid) { - predictors.emplace_back( - CreatePaddlePredictor( - config)); - } - for (int tid = 0; tid < num_threads; ++tid) { - threads.emplace_back([&, tid]() { - // Each thread should have local input_slots and outputs. - std::vector input_slots; - DataRecord data(FLAGS_infer_data, batch_size); - PrepareInputs(&input_slots, &data, batch_size); - std::vector outputs; - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - predictors[tid]->Run(input_slots, &outputs); - } - PrintTime(batch_size, num_times, num_threads, tid, - timer.toc() / num_times); - CompareResult(outputs, base_outputs); - }); - } - for (int i = 0; i < num_threads; ++i) { - threads[i].join(); - } + // only return the output of first thread + TestMultiThreadPrediction(config, input_slots_all, &outputs, num_threads); } if (use_analysis && activate_ir) { @@ -293,8 +224,7 @@ TEST(Analyzer, RNN_tests) { // Directly infer with the original model. TestRNN1Prediction(false, false, i); // Inference with the original model with the analysis turned on, the - // analysis - // module will transform the program to a data flow graph. + // analysis module will transform the program to a data flow graph. TestRNN1Prediction(true, false, i); // Inference with analysis and IR. The IR module will fuse some large // kernels. diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..c40ea58eea9c10a85acf84108f1d081a779f526d --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc @@ -0,0 +1,181 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/analyzer.h" + +#include +#include +#include // NOLINT +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" + +DEFINE_string(infer_model, "", "model path"); +DEFINE_string(infer_data, "", "data path"); +DEFINE_int32(batch_size, 1, "batch size."); +DEFINE_int32(repeat, 1, "Running the inference program repeat times."); +DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); + +namespace paddle { +namespace inference { + +using namespace framework; // NOLINT + +struct DataRecord { + std::vector>> link_step_data_all; + std::vector lod; + std::vector> rnn_link_data; + std::vector result_data; + size_t batch_iter{0}; + size_t batch_size{1}; + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) + : batch_size(batch_size) { + Load(path); + } + DataRecord NextBatch() { + DataRecord data; + size_t batch_end = batch_iter + batch_size; + // NOTE skip the final batch, if no enough data is provided. + if (batch_end <= link_step_data_all.size()) { + data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter, + link_step_data_all.begin() + batch_end); + // Prepare LoDs + data.lod.push_back(0); + CHECK(!data.link_step_data_all.empty()) << "empty"; + for (size_t j = 0; j < data.link_step_data_all.size(); j++) { + for (const auto &d : data.link_step_data_all[j]) { + data.rnn_link_data.push_back(d); + // calculate lod + data.lod.push_back(data.lod.back() + 11); + } + } + } + batch_iter += batch_size; + return data; + } + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + int num_lines = 0; + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, ':', &data); + if (num_lines % 2) { // feature + std::vector feature_data; + split(data[1], ' ', &feature_data); + std::vector> link_step_data; + int feature_count = 1; + std::vector feature; + for (auto &step_data : feature_data) { + std::vector tmp; + split_to_float(step_data, ',', &tmp); + feature.insert(feature.end(), tmp.begin(), tmp.end()); + if (feature_count % 11 == 0) { // each sample has 11 features + link_step_data.push_back(feature); + feature.clear(); + } + feature_count++; + } + link_step_data_all.push_back(std::move(link_step_data)); + } else { // result + std::vector tmp; + split_to_float(data[1], ',', &tmp); + result_data.insert(result_data.end(), tmp.begin(), tmp.end()); + } + } + } +}; +void PrepareInputs(std::vector *input_slots, DataRecord *data, + int batch_size) { + PaddleTensor feed_tensor; + feed_tensor.name = "feed"; + auto one_batch = data->NextBatch(); + int token_size = one_batch.rnn_link_data.size(); + // each token has 11 features, each feature's dim is 54. + std::vector rnn_link_data_shape({token_size * 11, 54}); + feed_tensor.shape = rnn_link_data_shape; + feed_tensor.lod.assign({one_batch.lod}); + feed_tensor.dtype = PaddleDType::FLOAT32; + TensorAssignData(&feed_tensor, one_batch.rnn_link_data); + // Set inputs. + input_slots->assign({feed_tensor}); +} + +void CompareResult(const std::vector &outputs, + const std::vector &base_result) { + PADDLE_ENFORCE_GT(outputs.size(), 0); + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = outputs[i]; + size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, + [](int a, int b) { return a * b; }); + PADDLE_ENFORCE_GT(size, 0); + float *data = static_cast(out.data.data()); + for (size_t i = 0; i < size; i++) { + EXPECT_NEAR(data[i], base_result[i], 1e-3); + } + } +} +// Test with a really complicate model. +void TestRNN2Prediction() { + AnalysisConfig config; + config.prog_file = FLAGS_infer_model + "/__model__"; + config.param_file = FLAGS_infer_model + "/param"; + config.use_gpu = false; + config.device = 0; + config.specify_input_name = true; + config.enable_ir_optim = true; + PADDLE_ENFORCE(config.ir_mode == + AnalysisConfig::IrPassMode::kExclude); // default + + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; + + auto base_predictor = + CreatePaddlePredictor(config); + auto predictor = + CreatePaddlePredictor( + config); + std::vector input_slots; + DataRecord data(FLAGS_infer_data, batch_size); + PrepareInputs(&input_slots, &data, batch_size); + std::vector outputs, base_outputs; + + Timer timer1; + timer1.tic(); + for (int i = 0; i < num_times; i++) { + base_predictor->Run(input_slots, &base_outputs); + } + PrintTime(batch_size, num_times, 1, 0, timer1.toc() / num_times); + + Timer timer2; + timer2.tic(); + for (int i = 0; i < num_times; i++) { + predictor->Run(input_slots, &outputs); + } + PrintTime(batch_size, num_times, 1, 0, timer2.toc() / num_times); + + CompareResult(base_outputs, data.result_data); + CompareResult(outputs, data.result_data); +} + +TEST(Analyzer, rnn2) { TestRNN2Prediction(); } + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc index 65169f8cfcc5bf1e989609666f6e0ba03e42e5ba..1472c475e4a3061ffcad96925ea215a41a7e63eb 100644 --- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc @@ -12,23 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/analyzer.h" -#include -#include // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files. -#include -#include -#include "paddle/fluid/framework/ir/pass.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/paddle_inference_api.h" -#include "paddle/fluid/inference/api/paddle_inference_pass.h" -#include "paddle/fluid/inference/api/timer.h" - -DEFINE_string(infer_model, "", "Directory of the inference model."); -DEFINE_string(infer_data, "", "Path of the dataset."); -DEFINE_int32(batch_size, 1, "batch size."); -DEFINE_int32(repeat, 1, "How many times to repeat run."); -DEFINE_int32(topn, -1, "Run top n batches of data to save time"); +#include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { namespace inference { @@ -37,24 +21,25 @@ struct DataReader { explicit DataReader(const std::string &path) : file(new std::ifstream(path)) {} - bool NextBatch(PaddleTensor *tensor, int batch_size) { + bool NextBatch(std::vector *input, int batch_size) { PADDLE_ENFORCE_EQ(batch_size, 1); std::string line; - tensor->lod.clear(); - tensor->lod.emplace_back(std::vector({0})); + PaddleTensor tensor; + tensor.dtype = PaddleDType::INT64; + tensor.lod.emplace_back(std::vector({0})); std::vector data; for (int i = 0; i < batch_size; i++) { if (!std::getline(*file, line)) return false; inference::split_to_int64(line, ' ', &data); } - tensor->lod.front().push_back(data.size()); + tensor.lod.front().push_back(data.size()); - tensor->data.Resize(data.size() * sizeof(int64_t)); - memcpy(tensor->data.data(), data.data(), data.size() * sizeof(int64_t)); - tensor->shape.clear(); - tensor->shape.push_back(data.size()); - tensor->shape.push_back(1); + tensor.data.Resize(data.size() * sizeof(int64_t)); + memcpy(tensor.data.data(), data.data(), data.size() * sizeof(int64_t)); + tensor.shape.push_back(data.size()); + tensor.shape.push_back(1); + input->assign({tensor}); return true; } @@ -68,32 +53,28 @@ void Main(int batch_size) { config.model_dir = FLAGS_infer_model; config.use_gpu = false; config.enable_ir_optim = true; - auto predictor = - CreatePaddlePredictor( - config); - - std::vector input_slots(1); - // one batch starts - // data -- - auto &input = input_slots[0]; - input.dtype = PaddleDType::INT64; - inference::Timer timer; - double sum = 0; - std::vector output_slots; + std::vector input_slots, output_slots; + DataReader reader(FLAGS_infer_data); + std::vector> input_slots_all; - int num_batches = 0; - for (int t = 0; t < FLAGS_repeat; t++) { - DataReader reader(FLAGS_infer_data); - while (reader.NextBatch(&input, FLAGS_batch_size)) { - if (FLAGS_topn > 0 && num_batches > FLAGS_topn) break; - timer.tic(); - CHECK(predictor->Run(input_slots, &output_slots)); - sum += timer.toc(); + if (FLAGS_test_all_data) { + LOG(INFO) << "test all data"; + int num_batches = 0; + while (reader.NextBatch(&input_slots, FLAGS_batch_size)) { + input_slots_all.emplace_back(input_slots); ++num_batches; } + LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size; + TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads); + return; } - PrintTime(batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat); + + // one batch starts + // data -- + reader.NextBatch(&input_slots, FLAGS_batch_size); + input_slots_all.emplace_back(input_slots); + TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads); // Get output LOG(INFO) << "get outputs " << output_slots.size(); diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..a207c41b7140c806b4c1fdc7f24a317b165c9aef --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -0,0 +1,133 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +struct Record { + std::vector data; + std::vector shape; +}; + +Record ProcessALine(const std::string &line) { + VLOG(3) << "process a line"; + std::vector columns; + split(line, '\t', &columns); + CHECK_EQ(columns.size(), 2UL) + << "data format error, should be \t"; + + Record record; + std::vector data_strs; + split(columns[0], ' ', &data_strs); + for (auto &d : data_strs) { + record.data.push_back(std::stof(d)); + } + + std::vector shape_strs; + split(columns[1], ' ', &shape_strs); + for (auto &s : shape_strs) { + record.shape.push_back(std::stoi(s)); + } + VLOG(3) << "data size " << record.data.size(); + VLOG(3) << "data shape size " << record.shape.size(); + return record; +} + +/* + * Use the native and analysis fluid engine to inference the demo. + * ocr, mobilenet and se_resnext50 + */ +void TestVisualPrediction(bool use_mkldnn) { + std::unique_ptr predictor; + AnalysisConfig cfg; + cfg.param_file = FLAGS_infer_model + "/__params__"; + cfg.prog_file = FLAGS_infer_model + "/__model__"; + cfg.use_gpu = false; + cfg._use_mkldnn = use_mkldnn; + cfg.device = 0; + cfg.enable_ir_optim = true; + // TODO(TJ): fix fusion gru + cfg.ir_passes.push_back("fc_gru_fuse_pass"); +#ifdef PADDLE_WITH_MKLDNN + // disable mkldnn fuse since it should have some bugs + cfg.ir_passes.push_back("conv_relu_mkldnn_fuse_pass"); +#endif + predictor = + CreatePaddlePredictor(cfg); + + // Only have single batch of data. + std::string line; + std::ifstream file(FLAGS_infer_data); + std::getline(file, line); + auto record = ProcessALine(line); + file.close(); + + // Inference. + PaddleTensor input; + input.shape = record.shape; + input.data = + PaddleBuf(record.data.data(), record.data.size() * sizeof(float)); + input.dtype = PaddleDType::FLOAT32; + + std::vector outputs_slots; + Timer timer; + timer.tic(); + for (int i = 0; i < FLAGS_repeat; i++) { + predictor->Run({input}, &outputs_slots); + } + PrintTime(/*batch size*/ 1, FLAGS_repeat, /*num threads*/ 1, /*thread id*/ 0, + timer.toc() / FLAGS_repeat); + + VLOG(3) << "output.size " << outputs_slots.size(); + + // run native as reference + auto ref_predictor = + CreatePaddlePredictor(cfg); + std::vector ref_outputs_slots; + ref_predictor->Run({input}, &ref_outputs_slots); + CompareResult(outputs_slots, ref_outputs_slots); + // print what are fused + AnalysisPredictor *analysis_predictor = + dynamic_cast(predictor.get()); + auto &fuse_statis = analysis_predictor->analysis_argument() + .Get>( + framework::ir::kFuseStatisAttr); + for (auto &item : fuse_statis) { + LOG(INFO) << "fused " << item.first << " " << item.second; + } + int num_ops = 0; + for (auto &node : + analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { + if (node->IsFunction()) { + ++num_ops; + } + } + LOG(INFO) << "has num ops: " << num_ops; +} + +TEST(Analyzer_vis, analysis) { TestVisualPrediction(/*use_mkldnn*/ false); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_vis, analysis_mkldnn) { + TestVisualPrediction(/*use_mkldnn*/ true); +} +#endif + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..43e97614e3ad9c14c8deee9f340757f373eb593e --- /dev/null +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -0,0 +1,141 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include // NOLINT +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" +#include "paddle/fluid/platform/profiler.h" + +DEFINE_string(infer_model, "", "model path"); +DEFINE_string(infer_data, "", "data file"); +DEFINE_int32(batch_size, 1, "batch size."); +DEFINE_int32(burning, 0, "Burning before repeat."); +DEFINE_int32(repeat, 1, "Running the inference program repeat times."); +DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); +DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); + +namespace paddle { +namespace inference { + +void CompareResult(const std::vector &outputs, + const std::vector &ref_outputs) { + EXPECT_GT(outputs.size(), 0); + EXPECT_EQ(outputs.size(), ref_outputs.size()); + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = outputs[i]; + auto &ref_out = ref_outputs[i]; + size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, + [](int a, int b) { return a * b; }); + size_t ref_size = + std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1, + [](int a, int b) { return a * b; }); + EXPECT_GT(size, 0); + EXPECT_EQ(size, ref_size); + EXPECT_EQ(out.dtype, ref_out.dtype); + switch (out.dtype) { + case PaddleDType::INT64: { + int64_t *pdata = static_cast(out.data.data()); + int64_t *pdata_ref = static_cast(ref_out.data.data()); + for (size_t j = 0; j < size; ++j) { + EXPECT_EQ(pdata_ref[j], pdata[j]); + } + break; + } + case PaddleDType::FLOAT32: { + float *pdata = static_cast(out.data.data()); + float *pdata_ref = static_cast(ref_out.data.data()); + for (size_t j = 0; j < size; ++j) { + EXPECT_NEAR(pdata_ref[j], pdata[j], 1e-3); + } + break; + } + } + } +} + +void TestOneThreadPrediction( + AnalysisConfig config, const std::vector> inputs, + std::vector *outputs) { + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; + auto predictor = + CreatePaddlePredictor( + config); + Timer timer; + timer.tic(); + for (int i = 0; i < num_times; i++) { + for (size_t j = 0; j < inputs.size(); j++) { + predictor->Run(inputs[j], outputs); + } + } + PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times, + inputs.size()); +} + +void TestMultiThreadPrediction( + AnalysisConfig config, const std::vector> inputs, + std::vector *outputs, int num_threads) { + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; + std::vector threads; + std::vector> predictors; + // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled + // because AttentionLSTM's hard code nodeid will be damanged. + for (int tid = 0; tid < num_threads; ++tid) { + predictors.emplace_back( + CreatePaddlePredictor( + config)); + } + for (int tid = 0; tid < num_threads; ++tid) { + threads.emplace_back([&, tid]() { + // Each thread should have local inputs and outputs. + // The inputs of each thread are all the same. + std::vector> inputs_tid = inputs; + std::vector outputs_tid; + Timer timer; + timer.tic(); + for (int i = 0; i < num_times; i++) { + for (size_t j = 0; j < inputs_tid.size(); j++) { + predictors[tid]->Run(inputs_tid[j], &outputs_tid); + } + } + PrintTime(batch_size, num_times, num_threads, tid, + timer.toc() / num_times, inputs_tid.size()); + }); + } + for (int i = 0; i < num_threads; ++i) { + threads[i].join(); + } +} + +void TestPrediction(AnalysisConfig config, + const std::vector> inputs, + std::vector *outputs, int num_threads) { + if (num_threads == 1) { + TestOneThreadPrediction(config, inputs, outputs); + } else { + TestMultiThreadPrediction(config, inputs, outputs, num_threads); + } +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index c2f45fdc99b87bc12c2aadf1985de6e98a24fce7..26ef27c3caafadb4801b0ae52133f6175655ce0a 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -167,6 +167,8 @@ void BuddyAllocator::Free(void* p) { } size_t BuddyAllocator::Used() { return total_used_; } +size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; } +size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; } void* BuddyAllocator::SystemAlloc(size_t size) { size_t index = 0; diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h index f0c83efc23ce39c4fc89296d672e1e55751851bf..3f86a51f0d0b8504bbc4b0477f123093b343e9cf 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.h +++ b/paddle/fluid/memory/detail/buddy_allocator.h @@ -42,6 +42,8 @@ class BuddyAllocator { void* Alloc(size_t unaligned_size); void Free(void* ptr); size_t Used(); + size_t GetMinChunkSize(); + size_t GetMaxChunkSize(); public: // Disable copy and assignment diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 7c800b3c164049244770ceb2070b177d8307e85e..283745e977533358ef52521b36e67f0ada950e61 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -119,8 +119,8 @@ void* Alloc(platform::CUDAPlace place, size_t size) { LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU " << place.device << ", available " << avail << " bytes"; LOG(WARNING) << "total " << total; - LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize(); - LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize(); + LOG(WARNING) << "GpuMinChunkSize " << buddy_allocator->GetMinChunkSize(); + LOG(WARNING) << "GpuMaxChunkSize " << buddy_allocator->GetMaxChunkSize(); LOG(WARNING) << "GPU memory used: " << Used(place); platform::SetDeviceId(cur_dev); } diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 7ec1e78da4ec642cb1e6248edfbcfed748fa11b8..ccb7fa1f8cce8cc757038904bce762af3b5ff30b 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -296,6 +296,7 @@ op_library(flatten_op DEPS reshape_op) op_library(sequence_pad_op DEPS sequence_padding) op_library(unstack_op DEPS stack_op) op_library(fake_quantize_op DEPS memory) +op_library(fusion_lstm_op DEPS cpu_lstm_compute) if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 3eb02c6b61ce61140bd777647a12477dd9c3c803..eae65968285703f5882d910e29bc5d8e1511cba6 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -300,10 +300,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector paddings = ctx.Attr>("paddings"); std::vector dilations = ctx.Attr>("dilations"); bool fuse_relu = ctx.Attr("fuse_relu"); + bool fuse_eltwise = ctx.Attr("fuse_eltwise"); int groups = ctx.Attr("groups"); - // TODO(pzelazko-intel) add support for group convolution and dilation - PADDLE_ENFORCE(groups == 1, "group convolution is not implemented yet"); + // TODO: add support for dilation PADDLE_ENFORCE( dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, "dilation in convolution is not implemented yet"); @@ -314,6 +314,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector src_tz = paddle::framework::vectorize2int(input->dims()); std::vector weights_tz = paddle::framework::vectorize2int(filter->dims()); + int g = std::max(groups, 1); + if (g > 1) { + int o = weights_tz[0]; + int i = weights_tz[1]; + int h = weights_tz[2]; + int w = weights_tz[3]; + weights_tz.resize(5); + weights_tz[0] = g; + weights_tz[1] = o / g; + weights_tz[2] = i; + weights_tz[3] = h; + weights_tz[4] = w; + } std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); // Get unique name for storing MKLDNN primitives @@ -327,7 +340,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto user_src_md = platform::MKLDNNMemDesc( {src_tz}, platform::MKLDNNGetDataType(), input->format()); auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), filter->format()); + {weights_tz}, platform::MKLDNNGetDataType(), + (g == 1) ? filter->format() : mkldnn::memory::format::goihw); /* create memory descriptor for convolution without specified format * ('any') which lets a primitive (convolution in this case) choose @@ -340,7 +354,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto src_md = platform::MKLDNNMemDesc( src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); auto weights_md = platform::MKLDNNMemDesc( - weights_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + weights_tz, platform::MKLDNNGetDataType(), + (g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw); std::vector bias_tz; // TODO(mgallus): avoid empty vector creation. // Currently used whenever bias is != nullptr. auto dst_md = platform::MKLDNNMemDesc( @@ -352,12 +367,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { bias_tz = paddle::framework::vectorize2int(bias->dims()); auto bias_md = platform::MKLDNNMemDesc( bias_tz, platform::MKLDNNGetDataType(), memory::format::x); - conv_pd = - ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, strides, - paddings, mkldnn_engine, fuse_relu); + conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, + strides, paddings, mkldnn_engine, + fuse_relu, fuse_eltwise); } else { - conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, - paddings, mkldnn_engine, fuse_relu); + conv_pd = + ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, + mkldnn_engine, fuse_relu, fuse_eltwise); } // Save conv_pd/src_memory/weights_memory for backward pass dev_ctx.SetBlob(key_conv_pd, conv_pd); @@ -407,16 +423,26 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { } private: - mkldnn::primitive_attr AddRelu() const { - // Fusion with ReLU layer is executed through the PostOps feature. Create a - // PostOps object and configure it to execute an eltwise relu operation. + mkldnn::primitive_attr CreatePostOps(bool fuse_relu, + bool fuse_eltwise) const { mkldnn::primitive_attr conv_attr; - constexpr float scale = 1.0f; - constexpr float negative_slope = 0.0f; - constexpr float placeholder = 0.0f; mkldnn::post_ops post_operations; - post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu, - negative_slope, placeholder); + // Fusion with Elementwise layer relies on adding a sum post-operation with + // the scale parameter. It is assumed that when fuse_eltwise is true, the + // Output tensor contains the data coming from residual connection. The + // result of this post_op is: Output = scale * Output + Conv_Out. + if (fuse_eltwise) { + post_operations.append_sum(1.0f); + } + // Fusion with ReLU layer is executed through the PostOps feature. Create a + // PostOps object and configure it to execute an eltwise relu operation. + if (fuse_relu) { + constexpr float scale = 1.0f; + constexpr float negative_slope = 0.0f; + constexpr float placeholder = 0.0f; + post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu, + negative_slope, placeholder); + } conv_attr.set_post_ops(post_operations); return conv_attr; } @@ -425,8 +451,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, const memory::desc& dst, const std::vector& strides, const std::vector& paddings, - const mkldnn::engine& engine, - const bool fuse_relu) const { + const mkldnn::engine& engine, const bool fuse_relu, + const bool fuse_eltwise) const { memory::dims stride_dims = {strides[0], strides[1]}; memory::dims padding_dims = {paddings[0], paddings[1]}; @@ -435,10 +461,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { dst, stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); - mkldnn::primitive_attr conv_attr; - if (fuse_relu) { - conv_attr = AddRelu(); - } + mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise); auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( conv_desc, conv_attr, engine); @@ -452,8 +475,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const memory::desc& bias, const memory::desc& dst, const std::vector& strides, const std::vector& paddings, - const mkldnn::engine& engine, - const bool fuse_relu) const { + const mkldnn::engine& engine, const bool fuse_relu, + const bool fuse_eltwise) const { memory::dims stride_dims = {strides[0], strides[1]}; memory::dims padding_dims = {paddings[0], paddings[1]}; @@ -462,10 +485,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { bias, dst, stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); - mkldnn::primitive_attr conv_attr; - if (fuse_relu) { - conv_attr = AddRelu(); - } + mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise); auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( conv_desc, conv_attr, engine); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 41d4fcf6de7c8fcb3cfbb2063b0a2ac1a2356168..8f84bf71a7f77606bed6672f0830e3fc80165a42 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -164,6 +164,11 @@ void Conv2DOpMaker::Make() { .SetDefault(false); AddAttr("fuse_relu", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr("fuse_eltwise", + "(bool, default false) Only used in mkldnn kernel. Used " + "whenever convolution output is connected via skip connection " + "to a previous layer.") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h index 0dee1781623d5a62830545c0952e5aadbe37accb..6abeca1da443248d6ad3c1bcc64dd775d77f4ed8 100644 --- a/paddle/fluid/operators/detection/bbox_util.h +++ b/paddle/fluid/operators/detection/bbox_util.h @@ -9,6 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/tensor.h" @@ -21,7 +22,7 @@ namespace operators { */ template inline void BoxToDelta(const int box_num, const framework::Tensor& ex_boxes, - const framework::Tensor& gt_boxes, const T* weights, + const framework::Tensor& gt_boxes, const float* weights, const bool normalized, framework::Tensor* box_delta) { auto ex_boxes_et = framework::EigenTensor::From(ex_boxes); auto gt_boxes_et = framework::EigenTensor::From(gt_boxes); @@ -62,5 +63,35 @@ void Gather(const T* in, const int in_stride, const int* index, const int num, } } +template +void BboxOverlaps(const framework::Tensor& r_boxes, + const framework::Tensor& c_boxes, + framework::Tensor* overlaps) { + auto r_boxes_et = framework::EigenTensor::From(r_boxes); + auto c_boxes_et = framework::EigenTensor::From(c_boxes); + auto overlaps_et = framework::EigenTensor::From(*overlaps); + int r_num = r_boxes.dims()[0]; + int c_num = c_boxes.dims()[0]; + auto zero = static_cast(0.0); + T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h, + inter_area; + for (int i = 0; i < r_num; ++i) { + r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) * + (r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1); + for (int j = 0; j < c_num; ++j) { + c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) * + (c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1); + x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0)); + y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1)); + x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2)); + y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3)); + inter_w = std::max(x_max - x_min + 1, zero); + inter_h = std::max(y_max - y_min + 1, zero); + inter_area = inter_w * inter_h; + overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area); + } + } +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc index be06dc19743cfa6f093bcb3f4e9f91af315d4211..d7a53f1bef98ecda3ba7b36323678a11a632a15c 100644 --- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc @@ -42,10 +42,11 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel { "Input(RpnRois) shouldn't be null."); PADDLE_ENFORCE(ctx->HasInput("GtClasses"), "Input(GtClasses) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("IsCrowd"), + "Input(IsCrowd) shouldn't be null."); PADDLE_ENFORCE(ctx->HasInput("GtBoxes"), "Input(GtBoxes) shouldn't be null."); - PADDLE_ENFORCE(ctx->HasInput("ImScales"), - "Input(ImScales) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null."); PADDLE_ENFORCE(ctx->HasOutput("Rois"), "Output(Rois) of RpnTargetAssignOp should not be null"); @@ -64,22 +65,21 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel { auto rpn_rois_dims = ctx->GetInputDim("RpnRois"); auto gt_classes_dims = ctx->GetInputDim("GtClasses"); + auto is_crowd_dims = ctx->GetInputDim("IsCrowd"); auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); - auto im_scales_dims = ctx->GetInputDim("ImScales"); + auto im_info_dims = ctx->GetInputDim("ImInfo"); PADDLE_ENFORCE_EQ(rpn_rois_dims.size(), 2, "The rank of Input(RpnRois) must be 2."); - PADDLE_ENFORCE_EQ(gt_classes_dims.size(), 1, - "The rank of Input(GtClasses) must be 1."); PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2, "The rank of Input(GtBoxes) must be 2."); - PADDLE_ENFORCE_EQ(im_scales_dims.size(), 1, - "The rank of Input(ImScales) must be 1."); + PADDLE_ENFORCE_EQ(im_info_dims.size(), 2, + "The rank of Input(ImInfo) must be 2."); int class_nums = ctx->Attrs().Get("class_nums"); ctx->SetOutputDim("Rois", {-1, 4}); - ctx->SetOutputDim("LabelsInt32", {-1}); + ctx->SetOutputDim("LabelsInt32", {-1, 1}); ctx->SetOutputDim("BboxTargets", {-1, 4 * class_nums}); ctx->SetOutputDim("BboxInsideWeights", {-1, 4 * class_nums}); ctx->SetOutputDim("BboxOutsideWeights", {-1, 4 * class_nums}); @@ -105,45 +105,18 @@ void Concat(const platform::CPUDeviceContext& context, concat_functor(context, inputs, axis, out_tensor); } -template -void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes, - Tensor* overlaps) { - auto r_boxes_et = framework::EigenTensor::From(r_boxes); - auto c_boxes_et = framework::EigenTensor::From(c_boxes); - auto overlaps_et = framework::EigenTensor::From(*overlaps); - int r_num = r_boxes.dims()[0]; - int c_num = c_boxes.dims()[0]; - auto zero = static_cast(0.0); - T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h, - inter_area; - for (int i = 0; i < r_num; ++i) { - r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) * - (r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1); - for (int j = 0; j < c_num; ++j) { - c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) * - (c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1); - x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0)); - y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1)); - x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2)); - y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3)); - inter_w = std::max(x_max - x_min + 1, zero); - inter_h = std::max(y_max - y_min + 1, zero); - inter_area = inter_w * inter_h; - overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area); - } - } -} - template std::vector> SampleFgBgGt( const platform::CPUDeviceContext& context, Tensor* iou, - const int batch_size_per_im, const float fg_fraction, const float fg_thresh, - const float bg_thresh_hi, const float bg_thresh_lo, - std::minstd_rand engine) { + const Tensor& is_crowd, const int batch_size_per_im, + const float fg_fraction, const float fg_thresh, const float bg_thresh_hi, + const float bg_thresh_lo, std::minstd_rand engine, const bool use_random) { std::vector fg_inds; std::vector bg_inds; std::vector gt_inds; - T* proposal_to_gt_overlaps = iou->mutable_data(context.GetPlace()); + int64_t gt_num = is_crowd.numel(); + const int* crowd_data = is_crowd.data(); + T* proposal_to_gt_overlaps = iou->data(); int64_t row = iou->dims()[0]; int64_t col = iou->dims()[1]; float epsilon = 0.00001; @@ -152,6 +125,9 @@ std::vector> SampleFgBgGt( for (int64_t i = 0; i < row; ++i) { const T* v = proposal_to_gt_overlaps + i * col; T max_overlap = *std::max_element(v, v + col); + if ((i < gt_num) && (crowd_data[i])) { + max_overlap = -1.0; + } if (max_overlap > fg_thresh) { for (int64_t j = 0; j < col; ++j) { T val = proposal_to_gt_overlaps[i * col + j]; @@ -170,17 +146,19 @@ std::vector> SampleFgBgGt( } // Reservoir Sampling + std::uniform_real_distribution uniform(0, 1); int fg_rois_per_im = std::floor(batch_size_per_im * fg_fraction); int fg_rois_this_image = fg_inds.size(); int fg_rois_per_this_image = std::min(fg_rois_per_im, fg_rois_this_image); - std::uniform_real_distribution uniform(0, 1); - const int64_t fg_size = static_cast(fg_inds.size()); - if (fg_size > fg_rois_per_this_image) { - for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) { - int rng_ind = std::floor(uniform(engine) * i); - if (rng_ind < fg_rois_per_this_image) { - std::iter_swap(fg_inds.begin() + rng_ind, fg_inds.begin() + i); - std::iter_swap(gt_inds.begin() + rng_ind, gt_inds.begin() + i); + if (use_random) { + const int64_t fg_size = static_cast(fg_inds.size()); + if (fg_size > fg_rois_per_this_image) { + for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) { + int rng_ind = std::floor(uniform(engine) * i); + if (rng_ind < fg_rois_per_this_image) { + std::iter_swap(fg_inds.begin() + rng_ind, fg_inds.begin() + i); + std::iter_swap(gt_inds.begin() + rng_ind, gt_inds.begin() + i); + } } } } @@ -192,12 +170,14 @@ std::vector> SampleFgBgGt( int bg_rois_per_image = batch_size_per_im - fg_rois_per_this_image; int bg_rois_this_image = bg_inds.size(); int bg_rois_per_this_image = std::min(bg_rois_per_image, bg_rois_this_image); - const int64_t bg_size = static_cast(bg_inds.size()); - if (bg_size > bg_rois_per_this_image) { - for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) { - int rng_ind = std::floor(uniform(engine) * i); - if (rng_ind < fg_rois_per_this_image) - std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i); + if (use_random) { + const int64_t bg_size = static_cast(bg_inds.size()); + if (bg_size > bg_rois_per_this_image) { + for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) { + int rng_ind = std::floor(uniform(engine) * i); + if (rng_ind < fg_rois_per_this_image) + std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i); + } } } std::vector new_bg_inds(bg_inds.begin(), @@ -248,14 +228,14 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context, template std::vector SampleRoisForOneImage( const platform::CPUDeviceContext& context, Tensor* rpn_rois, - Tensor* gt_classes, Tensor* gt_boxes, Tensor* im_scale, + Tensor* gt_classes, Tensor* is_crowd, Tensor* gt_boxes, Tensor* im_info, const int batch_size_per_im, const float fg_fraction, const float fg_thresh, const float bg_thresh_hi, const float bg_thresh_lo, const std::vector& bbox_reg_weights, const int class_nums, - std::minstd_rand engine) { + std::minstd_rand engine, bool use_random) { auto rpn_rois_et = framework::EigenTensor::From(*rpn_rois); - auto im_scale_data = im_scale->data()[0]; - rpn_rois_et = rpn_rois_et / im_scale_data; + auto im_scale = im_info->data()[2]; + rpn_rois_et = rpn_rois_et / im_scale; Tensor boxes; int proposals_num = gt_boxes->dims()[0] + rpn_rois->dims()[0]; @@ -270,8 +250,8 @@ std::vector SampleRoisForOneImage( // Generate proposal index std::vector> fg_bg_gt = SampleFgBgGt( - context, &proposal_to_gt_overlaps, batch_size_per_im, fg_fraction, - fg_thresh, bg_thresh_hi, bg_thresh_lo, engine); + context, &proposal_to_gt_overlaps, *is_crowd, batch_size_per_im, + fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, engine, use_random); std::vector fg_inds = fg_bg_gt[0]; std::vector bg_inds = fg_bg_gt[1]; std::vector gt_inds = fg_bg_gt[2]; @@ -291,15 +271,15 @@ std::vector SampleRoisForOneImage( // Compute targets Tensor bbox_targets_single; bbox_targets_single.mutable_data(bbox_dim, context.GetPlace()); - BoxToDelta(fg_num, sampled_boxes, sampled_gts, nullptr, false, - &bbox_targets_single); + BoxToDelta(fg_num, sampled_boxes, sampled_gts, bbox_reg_weights.data(), + false, &bbox_targets_single); // Scale rois Tensor sampled_rois; sampled_rois.mutable_data(sampled_boxes.dims(), context.GetPlace()); auto sampled_rois_et = framework::EigenTensor::From(sampled_rois); auto sampled_boxes_et = framework::EigenTensor::From(sampled_boxes); - sampled_rois_et = sampled_boxes_et * im_scale_data; + sampled_rois_et = sampled_boxes_et * im_scale; // Expand box targets Tensor bbox_targets, bbox_inside_weights, bbox_outside_weights; @@ -351,8 +331,9 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { auto* rpn_rois = context.Input("RpnRois"); auto* gt_classes = context.Input("GtClasses"); + auto* is_crowd = context.Input("IsCrowd"); auto* gt_boxes = context.Input("GtBoxes"); - auto* im_scales = context.Input("ImScales"); + auto* im_info = context.Input("ImInfo"); auto* rois = context.Output("Rois"); auto* labels_int32 = context.Output("LabelsInt32"); @@ -369,18 +350,21 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { std::vector bbox_reg_weights = context.Attr>("bbox_reg_weights"); int class_nums = context.Attr("class_nums"); + bool use_random = context.Attr("use_random"); PADDLE_ENFORCE_EQ(rpn_rois->lod().size(), 1UL, "GenerateProposalLabelsOp rpn_rois needs 1 level of LoD"); PADDLE_ENFORCE_EQ( gt_classes->lod().size(), 1UL, "GenerateProposalLabelsOp gt_classes needs 1 level of LoD"); + PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL, + "GenerateProposalLabelsOp is_crowd needs 1 level of LoD"); PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL, "GenerateProposalLabelsOp gt_boxes needs 1 level of LoD"); int64_t n = static_cast(rpn_rois->lod().back().size() - 1); rois->mutable_data({n * batch_size_per_im, kBoxDim}, context.GetPlace()); - labels_int32->mutable_data({n * batch_size_per_im}, + labels_int32->mutable_data({n * batch_size_per_im, 1}, context.GetPlace()); bbox_targets->mutable_data({n * batch_size_per_im, kBoxDim * class_nums}, context.GetPlace()); @@ -391,8 +375,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { std::random_device rnd; std::minstd_rand engine; - int seed = - context.Attr("fix_seed") ? context.Attr("seed") : rnd(); + int seed = rnd(); engine.seed(seed); framework::LoD lod; @@ -403,19 +386,23 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { auto rpn_rois_lod = rpn_rois->lod().back(); auto gt_classes_lod = gt_classes->lod().back(); + auto is_crowd_lod = is_crowd->lod().back(); auto gt_boxes_lod = gt_boxes->lod().back(); for (int i = 0; i < n; ++i) { Tensor rpn_rois_slice = rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]); Tensor gt_classes_slice = gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]); + Tensor is_crowd_slice = + is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]); Tensor gt_boxes_slice = gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]); - Tensor im_scales_slice = im_scales->Slice(i, i + 1); + Tensor im_info_slice = im_info->Slice(i, i + 1); std::vector tensor_output = SampleRoisForOneImage( - dev_ctx, &rpn_rois_slice, >_classes_slice, >_boxes_slice, - &im_scales_slice, batch_size_per_im, fg_fraction, fg_thresh, - bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums, engine); + dev_ctx, &rpn_rois_slice, >_classes_slice, &is_crowd_slice, + >_boxes_slice, &im_info_slice, batch_size_per_im, fg_fraction, + fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums, + engine, use_random); Tensor sampled_rois = tensor_output[0]; Tensor sampled_labels_int32 = tensor_output[1]; Tensor sampled_bbox_targets = tensor_output[2]; @@ -442,7 +429,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { bbox_inside_weights->set_lod(lod); bbox_outside_weights->set_lod(lod); rois->Resize({num_rois, kBoxDim}); - labels_int32->Resize({num_rois}); + labels_int32->Resize({num_rois, 1}); bbox_targets->Resize({num_rois, kBoxDim * class_nums}); bbox_inside_weights->Resize({num_rois, kBoxDim * class_nums}); bbox_outside_weights->Resize({num_rois, kBoxDim * class_nums}); @@ -455,8 +442,9 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker { // TODO(buxingyuan): Add Document AddInput("RpnRois", "RpnRois."); AddInput("GtClasses", "GtClasses."); + AddInput("IsCrowd", "IsCrowd."); AddInput("GtBoxes", "GtBoxes."); - AddInput("ImScales", "ImScales."); + AddInput("ImInfo", "ImInfo."); AddOutput("Rois", "Rois."); AddOutput("LabelsInt32", "LabelsInt32."); @@ -471,8 +459,7 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("bg_thresh_lo", "bg_thresh_lo"); AddAttr>("bbox_reg_weights", "bbox_reg_weights"); AddAttr("class_nums", "class_nums"); - AddAttr("fix_seed", "fix_seed").SetDefault(false); - AddAttr("seed", "seed").SetDefault(0); + AddAttr("use_random", "use_random").SetDefault(true); AddComment(R"DOC( Generate Proposals Labels Operator. diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index ebe6830eccd87a156768eb0d4b96220bcc9f4edc..c33aa255362bc5234f2813fb93e70c943b03c33f 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -89,12 +89,11 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors, } for (int64_t i = 0; i < row; ++i) { - T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len]; - T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1]; + T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0; + T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0; - T anchor_center_x = (anchor_data[i * len + 2] + anchor_data[i * len]) / 2; - T anchor_center_y = - (anchor_data[i * len + 3] + anchor_data[i * len + 1]) / 2; + T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width; + T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height; T bbox_center_x = 0, bbox_center_y = 0; T bbox_width = 0, bbox_height = 0; @@ -106,25 +105,31 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors, bbox_center_y = variances_data[i * len + 1] * bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; - bbox_width = std::exp(variances_data[i * len + 2] * - bbox_deltas_data[i * len + 2]) * + bbox_width = std::exp(std::min(variances_data[i * len + 2] * + bbox_deltas_data[i * len + 2], + std::log(1000.0 / 16.0))) * anchor_width; - bbox_height = std::exp(variances_data[i * len + 3] * - bbox_deltas_data[i * len + 3]) * + bbox_height = std::exp(std::min(variances_data[i * len + 3] * + bbox_deltas_data[i * len + 3], + std::log(1000.0 / 16.0))) * anchor_height; } else { bbox_center_x = bbox_deltas_data[i * len] * anchor_width + anchor_center_x; bbox_center_y = bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; - bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width; - bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height; + bbox_width = std::exp(std::min(bbox_deltas_data[i * len + 2], + std::log(1000.0 / 16.0))) * + anchor_width; + bbox_height = std::exp(std::min(bbox_deltas_data[i * len + 3], + std::log(1000.0 / 16.0))) * + anchor_height; } proposals_data[i * len] = bbox_center_x - bbox_width / 2; proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2; - proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2; - proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2; + proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1; + proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1; } // return proposals; } @@ -156,18 +161,23 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes, float min_size, const Tensor &im_info, Tensor *keep) { const T *im_info_data = im_info.data(); T *boxes_data = boxes->mutable_data(ctx.GetPlace()); - min_size *= im_info_data[2]; + T im_scale = im_info_data[2]; keep->Resize({boxes->dims()[0], 1}); + min_size = std::max(min_size, 1.0f); int *keep_data = keep->mutable_data(ctx.GetPlace()); int keep_len = 0; for (int i = 0; i < boxes->dims()[0]; ++i) { T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1; T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1; + T ws_origin_scale = + (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1; + T hs_origin_scale = + (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1; T x_ctr = boxes_data[4 * i] + ws / 2; T y_ctr = boxes_data[4 * i + 1] + hs / 2; - if (ws >= min_size && hs >= min_size && x_ctr <= im_info_data[1] && - y_ctr <= im_info_data[0]) { + if (ws_origin_scale >= min_size && hs_origin_scale >= min_size && + x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) { keep_data[keep_len++] = i; } } @@ -218,8 +228,8 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) { const T inter_ymin = std::max(box1[1], box2[1]); const T inter_xmax = std::min(box1[2], box2[2]); const T inter_ymax = std::min(box1[3], box2[3]); - const T inter_w = inter_xmax - inter_xmin; - const T inter_h = inter_ymax - inter_ymin; + const T inter_w = std::max(0.0f, inter_xmax - inter_xmin + 1); + const T inter_h = std::max(0.0f, inter_ymax - inter_ymin + 1); const T inter_area = inter_w * inter_h; const T bbox1_area = BBoxArea(box1, normalized); const T bbox2_area = BBoxArea(box2, normalized); diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc index 88757f25cd9a5789758640de2d9cae0b12350b25..dda423efd35b96f5e1d7c55389818f46ef3d8694 100644 --- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc +++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc @@ -31,8 +31,14 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("DistMat"), - "Input(DistMat) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("Anchor"), + "Input(Anchor) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("GtBoxes"), + "Input(GtBoxes) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("IsCrowd"), + "Input(Anchor) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("ImInfo"), + "Input(ImInfo) of RpnTargetAssignOp should not be null"); PADDLE_ENFORCE( ctx->HasOutput("LocationIndex"), @@ -43,10 +49,20 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { PADDLE_ENFORCE( ctx->HasOutput("TargetLabel"), "Output(TargetLabel) of RpnTargetAssignOp should not be null"); - - auto in_dims = ctx->GetInputDim("DistMat"); - PADDLE_ENFORCE_EQ(in_dims.size(), 2, - "The rank of Input(DistMat) must be 2."); + PADDLE_ENFORCE( + ctx->HasOutput("TargetBBox"), + "Output(TargetBBox) of RpnTargetAssignOp should not be null"); + + auto anchor_dims = ctx->GetInputDim("Anchor"); + auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); + auto is_crowd_dims = ctx->GetInputDim("IsCrowd"); + auto im_info_dims = ctx->GetInputDim("ImInfo"); + PADDLE_ENFORCE_EQ(anchor_dims.size(), 2, + "The rank of Input(Anchor) must be 2."); + PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2, + "The rank of Input(GtBoxes) must be 2."); + PADDLE_ENFORCE_EQ(im_info_dims.size(), 2, + "The rank of Input(ImInfo) must be 2."); ctx->SetOutputDim("LocationIndex", {-1}); ctx->SetOutputDim("ScoreIndex", {-1}); @@ -59,198 +75,383 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType( - ctx.Input("DistMat")->type()), + ctx.Input("Anchor")->type()), platform::CPUPlace()); } }; template -class RpnTargetAssignKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* anchor_t = context.Input("Anchor"); // (H*W*A) * 4 - auto* gt_bbox_t = context.Input("GtBox"); - auto* dist_t = context.Input("DistMat"); +void AppendRpns(LoDTensor* out, int64_t offset, Tensor* to_add) { + auto* out_data = out->data(); + auto* to_add_data = to_add->data(); + memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T)); +} + +template +std::vector FilterStraddleAnchor( + const platform::CPUDeviceContext& context, const Tensor* anchor, + const float rpn_straddle_thresh, T im_height, T im_width) { + std::vector inds_inside; + int anchor_num = anchor->dims()[0]; + auto* anchor_data = anchor->data(); + if (rpn_straddle_thresh >= 0) { + int index; + for (int i = 0; i < anchor_num; ++i) { + index = i * 4; + if ((anchor_data[index + 0] >= -rpn_straddle_thresh) && + (anchor_data[index + 1] >= -rpn_straddle_thresh) && + (anchor_data[index + 2] < im_width + rpn_straddle_thresh) && + (anchor_data[index + 3] < im_height + rpn_straddle_thresh)) { + inds_inside.emplace_back(i); + } + } + } else { + for (int i = 0; i < anchor_num; ++i) { + inds_inside.emplace_back(i); + } + } + int inside_num = inds_inside.size(); + Tensor inds_inside_t; + int* inds_inside_data = + inds_inside_t.mutable_data({inside_num}, context.GetPlace()); + std::copy(inds_inside.begin(), inds_inside.end(), inds_inside_data); + Tensor inside_anchor_t; + T* inside_anchor_data = + inside_anchor_t.mutable_data({inside_num, 4}, context.GetPlace()); + Gather(anchor->data(), 4, inds_inside_data, inside_num, + inside_anchor_data); + std::vector res; + res.emplace_back(inds_inside_t); + res.emplace_back(inside_anchor_t); + return res; +} + +template +Tensor FilterCrowdGt(const platform::CPUDeviceContext& context, + Tensor* gt_boxes, Tensor* is_crowd) { + int gt_num = gt_boxes->dims()[0]; + std::vector not_crowd_inds; + auto* is_crowd_data = is_crowd->data(); + for (int i = 0; i < gt_num; ++i) { + if (is_crowd_data[i] == 0) { + not_crowd_inds.emplace_back(i); + } + } + int ncrowd_num = not_crowd_inds.size(); + Tensor ncrowd_gt_boxes; + T* ncrowd_gt_boxes_data = + ncrowd_gt_boxes.mutable_data({ncrowd_num, 4}, context.GetPlace()); + Gather(gt_boxes->data(), 4, not_crowd_inds.data(), ncrowd_num, + ncrowd_gt_boxes_data); + return ncrowd_gt_boxes; +} + +void ReservoirSampling(const int num, std::vector* inds, + std::minstd_rand engine, bool use_random) { + std::uniform_real_distribution uniform(0, 1); + size_t len = inds->size(); + if (len > static_cast(num)) { + if (use_random) { + for (size_t i = num; i < len; ++i) { + int rng_ind = std::floor(uniform(engine) * i); + if (rng_ind < num) + std::iter_swap(inds->begin() + rng_ind, inds->begin() + i); + } + } + inds->resize(num); + } +} + +template +void ScoreAssign(const T* anchor_by_gt_overlap_data, + const Tensor& anchor_to_gt_max, const Tensor& gt_to_anchor_max, + const int rpn_batch_size_per_im, const float rpn_fg_fraction, + const float rpn_positive_overlap, + const float rpn_negative_overlap, std::vector* fg_inds, + std::vector* bg_inds, std::vector* tgt_lbl, + std::minstd_rand engine, bool use_random) { + float epsilon = 0.00001; + int anchor_num = anchor_to_gt_max.dims()[0]; + int gt_num = gt_to_anchor_max.dims()[0]; + std::vector target_label(anchor_num, -1); + std::vector fg_inds_fake; + std::vector bg_inds_fake; + const T* anchor_to_gt_max_data = anchor_to_gt_max.data(); + const T* gt_to_anchor_max_data = gt_to_anchor_max.data(); + // TODO(buxingyuan): Match with Detectron now + // but it seems here is a bug in two directions assignment + // in which the later one may overwrites the former one. + for (int64_t i = 0; i < anchor_num; ++i) { + bool is_anchors_with_max_overlap = false; + for (int64_t j = 0; j < gt_num; ++j) { + T value = anchor_by_gt_overlap_data[i * gt_num + j]; + T diff = std::abs(value - gt_to_anchor_max_data[j]); + if (diff < epsilon) { + is_anchors_with_max_overlap = true; + break; + } + } + bool is_anchor_great_than_thresh = + (anchor_to_gt_max_data[i] >= rpn_positive_overlap); + if (is_anchors_with_max_overlap || is_anchor_great_than_thresh) { + fg_inds_fake.push_back(i); + } + } - auto* loc_index_t = context.Output("LocationIndex"); - auto* score_index_t = context.Output("ScoreIndex"); - auto* tgt_bbox_t = context.Output("TargetBBox"); - auto* tgt_lbl_t = context.Output("TargetLabel"); + // Reservoir Sampling + int fg_num = static_cast(rpn_fg_fraction * rpn_batch_size_per_im); + ReservoirSampling(fg_num, &fg_inds_fake, engine, use_random); + fg_num = static_cast(fg_inds_fake.size()); + for (int64_t i = 0; i < fg_num; ++i) { + target_label[fg_inds_fake[i]] = 1; + } - auto lod = dist_t->lod().back(); - int64_t batch_num = static_cast(lod.size() - 1); - int64_t anchor_num = dist_t->dims()[1]; - PADDLE_ENFORCE_EQ(anchor_num, anchor_t->dims()[0]); + int bg_num = rpn_batch_size_per_im - fg_num; + for (int64_t i = 0; i < anchor_num; ++i) { + if (anchor_to_gt_max_data[i] < rpn_negative_overlap) { + bg_inds_fake.push_back(i); + } + } + ReservoirSampling(bg_num, &bg_inds_fake, engine, use_random); + bg_num = static_cast(bg_inds_fake.size()); + for (int64_t i = 0; i < bg_num; ++i) { + target_label[bg_inds_fake[i]] = 0; + } - int rpn_batch_size = context.Attr("rpn_batch_size_per_im"); - float pos_threshold = context.Attr("rpn_positive_overlap"); - float neg_threshold = context.Attr("rpn_negative_overlap"); - float fg_fraction = context.Attr("fg_fraction"); + for (int64_t i = 0; i < anchor_num; ++i) { + if (target_label[i] == 1) fg_inds->emplace_back(i); + if (target_label[i] == 0) bg_inds->emplace_back(i); + } + fg_num = fg_inds->size(); + bg_num = bg_inds->size(); + + tgt_lbl->resize(fg_num + bg_num, 0); + std::vector fg_lbl(fg_num, 1); + std::vector bg_lbl(bg_num, 0); + std::copy(fg_lbl.begin(), fg_lbl.end(), tgt_lbl->data()); + std::copy(bg_lbl.begin(), bg_lbl.end(), tgt_lbl->data() + fg_num); +} + +template +std::vector SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx, + const Tensor& anchor_by_gt_overlap, + const int rpn_batch_size_per_im, + const float rpn_positive_overlap, + const float rpn_negative_overlap, + const float rpn_fg_fraction, + std::minstd_rand engine, bool use_random) { + auto* anchor_by_gt_overlap_data = anchor_by_gt_overlap.data(); + int anchor_num = anchor_by_gt_overlap.dims()[0]; + int gt_num = anchor_by_gt_overlap.dims()[1]; + + std::vector fg_inds; + std::vector bg_inds; + std::vector gt_inds; + std::vector tgt_lbl; + + // Calculate the max IoU between anchors and gt boxes + // Map from anchor to gt box that has highest overlap + auto place = ctx.GetPlace(); + Tensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max; + anchor_to_gt_max.mutable_data({anchor_num}, place); + int* argmax = anchor_to_gt_argmax.mutable_data({anchor_num}, place); + gt_to_anchor_max.mutable_data({gt_num}, place); + + auto anchor_by_gt_overlap_et = + framework::EigenMatrix::From(anchor_by_gt_overlap); + auto anchor_to_gt_max_et = + framework::EigenVector::Flatten(anchor_to_gt_max); + auto gt_to_anchor_max_et = + framework::EigenVector::Flatten(gt_to_anchor_max); + auto anchor_to_gt_argmax_et = + framework::EigenVector::Flatten(anchor_to_gt_argmax); + anchor_to_gt_max_et = + anchor_by_gt_overlap_et.maximum(Eigen::DSizes(1)); + anchor_to_gt_argmax_et = + anchor_by_gt_overlap_et.argmax(1).template cast(); + gt_to_anchor_max_et = + anchor_by_gt_overlap_et.maximum(Eigen::DSizes(0)); + + // Follow the Faster RCNN's implementation + ScoreAssign(anchor_by_gt_overlap_data, anchor_to_gt_max, gt_to_anchor_max, + rpn_batch_size_per_im, rpn_fg_fraction, rpn_positive_overlap, + rpn_negative_overlap, &fg_inds, &bg_inds, &tgt_lbl, engine, + use_random); + + int fg_num = fg_inds.size(); + int bg_num = bg_inds.size(); + gt_inds.reserve(fg_num); + for (int i = 0; i < fg_num; ++i) { + gt_inds.emplace_back(argmax[fg_inds[i]]); + } - int fg_num_per_batch = static_cast(rpn_batch_size * fg_fraction); + Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t; + int* loc_index_data = loc_index_t.mutable_data({fg_num}, place); + int* score_index_data = + score_index_t.mutable_data({fg_num + bg_num}, place); + int* tgt_lbl_data = tgt_lbl_t.mutable_data({fg_num + bg_num}, place); + int* gt_inds_data = gt_inds_t.mutable_data({fg_num}, place); + std::copy(fg_inds.begin(), fg_inds.end(), loc_index_data); + std::copy(fg_inds.begin(), fg_inds.end(), score_index_data); + std::copy(bg_inds.begin(), bg_inds.end(), score_index_data + fg_num); + std::copy(tgt_lbl.begin(), tgt_lbl.end(), tgt_lbl_data); + std::copy(gt_inds.begin(), gt_inds.end(), gt_inds_data); + std::vector loc_score_tgtlbl_gt; + loc_score_tgtlbl_gt.emplace_back(loc_index_t); + loc_score_tgtlbl_gt.emplace_back(score_index_t); + loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t); + loc_score_tgtlbl_gt.emplace_back(gt_inds_t); + + return loc_score_tgtlbl_gt; +} - int64_t max_num = batch_num * anchor_num; +template +class RpnTargetAssignKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* anchor = context.Input("Anchor"); // (H*W*A) * 4 + auto* gt_boxes = context.Input("GtBoxes"); + auto* is_crowd = context.Input("IsCrowd"); + auto* im_info = context.Input("ImInfo"); + + auto* loc_index = context.Output("LocationIndex"); + auto* score_index = context.Output("ScoreIndex"); + auto* tgt_bbox = context.Output("TargetBBox"); + auto* tgt_lbl = context.Output("TargetLabel"); + + PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL, + "RpnTargetAssignOp gt_boxes needs 1 level of LoD"); + PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL, + "RpnTargetAssignOp is_crowd needs 1 level of LoD"); + int64_t anchor_num = static_cast(anchor->dims()[0]); + int64_t batch_num = static_cast(gt_boxes->lod().back().size() - 1); + + int rpn_batch_size_per_im = context.Attr("rpn_batch_size_per_im"); + float rpn_straddle_thresh = context.Attr("rpn_straddle_thresh"); + float rpn_positive_overlap = context.Attr("rpn_positive_overlap"); + float rpn_negative_overlap = context.Attr("rpn_negative_overlap"); + float rpn_fg_fraction = context.Attr("rpn_fg_fraction"); + bool use_random = context.Attr("use_random"); + + int64_t max_num = batch_num * rpn_batch_size_per_im; auto place = context.GetPlace(); - tgt_bbox_t->mutable_data({max_num, 4}, place); - auto* loc_index = loc_index_t->mutable_data({max_num}, place); - auto* score_index = score_index_t->mutable_data({max_num}, place); + loc_index->mutable_data({max_num}, place); + score_index->mutable_data({max_num}, place); + tgt_bbox->mutable_data({max_num, 4}, place); + tgt_lbl->mutable_data({max_num, 1}, place); - Tensor tmp_tgt_lbl; - auto* tmp_lbl_data = tmp_tgt_lbl.mutable_data({max_num}, place); auto& dev_ctx = context.device_context(); - math::SetConstant iset; - iset(dev_ctx, &tmp_tgt_lbl, static_cast(-1)); std::random_device rnd; std::minstd_rand engine; - int seed = - context.Attr("fix_seed") ? context.Attr("seed") : rnd(); + int seed = rnd(); engine.seed(seed); - int fg_num = 0; - int bg_num = 0; + framework::LoD lod_loc, loc_score; + std::vector lod0_loc(1, 0); + std::vector lod0_score(1, 0); + + int total_loc_num = 0; + int total_score_num = 0; + auto gt_boxes_lod = gt_boxes->lod().back(); + auto is_crowd_lod = is_crowd->lod().back(); for (int i = 0; i < batch_num; ++i) { - Tensor dist = dist_t->Slice(lod[i], lod[i + 1]); - Tensor gt_bbox = gt_bbox_t->Slice(lod[i], lod[i + 1]); - auto fg_bg_gt = SampleFgBgGt(dev_ctx, dist, pos_threshold, neg_threshold, - rpn_batch_size, fg_num_per_batch, engine, - tmp_lbl_data + i * anchor_num); - - int cur_fg_num = fg_bg_gt[0].size(); - int cur_bg_num = fg_bg_gt[1].size(); - std::transform(fg_bg_gt[0].begin(), fg_bg_gt[0].end(), loc_index, - [i, anchor_num](int d) { return d + i * anchor_num; }); - memcpy(score_index, loc_index, cur_fg_num * sizeof(int)); - std::transform(fg_bg_gt[1].begin(), fg_bg_gt[1].end(), - score_index + cur_fg_num, - [i, anchor_num](int d) { return d + i * anchor_num; }); + Tensor gt_boxes_slice = + gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]); + Tensor is_crowd_slice = + is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]); + Tensor im_info_slice = im_info->Slice(i, i + 1); + auto* im_info_data = im_info_slice.data(); + auto im_height = im_info_data[0]; + auto im_width = im_info_data[1]; + auto im_scale = im_info_data[2]; + + // Filter straddle anchor + std::vector filter_output = FilterStraddleAnchor( + dev_ctx, anchor, rpn_straddle_thresh, im_height, im_width); + Tensor inds_inside = filter_output[0]; + Tensor inside_anchor = filter_output[1]; + + // Filter crowd gt + Tensor ncrowd_gt_boxes = + FilterCrowdGt(dev_ctx, >_boxes_slice, &is_crowd_slice); + auto ncrowd_gt_boxes_et = + framework::EigenTensor::From(ncrowd_gt_boxes); + ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale; + + Tensor anchor_by_gt_overlap; + anchor_by_gt_overlap.mutable_data( + {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place); + BboxOverlaps(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap); + + auto loc_score_tgtlbl_gt = SampleRpnFgBgGt( + dev_ctx, anchor_by_gt_overlap, rpn_batch_size_per_im, + rpn_positive_overlap, rpn_negative_overlap, rpn_fg_fraction, engine, + use_random); + + Tensor sampled_loc_index = loc_score_tgtlbl_gt[0]; + Tensor sampled_score_index = loc_score_tgtlbl_gt[1]; + Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2]; + Tensor sampled_gt_index = loc_score_tgtlbl_gt[3]; + + int loc_num = sampled_loc_index.dims()[0]; + int score_num = sampled_score_index.dims()[0]; + // unmap to all anchor + Tensor sampled_loc_index_unmap, sampled_score_index_unmap; + sampled_loc_index_unmap.mutable_data({loc_num}, place); + sampled_score_index_unmap.mutable_data({score_num}, place); + Gather(inds_inside.data(), 1, sampled_loc_index.data(), + loc_num, sampled_loc_index_unmap.data()); + Gather(inds_inside.data(), 1, sampled_score_index.data(), + score_num, sampled_score_index_unmap.data()); // get target bbox deltas - if (cur_fg_num) { - Tensor fg_gt; - T* gt_data = fg_gt.mutable_data({cur_fg_num, 4}, place); - Tensor tgt_bbox = tgt_bbox_t->Slice(fg_num, fg_num + cur_fg_num); - T* tgt_data = tgt_bbox.data(); - Gather(anchor_t->data(), 4, - reinterpret_cast(&fg_bg_gt[0][0]), cur_fg_num, - tgt_data); - Gather(gt_bbox.data(), 4, reinterpret_cast(&fg_bg_gt[2][0]), - cur_fg_num, gt_data); - BoxToDelta(cur_fg_num, tgt_bbox, fg_gt, nullptr, false, &tgt_bbox); - } - - loc_index += cur_fg_num; - score_index += cur_fg_num + cur_bg_num; - fg_num += cur_fg_num; - bg_num += cur_bg_num; - } - - int lbl_num = fg_num + bg_num; - PADDLE_ENFORCE_LE(fg_num, max_num); - PADDLE_ENFORCE_LE(lbl_num, max_num); - - tgt_bbox_t->Resize({fg_num, 4}); - loc_index_t->Resize({fg_num}); - score_index_t->Resize({lbl_num}); - auto* lbl_data = tgt_lbl_t->mutable_data({lbl_num, 1}, place); - Gather(tmp_lbl_data, 1, score_index_t->data(), lbl_num, - lbl_data); - } - - private: - void ScoreAssign(const T* dist_data, const Tensor& anchor_to_gt_max, - const int row, const int col, const float pos_threshold, - const float neg_threshold, int64_t* target_label, - std::vector* fg_inds, std::vector* bg_inds) const { - float epsilon = 0.0001; - for (int64_t i = 0; i < row; ++i) { - const T* v = dist_data + i * col; - T max = *std::max_element(v, v + col); - for (int64_t j = 0; j < col; ++j) { - if (std::abs(max - v[j]) < epsilon) { - target_label[j] = 1; - } - } - } - - // Pick the fg/bg - const T* anchor_to_gt_max_data = anchor_to_gt_max.data(); - for (int64_t j = 0; j < col; ++j) { - if (anchor_to_gt_max_data[j] >= pos_threshold) { - target_label[j] = 1; - } else if (anchor_to_gt_max_data[j] < neg_threshold) { - target_label[j] = 0; - } - if (target_label[j] == 1) { - fg_inds->push_back(j); - } else if (target_label[j] == 0) { - bg_inds->push_back(j); - } + Tensor sampled_anchor, sampled_gt, sampled_tgt_bbox; + auto* sampled_anchor_data = + sampled_anchor.mutable_data({loc_num, 4}, place); + auto* sampled_gt_data = sampled_gt.mutable_data({loc_num, 4}, place); + Gather(anchor->data(), 4, sampled_loc_index_unmap.data(), + loc_num, sampled_anchor_data); + Gather(ncrowd_gt_boxes.data(), 4, sampled_gt_index.data(), + loc_num, sampled_gt_data); + sampled_tgt_bbox.mutable_data({loc_num, 4}, place); + BoxToDelta(loc_num, sampled_anchor, sampled_gt, nullptr, false, + &sampled_tgt_bbox); + + // Add anchor offset + int anchor_offset = i * anchor_num; + auto sampled_loc_index_unmap_et = + framework::EigenTensor::From(sampled_loc_index_unmap); + sampled_loc_index_unmap_et = sampled_loc_index_unmap_et + anchor_offset; + auto sampled_score_index_unmap_et = + framework::EigenTensor::From(sampled_score_index_unmap); + sampled_score_index_unmap_et = + sampled_score_index_unmap_et + anchor_offset; + AppendRpns(loc_index, total_loc_num, &sampled_loc_index_unmap); + AppendRpns(score_index, total_score_num, &sampled_score_index_unmap); + AppendRpns(tgt_bbox, total_loc_num * 4, &sampled_tgt_bbox); + AppendRpns(tgt_lbl, total_score_num, &sampled_tgtlbl); + total_loc_num += loc_num; + + total_score_num += score_num; + lod0_loc.emplace_back(total_loc_num); + lod0_score.emplace_back(total_score_num); } - } - - void ReservoirSampling(const int num, std::minstd_rand engine, - std::vector* inds) const { - std::uniform_real_distribution uniform(0, 1); - size_t len = inds->size(); - if (len > static_cast(num)) { - for (size_t i = num; i < len; ++i) { - int rng_ind = std::floor(uniform(engine) * i); - if (rng_ind < num) - std::iter_swap(inds->begin() + rng_ind, inds->begin() + i); - } - inds->resize(num); - } - } - // std::vector> RpnTargetAssign( - std::vector> SampleFgBgGt( - const platform::CPUDeviceContext& ctx, const Tensor& dist, - const float pos_threshold, const float neg_threshold, - const int rpn_batch_size, const int fg_num, std::minstd_rand engine, - int64_t* target_label) const { - auto* dist_data = dist.data(); - int row = dist.dims()[0]; - int col = dist.dims()[1]; - - std::vector fg_inds; - std::vector bg_inds; - std::vector gt_inds; - - // Calculate the max IoU between anchors and gt boxes - // Map from anchor to gt box that has highest overlap - auto place = ctx.GetPlace(); - Tensor anchor_to_gt_max, anchor_to_gt_argmax; - anchor_to_gt_max.mutable_data({col}, place); - int* argmax = anchor_to_gt_argmax.mutable_data({col}, place); - - auto x = framework::EigenMatrix::From(dist); - auto x_col_max = framework::EigenVector::Flatten(anchor_to_gt_max); - auto x_col_argmax = - framework::EigenVector::Flatten(anchor_to_gt_argmax); - x_col_max = x.maximum(Eigen::DSizes(0)); - x_col_argmax = x.argmax(0).template cast(); - - // Follow the Faster RCNN's implementation - ScoreAssign(dist_data, anchor_to_gt_max, row, col, pos_threshold, - neg_threshold, target_label, &fg_inds, &bg_inds); - // Reservoir Sampling - ReservoirSampling(fg_num, engine, &fg_inds); - int fg_num2 = static_cast(fg_inds.size()); - int bg_num = rpn_batch_size - fg_num2; - ReservoirSampling(bg_num, engine, &bg_inds); - - gt_inds.reserve(fg_num2); - for (int i = 0; i < fg_num2; ++i) { - gt_inds.emplace_back(argmax[fg_inds[i]]); - } - std::vector> fg_bg_gt; - fg_bg_gt.emplace_back(fg_inds); - fg_bg_gt.emplace_back(bg_inds); - fg_bg_gt.emplace_back(gt_inds); - - return fg_bg_gt; + PADDLE_ENFORCE_LE(total_loc_num, max_num); + PADDLE_ENFORCE_LE(total_score_num, max_num); + + lod_loc.emplace_back(lod0_loc); + loc_score.emplace_back(lod0_score); + loc_index->set_lod(lod_loc); + score_index->set_lod(loc_score); + tgt_bbox->set_lod(lod_loc); + tgt_lbl->set_lod(loc_score); + loc_index->Resize({total_loc_num}); + score_index->Resize({total_score_num}); + tgt_bbox->Resize({total_loc_num, 4}); + tgt_lbl->Resize({total_score_num, 1}); } }; @@ -259,18 +460,22 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("Anchor", "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4]."); - AddInput("GtBox", "(LoDTensor) input groud-truth bbox with shape [K, 4]."); - AddInput( - "DistMat", - "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape " - "[K, M]. It is pair-wise distance matrix between the entities " - "represented by each row and each column. For example, assumed one " - "entity is A with shape [K], another entity is B with shape [M]. The " - "DistMat[i][j] is the distance between A[i] and B[j]. The bigger " - "the distance is, the better macthing the pairs are. Please note, " - "This tensor can contain LoD information to represent a batch of " - "inputs. One instance of this batch can contain different numbers of " - "entities."); + AddInput("GtBoxes", + "(LoDTensor) input groud-truth bbox with shape [K, 4]."); + AddInput("IsCrowd", + "(LoDTensor) input which indicates groud-truth is crowd."); + AddInput("ImInfo", + "(LoDTensor) input image information with shape [N, 3]. " + "N is the batch size, each image information includes height, " + "width and scale."); + AddAttr("rpn_batch_size_per_im", + "Total number of RPN examples per image.") + .SetDefault(256); + AddAttr( + "rpn_straddle_thresh", + "Remove RPN anchors that go outside the image by straddle_thresh " + "pixels, " + "Set to -1 or a large value, e.g. 100000, to disable pruning anchors."); AddAttr( "rpn_positive_overlap", "Minimum overlap required between an anchor and ground-truth " @@ -282,20 +487,15 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { "box for the (anchor, gt box) pair to be a negative examples.") .SetDefault(0.3); AddAttr( - "fg_fraction", + "rpn_fg_fraction", "Target fraction of RoI minibatch that " "is labeled foreground (i.e. class > 0), 0-th class is background.") .SetDefault(0.25); - AddAttr("rpn_batch_size_per_im", - "Total number of RPN examples per image.") - .SetDefault(256); - AddAttr("fix_seed", - "A flag indicating whether to use a fixed seed to generate " - "random mask. NOTE: DO NOT set this flag to true in " - "training. Setting this flag to true is only useful in " - "unittest.") - .SetDefault(false); - AddAttr("seed", "RpnTargetAssign random seed.").SetDefault(0); + AddAttr("use_random", + "A flag indicating whether to use a ReservoirSampling. " + "NOTE: DO NOT set this flag to false in training. " + "Setting this flag to false is only useful in unittest.") + .SetDefault(true); AddOutput( "LocationIndex", "(Tensor), The indexes of foreground anchors in all RPN anchors, the " @@ -308,16 +508,16 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { "ScoreIndex is [F + B], F and B are sampled foreground and backgroud " " number."); AddOutput("TargetBBox", - "(Tensor), The target bbox deltas with shape " + "(Tensor), The target bbox deltas with shape " "[F, 4], F is the sampled foreground number."); AddOutput( "TargetLabel", - "(Tensor), The target labels of each anchor with shape " + "(Tensor), The target labels of each anchor with shape " "[F + B, 1], F and B are sampled foreground and backgroud number."); AddComment(R"DOC( -This operator can be, for given the IoU between the ground truth bboxes and the +This operator can be, for a given set of ground truth bboxes and the anchors, to assign classification and regression targets to each prediction. -The Score index and LocationIndex will be generated according to the DistMat. +The ScoreIndex and LocationIndex will be generated according to the anchor-groundtruth IOU. The rest anchors would not contibute to the RPN training loss ScoreIndex is composed of foreground anchor indexes(positive labels) and diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index 07ac20797ddab54296a45e99915588a40cc6f3c7..13682b78f0eccf049daa315f3a26aafd22e42a41 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -125,7 +125,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, VarHandlePtr h(new VarHandle(ep, "Get", var_name_val, p_ctx, p_scope)); s->Prepare(h, time_out); - framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] { + framework::AsyncIO([var_name_val, s, this] { // prepare input sendrecv::VariableMessage req; req.set_varname(var_name_val); @@ -166,7 +166,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, s->Prepare(h, time_out); framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, - time_out, s, this] { + s, this] { auto* var = p_scope->FindVar(in_var_name_val); ::grpc::ByteBuffer req; @@ -290,12 +290,18 @@ void GRPCClient::Proceed() { c->Finish(false); } - delete c; + bool notify = false; { std::lock_guard lk(sync_mutex_); req_count_--; + notify = (req_count_ <= 0 || !c->status_.ok()); + } + + delete c; + + if (notify) { + sync_cond_.notify_all(); } - sync_cond_.notify_all(); } VLOG(3) << "GRPCClient Proceed end"; } diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h index 2fab02e32fe18ee04f86a69bb5bae1cbe7c6762c..d2b0eb6ca6de1984dc7cfc2a662c88d5e56e1e05 100644 --- a/paddle/fluid/operators/distributed/proto_encoder_helper.h +++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h @@ -82,8 +82,10 @@ class ProtoEncodeHelper { : base_(buf), p_(buf), limit_(base_ + max_size) {} ~ProtoEncodeHelper() { +#define REPLACE_ENFORCE_GLOG 1 // Make sure callers didn't do operations that went over max_size promised - PADDLE_ENFORCE_LE(p_, limit_); + paddle::platform::throw_on_error(p_ <= limit_); +#undef REPLACE_ENFORCE_GLOG } const char* data() const { return base_; } diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index 3c3f9d17c871ac1cb4df83db17cf489d5b9e0563..3dbbd75b1e945208395c42ace3235db7891936c5 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -56,7 +56,7 @@ class VarHandle { const std::string& name, const platform::DeviceContext* p_ctx = nullptr, const framework::Scope* p_scope = nullptr) - : ok_(kVarHandleDefaultState) { + : status_(kDefaultState) { ep_ = ep; ctx_ = p_ctx; scope_ = p_scope; @@ -68,18 +68,20 @@ class VarHandle { public: bool Wait() { + int ret = kDefaultState; { std::unique_lock lk(sync_mutex_); - wait_cond_.wait(lk, [this] { return ok_ != kVarHandleDefaultState; }); + wait_cond_.wait(lk, [this] { return status_ != kDefaultState; }); + ret = status_; } - VLOG(7) << "VarHandle wait:" << ok_; - return ok_ != 0; + VLOG(7) << "VarHandle wait:" << ret; + return ret != kErrorState; } void Finish(bool ok) { { std::unique_lock lk(sync_mutex_); - ok_ = ok; + status_ = ok ? kFinishState : kErrorState; } VLOG(7) << "VarHandle finish:" << ok; wait_cond_.notify_all(); @@ -87,8 +89,8 @@ class VarHandle { std::string String() const { std::ostringstream s; - s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], ok:[" << ok_ - << "]"; + s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], status:[" + << status_ << "]"; return s.str(); } @@ -111,9 +113,13 @@ class VarHandle { protected: std::mutex sync_mutex_; std::condition_variable wait_cond_; - int ok_; - static const int kVarHandleDefaultState = -1; + enum VarHandleStatus { + kDefaultState = -1, + kErrorState = 0, + kFinishState = 1, + }; + VarHandleStatus status_; private: DISABLE_COPY_AND_ASSIGN(VarHandle); diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc index 55e465e3af08c012b8cff7714452ed32b32a5556..8ca79d20ec4f6412b00dbf3990068f81b65e2efd 100644 --- a/paddle/fluid/operators/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fusion_lstm_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fusion_lstm_op.h" #include #include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/cpu_lstm_compute.h" #include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc_compute.h" #include "paddle/fluid/operators/math/sequence2batch.h" @@ -269,7 +270,6 @@ class FuisonLSTMKernel : public framework::OpKernel { blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast(1), prev, D, \ wh_data, D4, static_cast(1), out, D4) -// gates: W_ch, W_ih, W_fh, W_oh #define GET_Ct(ct_1, gates, ct) \ /* C_t = C_t-1 * fgated + cand_gated * igated*/ \ act_cand(D, gates, gates); \ @@ -395,11 +395,22 @@ class FuisonLSTMKernel : public framework::OpKernel { } } } else { + // TODO(TJ): unly workaround, clean me + std::function compute_ctht; + if (platform::jit::MayIUse(platform::jit::avx) && + act_gate_str == "sigmoid" && act_cand_str == "tanh" && + act_cell_str == "tanh" && D == 8) { + compute_ctht = math::lstm_compute_ctht; + } else { + compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) { + COMPUTE_CtHt(gates, ct_1, ct, ht); + }; + } for (int i = 0; i < N; ++i) { PROCESS_H0C0 for (int step = tstart; step < seq_len; ++step) { GEMM_WH_ADDON(1, prev_h_data, xx_data); - COMPUTE_CtHt(xx_data, prev_c_data, c_out_data, h_out_data); + compute_ctht(xx_data, prev_c_data, c_out_data, h_out_data); MOVE_ONE_STEP; } } @@ -532,12 +543,23 @@ class FuisonLSTMKernel : public framework::OpKernel { MOVE_ONE_STEP; } } else { + // TODO(TJ): unly workaround, clean me + std::function compute_ctht; + if (platform::jit::MayIUse(platform::jit::avx) && + act_gate_str == "sigmoid" && act_cand_str == "tanh" && + act_cell_str == "tanh" && D == 8) { + compute_ctht = math::lstm_compute_ctht; + } else { + compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) { + COMPUTE_CtHt(gates, ct_1, ct, ht); + }; + } for (int step = tstart; step < max_seq_len; ++step) { const int cur_bs = batch_starts[step + 1] - batch_starts[step]; GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data); DEFINE_CUR; for (int i = 0; i < cur_bs; ++i) { - COMPUTE_CtHt(cur_in_data, cur_prev_c_data, cur_c_out_data, + compute_ctht(cur_in_data, cur_prev_c_data, cur_c_out_data, cur_h_out_data); MOVE_ONE_BATCH; } diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 966d78b84130c172c41e8049bf6bb1dc659d7d48..dc008d16971bc762b401ddece56f9ec56f7a47d6 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -59,17 +59,16 @@ static void ParallelExecuteBlocks( framework::ProgramDesc *program, framework::Scope *scope) { std::vector> fs; for (size_t idx : parallel_blkids) { - fs.push_back( - framework::Async([&executor, &prepared, &program, &scope, idx]() { - int run_block = idx; // thread local - try { - VLOG(3) << "running server block: " << run_block - << "pointer: " << prepared[run_block].get(); - executor->RunPreparedContext(prepared[run_block].get(), scope); - } catch (const std::exception &e) { - LOG(ERROR) << "run sub program error " << e.what(); - } - })); + fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() { + int run_block = idx; // thread local + try { + VLOG(3) << "running server block: " << run_block + << "pointer: " << prepared[run_block].get(); + executor->RunPreparedContext(prepared[run_block].get(), scope); + } catch (const std::exception &e) { + LOG(ERROR) << "run sub program error " << e.what(); + } + })); } for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); } diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index d7f0f3c6280db7d121bf8821ec6d578e22a33da6..91101356436c26171eaca2fe01dfd4d937e71717 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -45,6 +45,8 @@ math_library(im2col) if (NOT WIN32) # windows do not support avx functions yet. math_library(gru_compute DEPS activation_functions math_function) math_library(lstm_compute DEPS activation_functions) +# TODO(TJ): ugly workaround, clean me +cc_library(cpu_lstm_compute SRCS cpu_lstm_compute.cc DEPS activation_functions cblas cpu_info) endif (NOT WIN32) cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context) diff --git a/paddle/fluid/operators/math/cpu_lstm_compute.cc b/paddle/fluid/operators/math/cpu_lstm_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..58e6512021203664573a0478dade052f92dd70bb --- /dev/null +++ b/paddle/fluid/operators/math/cpu_lstm_compute.cc @@ -0,0 +1,18 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/cpu_lstm_compute.h" + +namespace paddle { +namespace operators { +namespace math {} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/cpu_lstm_compute.h b/paddle/fluid/operators/math/cpu_lstm_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..28b6f71729edf1b8cc5d610d76af78dea213313e --- /dev/null +++ b/paddle/fluid/operators/math/cpu_lstm_compute.h @@ -0,0 +1,81 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/platform/cpu_info.h" +#ifdef __AVX__ +#include +#endif + +namespace paddle { +namespace operators { +namespace math { + +// TODO(TJ): ugly workaround, clean me +template +void lstm_compute_ctht(T* gates, const T* ct_1, T* ct, T* ht) { + // gates: W_ch, W_ih, W_fh, W_oh + vec_sigmoid(24, gates + 8, gates + 8); + vec_tanh(8, gates, gates); + const T *i = gates + 8, *f = gates + 16, *o = gates + 24; + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + for (int d = 0; d < 8; ++d) { + // C_t = C_t-1 * fgated + cand_gated * igated + ct[d] = ct_1[d] * f[d] + gates[d] * i[d]; + // H_t = act_cell(C_t) * ogated + T tmp = ct[d] * 2; + tmp = static_cast(0) - ((tmp < min) ? min : ((tmp > max) ? max : tmp)); + vec_exp(1, &tmp, &tmp); + tmp = static_cast(2) / (static_cast(1) + tmp) - static_cast(1); + ht[d] = tmp * o[d]; + } +} + +#ifdef __AVX__ +namespace detail { +namespace forward { +namespace avx { +__m256 Sigmoid(const __m256 a); +__m256 Tanh(const __m256 a); +} // namespace avx +} // namespace forward +} // namespace detail + +template <> +void lstm_compute_ctht(float* gates, const float* ct_1, float* ct, + float* ht) { + namespace act = detail::forward::avx; + // gates: W_ch, W_ih, W_fh, W_oh + __m256 c, i, f, o; + c = _mm256_loadu_ps(gates); + i = _mm256_loadu_ps(gates + 8); + f = _mm256_loadu_ps(gates + 16); + o = _mm256_loadu_ps(gates + 24); + + /* C_t = C_t-1 * fgated + cand_gated * igated*/ + c = _mm256_mul_ps(act::Tanh(c), act::Sigmoid(i)); + i = _mm256_loadu_ps(ct_1); + f = _mm256_mul_ps(i, act::Sigmoid(f)); + f = _mm256_add_ps(c, f); + _mm256_storeu_ps(ct, f); + + /* H_t = act_cell(C_t) * ogated */ + o = _mm256_mul_ps(act::Tanh(f), act::Sigmoid(o)); + _mm256_storeu_ps(ht, o); +} +#endif + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index 9560e3a3c15ca63892fbe3552679a22f027f11e2..6a059968b79189458349e466079cc7a663a8e5ff 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/enforce.h" #ifdef __AVX__ #include #endif @@ -476,7 +477,7 @@ class VecActivations { } else if (type == "identity" || type == "") { return vec_identity; } - LOG(FATAL) << "Not support type: " << type; + PADDLE_THROW("Not support type: %s", type); } }; diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index f25d3d3f1ee1f89d46b8e7c88ca68048f5203544..69318a6598c8c69eceab7216df6382537153d34f 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -103,6 +103,58 @@ class MaxSeqPoolGradFunctor { } }; +template +class LastSeqPoolFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::LoDTensor& input, + framework::Tensor* output) { + // Create pointers to input and output data + auto* in_data = input.data(); + auto* out_data = output->data(); + + // Calculate the size of each item in sequence + int64_t item_size = input.numel() / input.dims()[0]; + auto lod = input.lod()[0]; + int seq_num = static_cast(lod.size()) - 1; + for (int i = 0; i < seq_num; ++i) { + // Calculate the length of each sequence + int64_t seq_len = static_cast(lod[i + 1] - lod[i]); + // Point to the begin of next sequence + in_data += seq_len * item_size; + // Copy the last item of sequence to output + std::memcpy(out_data, (in_data - item_size), item_size * sizeof(T)); + out_data += item_size; + } + } +}; + +template +class FirstSeqPoolFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::LoDTensor& input, + framework::Tensor* output) { + // Create pointers to input and output data + auto* in_data = input.data(); + auto* out_data = output->data(); + + // Calculate the size of each item in sequence + int64_t item_size = input.numel() / input.dims()[0]; + auto lod = input.lod()[0]; + int seq_num = static_cast(lod.size()) - 1; + for (int i = 0; i < seq_num; ++i) { + // Calculate the length of each sequence + int64_t seq_len = static_cast(lod[i + 1] - lod[i]); + // Copy the first item of sequence to output + std::memcpy(out_data, in_data, item_size * sizeof(T)); + // Point to the next sequence + in_data += seq_len * item_size; + out_data += item_size; + } + } +}; + template class SequencePoolFunctor { public: @@ -116,6 +168,16 @@ class SequencePoolFunctor { max_pool(context, input, output, index); return; } + if (pooltype == "LAST") { + math::LastSeqPoolFunctor last_pool; + last_pool(context, input, output); + return; + } + if (pooltype == "FIRST") { + math::FirstSeqPoolFunctor first_pool; + first_pool(context, input, output); + return; + } auto lod = input.lod()[0]; auto& place = *context.eigen_device(); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { @@ -133,10 +195,6 @@ class SequencePoolFunctor { } else if (pooltype == "SQRT") { out_e.device(place) = in_e.sum(Eigen::array({{0}})) / std::sqrt(static_cast(h)); - } else if (pooltype == "LAST") { - out_e.device(place) = in_e.chip(h - 1, 0); - } else if (pooltype == "FIRST") { - out_e.device(place) = in_e.chip(0, 0); } else { PADDLE_THROW("unsupported pooling pooltype"); } diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc index 058115cb624627d81b31d0903f7d615d19708c77..b2543d3d0d80f0573f2cbc755318c1b5a0982324 100644 --- a/paddle/fluid/operators/maxout_op.cc +++ b/paddle/fluid/operators/maxout_op.cc @@ -71,8 +71,7 @@ class MaxOutOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of MaxoutOp" - "should not be null."); + "Input(X) of MaxoutOpshould not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of MaxoutOp should not be null."); auto in_x_dims = ctx->GetInputDim("X"); @@ -90,9 +89,10 @@ class MaxOutOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of MaxOutOpGrad must not be null."); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), - "Input(X@GRAD) should not be null."); + "Output(Grad@X) of MaxOutOpGrad should not be null."); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); } }; diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index 23d9ea88f6701f9f9e5e02948e996878a849ddd6..e0c4c81bdd5b5d0af3bafe632a2fa033efd08050 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -26,10 +26,13 @@ class PReluOp : public framework::OperatorWithKernel { std::string mode = ctx->Attrs().Get("mode"); auto x_dim = ctx->GetInputDim("X"); - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); - PADDLE_ENFORCE(ctx->HasInput("Alpha"), "Input(Alpha) should not be null"); + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of PreluOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("Alpha"), + "Input(Alpha) of PreluOp should not be null"); - PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of PreluOp should not be null"); if (mode == "all") { PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1, "For mode 'all', size of weight Alpha must be one."); diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc index 23e5fc1112d0b1e634d0ab288721cbba57b3ffe5..13df1d4b4bb6c240610f96ccc8f223fc984d63f7 100644 --- a/paddle/fluid/operators/rnn_memory_helper_op.cc +++ b/paddle/fluid/operators/rnn_memory_helper_op.cc @@ -42,7 +42,7 @@ class RNNMemoryHelperOp : public framework::OperatorBase { auto *out_tensor = out_var->GetMutable(); auto &mem_tensor = mem_var->Get(); - out_tensor->ShareDataWith(mem_tensor); + framework::TensorCopySync(mem_tensor, dev_place, out_tensor); out_tensor->set_lod(mem_tensor.lod()); } }; @@ -50,8 +50,10 @@ class RNNMemoryHelperOp : public framework::OperatorBase { class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase { public: void operator()(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), ""); - PADDLE_ENFORCE(ctx->HasOutput("Out"), ""); + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of rnn_memory_helper op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output of rnn_memory_helper op should not be null."); ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->ShareLoD("X", /*->*/ "Out"); } @@ -107,7 +109,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase { } else { auto &out_grad_tensor = out_grad_var->Get(); auto *in_grad_tensor = in_grad_var->GetMutable(); - in_grad_tensor->ShareDataWith(out_grad_tensor); + framework::TensorCopySync(out_grad_tensor, dev_place, in_grad_tensor); in_grad_tensor->set_lod(out_grad_tensor.lod()); } } @@ -133,8 +135,11 @@ class RNNMemoryHelperGradOpShapeInference : public framework::InferShapeBase { public: void operator()(framework::InferShapeContext *ctx) const override { auto x_grad_name = framework::GradVarName("X"); - PADDLE_ENFORCE(ctx->HasOutput(x_grad_name), ""); - PADDLE_ENFORCE(ctx->HasInput("X"), ""); + PADDLE_ENFORCE(ctx->HasOutput(x_grad_name), + "Gradient of Input(X) in rnn_memory_helper_grad of should " + "not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of rnn_memory_helper_grad of should not be null."); ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X")); ctx->ShareLoD("X", /*->*/ x_grad_name); } diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc index 4bd23d594134f227e86b01fd75b7e202dd76c11b..e55462d6cfe389033a9c24a464fbf5b5d699f34f 100644 --- a/paddle/fluid/operators/slice_op.cc +++ b/paddle/fluid/operators/slice_op.cc @@ -25,7 +25,7 @@ class SliceOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { + void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("Input"), "Input (Input) of slice op should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), @@ -58,7 +58,7 @@ class SliceOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { + const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace()); @@ -87,13 +87,13 @@ Slice Operator. Produces a slice of the input tensor along multiple axes. Similar to numpy: https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html -Slice uses `axes`, `starts` and `ends` attributes to specify the start and +Slice uses `axes`, `starts` and `ends` attributes to specify the start and end dimension for each axis in the list of axes, it uses this information -to slice the input data tensor. If a negative value is passed for any of -the start or end indices, it represents number of elements before the end +to slice the input data tensor. If a negative value is passed for any of +the start or end indices, it represents number of elements before the end of that dimension. If the value passed to start or end is larger than -the n (the number of elements in this dimension), it represents n. -For slicing to the end of a dimension with unknown size, it is recommended +the n (the number of elements in this dimension), it represents n. +For slicing to the end of a dimension with unknown size, it is recommended to pass in INT_MAX. If axes are omitted, they are set to [0, ..., ndim-1]. Following examples will explain how slice works: @@ -119,15 +119,54 @@ Following examples will explain how slice works: } }; +class SliceOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), "Input should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto x_dims = ctx->GetInputDim("Input"); + auto x_grad_name = framework::GradVarName("Input"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + } +}; + +class SliceOpGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* bind = new framework::OpDesc(); + bind->SetInput("Input", Input("Input")); + bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + bind->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + bind->SetAttrMap(Attrs()); + bind->SetType("slice_grad"); + return std::unique_ptr(bind); + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(slice, ops::SliceOp, ops::SliceOpMaker, - paddle::framework::EmptyGradOpMaker); + ops::SliceOpGradMaker); +REGISTER_OPERATOR(slice_grad, ops::SliceOpGrad); REGISTER_OP_CPU_KERNEL( slice, ops::SliceKernel, ops::SliceKernel, ops::SliceKernel, ops::SliceKernel); + +REGISTER_OP_CPU_KERNEL( + slice_grad, ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel); diff --git a/paddle/fluid/operators/slice_op.cu b/paddle/fluid/operators/slice_op.cu index 8c1767c70b19d1386af9610ef3405eb487a39878..5efecb78d1a4eaffc3a9c62e1e82a9bcb5922748 100644 --- a/paddle/fluid/operators/slice_op.cu +++ b/paddle/fluid/operators/slice_op.cu @@ -20,3 +20,10 @@ REGISTER_OP_CUDA_KERNEL( ops::SliceKernel, ops::SliceKernel, ops::SliceKernel); + +REGISTER_OP_CUDA_KERNEL( + slice_grad, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel); diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h index ba231aee176564b91a642912ce0b32bcdef8cfc1..f38d08d7640794bd9a456a6c4ee1da2e04e96b37 100644 --- a/paddle/fluid/operators/slice_op.h +++ b/paddle/fluid/operators/slice_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include +#include #include #include "paddle/fluid/framework/op_registry.h" @@ -84,5 +85,79 @@ class SliceKernel : public framework::OpKernel { out_t.device(place) = in_t.slice(offsets, extents); } }; + +template +class SliceGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + size_t rank = ctx.Input(framework::GradVarName("Out")) + ->dims() + .size(); + switch (rank) { + case 1: + SliceCompute<1>(ctx); + break; + case 2: + SliceCompute<2>(ctx); + break; + case 3: + SliceCompute<3>(ctx); + break; + case 4: + SliceCompute<4>(ctx); + break; + case 5: + SliceCompute<5>(ctx); + break; + case 6: + SliceCompute<6>(ctx); + break; + } + } + + private: + template + void SliceCompute(const framework::ExecutionContext& context) const { + auto& place = + *context.template device_context().eigen_device(); + auto* d_out = + context.Input(framework::GradVarName("Out")); + auto* d_input = + context.Output(framework::GradVarName("Input")); + d_input->mutable_data(context.GetPlace()); + auto out_dims = d_out->dims(); + auto in_dims = d_input->dims(); + auto axes = context.Attr>("axes"); + auto starts = context.Attr>("starts"); + + auto offsets = Eigen::array(); + auto extents = Eigen::array(); + for (size_t i = 0; i < D; ++i) { + offsets[i] = 0; + extents[i] = out_dims[i]; + } + int start; + for (size_t i = 0; i < axes.size(); ++i) { + start = starts[i]; + if (start < 0) { + start = (start + in_dims[axes[i]]); + } + start = std::max(start, 0); + offsets[axes[i]] = start; + } + Eigen::array, D> paddings; + for (size_t i = 0; i < paddings.size(); ++i) { + paddings[i].first = offsets[i]; + paddings[i].second = (in_dims[i] - out_dims[i]) - offsets[i]; + } + auto d_in_t = + framework::EigenTensor::From( + *d_input); + auto d_out_t = + framework::EigenTensor::From( + *d_out); + d_in_t.device(place) = d_out_t.pad(paddings, 0); + } +}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index bc556ab3643cefa3e45d2a8a3835937753af723f..79e75ea9a035b654f0bb7026d3a491bebe0b23c4 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -160,11 +160,7 @@ class TensorRTEngineKernel : public framework::OpKernel { fluid_t->mutable_data(platform::CUDAPlace( boost::get(context.GetPlace()).device)), size * sizeof(float)); - //} else { - // engine->GetOutputInGPU( - // y, fluid_t->mutable_data(platform::CUDAPlace()), - // size * sizeof(float)); - //} + output_index += 1; } diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc index 65a3bc928e47ac60f06e7efc75f42703e45acbb4..791138a8c0eb3c477942a8b723206a8f8a3eac77 100644 --- a/paddle/fluid/operators/while_op.cc +++ b/paddle/fluid/operators/while_op.cc @@ -63,7 +63,7 @@ class WhileOp : public framework::OperatorBase { while (cond.data()[0]) { auto ¤t_scope = scope.NewScope(); step_scopes->push_back(¤t_scope); - executor.RunPreparedContext(ctx.get(), ¤t_scope, false); + executor.RunPreparedContext(ctx.get(), ¤t_scope, false, true, true); if (is_test) { scope.DeleteScope(¤t_scope); } @@ -169,7 +169,8 @@ class WhileGradOp : public framework::OperatorBase { } } } - executor.RunPreparedContext(ctx.get(), *cur_scope_iter, false); + executor.RunPreparedContext(ctx.get(), *cur_scope_iter, false, true, + true); auto &pg_names = Outputs(kXGRAD); auto &p_names = Inputs(kX); diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index e25efebe6c3555958f4f75e2b87b7dc45d4a4177..5af8af640e43a5b2e5ee9856f09f66a9fdf4463c 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -51,7 +51,7 @@ ENDIF() # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies -cc_library(device_context SRCS device_context.cc init.cc DEPS malloc +cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}) nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index c6f1d1f3d544117311821d980300dffea03891a5..dfc079e986e93c7f02f17b299e5d6293edbedd05 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -210,11 +210,14 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) if (dynload::HasCUDNN()) { cudnn_holder_.reset(new CudnnHolder(&stream_, place)); } + + callback_manager_.reset(new StreamCallbackManager(stream_)); } CUDADeviceContext::~CUDADeviceContext() { SetDeviceId(place_.device); Wait(); + WaitStreamCallback(); PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_)); eigen_stream_.reset(); eigen_device_.reset(); diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 3ed49fc4233d4c0cd6cc16319eda08480ab9b434..79539195157d74d4d757edee5e008cbb76c93ee2 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -31,6 +31,9 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/stream_callback_manager.h" +#endif #include "unsupported/Eigen/CXX11/Tensor" namespace paddle { @@ -112,6 +115,17 @@ class CUDADeviceContext : public DeviceContext { PADDLE_ENFORCE(cudaEventRecord(ev, stream_)); } + template + void AddStreamCallback(Callback&& callback) const { + std::lock_guard guard(callback_mtx_); + callback_manager_->AddCallback(callback); + } + + void WaitStreamCallback() const { + std::lock_guard guard(callback_mtx_); + callback_manager_->Wait(); + } + private: CUDAPlace place_; @@ -125,7 +139,12 @@ class CUDADeviceContext : public DeviceContext { int multi_process; int max_threads_per_mp; - std::mutex mtx_; + mutable std::mutex mtx_; + + // This lock is only used by callback + // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes + mutable std::mutex callback_mtx_; + std::unique_ptr callback_manager_; }; template <> diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h new file mode 100644 index 0000000000000000000000000000000000000000..6c984065aa5fa1a8875aebe84051ab396bc417ec --- /dev/null +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -0,0 +1,82 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "ThreadPool.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { + +using StreamCallback = std::function; + +class StreamCallbackManager; + +struct StreamCallbackContext { + template + inline StreamCallbackContext(const StreamCallbackManager *manager, + Callback &&callback) + : manager_(manager), callback_(callback) {} + + const StreamCallbackManager *manager_; // do not own + StreamCallback callback_; +}; + +class StreamCallbackManager { + public: + explicit inline StreamCallbackManager(cudaStream_t stream = nullptr) + : stream_(stream), thread_pool_(new ThreadPool(1)) {} + + template + inline void AddCallback(Callback &&callback) const { + AddCallbackWithStreamAndErrorInfo( + [=](cudaStream_t, cudaError_t) { callback(); }); + } + + template + inline void AddCallbackWithStreamAndErrorInfo(Callback &&callback) const { + auto *stream_callback_context = new StreamCallbackContext(this, callback); + PADDLE_ENFORCE(cudaStreamAddCallback( + stream_, StreamCallbackManager::StreamCallbackFunc, + stream_callback_context, 0)); + } + + void Wait() const { thread_pool_.reset(new ThreadPool(1)); } + + private: + const cudaStream_t stream_; + mutable std::unique_ptr thread_pool_; + + // cudaStreamCallback cannot call CUDA API inside, so we have to use + // thread_pool here + static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, + cudaError_t status, + void *user_data) { + auto *callback_context_ptr = + reinterpret_cast(user_data); + callback_context_ptr->manager_->thread_pool_->enqueue([=]() { + std::unique_ptr callback_context( + callback_context_ptr); + callback_context->callback_(stream, status); + }); + } +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 20fc08e21dadc12be8903476df374abb5caecf61..8bc30fc123163983f4bddc19af489920db93e0c0 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -683,7 +683,6 @@ All parameter, weight, gradient are variables in Paddle. const std::string &, Scope *, std::vector &, const ExecutionStrategy &, const BuildStrategy &, size_t, size_t>()) - .def("_bcast_params", &ParallelExecutor::BCastParamsToDevices) // NOTE: even we return a vec* to Python use reference policy. // We still cannot get local_scope from this vector, since the element // of vec will be freed by Python GC. We can only return Scope* diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/fluid/string/CMakeLists.txt index 719411bf6677c923397748289b95415c47fa299a..8572dc1e8e543b552e3ed5a180ec942faf90a624 100644 --- a/paddle/fluid/string/CMakeLists.txt +++ b/paddle/fluid/string/CMakeLists.txt @@ -1,6 +1,5 @@ cc_library(stringpiece SRCS piece.cc) cc_library(pretty_log SRCS pretty_log.cc) -cc_test(test_pretty_log SRCS pretty_log.cc) cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags) cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags) cc_test(to_string_test SRCS to_string_test.cc) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index ad095b92711dccb44f26748bcfa89a0b4123c6e7..77b9b36e68c88eab35bcc1a88ce08a7b5940d55f 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -33,6 +33,7 @@ function print_usage() { ${BLUE}single_test${NONE}: run a single unit test ${BLUE}bind_test${NONE}: parallel tests bind to different GPU ${BLUE}doc${NONE}: generate paddle documents + ${BLUE}gen_doc_lib${NONE}: generate paddle documents library ${BLUE}html${NONE}: convert C++ source code into HTML ${BLUE}dockerfile${NONE}: generate paddle release dockerfile ${BLUE}capi${NONE}: generate paddle CAPI package @@ -67,26 +68,44 @@ function cmake_gen() { # Support build for all python versions, currently # including cp27-cp27m and cp27-cp27mu. PYTHON_FLAGS="" - if [ "$1" != "" ]; then - echo "using python abi: $1" - if [ "$1" == "cp27-cp27m" ]; then - export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:} - export PATH=/opt/python/cp27-cp27m/bin/:${PATH} - PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python - -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7 - -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so" - elif [ "$1" == "cp27-cp27mu" ]; then - export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:} - export PATH=/opt/python/cp27-cp27mu/bin/:${PATH} - PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python - -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7 - -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so" - elif [ "$1" == "cp35-cp35m" ]; then - export LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} - export PATH=/opt/_internal/cpython-3.5.1/bin/:${PATH} - export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3 + SYSTEM=`uname -s` + if [ "$SYSTEM" == "Darwin" ]; then + if [[ "$1" == "cp27-cp27m" ]] || [[ "$1" == "" ]]; then + echo "using python abi: $1" + if [ -d "/Library/Frameworks/Python.framework/Versions/2.7" ]; then + export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7 + export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7 + export PATH=/Library/Frameworks/Python.framework/Versions/2.7/bin/:${PATH} + PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/bin/python2.7 + -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/2.7/include/python2.7 + -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/lib/libpython2.7.dylib" + else + exit 1 + fi + # TODO: qiyang add python3 part here + fi + else + if [ "$1" != "" ]; then + echo "using python abi: $1" + if [ "$1" == "cp27-cp27m" ]; then + export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:} + export PATH=/opt/python/cp27-cp27m/bin/:${PATH} + PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python + -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7 + -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so" + elif [ "$1" == "cp27-cp27mu" ]; then + export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:} + export PATH=/opt/python/cp27-cp27mu/bin/:${PATH} + PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python + -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7 + -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so" + elif [ "$1" == "cp35-cp35m" ]; then + export LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} + export PATH=/opt/_internal/cpython-3.5.1/bin/:${PATH} + export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3 -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so" + fi fi fi @@ -116,6 +135,8 @@ function cmake_gen() { -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_INFERENCE=${WITH_INFERENCE:-ON} + -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} + -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo} -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} -DPY_VERSION=${PY_VERSION:-2.7} ======================================== @@ -146,6 +167,8 @@ EOF -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \ -DWITH_INFERENCE=${WITH_INFERENCE:-ON} \ + -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \ + -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo} \ -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \ -DPY_VERSION=${PY_VERSION:-2.7} } @@ -200,6 +223,19 @@ EOF make install -j `nproc` } +function build_mac() { + mkdir -p ${PADDLE_ROOT}/build + cd ${PADDLE_ROOT}/build + cat < 0), 0-th class is background. rpn_positive_overlap(float): Minimum overlap required between an anchor and ground-truth box for the (anchor, gt box) pair to be a positive @@ -129,45 +137,48 @@ def rpn_target_assign(loc, Examples: .. code-block:: python - loc = layers.data(name='location', shape=[2, 80], + bbox_pred = layers.data(name='bbox_pred', shape=[100, 4], append_batch_size=False, dtype='float32') - scores = layers.data(name='scores', shape=[2, 40], + cls_logits = layers.data(name='cls_logits', shape=[100, 1], append_batch_size=False, dtype='float32') anchor_box = layers.data(name='anchor_box', shape=[20, 4], append_batch_size=False, dtype='float32') - gt_box = layers.data(name='gt_box', shape=[10, 4], + gt_boxes = layers.data(name='gt_boxes', shape=[10, 4], append_batch_size=False, dtype='float32') loc_pred, score_pred, loc_target, score_target = - fluid.layers.detection_output(loc=location, - scores=scores, + fluid.layers.rpn_target_assign(bbox_pred=bbox_pred, + cls_logits=cls_logits, anchor_box=anchor_box, - gt_box=gt_box) + gt_boxes=gt_boxes) """ helper = LayerHelper('rpn_target_assign', **locals()) - # Compute overlaps between the prior boxes and the gt boxes overlaps - iou = iou_similarity(x=gt_box, y=anchor_box) # Assign target label to anchors loc_index = helper.create_tmp_variable(dtype='int32') score_index = helper.create_tmp_variable(dtype='int32') - target_label = helper.create_tmp_variable(dtype='int64') + target_label = helper.create_tmp_variable(dtype='int32') target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype) helper.append_op( type="rpn_target_assign", - inputs={'Anchor': anchor_box, - 'GtBox': gt_box, - 'DistMat': iou}, + inputs={ + 'Anchor': anchor_box, + 'GtBoxes': gt_boxes, + 'IsCrowd': is_crowd, + 'ImInfo': im_info + }, outputs={ 'LocationIndex': loc_index, 'ScoreIndex': score_index, 'TargetLabel': target_label, - 'TargetBBox': target_bbox, + 'TargetBBox': target_bbox }, attrs={ 'rpn_batch_size_per_im': rpn_batch_size_per_im, + 'rpn_straddle_thresh': rpn_straddle_thresh, 'rpn_positive_overlap': rpn_positive_overlap, 'rpn_negative_overlap': rpn_negative_overlap, - 'fg_fraction': fg_fraction + 'rpn_fg_fraction': rpn_fg_fraction, + 'use_random': use_random }) loc_index.stop_gradient = True @@ -175,12 +186,12 @@ def rpn_target_assign(loc, target_label.stop_gradient = True target_bbox.stop_gradient = True - scores = nn.reshape(x=scores, shape=(-1, 1)) - loc = nn.reshape(x=loc, shape=(-1, 4)) - predicted_scores = nn.gather(scores, score_index) - predicted_location = nn.gather(loc, loc_index) + cls_logits = nn.reshape(x=cls_logits, shape=(-1, 1)) + bbox_pred = nn.reshape(x=bbox_pred, shape=(-1, 4)) + predicted_cls_logits = nn.gather(cls_logits, score_index) + predicted_bbox_pred = nn.gather(bbox_pred, loc_index) - return predicted_scores, predicted_location, target_label, target_bbox + return predicted_cls_logits, predicted_bbox_pred, target_label, target_bbox def detection_output(loc, @@ -1258,15 +1269,17 @@ def anchor_generator(input, def generate_proposal_labels(rpn_rois, gt_classes, + is_crowd, gt_boxes, - im_scales, + im_info, batch_size_per_im=256, fg_fraction=0.25, fg_thresh=0.25, bg_thresh_hi=0.5, bg_thresh_lo=0.0, bbox_reg_weights=[0.1, 0.1, 0.2, 0.2], - class_nums=None): + class_nums=None, + use_random=True): """ ** Generate proposal labels Faster-RCNN ** TODO(buxingyuan): Add Document @@ -1285,8 +1298,9 @@ def generate_proposal_labels(rpn_rois, inputs={ 'RpnRois': rpn_rois, 'GtClasses': gt_classes, + 'IsCrowd': is_crowd, 'GtBoxes': gt_boxes, - 'ImScales': im_scales + 'ImInfo': im_info }, outputs={ 'Rois': rois, @@ -1302,7 +1316,8 @@ def generate_proposal_labels(rpn_rois, 'bg_thresh_hi': bg_thresh_hi, 'bg_thresh_lo': bg_thresh_lo, 'bbox_reg_weights': bbox_reg_weights, - 'class_nums': class_nums + 'class_nums': class_nums, + 'use_random': use_random }) rois.stop_gradient = True diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index bd9f8b3c356ca1e43923d42beebaa5ab98158084..44af29d3390e35129d0ee65b31eacad6b28a9d60 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -142,11 +142,6 @@ class ParallelExecutor(object): main = main if main else framework.default_main_program() if scope == None: scope = executor.global_scope() - # FIXME(Yancey1989): it's a temporary approach to determinate the distribute - # train program, call self.bcast_param() at the end of each mini-batch. - self.is_dist = True if "recv" in [ - op.type for op in main.global_block().ops - ] else False if share_vars_from and not isinstance(share_vars_from, ParallelExecutor): @@ -286,21 +281,11 @@ class ParallelExecutor(object): self.executor.run(fetch_list, fetch_var_name) arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() - if self.is_dist: - self._bcast_params() - if return_numpy: return executor.as_numpy(arr) return [arr[i] for i in range(len(arr))] - def _bcast_params(self): - """ - Broadcast the parameters to other devices. It is used during - distributed training. - """ - self.executor._bcast_params(set(self.persistable_vars)) - @property def device_count(self): return len(self._act_places) diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index e2564763d19d180f7c6933429dddf58c77be7bb8..56129641ce5900d82aedf243d2fa1eadfd6b8d86 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -148,51 +148,60 @@ class TestAnchorGenerator(unittest.TestCase): class TestGenerateProposalLabels(unittest.TestCase): def test_generate_proposal_labels(self): - rpn_rois = layers.data( - name='rpn_rois', - shape=[4, 4], - dtype='float32', - lod_level=1, - append_batch_size=False) - gt_classes = layers.data( - name='gt_classes', - shape=[6], - dtype='int32', - lod_level=1, - append_batch_size=False) - gt_boxes = layers.data( - name='gt_boxes', - shape=[6, 4], - dtype='float32', - lod_level=1, - append_batch_size=False) - im_scales = layers.data( - name='im_scales', - shape=[1], - dtype='float32', - lod_level=1, - append_batch_size=False) - class_nums = 5 - rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights = fluid.layers.generate_proposal_labels( - rpn_rois=rpn_rois, - gt_classes=gt_classes, - gt_boxes=gt_boxes, - im_scales=im_scales, - batch_size_per_im=2, - fg_fraction=0.5, - fg_thresh=0.5, - bg_thresh_hi=0.5, - bg_thresh_lo=0.0, - bbox_reg_weights=[0.1, 0.1, 0.2, 0.2], - class_nums=class_nums) - assert rois.shape[1] == 4 - assert rois.shape[0] == labels_int32.shape[0] - assert rois.shape[0] == bbox_targets.shape[0] - assert rois.shape[0] == bbox_inside_weights.shape[0] - assert rois.shape[0] == bbox_outside_weights.shape[0] - assert bbox_targets.shape[1] == 4 * class_nums - assert bbox_inside_weights.shape[1] == 4 * class_nums - assert bbox_outside_weights.shape[1] == 4 * class_nums + program = Program() + with program_guard(program): + rpn_rois = layers.data( + name='rpn_rois', + shape=[4, 4], + dtype='float32', + lod_level=1, + append_batch_size=False) + gt_classes = layers.data( + name='gt_classes', + shape=[6], + dtype='int32', + lod_level=1, + append_batch_size=False) + is_crowd = layers.data( + name='is_crowd', + shape=[6], + dtype='int32', + lod_level=1, + append_batch_size=False) + gt_boxes = layers.data( + name='gt_boxes', + shape=[6, 4], + dtype='float32', + lod_level=1, + append_batch_size=False) + im_info = layers.data( + name='im_info', + shape=[1, 3], + dtype='float32', + lod_level=1, + append_batch_size=False) + class_nums = 5 + rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights = fluid.layers.generate_proposal_labels( + rpn_rois=rpn_rois, + gt_classes=gt_classes, + is_crowd=is_crowd, + gt_boxes=gt_boxes, + im_info=im_info, + batch_size_per_im=2, + fg_fraction=0.5, + fg_thresh=0.5, + bg_thresh_hi=0.5, + bg_thresh_lo=0.0, + bbox_reg_weights=[0.1, 0.1, 0.2, 0.2], + class_nums=class_nums) + assert rois.shape[1] == 4 + assert rois.shape[0] == labels_int32.shape[0] + assert rois.shape[0] == bbox_targets.shape[0] + assert rois.shape[0] == bbox_inside_weights.shape[0] + assert rois.shape[0] == bbox_outside_weights.shape[0] + assert bbox_targets.shape[1] == 4 * class_nums + assert bbox_inside_weights.shape[1] == 4 * class_nums + assert bbox_outside_weights.shape[1] == 4 * class_nums class TestMultiBoxHead(unittest.TestCase): @@ -254,18 +263,18 @@ class TestRpnTargetAssign(unittest.TestCase): def test_rpn_target_assign(self): program = Program() with program_guard(program): - loc_shape = [10, 50, 4] - score_shape = [10, 50, 2] + bbox_pred_shape = [10, 50, 4] + cls_logits_shape = [10, 50, 2] anchor_shape = [50, 4] - loc = layers.data( - name='loc', - shape=loc_shape, + bbox_pred = layers.data( + name='bbox_pred', + shape=bbox_pred_shape, append_batch_size=False, dtype='float32') - scores = layers.data( - name='scores', - shape=score_shape, + cls_logits = layers.data( + name='cls_logits', + shape=cls_logits_shape, append_batch_size=False, dtype='float32') anchor_box = layers.data( @@ -278,17 +287,31 @@ class TestRpnTargetAssign(unittest.TestCase): shape=anchor_shape, append_batch_size=False, dtype='float32') - gt_box = layers.data( - name='gt_box', shape=[4], lod_level=1, dtype='float32') - + gt_boxes = layers.data( + name='gt_boxes', shape=[4], lod_level=1, dtype='float32') + is_crowd = layers.data( + name='is_crowd', + shape=[10], + dtype='int32', + lod_level=1, + append_batch_size=False) + im_info = layers.data( + name='im_info', + shape=[1, 3], + dtype='float32', + lod_level=1, + append_batch_size=False) pred_scores, pred_loc, tgt_lbl, tgt_bbox = layers.rpn_target_assign( - loc=loc, - scores=scores, + bbox_pred=bbox_pred, + cls_logits=cls_logits, anchor_box=anchor_box, anchor_var=anchor_var, - gt_box=gt_box, + gt_boxes=gt_boxes, + is_crowd=is_crowd, + im_info=im_info, rpn_batch_size_per_im=256, - fg_fraction=0.25, + rpn_straddle_thresh=0.0, + rpn_fg_fraction=0.5, rpn_positive_overlap=0.7, rpn_negative_overlap=0.3) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 8ac1cb164e158cf38d1c0570f5bf37ee6a6badae..958e72ce27f38e48da17ca738c24e665645ae033 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -29,6 +29,19 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test +if(APPLE) + message(WARNING "These tests has been disabled in OSX before being fixed: \n test_detection_map_op \n test_desc_clone \n test_debugger \n test_program_code \n test_dist_transformer \n test_dist_se_resnext") + # this op is not support on mac + list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op) + # TODO: add the unitest back when it fixed + list(REMOVE_ITEM TEST_OPS test_detection_map_op) + list(REMOVE_ITEM TEST_OPS test_desc_clone) + list(REMOVE_ITEM TEST_OPS test_debugger) + list(REMOVE_ITEM TEST_OPS test_program_code) + list(REMOVE_ITEM TEST_OPS test_dist_transformer) + list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) +endif() + function(py_test_modules TARGET_NAME) if(WITH_TESTING) set(options SERIAL) @@ -46,6 +59,7 @@ function(py_test_modules TARGET_NAME) endfunction() list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_dist_train) +list(REMOVE_ITEM TEST_OPS test_dist_transpiler) list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) @@ -61,11 +75,12 @@ if(WITH_DISTRIBUTE) set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20) set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) + py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) + py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) + py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) endif() py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150) -py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) -py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py index e3db316698398ff693157d583ad1410d10dcf81d..3ec79f8ef6e6f70f1365eaa32352c284d294a1ea 100644 --- a/python/paddle/fluid/tests/unittests/dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/dist_transformer.py @@ -92,7 +92,7 @@ class TrainTaskConfig(object): src_vocab_fpath = data_path + "vocab.bpe.32000" trg_vocab_fpath = data_path + "vocab.bpe.32000" train_file_pattern = data_path + "train.tok.clean.bpe.32000.en-de" - val_file_pattern = data_path + "newstest2013.tok.bpe.32000.en-de" + val_file_pattern = data_path + "newstest2013.tok.bpe.32000.en-de.cut" pool_size = 2000 sort_type = None local = True @@ -624,11 +624,12 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, init = True # Validate and save the model for inference. - if TrainTaskConfig.val_file_pattern is not None: - val_avg_cost, val_ppl = test() - print("[%f]" % val_avg_cost) - else: - assert (False) + if batch_id == 0 or batch_id == 4: + if TrainTaskConfig.val_file_pattern is not None: + val_avg_cost, val_ppl = test() + print("[%f]" % val_avg_cost) + else: + assert (False) #import transformer_reader as reader @@ -1701,8 +1702,9 @@ class DistTransformer2x2(TestDistRunnerBase): exe.run(startup_prog) exe.run(pserver_prog) - def run_trainer(self, place, args): - + def run_trainer(self, use_cuda, args): + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + TrainTaskConfig.use_gpu = use_cuda sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model( args.is_dist, not args.sync_mode) diff --git a/python/paddle/fluid/tests/unittests/test_desc_clone.py b/python/paddle/fluid/tests/unittests/test_desc_clone.py index 08579c7dd62ea6aea87b053345211914a6be6237..82e704169e4b828549dab4c47b0fa46d9afd8f7e 100644 --- a/python/paddle/fluid/tests/unittests/test_desc_clone.py +++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py @@ -109,15 +109,20 @@ def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers): return t +from paddle.fluid.transpiler.details import op_to_code + + def operator_equal(a, b): + if op_to_code(a) != op_to_code(b): + raise ValueError("In operator_equal not equal\n") + for k, v in six.iteritems(a.__dict__): if isinstance(v, fluid.framework.Program) or \ isinstance(v, fluid.framework.Block): continue elif isinstance(v, core.OpDesc): - if v.serialize_to_string() != b.__dict__[k].serialize_to_string(): - raise ValueError("In operator_equal not equal:{0}\n".format(k)) + continue elif isinstance(v, collections.OrderedDict): v0 = sorted(list(six.iteritems(v)), key=lambda x: x[0]) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index c0f5da5a1ae43847dff6348ea5f3e3bfd5e89ab9..37cad73019c529f64868b6ad3c6e2fffe59cc0d8 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -61,9 +61,10 @@ class TestDistRunnerBase(object): exe.run(startup_prog) exe.run(pserver_prog) - def run_trainer(self, place, args): + def run_trainer(self, use_cuda, args): import paddle import paddle.fluid as fluid + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=2) if args.mem_opt: @@ -91,7 +92,7 @@ class TestDistRunnerBase(object): build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce exe = fluid.ParallelExecutor( - True, + use_cuda, loss_name=avg_cost.name, exec_strategy=strategy, build_strategy=build_stra) @@ -142,9 +143,8 @@ def runtime_main(test_class): if args.role == "pserver" and args.is_dist: model.run_pserver(args) else: - p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( - ) else fluid.CPUPlace() - model.run_trainer(p, args) + use_cuda = True if core.is_compiled_with_cuda() else False + model.run_trainer(use_cuda, args) import paddle.compat as cpt @@ -225,11 +225,12 @@ class TestDistBase(unittest.TestCase): def check_with_place(self, model_file, delta=1e-3, check_error_log=False): # TODO(typhoonzero): should auto adapt GPU count on the machine. required_envs = { - "PATH": os.getenv("PATH"), - "PYTHONPATH": os.getenv("PYTHONPATH"), - "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"), + "PATH": os.getenv("PATH", ""), + "PYTHONPATH": os.getenv("PYTHONPATH", ""), + "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), "FLAGS_fraction_of_gpu_memory_to_use": "0.15", - "FLAGS_cudnn_deterministic": "1" + "FLAGS_cudnn_deterministic": "1", + "CPU_NUM": "1" } if check_error_log: diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py index a8e6ce4cfe18384e405f1602429628914d2c2e00..e55f8707a9a8ac2b0d69c65b15e6593025511999 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py @@ -14,6 +14,7 @@ from __future__ import print_function +import os import unittest import paddle from test_dist_base import TestDistBase @@ -44,6 +45,14 @@ def download_files(): test_url = url_prefix + 'newstest2013.tok.bpe.32000.en-de' test_md5 = '9dd74a266dbdb25314183899f269b4a2' paddle.dataset.common.download(test_url, 'test_dist_transformer', test_md5) + # cut test data for faster CI + orig_path = os.path.join(paddle.dataset.common.DATA_HOME, + "test_dist_transformer", + "newstest2013.tok.bpe.32000.en-de") + head_path = os.path.join(paddle.dataset.common.DATA_HOME, + "test_dist_transformer", + "newstest2013.tok.bpe.32000.en-de.cut") + os.system("head -n10 %s > %s" % (orig_path, head_path)) class TestDistTransformer2x2Sync(TestDistBase): diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index b85501ef6b80d1f5004aa0dd08c3123d3bda48a5..a198b25520f97ce23b9c1ebb9cd82fc458222d73 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -62,7 +62,7 @@ class TranspilerTest(unittest.TestCase): t = self._transpiler_instance(config) - trainer_main = t.get_trainer_program() + trainer_main = t.get_trainer_program(wait_port=False) trainer_startup = fluid.default_startup_program() assert (src.num_blocks == 1) diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py similarity index 77% rename from python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py rename to python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py index 6dc101b6dad8813893c6a891da0e16f952bb4c2d..2d5cd3b24bff52d82353ccf3fd2ecb69166c66c6 100644 --- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py +++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py @@ -20,10 +20,10 @@ import paddle.fluid as fluid from op_test import OpTest -def generate_proposal_labels_in_python( - rpn_rois, gt_classes, gt_boxes, im_scales, batch_size_per_im, - fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, - class_nums): +def generate_proposal_labels_in_python(rpn_rois, gt_classes, is_crowd, gt_boxes, + im_info, batch_size_per_im, fg_fraction, + fg_thresh, bg_thresh_hi, bg_thresh_lo, + bbox_reg_weights, class_nums): rois = [] labels_int32 = [] bbox_targets = [] @@ -31,13 +31,13 @@ def generate_proposal_labels_in_python( bbox_outside_weights = [] lod = [] assert len(rpn_rois) == len( - im_scales), 'batch size of rpn_rois and ground_truth is not matched' + im_info), 'batch size of rpn_rois and ground_truth is not matched' - for im_i in range(len(im_scales)): + for im_i in range(len(im_info)): frcn_blobs = _sample_rois( - rpn_rois[im_i], gt_classes[im_i], gt_boxes[im_i], im_scales[im_i], - batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi, - bg_thresh_lo, bbox_reg_weights, class_nums) + rpn_rois[im_i], gt_classes[im_i], is_crowd[im_i], gt_boxes[im_i], + im_info[im_i], batch_size_per_im, fg_fraction, fg_thresh, + bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums) lod.append(frcn_blobs['rois'].shape[0]) @@ -50,13 +50,14 @@ def generate_proposal_labels_in_python( return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights, lod -def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im, - fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, - bbox_reg_weights, class_nums): +def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info, + batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi, + bg_thresh_lo, bbox_reg_weights, class_nums): rois_per_image = int(batch_size_per_im) fg_rois_per_im = int(np.round(fg_fraction * rois_per_image)) # Roidb + im_scale = im_info[2] inv_im_scale = 1. / im_scale rpn_rois = rpn_rois * inv_im_scale @@ -78,6 +79,9 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im, box_to_gt_ind_map[overlapped_boxes_ind] = overlaps_argmax[ overlapped_boxes_ind] + crowd_ind = np.where(is_crowd)[0] + gt_overlaps[crowd_ind] = -1 + max_overlaps = gt_overlaps.max(axis=1) max_classes = gt_overlaps.argmax(axis=1) @@ -85,9 +89,10 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im, fg_inds = np.where(max_overlaps >= fg_thresh)[0] fg_rois_per_this_image = np.minimum(fg_rois_per_im, fg_inds.shape[0]) # Sample foreground if there are too many - if fg_inds.shape[0] > fg_rois_per_this_image: - fg_inds = np.random.choice( - fg_inds, size=fg_rois_per_this_image, replace=False) + # if fg_inds.shape[0] > fg_rois_per_this_image: + # fg_inds = np.random.choice( + # fg_inds, size=fg_rois_per_this_image, replace=False) + fg_inds = fg_inds[:fg_rois_per_this_image] # Background bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >= @@ -96,9 +101,10 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im, bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.shape[0]) # Sample background if there are too many - if bg_inds.shape[0] > bg_rois_per_this_image: - bg_inds = np.random.choice( - bg_inds, size=bg_rois_per_this_image, replace=False) + # if bg_inds.shape[0] > bg_rois_per_this_image: + # bg_inds = np.random.choice( + # bg_inds, size=bg_rois_per_this_image, replace=False) + bg_inds = bg_inds[:bg_rois_per_this_image] keep_inds = np.append(fg_inds, bg_inds) sampled_labels = max_classes[keep_inds] @@ -208,8 +214,9 @@ class TestGenerateProposalLabelsOp(OpTest): self.inputs = { 'RpnRois': (self.rpn_rois[0], self.rpn_rois_lod), 'GtClasses': (self.gt_classes[0], self.gts_lod), + 'IsCrowd': (self.is_crowd[0], self.gts_lod), 'GtBoxes': (self.gt_boxes[0], self.gts_lod), - 'ImScales': self.im_scales[0] + 'ImInfo': self.im_info } self.attrs = { 'batch_size_per_im': self.batch_size_per_im, @@ -218,14 +225,15 @@ class TestGenerateProposalLabelsOp(OpTest): 'bg_thresh_hi': self.bg_thresh_hi, 'bg_thresh_lo': self.bg_thresh_lo, 'bbox_reg_weights': self.bbox_reg_weights, - 'class_nums': self.class_nums + 'class_nums': self.class_nums, + 'use_random': False } self.outputs = { - 'Rois': (self.rois[0], [self.lod]), - 'LabelsInt32': (self.labels_int32[0], [self.lod]), - 'BboxTargets': (self.bbox_targets[0], [self.lod]), - 'BboxInsideWeights': (self.bbox_inside_weights[0], [self.lod]), - 'BboxOutsideWeights': (self.bbox_outside_weights[0], [self.lod]), + 'Rois': (self.rois, [self.lod]), + 'LabelsInt32': (self.labels_int32, [self.lod]), + 'BboxTargets': (self.bbox_targets, [self.lod]), + 'BboxInsideWeights': (self.bbox_inside_weights, [self.lod]), + 'BboxOutsideWeights': (self.bbox_outside_weights, [self.lod]), } def test_check_output(self): @@ -236,8 +244,8 @@ class TestGenerateProposalLabelsOp(OpTest): self.set_data() def init_test_params(self): - self.batch_size_per_im = 10 - self.fg_fraction = 1.0 + self.batch_size_per_im = 512 + self.fg_fraction = 0.25 self.fg_thresh = 0.5 self.bg_thresh_hi = 0.5 self.bg_thresh_lo = 0.0 @@ -246,14 +254,14 @@ class TestGenerateProposalLabelsOp(OpTest): def init_test_input(self): np.random.seed(0) - image_nums = 1 gt_nums = 6 # Keep same with batch_size_per_im for unittest - proposal_nums = self.batch_size_per_im - gt_nums - images_shape = [] - self.im_scales = [] - for i in range(image_nums): - images_shape.append(np.random.randint(200, size=2)) - self.im_scales.append(np.ones((1)).astype(np.float32)) + proposal_nums = 2000 #self.batch_size_per_im - gt_nums + images_shape = [[64, 64]] + self.im_info = np.ones((len(images_shape), 3)).astype(np.float32) + for i in range(len(images_shape)): + self.im_info[i, 0] = images_shape[i][0] + self.im_info[i, 1] = images_shape[i][1] + self.im_info[i, 2] = 0.8 #scale self.rpn_rois, self.rpn_rois_lod = _generate_proposals(images_shape, proposal_nums) @@ -261,16 +269,23 @@ class TestGenerateProposalLabelsOp(OpTest): images_shape, self.class_nums, gt_nums) self.gt_classes = [gt['gt_classes'] for gt in ground_truth] self.gt_boxes = [gt['boxes'] for gt in ground_truth] + self.is_crowd = [gt['is_crowd'] for gt in ground_truth] def init_test_output(self): self.rois, self.labels_int32, self.bbox_targets, \ self.bbox_inside_weights, self.bbox_outside_weights, \ self.lod = generate_proposal_labels_in_python( - self.rpn_rois, self.gt_classes, self.gt_boxes, self.im_scales, + self.rpn_rois, self.gt_classes, self.is_crowd, self.gt_boxes, self.im_info, self.batch_size_per_im, self.fg_fraction, self.fg_thresh, self.bg_thresh_hi, self.bg_thresh_lo, self.bbox_reg_weights, self.class_nums ) + self.rois = np.vstack(self.rois) + self.labels_int32 = np.hstack(self.labels_int32) + self.labels_int32 = self.labels_int32[:, np.newaxis] + self.bbox_targets = np.vstack(self.bbox_targets) + self.bbox_inside_weights = np.vstack(self.bbox_inside_weights) + self.bbox_outside_weights = np.vstack(self.bbox_outside_weights) def _generate_proposals(images_shape, proposal_nums): @@ -280,7 +295,7 @@ def _generate_proposals(images_shape, proposal_nums): for i, image_shape in enumerate(images_shape): proposals = _generate_boxes(image_shape, proposal_nums) rpn_rois.append(proposals) - num_proposals += len(proposals) + num_proposals = len(proposals) rpn_rois_lod.append(num_proposals) return rpn_rois, [rpn_rois_lod] @@ -294,7 +309,11 @@ def _generate_groundtruth(images_shape, class_nums, gt_nums): gt_classes = np.random.randint( low=1, high=class_nums, size=gt_nums).astype(np.int32) gt_boxes = _generate_boxes(image_shape, gt_nums) - ground_truth.append(dict(gt_classes=gt_classes, boxes=gt_boxes)) + is_crowd = np.zeros((gt_nums), dtype=np.int32) + is_crowd[0] = 1 + ground_truth.append( + dict( + gt_classes=gt_classes, boxes=gt_boxes, is_crowd=is_crowd)) num_gts += len(gt_classes) gts_lod.append(num_gts) return ground_truth, [gts_lod] diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py similarity index 88% rename from python/paddle/fluid/tests/unittests/test_generate_proposals.py rename to python/paddle/fluid/tests/unittests/test_generate_proposals_op.py index 3fbd2ce95a4f22b91cd4955f914e12f422b0ee83..86e27fe29ed945ec77fbbcdbd1c7cc6ecfba0fd5 100644 --- a/python/paddle/fluid/tests/unittests/test_generate_proposals.py +++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py @@ -114,10 +114,10 @@ def box_coder(all_anchors, bbox_deltas, variances): #anchor_loc: width, height, center_x, center_y anchor_loc = np.zeros_like(bbox_deltas, dtype=np.float32) - anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] - anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] - anchor_loc[:, 2] = (all_anchors[:, 2] + all_anchors[:, 0]) / 2 - anchor_loc[:, 3] = (all_anchors[:, 3] + all_anchors[:, 1]) / 2 + anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] + 1 + anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] + 1 + anchor_loc[:, 2] = all_anchors[:, 0] + 0.5 * anchor_loc[:, 0] + anchor_loc[:, 3] = all_anchors[:, 1] + 0.5 * anchor_loc[:, 1] #predicted bbox: bbox_center_x, bbox_center_y, bbox_width, bbox_height pred_bbox = np.zeros_like(bbox_deltas, dtype=np.float32) @@ -127,23 +127,29 @@ def box_coder(all_anchors, bbox_deltas, variances): i, 0] + anchor_loc[i, 2] pred_bbox[i, 1] = variances[i, 1] * bbox_deltas[i, 1] * anchor_loc[ i, 1] + anchor_loc[i, 3] - pred_bbox[i, 2] = math.exp(variances[i, 2] * - bbox_deltas[i, 2]) * anchor_loc[i, 0] - pred_bbox[i, 3] = math.exp(variances[i, 3] * - bbox_deltas[i, 3]) * anchor_loc[i, 1] + pred_bbox[i, 2] = math.exp( + min(variances[i, 2] * bbox_deltas[i, 2], math.log( + 1000 / 16.0))) * anchor_loc[i, 0] + pred_bbox[i, 3] = math.exp( + min(variances[i, 3] * bbox_deltas[i, 3], math.log( + 1000 / 16.0))) * anchor_loc[i, 1] else: for i in range(bbox_deltas.shape[0]): pred_bbox[i, 0] = bbox_deltas[i, 0] * anchor_loc[i, 0] + anchor_loc[ i, 2] pred_bbox[i, 1] = bbox_deltas[i, 1] * anchor_loc[i, 1] + anchor_loc[ i, 3] - pred_bbox[i, 2] = math.exp(bbox_deltas[i, 2]) * anchor_loc[i, 0] - pred_bbox[i, 3] = math.exp(bbox_deltas[i, 3]) * anchor_loc[i, 1] + pred_bbox[i, 2] = math.exp( + min(bbox_deltas[i, 2], math.log(1000 / 16.0))) * anchor_loc[i, + 0] + pred_bbox[i, 3] = math.exp( + min(bbox_deltas[i, 3], math.log(1000 / 16.0))) * anchor_loc[i, + 1] proposals[:, 0] = pred_bbox[:, 0] - pred_bbox[:, 2] / 2 proposals[:, 1] = pred_bbox[:, 1] - pred_bbox[:, 3] / 2 - proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 + proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - 1 + proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 - 1 return proposals @@ -170,13 +176,16 @@ def filter_boxes(boxes, min_size, im_info): """Only keep boxes with both sides >= min_size and center within the image. """ # Scale min_size to match image scale - min_size *= im_info[2] + im_scale = im_info[2] + min_size = max(min_size, 1.0) ws = boxes[:, 2] - boxes[:, 0] + 1 hs = boxes[:, 3] - boxes[:, 1] + 1 + ws_orig_scale = (boxes[:, 2] - boxes[:, 0]) / im_scale + 1 + hs_orig_scale = (boxes[:, 3] - boxes[:, 1]) / im_scale + 1 x_ctr = boxes[:, 0] + ws / 2. y_ctr = boxes[:, 1] + hs / 2. - keep = np.where((ws >= min_size) & (hs >= min_size) & (x_ctr < im_info[1]) & - (y_ctr < im_info[0]))[0] + keep = np.where((ws_orig_scale >= min_size) & (hs_orig_scale >= min_size) & + (x_ctr < im_info[1]) & (y_ctr < im_info[0]))[0] return keep @@ -204,7 +213,7 @@ def iou(box_a, box_b): xb = min(xmax_a, xmax_b) yb = min(ymax_a, ymax_b) - inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0) + inter_area = max(xb - xa + 1, 0.0) * max(yb - ya + 1, 0.0) iou_ratio = inter_area / (area_a + area_b - inter_area) diff --git a/python/paddle/fluid/tests/unittests/test_reader_reset.py b/python/paddle/fluid/tests/unittests/test_reader_reset.py index 8ad11d76f683d556f05cafc3251acc942efef72f..e97a05b6f929821f82d96b462598a5ff03cf0a48 100644 --- a/python/paddle/fluid/tests/unittests/test_reader_reset.py +++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py @@ -13,7 +13,7 @@ # limitations under the License. from __future__ import print_function - +import os import paddle.fluid as fluid import paddle import numpy as np @@ -41,6 +41,8 @@ class TestReaderReset(unittest.TestCase): self.data_file_name, reader, feeder) def setUp(self): + # set parallel threads to fit 20 batches in line 49 + os.environ['CPU_NUM'] = str(20) self.use_cuda = fluid.core.is_compiled_with_cuda() self.data_file_name = './reader_reset_test.recordio' self.ins_shape = [3] diff --git a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py index bd548009b3ada9512e4b5f7d7b61b67b0717a39b..f63dbcd3d7f6bfce3ccc1c42ae41afe42bfad003 100644 --- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py +++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py @@ -19,48 +19,58 @@ import numpy as np import paddle.fluid.core as core from op_test import OpTest from test_anchor_generator_op import anchor_generator_in_python -from test_generate_proposal_labels import _generate_groundtruth -from test_generate_proposal_labels import _bbox_overlaps, _box_to_delta - - -def rpn_target_assign(gt_anchor_iou, rpn_batch_size_per_im, - rpn_positive_overlap, rpn_negative_overlap, fg_fraction): - iou = np.transpose(gt_anchor_iou) - anchor_to_gt_max = iou.max(axis=1) - anchor_to_gt_argmax = iou.argmax(axis=1) - - gt_to_anchor_argmax = iou.argmax(axis=0) - gt_to_anchor_max = iou[gt_to_anchor_argmax, np.arange(iou.shape[1])] - anchors_with_max_overlap = np.where(iou == gt_to_anchor_max)[0] - - tgt_lbl = np.ones((iou.shape[0], ), dtype=np.int32) * -1 - tgt_lbl[anchors_with_max_overlap] = 1 - tgt_lbl[anchor_to_gt_max >= rpn_positive_overlap] = 1 - - num_fg = int(fg_fraction * rpn_batch_size_per_im) - fg_inds = np.where(tgt_lbl == 1)[0] - if len(fg_inds) > num_fg: +from test_generate_proposal_labels_op import _generate_groundtruth +from test_generate_proposal_labels_op import _bbox_overlaps, _box_to_delta + + +def rpn_target_assign(anchor_by_gt_overlap, + rpn_batch_size_per_im, + rpn_positive_overlap, + rpn_negative_overlap, + rpn_fg_fraction, + use_random=True): + anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1) + anchor_to_gt_max = anchor_by_gt_overlap[np.arange( + anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax] + + gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0) + gt_to_anchor_max = anchor_by_gt_overlap[gt_to_anchor_argmax, np.arange( + anchor_by_gt_overlap.shape[1])] + anchors_with_max_overlap = np.where( + anchor_by_gt_overlap == gt_to_anchor_max)[0] + + labels = np.ones((anchor_by_gt_overlap.shape[0], ), dtype=np.int32) * -1 + labels[anchors_with_max_overlap] = 1 + labels[anchor_to_gt_max >= rpn_positive_overlap] = 1 + + num_fg = int(rpn_fg_fraction * rpn_batch_size_per_im) + fg_inds = np.where(labels == 1)[0] + if len(fg_inds) > num_fg and use_random: disable_inds = np.random.choice( fg_inds, size=(len(fg_inds) - num_fg), replace=False) - tgt_lbl[disable_inds] = -1 - fg_inds = np.where(tgt_lbl == 1)[0] + else: + disable_inds = fg_inds[num_fg:] + labels[disable_inds] = -1 + fg_inds = np.where(labels == 1)[0] - num_bg = rpn_batch_size_per_im - np.sum(tgt_lbl == 1) + num_bg = rpn_batch_size_per_im - np.sum(labels == 1) bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0] - tgt_lbl[bg_inds] = 0 - if len(bg_inds) > num_bg: + if len(bg_inds) > num_bg and use_random: enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)] - tgt_lbl[enable_inds] = 0 - bg_inds = np.where(tgt_lbl == 0)[0] - tgt_lbl[bg_inds] = 0 + else: + enable_inds = bg_inds[:num_bg] + labels[enable_inds] = 0 + fg_inds = np.where(labels == 1)[0] + bg_inds = np.where(labels == 0)[0] loc_index = fg_inds score_index = np.hstack((fg_inds, bg_inds)) - tgt_lbl = np.expand_dims(tgt_lbl, axis=1) + labels = labels[score_index] + assert not np.any(labels == -1), "Wrong labels with -1" gt_inds = anchor_to_gt_argmax[fg_inds] - return loc_index, score_index, tgt_lbl, gt_inds + return loc_index, score_index, labels, gt_inds def get_anchor(n, c, h, w): @@ -75,85 +85,129 @@ def get_anchor(n, c, h, w): return anchors -def rpn_blob(anchor, gt_boxes, iou, lod, rpn_batch_size_per_im, - rpn_positive_overlap, rpn_negative_overlap, fg_fraction): - - loc_indexes = [] - score_indexes = [] - tmp_tgt_labels = [] - tgt_bboxes = [] - anchor_num = anchor.shape[0] - +def rpn_target_assign_in_python(all_anchors, + gt_boxes, + is_crowd, + im_info, + lod, + rpn_straddle_thresh, + rpn_batch_size_per_im, + rpn_positive_overlap, + rpn_negative_overlap, + rpn_fg_fraction, + use_random=True): + anchor_num = all_anchors.shape[0] batch_size = len(lod) - 1 for i in range(batch_size): + im_height = im_info[i][0] + im_width = im_info[i][1] + im_scale = im_info[i][2] + if rpn_straddle_thresh >= 0: + # Only keep anchors inside the image by a margin of straddle_thresh + inds_inside = np.where( + (all_anchors[:, 0] >= -rpn_straddle_thresh) & + (all_anchors[:, 1] >= -rpn_straddle_thresh) & ( + all_anchors[:, 2] < im_width + rpn_straddle_thresh) & ( + all_anchors[:, 3] < im_height + rpn_straddle_thresh))[0] + # keep only inside anchors + inside_anchors = all_anchors[inds_inside, :] + else: + inds_inside = np.arange(all_anchors.shape[0]) + inside_anchors = all_anchors + b, e = lod[i], lod[i + 1] - iou_slice = iou[b:e, :] - bboxes_slice = gt_boxes[b:e, :] + gt_boxes_slice = gt_boxes[b:e, :] * im_scale + is_crowd_slice = is_crowd[b:e] - loc_idx, score_idx, tgt_lbl, gt_inds = rpn_target_assign( - iou_slice, rpn_batch_size_per_im, rpn_positive_overlap, - rpn_negative_overlap, fg_fraction) + not_crowd_inds = np.where(is_crowd_slice == 0)[0] + gt_boxes_slice = gt_boxes_slice[not_crowd_inds] + iou = _bbox_overlaps(inside_anchors, gt_boxes_slice) - fg_bboxes = bboxes_slice[gt_inds] - fg_anchors = anchor[loc_idx] - box_deltas = _box_to_delta(fg_anchors, fg_bboxes, [1., 1., 1., 1.]) + loc_inds, score_inds, labels, gt_inds = rpn_target_assign( + iou, rpn_batch_size_per_im, rpn_positive_overlap, + rpn_negative_overlap, rpn_fg_fraction, use_random) + # unmap to all anchor + loc_inds = inds_inside[loc_inds] + score_inds = inds_inside[score_inds] + + sampled_gt = gt_boxes_slice[gt_inds] + sampled_anchor = all_anchors[loc_inds] + box_deltas = _box_to_delta(sampled_anchor, sampled_gt, [1., 1., 1., 1.]) if i == 0: - loc_indexes = loc_idx - score_indexes = score_idx - tmp_tgt_labels = tgt_lbl + loc_indexes = loc_inds + score_indexes = score_inds + tgt_labels = labels tgt_bboxes = box_deltas else: loc_indexes = np.concatenate( - [loc_indexes, loc_idx + i * anchor_num]) + [loc_indexes, loc_inds + i * anchor_num]) score_indexes = np.concatenate( - [score_indexes, score_idx + i * anchor_num]) - tmp_tgt_labels = np.concatenate([tmp_tgt_labels, tgt_lbl]) + [score_indexes, score_inds + i * anchor_num]) + tgt_labels = np.concatenate([tgt_labels, labels]) tgt_bboxes = np.vstack([tgt_bboxes, box_deltas]) - tgt_labels = tmp_tgt_labels[score_indexes] return loc_indexes, score_indexes, tgt_bboxes, tgt_labels class TestRpnTargetAssignOp(OpTest): def setUp(self): n, c, h, w = 2, 4, 14, 14 - anchor = get_anchor(n, c, h, w) + all_anchors = get_anchor(n, c, h, w) gt_num = 10 - anchor = anchor.reshape(-1, 4) - anchor_num = anchor.shape[0] - - im_shapes = [[64, 64], [64, 64]] - gt_box, lod = _generate_groundtruth(im_shapes, 3, 4) - bbox = np.vstack([v['boxes'] for v in gt_box]) - - iou = _bbox_overlaps(bbox, anchor) - - anchor = anchor.astype('float32') - bbox = bbox.astype('float32') - iou = iou.astype('float32') - - loc_index, score_index, tgt_bbox, tgt_lbl = rpn_blob( - anchor, bbox, iou, [0, 4, 8], 25600, 0.95, 0.03, 0.25) + all_anchors = all_anchors.reshape(-1, 4) + anchor_num = all_anchors.shape[0] + + images_shape = [[64, 64], [64, 64]] + #images_shape = [[64, 64]] + groundtruth, lod = _generate_groundtruth(images_shape, 3, 4) + lod = [0, 4, 8] + #lod = [0, 4] + + im_info = np.ones((len(images_shape), 3)).astype(np.float32) + for i in range(len(images_shape)): + im_info[i, 0] = images_shape[i][0] + im_info[i, 1] = images_shape[i][1] + im_info[i, 2] = 0.8 #scale + gt_boxes = np.vstack([v['boxes'] for v in groundtruth]) + is_crowd = np.hstack([v['is_crowd'] for v in groundtruth]) + + all_anchors = all_anchors.astype('float32') + gt_boxes = gt_boxes.astype('float32') + + rpn_straddle_thresh = 0.0 + rpn_batch_size_per_im = 256 + rpn_positive_overlap = 0.7 + rpn_negative_overlap = 0.3 + rpn_fg_fraction = 0.5 + use_random = False + + loc_index, score_index, tgt_bbox, labels = rpn_target_assign_in_python( + all_anchors, gt_boxes, is_crowd, im_info, lod, rpn_straddle_thresh, + rpn_batch_size_per_im, rpn_positive_overlap, rpn_negative_overlap, + rpn_fg_fraction, use_random) + labels = labels[:, np.newaxis] self.op_type = "rpn_target_assign" self.inputs = { - 'Anchor': anchor, - 'GtBox': (bbox, [[4, 4]]), - 'DistMat': (iou, [[4, 4]]), + 'Anchor': all_anchors, + 'GtBoxes': (gt_boxes, [[4, 4]]), + 'IsCrowd': (is_crowd, [[4, 4]]), + 'ImInfo': (im_info, [[1, 1]]) } self.attrs = { - 'rpn_batch_size_per_im': 25600, - 'rpn_positive_overlap': 0.95, - 'rpn_negative_overlap': 0.03, - 'fg_fraction': 0.25, - 'fix_seed': True + 'rpn_batch_size_per_im': rpn_batch_size_per_im, + 'rpn_straddle_thresh': rpn_straddle_thresh, + 'rpn_positive_overlap': rpn_positive_overlap, + 'rpn_negative_overlap': rpn_negative_overlap, + 'rpn_fg_fraction': rpn_fg_fraction, + 'use_random': use_random } self.outputs = { 'LocationIndex': loc_index.astype('int32'), 'ScoreIndex': score_index.astype('int32'), 'TargetBBox': tgt_bbox.astype('float32'), - 'TargetLabel': tgt_lbl.astype('int64'), + 'TargetLabel': labels.astype('int32') } def test_check_output(self): diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py index 134df38eea6655857db04dfdc19dd7f7897946f4..4e6ed3a74b344da068bbfb60707838a1b4fc40fd 100644 --- a/python/paddle/fluid/tests/unittests/test_slice_op.py +++ b/python/paddle/fluid/tests/unittests/test_slice_op.py @@ -41,6 +41,9 @@ class TestSliceOp(OpTest): def test_check_output(self): self.check_output() + def test_check_grad_normal(self): + self.check_grad(['Input'], 'Out', max_relative_error=0.006) + class TestCase1(TestSliceOp): def config(self): diff --git a/python/paddle/fluid/tests/unittests/test_while_op.py b/python/paddle/fluid/tests/unittests/test_while_op.py index b75373cf24a7344bf59b3c6fcb9c4c3969be6503..43fd9d425bffb1e0198f4e845da959570a964990 100644 --- a/python/paddle/fluid/tests/unittests/test_while_op.py +++ b/python/paddle/fluid/tests/unittests/test_while_op.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -30,8 +30,10 @@ class TestWhileOp(unittest.TestCase): "d1", shape=[10], append_batch_size=False, dtype='float32') d2 = layers.data( "d2", shape=[10], append_batch_size=False, dtype='float32') + i = layers.zeros(shape=[1], dtype='int64') i.stop_gradient = True + init = layers.zeros(shape=[10], dtype='float32') mem_array = layers.array_write(x=init, i=i) data_array = layers.array_write(x=d0, i=i) @@ -45,11 +47,19 @@ class TestWhileOp(unittest.TestCase): i = layers.zeros(shape=[1], dtype='int64') i.stop_gradient = True - array_len = layers.fill_constant(shape=[1], dtype='int64', value=3) + array_len = layers.fill_constant(shape=[1], dtype='int64', value=1) array_len.stop_gradient = True cond = layers.less_than(x=i, y=array_len) + j = layers.fill_constant(shape=[1], dtype='int64', value=1) + j.stop_gradient = True + + array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3) + array_len2.stop_gradient = True + cond2 = layers.less_than(x=j, y=array_len2) + while_op = layers.While(cond=cond) + while_op2 = layers.While(cond=cond2) with while_op.block(): d = layers.array_read(array=data_array, i=i) prev = layers.array_read(array=mem_array, i=i) @@ -59,7 +69,16 @@ class TestWhileOp(unittest.TestCase): layers.array_write(result, i=i, array=mem_array) layers.less_than(x=i, y=array_len, cond=cond) - sum_result = layers.array_read(array=mem_array, i=i) + with while_op2.block(): + d2 = layers.array_read(array=data_array, i=j) + prev2 = layers.array_read(array=mem_array, i=j) + result2 = layers.sums(input=[d2, prev2]) + + j = layers.increment(x=j, in_place=True) + layers.array_write(result2, i=j, array=mem_array) + layers.less_than(x=j, y=array_len2, cond=cond2) + + sum_result = layers.array_read(array=mem_array, i=j) loss = layers.mean(sum_result) append_backward(loss) diff --git a/python/paddle/fluid/transpiler/details/__init__.py b/python/paddle/fluid/transpiler/details/__init__.py index 5e98266a761c7e01bd6668e85e6adeb54103ca80..f33c05ed2f48c2498b98fc486d6ff7471088d77e 100644 --- a/python/paddle/fluid/transpiler/details/__init__.py +++ b/python/paddle/fluid/transpiler/details/__init__.py @@ -16,3 +16,4 @@ from __future__ import print_function from .program_utils import * from .ufind import * +from .checkport import * diff --git a/python/paddle/fluid/transpiler/details/checkport.py b/python/paddle/fluid/transpiler/details/checkport.py new file mode 100644 index 0000000000000000000000000000000000000000..7bad4b427a2d53bd14c7a1f870ce74a883158d04 --- /dev/null +++ b/python/paddle/fluid/transpiler/details/checkport.py @@ -0,0 +1,50 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import time +import socket +from contextlib import closing + + +def wait_server_ready(endpoints): + """ + Wait until parameter servers are ready, use connext_ex to detect + port readiness. + + Args: + endpoints (list): endpoints string list, like: + ["127.0.0.1:8080", "127.0.0.1:8081"] + + Examples: + .. code-block:: python + + wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"]) + """ + while True: + all_ok = True + for ep in endpoints: + ip_port = ep.split(":") + with closing(socket.socket(socket.AF_INET, + socket.SOCK_STREAM)) as sock: + sock.settimeout(2) + result = sock.connect_ex((ip_port[0], int(ip_port[1]))) + if result != 0: + all_ok = False + if not all_ok: + sys.stderr.write("pserver not ready, wait 3 sec to retry...\n") + sys.stderr.flush() + time.sleep(3) + else: + break diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py index a83aa0f11eed9bfc1674d8d75dcfacc297f056b0..200175cfe87e24a53e1e229e41d1ff2a25fd66ec 100644 --- a/python/paddle/fluid/transpiler/details/program_utils.py +++ b/python/paddle/fluid/transpiler/details/program_utils.py @@ -113,27 +113,32 @@ def op_to_code(op): inputs_str += ", " inputs_str += "}" + attr_names = sorted(op.attr_names) attrs_str = "" - for i in range(0, len(op.attr_names)): - name = op.attr_names[i] + for i in range(0, len(attr_names)): + name = attr_names[i] attr_type = op.desc.attr_type(name) if attr_type == core.AttrType.BLOCK: a = "{name} = block[{value}]".format( name=name, type=attr_type, value=op.block_attr_id(name)) attrs_str += a + if i != len(attr_names) - 1: + attrs_str += ", " continue if attr_type == core.AttrType.BLOCKS: a = "{name} = blocks{value}".format( name=name, type=attr_type, value=op.blocks_attr_ids(name)) attrs_str += a + if i != len(attr_names) - 1: + attrs_str += ", " continue a = "{name} = {value}".format( name=name, type=attr_type, value=op.desc.attr(name)) attrs_str += a - if i != len(op.attr_names) - 1: + if i != len(attr_names) - 1: attrs_str += ", " if outputs_str != "{}": diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index d4d218d547a394a56c040ade2a9ba703b691b86b..53c9cbe23dd82af866658fe46d1d631b0a3b26f3 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -381,7 +381,7 @@ class DistributeTranspiler(object): pserver_endpoints) self._split_table_grad_and_add_send_vars(program, pserver_endpoints) - def get_trainer_program(self): + def get_trainer_program(self, wait_port=True): """ Get transpiled trainer side program. @@ -393,6 +393,9 @@ class DistributeTranspiler(object): delete_ops(self.origin_program.global_block(), self.optimize_ops) self.origin_program.__str__() + if wait_port: + wait_server_ready(self.pserver_endpoints) + return self.origin_program def _get_trainer_startup_program(self, recv_vars, eplist): diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index adad2428f7fdc554cf4efd652f52b5c5de0ab527..49ba2cfd55bc881ed753fcefbd41f5b8fd4ebaf7 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -65,8 +65,43 @@ class InferenceTranspiler(object): if use_mkldnn: self._fuse_conv_bias_mkldnn(program) self._fuse_conv_relu_mkldnn(program) + self._fuse_conv_eltwise_mkldnn(program) + self._fuse_conv_relu_mkldnn( + program) # ResNet residual block merging self._fuse_bn_relu_mkldnn(program) + def _fuse_conv_eltwise_mkldnn(self, program): + ''' + Transpile the program fusing elementwise_add into conv for MKLDNN + program. Elementwise add following convolution OP can be fused by adding + 'fuse_eltwise' attribute to convolution OP and replacing its output + Tensor with second parameter of elementwise_add. + The result of fuse is: + - before: + - conv->elementwise_add->any_other_op + - after: + - conv->any_other_op + :param program: program to transpile + :type program: Program + ''' + self.block = program.block(0) + + i = 0 + while i < len(self.block.ops): + current_op = self.block.ops[i] + if current_op.type in ['conv2d']: + next_op = self.block.ops[i + 1] + if next_op.type == 'elementwise_add': + self._fuse_conv_eltwise(current_op, next_op) + self.block._remove_op(i + 1) # Remove elementwise_add + i = i + 1 + self._adjust_input() + self._remove_unused_var() + # TODO(luotao): use clone() method to flush the program.desc in force, + # since some large program.desc will not be flushed immediately. + # And a better solution will be considered later. + program = program.clone() + def _fuse_conv_relu_mkldnn(self, program): ''' Transpile the program by fused relu activation for MKLDNN program. @@ -88,9 +123,9 @@ class InferenceTranspiler(object): if current_op.type in ['conv2d']: next_op = self.block.ops[i + 1] if next_op.type == 'relu': - # modify conv OP to include relu + # modify bnorm OP to include relu current_op.set_attr("fuse_relu", True) - # remove conv OP + # remove relu OP self.block._remove_op(i + 1) i = i + 1 @@ -409,6 +444,20 @@ class InferenceTranspiler(object): outputs={"Output": out_var}, attrs=attrs) + def _fuse_conv_eltwise(self, conv_op, eltwise_op): + ''' + fuse the conv op with elementwise_add + + :param conv_op: convolution operator + :type conv_op: Operator + :param eltwise_op: operator adding data from skip connection + :type eltwise_op: Operator + ''' + + conv_op.set_attr("fuse_eltwise", True) + self.input_map[conv_op.output("Output")[0]] = eltwise_op.input("Y")[0] + self.input_map[eltwise_op.output("Out")[0]] = eltwise_op.input("Y")[0] + def _adjust_input(self): for i in range(len(self.block.ops)): current_op = self.block.ops[i]