diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py index ed696e82f8723eba573e8affd3f25e2aa6426e63..0d5c9652de6b814627e54018366137e214726619 100644 --- a/benchmark/fluid/args.py +++ b/benchmark/fluid/args.py @@ -140,5 +140,11 @@ def parse_args(): '--use_lars', action='store_true', help='If set, use lars for optimizers, ONLY support resnet module.') + parser.add_argument( + '--reduce_strategy', + type=str, + choices=['reduce', 'all_reduce'], + default='all_reduce', + help='Specify the reduce strategy, can be reduce, all_reduce') args = parser.parse_args() return args diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 25622ee06c69e13181f34dfffadd5e299d31c8a8..ddd9fe809853a830ca676cc98f1819f683866def 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -170,6 +170,14 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog, strategy = fluid.ExecutionStrategy() strategy.num_threads = args.cpus strategy.allow_op_delay = False + build_strategy = fluid.BuildStrategy() + if args.reduce_strategy == "reduce": + build_strategy.reduce_strategy = fluid.BuildStrategy( + ).ReduceStrategy.Reduce + else: + build_strategy.reduce_strategy = fluid.BuildStrategy( + ).ReduceStrategy.AllReduce + avg_loss = train_args[0] if args.update_method == "pserver": @@ -184,6 +192,7 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog, avg_loss.name, main_program=train_prog, exec_strategy=strategy, + build_strategy=build_strategy, num_trainers=num_trainers, trainer_id=trainer_id) diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py index cef8657ee629dcbc19221fd3440844a56627e920..f123e07fb711bd8ff67c1ecf5ec9a02c1e79eb1d 100644 --- a/benchmark/fluid/models/mnist.py +++ b/benchmark/fluid/models/mnist.py @@ -67,11 +67,14 @@ def cnn_model(data): def get_model(args, is_train, main_prog, startup_prog): # NOTE: mnist is small, we don't implement data sharding yet. - filelist = [ - os.path.join(args.data_path, f) for f in os.listdir(args.data_path) - ] + opt = None + data_file_handle = None with fluid.program_guard(main_prog, startup_prog): if args.use_reader_op: + filelist = [ + os.path.join(args.data_path, f) + for f in os.listdir(args.data_path) + ] data_file_handle = fluid.layers.open_files( filenames=filelist, shapes=[[-1, 1, 28, 28], (-1, 1)], @@ -100,7 +103,7 @@ def get_model(args, is_train, main_prog, startup_prog): if is_train: opt = fluid.optimizer.AdamOptimizer( learning_rate=0.001, beta1=0.9, beta2=0.999) - opt.minimize() + opt.minimize(avg_cost) if args.memory_optimize: fluid.memory_optimize(main_prog) diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py index d71b855612ae32083b2b2e3448db3749c340633b..1b3bfe659c7d97b58dc4121387d4db22266381c5 100644 --- a/benchmark/fluid/models/resnet.py +++ b/benchmark/fluid/models/resnet.py @@ -207,7 +207,7 @@ def get_model(args, is_train, main_prog, startup_prog): total_images = 1281167 / trainer_count - step = int(total_images / args.batch_size + 1) + step = int(total_images / (args.batch_size * args.gpus) + 1) epochs = [30, 60, 90] bd = [step * e for e in epochs] base_lr = args.learning_rate diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake index ac19b1651893f18b14c62a0986df75bed25d7e80..8f65a737c43a124c05574d6eb9c3050fdab5299a 100644 --- a/cmake/tensorrt.cmake +++ b/cmake/tensorrt.cmake @@ -16,7 +16,9 @@ find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a DOC "Path to TensorRT library.") if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY) + if(WITH_DSO) set(TENSORRT_FOUND ON) + endif(WITH DSO) else() set(TENSORRT_FOUND OFF) endif() diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 842fde1ec5f16aeec28a790f1869eaed64e3516c..e362d3486487dd0b55e3e40d1c1358f2e5604ac5 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -59,7 +59,7 @@ paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], vara paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None)) paddle.fluid.InferenceTranspiler.__init__ paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) @@ -305,9 +305,9 @@ paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'neg paddle.fluid.layers.detection_output ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)) paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)) paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral')) -paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'anchor_var', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3)) +paddle.fluid.layers.rpn_target_assign ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)) paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)) -paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'gt_boxes', 'im_scales', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None)) +paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True)) paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)) paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) @@ -346,7 +346,7 @@ paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'con paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None)) paddle.fluid.transpiler.InferenceTranspiler.__init__ paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index bf493a3fa44e48deec734250d04b2a413c3ed9da..7c5f5bd80a937bf1a1c891155764833d7b21c5c2 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -46,7 +46,8 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, #endif void AllReduceOpHandle::RunImpl() { - platform::RecordEvent r("all_reduce", nullptr); + platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); + if (NoDummyInputSize() == 1) { return; // No need to all reduce when GPU count = 1; } else { diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 1d9f1bd6e417e30f0799f0bbed1739cedb4e8fbf..4fdab5cd94358d08eac7f8b041bf16d09042f0bd 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -15,12 +15,15 @@ #include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { namespace details { void BroadcastOpHandle::RunImpl() { + platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); + if (places_.size() == 1) return; // The input and output may have dummy vars. diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index d44ebbae4d4be9c79e629303805c94030b8879db..250e093a5f789dba6b06df4889c060c294d469fe 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -348,14 +348,31 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( size_t cur_device_id = 0; bool is_forwarding = true; + bool is_dist_train = false; for (ir::Node *node : sorted_ops) { if (boost::get( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == static_cast(OpRole::kRPC)) { - CreateRPCOp(&result, node); + int op_dev_id = CreateRPCOp(&result, node); + PADDLE_ENFORCE(op_dev_id != -1, + "Can not schedule the RPC operator to the right place."); + if (node->Op()->Type() == "recv") { + auto recv_vars_attr = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE(recv_vars_attr.size() == 2UL); // [parameter, gradient] + if (recv_vars_attr[0].find(".block") == std::string::npos) { + bcast_var_name_set[op_dev_id].emplace(recv_vars_attr[0]); + } + } + is_dist_train = true; } else if (IsDistTrainOp(node, send_vars, recv_vars)) { - CreateDistTrainOp(&result, node); + int op_dev_id = CreateDistTrainOp(&result, node); + if (node->Op()->Type() == "concat") { + auto origin_param_name = node->Op()->OutputArgumentNames()[0]; + bcast_var_name_set[op_dev_id].emplace(origin_param_name); + } } else if (IsScaleLossOp(node)) { // user can customize loss@grad if not use_default_grad_scale_ if (strategy_.gradient_scale_ != @@ -414,7 +431,9 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( CreateReduceOp(&result, g_name, cur_device_id); graph->Get(kShardedVarDevice) .emplace(g_name, cur_device_id); - bcast_var_name_set[cur_device_id].emplace(p_name); + if (!is_dist_train) { + bcast_var_name_set[cur_device_id].emplace(p_name); + } break; case BuildStrategy::ReduceStrategy::kAllReduce: if (IsSparseGradient(g_name)) { @@ -436,14 +455,19 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( } } } - bool use_gpu = false; #ifdef PADDLE_WITH_CUDA use_gpu = nccl_ctxs_ != nullptr; #endif - if (use_gpu && strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { - // Insert BCast Ops + // Insert broadcast operators principle: + // 1. Broadcast optimized parameters in Reduce strategy; + // 2. No need broadcast optimized parameters in AllReduce strategy because of + // the optimization sub-graph would be run on every GPU; + // 3. Allways broadcast received parameters in Distribute Training. + if ((use_gpu && + strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) || + is_dist_train) { for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { auto &to_bcast_set = bcast_var_name_set[dev_id]; for (auto &bcast_name : to_bcast_set) { @@ -675,8 +699,8 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, return var; } -void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, - ir::Node *node) const { +int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, + ir::Node *node) const { int op_dev_id = -1; std::vector input_var_names; std::vector output_var_names; @@ -719,6 +743,7 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, node->Op()->Type()); CreateComputationalOp(result, node, op_dev_id); + return op_dev_id; } void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { @@ -737,8 +762,8 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { } // Create RPC related op handles that connects its in ops and out ops. -void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, - ir::Node *node) const { +int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, + ir::Node *node) const { int op_dev_id = -1; if (node->Op()->Type() == "send") { // TODO(paddle-dev): getting the first var is not safe. @@ -824,6 +849,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id); } } + return op_dev_id; } bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index ac6d9c5a64cfde60f75c76dae0a30cc7d735e996..1ca8c4b855f9468589e537245380451a91a50b14 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -54,8 +54,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass { bool IsScaleLossOp(ir::Node *node) const; - void CreateRPCOp(ir::Graph *result, ir::Node *node) const; - void CreateDistTrainOp(ir::Graph *result, ir::Node *node) const; + int CreateRPCOp(ir::Graph *result, ir::Node *node) const; + int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const; /** * Is this operator as the end-point operator before/after send operator. diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 6c7e5c1fb06620b1c071b00fcfcc1b4a29bf8d62..7fc06f234d42a992328c0b6164f17945d8075c28 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -27,7 +27,8 @@ namespace framework { namespace details { void ReduceOpHandle::RunImpl() { - platform::RecordEvent r("reduce", nullptr); + platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); + if (places_.size() == 1) return; // the input and output may have dummy var. auto in_var_handles = DynamicCast(inputs_); diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index 609e18581957f62b040e04e937873b7a8fa5785a..ba243979b34aa1f683de707525403becaf0a1c00 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -51,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() { ->stream(); memory::Copy(boost::get(place_), tmp, platform::CPUPlace(), &coeff_, sizeof(float), stream); - VLOG(1) << place_ << "RUN Scale loss grad op"; + VLOG(10) << place_ << "RUN Scale loss grad op"; }); #endif } diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc index 4408cb45acb3d46e1addf5c25c238af50e5f5e5f..09c5ec59d66445bdbd5349447b125be89cb2efdf 100644 --- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc @@ -58,7 +58,7 @@ std::unique_ptr ConvReLUFusePass::ApplyImpl( desc.SetInput("Input", std::vector({conv_relu_i_in})); desc.SetInput("Filter", std::vector({conv_relu_w_in})); desc.SetInput("Bias", std::vector({conv_relu_b_in})); - desc.SetOutput("Out", std::vector({conv_relu_out})); + desc.SetOutput("Output", std::vector({conv_relu_out})); desc.SetType("conv2d"); for (auto& attr : conv->Op()->GetAttrMap()) { desc.SetAttr(attr.first, attr.second); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 371384dc56eec91db1f621c0ebb65113e7a5a5cc..1a8d9cefbfa570d2ac3f4fc32d50d705ddc67a75 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -429,7 +429,7 @@ struct LSTM : public PatternBase { struct GRU : public PatternBase { GRU(PDPattern* pattern, const std::string& name_scope) - : PatternBase(pattern, name_scope, "lstm") {} + : PatternBase(pattern, name_scope, "gru") {} PDNode* operator()(PDNode* x); diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index 399afbe64a56393176795ecdd1ac70bfedd5c91a..9bdbefc07cbc4bf7a4714927c84855837610430e 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -72,6 +72,9 @@ class Analyzer : public OrderedRegistry { "mul_gru_fuse_pass", // "seq_concat_fc_fuse_pass", // "fc_fuse_pass", // +#ifdef PADDLE_WITH_MKLDNN + "conv_relu_mkldnn_fuse_pass", // +#endif }}; std::unordered_set disabled_ir_passes_; diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc index c3a2dbf9d18f80f93381d485a3870e43411bd992..b879067d2f2f6294c50e0adb21f9399a7c36698a 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc @@ -120,13 +120,20 @@ void UnionContractedNodes(const std::unordered_map &node_map, outputs.insert(node); } - // update the dst and src node's inlinks and outlinks. +// update the dst and src node's inlinks and outlinks. +#ifdef __clang__ + src_node->inlinks = std::vector(inputs.begin(), inputs.end()); + src_node->outlinks = std::vector(outputs.begin(), outputs.end()); + dst_node->inlinks.clear(); + dst_node->outlinks.clear(); +#else src_node->inlinks = std::move(std::vector(inputs.begin(), inputs.end())); src_node->outlinks = std::move(std::vector(outputs.begin(), outputs.end())); dst_node->inlinks.clear(); dst_node->outlinks.clear(); +#endif auto inlink_or_outlink_cleaner = [&](std::vector &nodes) { for (auto *&n : nodes) { diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 2a9a7aed480e76edbac4d5ba6d7bc3b8b2dc5006..684e0ce0e292d852d4601ebd1ccd920382e42c8b 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -77,6 +77,9 @@ bool AnalysisPredictor::Init( OptimizeInferenceProgram(); ctx_ = executor_->Prepare(*inference_program_, 0); + if (config_._use_mkldnn) { + executor_->EnableMKLDNN(*inference_program_); + } VLOG(5) << "to create variables"; PADDLE_ENFORCE(scope_.get()); diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc index 5f1e1b548c7b7daa66932571d7053701bc0bd1f6..c71769a32f604358fe68c927546591310649f116 100644 --- a/paddle/fluid/inference/api/api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -64,13 +64,15 @@ PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) { void PaddleBuf::Resize(size_t length) { // Only the owned memory can be reset, the external memory can't be changed. - if (length_ == length) return; + if (length_ >= length) return; if (memory_owned_) { Free(); + data_ = malloc(length); + length_ = length; + memory_owned_ = true; + } else { + PADDLE_THROW("The memory is allocated externally, can not Resized"); } - data_ = new char[length]; - length_ = length; - memory_owned_ = true; } void PaddleBuf::Reset(void* data, size_t length) { @@ -82,8 +84,8 @@ void PaddleBuf::Reset(void* data, size_t length) { void PaddleBuf::Free() { if (memory_owned_ && data_) { - assert(length_ > 0); - delete[] static_cast(data_); + PADDLE_ENFORCE_GT(length_, 0); + free(static_cast(data_)); data_ = nullptr; length_ = 0; } diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 6fe13ed027de403bdc21882c26225bcd4cc7e49a..2e9e10139fa7008a46c3782960dfd44d3228cc26 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -106,6 +106,9 @@ bool NativePaddlePredictor::Init( } ctx_ = executor_->Prepare(*inference_program_, 0); + if (config_._use_mkldnn) { + executor_->EnableMKLDNN(*inference_program_); + } executor_->CreateVariables(*inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0); diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index f6893be428feacbba85bab380e22972848eaeb93..8e359a67738c0df180933421b45f15b39fd0e78c 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -123,10 +123,16 @@ std::string DescribeTensor(const PaddleTensor &tensor) { } void PrintTime(int batch_size, int repeat, int num_threads, int tid, - double latency) { + double latency, int epoch = 1) { LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat << ", threads: " << num_threads << ", thread id: " << tid << ", latency: " << latency << "ms ======"; + if (epoch > 1) { + int samples = batch_size * epoch; + LOG(INFO) << "====== sample number: " << samples + << ", average latency of each sample: " << latency / samples + << "ms ======"; + } } } // namespace inference diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 995da11e4a30eca72a91a53d3293aa8b033b012b..55a07ca705f9fafa9ea223a867300bd14e10c364 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -45,7 +45,7 @@ class PaddleBuf { PaddleBuf(void* data, size_t length) : data_(data), length_(length), memory_owned_{false} {} // Own memory. - PaddleBuf(size_t length) + explicit PaddleBuf(size_t length) : data_(new char[length]), length_(length), memory_owned_(true) {} // Resize to `length` bytes. void Resize(size_t length); @@ -121,6 +121,8 @@ struct NativeConfig : public PaddlePredictor::Config { bool use_gpu{false}; int device{0}; float fraction_of_gpu_memory{-1.f}; // Negative to notify initialization. + // NOTE: NOT use it, just for the internal test, will discard later + bool _use_mkldnn{false}; // Specify the variable's name of each input. bool specify_input_name{false}; diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index d44a2cfa7f2d2f7dde5001006e05cdff1612435b..2d89fa89e72712dc236f91cf265ebac4b0198650 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -1,57 +1,77 @@ -function (inference_download_and_uncompress install_dir url) - get_filename_component(filename ${url} NAME) - message(STATUS "Download inference test stuff ${filename} from ${url}") +set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com") +set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo") +set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor) +function (inference_download_and_uncompress install_dir filename) + message(STATUS "Download inference test stuff from ${INFERENCE_URL}/${filename}") execute_process(COMMAND bash -c "mkdir -p ${install_dir}") - execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}") + execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${INFERENCE_URL}/${filename}") execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}") message(STATUS "finish downloading ${filename}") endfunction(inference_download_and_uncompress) -function(download_model_and_data install_dir model_url data_url) +function(download_model_and_data install_dir model_name data_name) if (NOT EXISTS ${install_dir} AND WITH_INFERENCE) - inference_download_and_uncompress(${install_dir} ${model_url}) - inference_download_and_uncompress(${install_dir} ${data_url}) + inference_download_and_uncompress(${install_dir} ${model_name}) + inference_download_and_uncompress(${install_dir} ${data_name}) endif() endfunction() # RNN1 -set(RNN1_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fmodel.tar.gz") -set(RNN1_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fdata.txt.tar.gz") -set(RNN1_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/rnn1") -download_model_and_data(${RNN1_INSTALL_DIR} ${RNN1_MODEL_URL} ${RNN1_DATA_URL}) -inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor - ARGS --infer_model=${RNN1_INSTALL_DIR}/model - --infer_data=${RNN1_INSTALL_DIR}/data.txt) +# TODO: fix this test on MACOS +message(WARNING "These tests has been disabled in OSX before being fixed: \n test_analyzer_rnn1") +if(NOT APPLE) + set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") + download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz") + inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${RNN1_INSTALL_DIR}/model + --infer_data=${RNN1_INSTALL_DIR}/data.txt) +endif(NOT APPLE) + +# RNN2 +set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2") +download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz") +inference_analysis_test(test_analyzer_rnn2 SRCS analyzer_rnn2_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${RNN2_INSTALL_DIR}/model + --infer_data=${RNN2_INSTALL_DIR}/data.txt) # chinese_ner -set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz") -set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz") -set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner") -download_model_and_data(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} ${CHINESE_NER_DATA_URL}) +set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner") +download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz") inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt) # lac -set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz") -set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz") -set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac") -download_model_and_data(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} ${LAC_DATA_URL}) +set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac") +download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz") inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${LAC_INSTALL_DIR}/model --infer_data=${LAC_INSTALL_DIR}/data.txt) # text_classification -set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz") -set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz") -set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification") -download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} ${TEXT_CLASSIFICATION_DATA_URL}) -inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor - ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta - --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt - --topn=1 # Just run top 1 batch. - ) +set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification") +download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz") +inference_analysis_test(test_analyzer_text_classification SRCS analyzer_text_classification_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/model + --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt) + +# ocr +set(OCR_MODEL_URL "http://paddlemodels.cdn.bcebos.com/inference-vis-demos%2Focr.tar.gz") +set(OCR_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/ocr") +if (NOT EXISTS ${OCR_INSTALL_DIR} AND WITH_INFERENCE) + get_filename_component(filename ${OCR_MODEL_URL} NAME) + message(STATUS "Download inference test stuff ${filename} from ${OCR_MODEL_URL}") + execute_process(COMMAND bash -c "mkdir -p ${OCR_INSTALL_DIR}") + execute_process(COMMAND bash -c "cd ${OCR_INSTALL_DIR} && wget -q ${OCR_MODEL_URL}") + execute_process(COMMAND bash -c "cd ${OCR_INSTALL_DIR} && tar xzf ${filename}") + message(STATUS "finish downloading ${filename}") +endif() +inference_analysis_test(test_analyzer_ocr SRCS analyzer_vis_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${OCR_INSTALL_DIR}/model + --infer_data=${OCR_INSTALL_DIR}/data.txt) diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 7e00cb20ad0ce052a84d5491b0cdf167f0768081..bf893e32569f4b50a583ab6f43cb214ec3620e09 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -12,21 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/analyzer.h" -#include -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/api/analysis_predictor.h" -#include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/paddle_inference_pass.h" -#include "paddle/fluid/platform/profiler.h" - -DEFINE_string(infer_model, "", "model path for LAC"); -DEFINE_string(infer_data, "", "data file for LAC"); -DEFINE_int32(batch_size, 1, "batch size."); -DEFINE_int32(burning, 0, "Burning before repeat."); -DEFINE_int32(repeat, 1, "Running the inference program repeat times."); -DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); +#include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { namespace inference { @@ -124,48 +110,38 @@ const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, void TestLACPrediction(const std::string &model_path, const std::string &data_file, const int batch_size, - const int repeat, bool test_all_data, - bool use_analysis = false) { - NativeConfig config; - config.model_dir = model_path; - config.use_gpu = false; - config.device = 0; - config.specify_input_name = true; + const int repeat, bool use_analysis = false) { + AnalysisConfig cfg; + cfg.model_dir = model_path; + cfg.use_gpu = false; + cfg.device = 0; + cfg.specify_input_name = true; + cfg.enable_ir_optim = true; + std::vector input_slots, outputs_slots; DataRecord data(data_file, batch_size); GetOneBatch(&input_slots, &data, batch_size); std::unique_ptr predictor; if (use_analysis) { - AnalysisConfig cfg; - cfg.model_dir = model_path; - cfg.use_gpu = false; - cfg.device = 0; - cfg.specify_input_name = true; - cfg.enable_ir_optim = true; predictor = CreatePaddlePredictor(cfg); } else { predictor = - CreatePaddlePredictor(config); + CreatePaddlePredictor(cfg); } for (int i = 0; i < FLAGS_burning; i++) { predictor->Run(input_slots, &outputs_slots); } Timer timer; - if (test_all_data) { - double sum = 0; - LOG(INFO) << "Total number of samples: " << data.datasets.size(); - for (int i = 0; i < repeat; i++) { - for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) { - GetOneBatch(&input_slots, &data, batch_size); - timer.tic(); - predictor->Run(input_slots, &outputs_slots); - sum += timer.toc(); - } + if (FLAGS_test_all_data) { + LOG(INFO) << "test all data"; + std::vector> input_slots_all; + for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) { + GetOneBatch(&input_slots, &data, batch_size); + input_slots_all.emplace_back(input_slots); } - PrintTime(batch_size, repeat, 1, 0, sum / repeat); - LOG(INFO) << "Average latency of each sample: " - << sum / repeat / data.datasets.size() << " ms"; + LOG(INFO) << "total number of samples: " << data.datasets.size(); + TestPrediction(cfg, input_slots_all, &outputs_slots, FLAGS_num_threads); return; } timer.tic(); @@ -190,19 +166,10 @@ void TestLACPrediction(const std::string &model_path, if (use_analysis) { // run once for comparion as reference auto ref_predictor = - CreatePaddlePredictor(config); + CreatePaddlePredictor(cfg); std::vector ref_outputs_slots; ref_predictor->Run(input_slots, &ref_outputs_slots); - EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size()); - auto &ref_out = ref_outputs_slots[0]; - size_t ref_size = - std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1, - [](int a, int b) { return a * b; }); - EXPECT_EQ(size, ref_size); - int64_t *pdata_ref = static_cast(ref_out.data.data()); - for (size_t i = 0; i < size; ++i) { - EXPECT_EQ(pdata_ref[i], pdata[i]); - } + CompareResult(ref_outputs_slots, outputs_slots); AnalysisPredictor *analysis_predictor = dynamic_cast(predictor.get()); @@ -231,13 +198,13 @@ void TestLACPrediction(const std::string &model_path, TEST(Analyzer_LAC, native) { LOG(INFO) << "LAC with native"; TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, - FLAGS_repeat, FLAGS_test_all_data); + FLAGS_repeat); } TEST(Analyzer_LAC, analysis) { LOG(INFO) << "LAC with analysis"; TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, - FLAGS_repeat, FLAGS_test_all_data, true); + FLAGS_repeat, true); } } // namespace analysis diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index 6e8e43add7d3383fa79efea91c23750be9c8956f..f8c651e32f7e2ce1d8ced0e6774ffd555d351167 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -12,20 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/analyzer.h" -#include -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/api/analysis_predictor.h" -#include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/paddle_inference_pass.h" -#include "paddle/fluid/platform/profiler.h" - -DEFINE_string(infer_model, "", "model path"); -DEFINE_string(infer_data, "", "data path"); -DEFINE_int32(batch_size, 10, "batch size."); -DEFINE_int32(repeat, 1, "Running the inference program repeat times."); -DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); +#include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { namespace inference { @@ -113,50 +100,35 @@ const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26, 48, 39, 38, 16, 25}; void TestChineseNERPrediction(bool use_analysis) { - NativeConfig config; - config.prog_file = FLAGS_infer_model + "/__model__"; - config.param_file = FLAGS_infer_model + "/param"; - config.use_gpu = false; - config.device = 0; - config.specify_input_name = true; + AnalysisConfig cfg; + cfg.prog_file = FLAGS_infer_model + "/__model__"; + cfg.param_file = FLAGS_infer_model + "/param"; + cfg.use_gpu = false; + cfg.device = 0; + cfg.specify_input_name = true; + cfg.enable_ir_optim = true; std::vector input_slots, outputs; std::unique_ptr predictor; Timer timer; if (use_analysis) { - AnalysisConfig cfg; - cfg.prog_file = FLAGS_infer_model + "/__model__"; - cfg.param_file = FLAGS_infer_model + "/param"; - cfg.use_gpu = false; - cfg.device = 0; - cfg.specify_input_name = true; - cfg.enable_ir_optim = true; predictor = CreatePaddlePredictor(cfg); } else { predictor = - CreatePaddlePredictor(config); + CreatePaddlePredictor(cfg); } if (FLAGS_test_all_data) { LOG(INFO) << "test all data"; - double sum = 0; - size_t num_samples; - for (int i = 0; i < FLAGS_repeat; i++) { - DataRecord data(FLAGS_infer_data, FLAGS_batch_size); - // Just one batch, the num_samples remains the same. - num_samples = data.num_samples; - for (size_t bid = 0; bid < num_samples / FLAGS_batch_size; ++bid) { - PrepareInputs(&input_slots, &data, FLAGS_batch_size); - timer.tic(); - predictor->Run(input_slots, &outputs); - sum += timer.toc(); - } + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + std::vector> input_slots_all; + for (size_t bid = 0; bid < data.num_samples / FLAGS_batch_size; ++bid) { + PrepareInputs(&input_slots, &data, FLAGS_batch_size); + input_slots_all.emplace_back(input_slots); } - LOG(INFO) << "total number of samples: " << num_samples; - PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat); - LOG(INFO) << "average latency of each sample: " - << sum / FLAGS_repeat / num_samples; + LOG(INFO) << "total number of samples: " << data.num_samples; + TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); return; } // Prepare inputs. @@ -182,19 +154,10 @@ void TestChineseNERPrediction(bool use_analysis) { if (use_analysis) { // run once for comparion as reference auto ref_predictor = - CreatePaddlePredictor(config); + CreatePaddlePredictor(cfg); std::vector ref_outputs_slots; ref_predictor->Run(input_slots, &ref_outputs_slots); - EXPECT_EQ(ref_outputs_slots.size(), outputs.size()); - auto &ref_out = ref_outputs_slots[0]; - size_t ref_size = - std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1, - [](int a, int b) { return a * b; }); - EXPECT_EQ(size, ref_size); - int64_t *pdata_ref = static_cast(ref_out.data.data()); - for (size_t i = 0; i < size; ++i) { - EXPECT_EQ(pdata_ref[i], result[i]); - } + CompareResult(ref_outputs_slots, outputs); AnalysisPredictor *analysis_predictor = dynamic_cast(predictor.get()); diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index b8ac468b4e98bcef81cdbbf66e3f1640c03a7ab8..df96be544eaf51c52aa5592966f499fad91aab82 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -12,24 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/analyzer.h" - -#include -#include -#include // NOLINT -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/pass.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/api/analysis_predictor.h" -#include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/paddle_inference_api.h" -#include "paddle/fluid/inference/api/paddle_inference_pass.h" - -DEFINE_string(infer_model, "", "model path"); -DEFINE_string(infer_data, "", "data path"); -DEFINE_int32(batch_size, 10, "batch size."); -DEFINE_int32(repeat, 1, "Running the inference program repeat times."); -DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); +#include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { namespace inference { @@ -164,26 +147,6 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } } -void CompareResult(const std::vector &outputs, - const std::vector &base_outputs) { - PADDLE_ENFORCE_GT(outputs.size(), 0); - PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size()); - for (size_t i = 0; i < outputs.size(); i++) { - auto &out = outputs[i]; - auto &base_out = base_outputs[i]; - size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, - [](int a, int b) { return a * b; }); - size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(), - 1, [](int a, int b) { return a * b; }); - PADDLE_ENFORCE_EQ(size, size1); - PADDLE_ENFORCE_GT(size, 0); - float *data = static_cast(out.data.data()); - float *base_data = static_cast(base_out.data.data()); - for (size_t i = 0; i < size; i++) { - EXPECT_NEAR(data[i], base_data[i], 1e-3); - } - } -} // Test with a really complicate model. void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { AnalysisConfig config; @@ -198,7 +161,6 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { config.ir_passes.clear(); // Do not exclude any pass. int batch_size = FLAGS_batch_size; - int num_times = FLAGS_repeat; auto base_predictor = CreatePaddlePredictor(config); @@ -213,45 +175,14 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { base_predictor->Run(input_slots, &base_outputs); + std::vector> input_slots_all; + input_slots_all.emplace_back(input_slots); if (num_threads == 1) { - // Prepare inputs. - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - predictor->Run(input_slots, &outputs); - } - PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times); + TestOneThreadPrediction(config, input_slots_all, &outputs); CompareResult(outputs, base_outputs); } else { - std::vector threads; - std::vector> predictors; - // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled - // because AttentionLSTM's hard code nodeid will be damanged. - for (int tid = 0; tid < num_threads; ++tid) { - predictors.emplace_back( - CreatePaddlePredictor( - config)); - } - for (int tid = 0; tid < num_threads; ++tid) { - threads.emplace_back([&, tid]() { - // Each thread should have local input_slots and outputs. - std::vector input_slots; - DataRecord data(FLAGS_infer_data, batch_size); - PrepareInputs(&input_slots, &data, batch_size); - std::vector outputs; - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - predictors[tid]->Run(input_slots, &outputs); - } - PrintTime(batch_size, num_times, num_threads, tid, - timer.toc() / num_times); - CompareResult(outputs, base_outputs); - }); - } - for (int i = 0; i < num_threads; ++i) { - threads[i].join(); - } + // only return the output of first thread + TestMultiThreadPrediction(config, input_slots_all, &outputs, num_threads); } if (use_analysis && activate_ir) { @@ -293,8 +224,7 @@ TEST(Analyzer, RNN_tests) { // Directly infer with the original model. TestRNN1Prediction(false, false, i); // Inference with the original model with the analysis turned on, the - // analysis - // module will transform the program to a data flow graph. + // analysis module will transform the program to a data flow graph. TestRNN1Prediction(true, false, i); // Inference with analysis and IR. The IR module will fuse some large // kernels. diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..c40ea58eea9c10a85acf84108f1d081a779f526d --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc @@ -0,0 +1,181 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/analyzer.h" + +#include +#include +#include // NOLINT +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" + +DEFINE_string(infer_model, "", "model path"); +DEFINE_string(infer_data, "", "data path"); +DEFINE_int32(batch_size, 1, "batch size."); +DEFINE_int32(repeat, 1, "Running the inference program repeat times."); +DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); + +namespace paddle { +namespace inference { + +using namespace framework; // NOLINT + +struct DataRecord { + std::vector>> link_step_data_all; + std::vector lod; + std::vector> rnn_link_data; + std::vector result_data; + size_t batch_iter{0}; + size_t batch_size{1}; + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) + : batch_size(batch_size) { + Load(path); + } + DataRecord NextBatch() { + DataRecord data; + size_t batch_end = batch_iter + batch_size; + // NOTE skip the final batch, if no enough data is provided. + if (batch_end <= link_step_data_all.size()) { + data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter, + link_step_data_all.begin() + batch_end); + // Prepare LoDs + data.lod.push_back(0); + CHECK(!data.link_step_data_all.empty()) << "empty"; + for (size_t j = 0; j < data.link_step_data_all.size(); j++) { + for (const auto &d : data.link_step_data_all[j]) { + data.rnn_link_data.push_back(d); + // calculate lod + data.lod.push_back(data.lod.back() + 11); + } + } + } + batch_iter += batch_size; + return data; + } + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + int num_lines = 0; + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, ':', &data); + if (num_lines % 2) { // feature + std::vector feature_data; + split(data[1], ' ', &feature_data); + std::vector> link_step_data; + int feature_count = 1; + std::vector feature; + for (auto &step_data : feature_data) { + std::vector tmp; + split_to_float(step_data, ',', &tmp); + feature.insert(feature.end(), tmp.begin(), tmp.end()); + if (feature_count % 11 == 0) { // each sample has 11 features + link_step_data.push_back(feature); + feature.clear(); + } + feature_count++; + } + link_step_data_all.push_back(std::move(link_step_data)); + } else { // result + std::vector tmp; + split_to_float(data[1], ',', &tmp); + result_data.insert(result_data.end(), tmp.begin(), tmp.end()); + } + } + } +}; +void PrepareInputs(std::vector *input_slots, DataRecord *data, + int batch_size) { + PaddleTensor feed_tensor; + feed_tensor.name = "feed"; + auto one_batch = data->NextBatch(); + int token_size = one_batch.rnn_link_data.size(); + // each token has 11 features, each feature's dim is 54. + std::vector rnn_link_data_shape({token_size * 11, 54}); + feed_tensor.shape = rnn_link_data_shape; + feed_tensor.lod.assign({one_batch.lod}); + feed_tensor.dtype = PaddleDType::FLOAT32; + TensorAssignData(&feed_tensor, one_batch.rnn_link_data); + // Set inputs. + input_slots->assign({feed_tensor}); +} + +void CompareResult(const std::vector &outputs, + const std::vector &base_result) { + PADDLE_ENFORCE_GT(outputs.size(), 0); + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = outputs[i]; + size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, + [](int a, int b) { return a * b; }); + PADDLE_ENFORCE_GT(size, 0); + float *data = static_cast(out.data.data()); + for (size_t i = 0; i < size; i++) { + EXPECT_NEAR(data[i], base_result[i], 1e-3); + } + } +} +// Test with a really complicate model. +void TestRNN2Prediction() { + AnalysisConfig config; + config.prog_file = FLAGS_infer_model + "/__model__"; + config.param_file = FLAGS_infer_model + "/param"; + config.use_gpu = false; + config.device = 0; + config.specify_input_name = true; + config.enable_ir_optim = true; + PADDLE_ENFORCE(config.ir_mode == + AnalysisConfig::IrPassMode::kExclude); // default + + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; + + auto base_predictor = + CreatePaddlePredictor(config); + auto predictor = + CreatePaddlePredictor( + config); + std::vector input_slots; + DataRecord data(FLAGS_infer_data, batch_size); + PrepareInputs(&input_slots, &data, batch_size); + std::vector outputs, base_outputs; + + Timer timer1; + timer1.tic(); + for (int i = 0; i < num_times; i++) { + base_predictor->Run(input_slots, &base_outputs); + } + PrintTime(batch_size, num_times, 1, 0, timer1.toc() / num_times); + + Timer timer2; + timer2.tic(); + for (int i = 0; i < num_times; i++) { + predictor->Run(input_slots, &outputs); + } + PrintTime(batch_size, num_times, 1, 0, timer2.toc() / num_times); + + CompareResult(base_outputs, data.result_data); + CompareResult(outputs, data.result_data); +} + +TEST(Analyzer, rnn2) { TestRNN2Prediction(); } + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc index 65169f8cfcc5bf1e989609666f6e0ba03e42e5ba..1472c475e4a3061ffcad96925ea215a41a7e63eb 100644 --- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc @@ -12,23 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/analyzer.h" -#include -#include // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files. -#include -#include -#include "paddle/fluid/framework/ir/pass.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/paddle_inference_api.h" -#include "paddle/fluid/inference/api/paddle_inference_pass.h" -#include "paddle/fluid/inference/api/timer.h" - -DEFINE_string(infer_model, "", "Directory of the inference model."); -DEFINE_string(infer_data, "", "Path of the dataset."); -DEFINE_int32(batch_size, 1, "batch size."); -DEFINE_int32(repeat, 1, "How many times to repeat run."); -DEFINE_int32(topn, -1, "Run top n batches of data to save time"); +#include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { namespace inference { @@ -37,24 +21,25 @@ struct DataReader { explicit DataReader(const std::string &path) : file(new std::ifstream(path)) {} - bool NextBatch(PaddleTensor *tensor, int batch_size) { + bool NextBatch(std::vector *input, int batch_size) { PADDLE_ENFORCE_EQ(batch_size, 1); std::string line; - tensor->lod.clear(); - tensor->lod.emplace_back(std::vector({0})); + PaddleTensor tensor; + tensor.dtype = PaddleDType::INT64; + tensor.lod.emplace_back(std::vector({0})); std::vector data; for (int i = 0; i < batch_size; i++) { if (!std::getline(*file, line)) return false; inference::split_to_int64(line, ' ', &data); } - tensor->lod.front().push_back(data.size()); + tensor.lod.front().push_back(data.size()); - tensor->data.Resize(data.size() * sizeof(int64_t)); - memcpy(tensor->data.data(), data.data(), data.size() * sizeof(int64_t)); - tensor->shape.clear(); - tensor->shape.push_back(data.size()); - tensor->shape.push_back(1); + tensor.data.Resize(data.size() * sizeof(int64_t)); + memcpy(tensor.data.data(), data.data(), data.size() * sizeof(int64_t)); + tensor.shape.push_back(data.size()); + tensor.shape.push_back(1); + input->assign({tensor}); return true; } @@ -68,32 +53,28 @@ void Main(int batch_size) { config.model_dir = FLAGS_infer_model; config.use_gpu = false; config.enable_ir_optim = true; - auto predictor = - CreatePaddlePredictor( - config); - - std::vector input_slots(1); - // one batch starts - // data -- - auto &input = input_slots[0]; - input.dtype = PaddleDType::INT64; - inference::Timer timer; - double sum = 0; - std::vector output_slots; + std::vector input_slots, output_slots; + DataReader reader(FLAGS_infer_data); + std::vector> input_slots_all; - int num_batches = 0; - for (int t = 0; t < FLAGS_repeat; t++) { - DataReader reader(FLAGS_infer_data); - while (reader.NextBatch(&input, FLAGS_batch_size)) { - if (FLAGS_topn > 0 && num_batches > FLAGS_topn) break; - timer.tic(); - CHECK(predictor->Run(input_slots, &output_slots)); - sum += timer.toc(); + if (FLAGS_test_all_data) { + LOG(INFO) << "test all data"; + int num_batches = 0; + while (reader.NextBatch(&input_slots, FLAGS_batch_size)) { + input_slots_all.emplace_back(input_slots); ++num_batches; } + LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size; + TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads); + return; } - PrintTime(batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat); + + // one batch starts + // data -- + reader.NextBatch(&input_slots, FLAGS_batch_size); + input_slots_all.emplace_back(input_slots); + TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads); // Get output LOG(INFO) << "get outputs " << output_slots.size(); diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..a207c41b7140c806b4c1fdc7f24a317b165c9aef --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -0,0 +1,133 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +struct Record { + std::vector data; + std::vector shape; +}; + +Record ProcessALine(const std::string &line) { + VLOG(3) << "process a line"; + std::vector columns; + split(line, '\t', &columns); + CHECK_EQ(columns.size(), 2UL) + << "data format error, should be \t"; + + Record record; + std::vector data_strs; + split(columns[0], ' ', &data_strs); + for (auto &d : data_strs) { + record.data.push_back(std::stof(d)); + } + + std::vector shape_strs; + split(columns[1], ' ', &shape_strs); + for (auto &s : shape_strs) { + record.shape.push_back(std::stoi(s)); + } + VLOG(3) << "data size " << record.data.size(); + VLOG(3) << "data shape size " << record.shape.size(); + return record; +} + +/* + * Use the native and analysis fluid engine to inference the demo. + * ocr, mobilenet and se_resnext50 + */ +void TestVisualPrediction(bool use_mkldnn) { + std::unique_ptr predictor; + AnalysisConfig cfg; + cfg.param_file = FLAGS_infer_model + "/__params__"; + cfg.prog_file = FLAGS_infer_model + "/__model__"; + cfg.use_gpu = false; + cfg._use_mkldnn = use_mkldnn; + cfg.device = 0; + cfg.enable_ir_optim = true; + // TODO(TJ): fix fusion gru + cfg.ir_passes.push_back("fc_gru_fuse_pass"); +#ifdef PADDLE_WITH_MKLDNN + // disable mkldnn fuse since it should have some bugs + cfg.ir_passes.push_back("conv_relu_mkldnn_fuse_pass"); +#endif + predictor = + CreatePaddlePredictor(cfg); + + // Only have single batch of data. + std::string line; + std::ifstream file(FLAGS_infer_data); + std::getline(file, line); + auto record = ProcessALine(line); + file.close(); + + // Inference. + PaddleTensor input; + input.shape = record.shape; + input.data = + PaddleBuf(record.data.data(), record.data.size() * sizeof(float)); + input.dtype = PaddleDType::FLOAT32; + + std::vector outputs_slots; + Timer timer; + timer.tic(); + for (int i = 0; i < FLAGS_repeat; i++) { + predictor->Run({input}, &outputs_slots); + } + PrintTime(/*batch size*/ 1, FLAGS_repeat, /*num threads*/ 1, /*thread id*/ 0, + timer.toc() / FLAGS_repeat); + + VLOG(3) << "output.size " << outputs_slots.size(); + + // run native as reference + auto ref_predictor = + CreatePaddlePredictor(cfg); + std::vector ref_outputs_slots; + ref_predictor->Run({input}, &ref_outputs_slots); + CompareResult(outputs_slots, ref_outputs_slots); + // print what are fused + AnalysisPredictor *analysis_predictor = + dynamic_cast(predictor.get()); + auto &fuse_statis = analysis_predictor->analysis_argument() + .Get>( + framework::ir::kFuseStatisAttr); + for (auto &item : fuse_statis) { + LOG(INFO) << "fused " << item.first << " " << item.second; + } + int num_ops = 0; + for (auto &node : + analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { + if (node->IsFunction()) { + ++num_ops; + } + } + LOG(INFO) << "has num ops: " << num_ops; +} + +TEST(Analyzer_vis, analysis) { TestVisualPrediction(/*use_mkldnn*/ false); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_vis, analysis_mkldnn) { + TestVisualPrediction(/*use_mkldnn*/ true); +} +#endif + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..43e97614e3ad9c14c8deee9f340757f373eb593e --- /dev/null +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -0,0 +1,141 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include // NOLINT +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" +#include "paddle/fluid/platform/profiler.h" + +DEFINE_string(infer_model, "", "model path"); +DEFINE_string(infer_data, "", "data file"); +DEFINE_int32(batch_size, 1, "batch size."); +DEFINE_int32(burning, 0, "Burning before repeat."); +DEFINE_int32(repeat, 1, "Running the inference program repeat times."); +DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); +DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); + +namespace paddle { +namespace inference { + +void CompareResult(const std::vector &outputs, + const std::vector &ref_outputs) { + EXPECT_GT(outputs.size(), 0); + EXPECT_EQ(outputs.size(), ref_outputs.size()); + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = outputs[i]; + auto &ref_out = ref_outputs[i]; + size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, + [](int a, int b) { return a * b; }); + size_t ref_size = + std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1, + [](int a, int b) { return a * b; }); + EXPECT_GT(size, 0); + EXPECT_EQ(size, ref_size); + EXPECT_EQ(out.dtype, ref_out.dtype); + switch (out.dtype) { + case PaddleDType::INT64: { + int64_t *pdata = static_cast(out.data.data()); + int64_t *pdata_ref = static_cast(ref_out.data.data()); + for (size_t j = 0; j < size; ++j) { + EXPECT_EQ(pdata_ref[j], pdata[j]); + } + break; + } + case PaddleDType::FLOAT32: { + float *pdata = static_cast(out.data.data()); + float *pdata_ref = static_cast(ref_out.data.data()); + for (size_t j = 0; j < size; ++j) { + EXPECT_NEAR(pdata_ref[j], pdata[j], 1e-3); + } + break; + } + } + } +} + +void TestOneThreadPrediction( + AnalysisConfig config, const std::vector> inputs, + std::vector *outputs) { + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; + auto predictor = + CreatePaddlePredictor( + config); + Timer timer; + timer.tic(); + for (int i = 0; i < num_times; i++) { + for (size_t j = 0; j < inputs.size(); j++) { + predictor->Run(inputs[j], outputs); + } + } + PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times, + inputs.size()); +} + +void TestMultiThreadPrediction( + AnalysisConfig config, const std::vector> inputs, + std::vector *outputs, int num_threads) { + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; + std::vector threads; + std::vector> predictors; + // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled + // because AttentionLSTM's hard code nodeid will be damanged. + for (int tid = 0; tid < num_threads; ++tid) { + predictors.emplace_back( + CreatePaddlePredictor( + config)); + } + for (int tid = 0; tid < num_threads; ++tid) { + threads.emplace_back([&, tid]() { + // Each thread should have local inputs and outputs. + // The inputs of each thread are all the same. + std::vector> inputs_tid = inputs; + std::vector outputs_tid; + Timer timer; + timer.tic(); + for (int i = 0; i < num_times; i++) { + for (size_t j = 0; j < inputs_tid.size(); j++) { + predictors[tid]->Run(inputs_tid[j], &outputs_tid); + } + } + PrintTime(batch_size, num_times, num_threads, tid, + timer.toc() / num_times, inputs_tid.size()); + }); + } + for (int i = 0; i < num_threads; ++i) { + threads[i].join(); + } +} + +void TestPrediction(AnalysisConfig config, + const std::vector> inputs, + std::vector *outputs, int num_threads) { + if (num_threads == 1) { + TestOneThreadPrediction(config, inputs, outputs); + } else { + TestMultiThreadPrediction(config, inputs, outputs, num_threads); + } +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 7ec1e78da4ec642cb1e6248edfbcfed748fa11b8..ccb7fa1f8cce8cc757038904bce762af3b5ff30b 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -296,6 +296,7 @@ op_library(flatten_op DEPS reshape_op) op_library(sequence_pad_op DEPS sequence_padding) op_library(unstack_op DEPS stack_op) op_library(fake_quantize_op DEPS memory) +op_library(fusion_lstm_op DEPS cpu_lstm_compute) if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 3eb02c6b61ce61140bd777647a12477dd9c3c803..eae65968285703f5882d910e29bc5d8e1511cba6 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -300,10 +300,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector paddings = ctx.Attr>("paddings"); std::vector dilations = ctx.Attr>("dilations"); bool fuse_relu = ctx.Attr("fuse_relu"); + bool fuse_eltwise = ctx.Attr("fuse_eltwise"); int groups = ctx.Attr("groups"); - // TODO(pzelazko-intel) add support for group convolution and dilation - PADDLE_ENFORCE(groups == 1, "group convolution is not implemented yet"); + // TODO: add support for dilation PADDLE_ENFORCE( dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, "dilation in convolution is not implemented yet"); @@ -314,6 +314,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector src_tz = paddle::framework::vectorize2int(input->dims()); std::vector weights_tz = paddle::framework::vectorize2int(filter->dims()); + int g = std::max(groups, 1); + if (g > 1) { + int o = weights_tz[0]; + int i = weights_tz[1]; + int h = weights_tz[2]; + int w = weights_tz[3]; + weights_tz.resize(5); + weights_tz[0] = g; + weights_tz[1] = o / g; + weights_tz[2] = i; + weights_tz[3] = h; + weights_tz[4] = w; + } std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); // Get unique name for storing MKLDNN primitives @@ -327,7 +340,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto user_src_md = platform::MKLDNNMemDesc( {src_tz}, platform::MKLDNNGetDataType(), input->format()); auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), filter->format()); + {weights_tz}, platform::MKLDNNGetDataType(), + (g == 1) ? filter->format() : mkldnn::memory::format::goihw); /* create memory descriptor for convolution without specified format * ('any') which lets a primitive (convolution in this case) choose @@ -340,7 +354,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto src_md = platform::MKLDNNMemDesc( src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); auto weights_md = platform::MKLDNNMemDesc( - weights_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + weights_tz, platform::MKLDNNGetDataType(), + (g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw); std::vector bias_tz; // TODO(mgallus): avoid empty vector creation. // Currently used whenever bias is != nullptr. auto dst_md = platform::MKLDNNMemDesc( @@ -352,12 +367,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { bias_tz = paddle::framework::vectorize2int(bias->dims()); auto bias_md = platform::MKLDNNMemDesc( bias_tz, platform::MKLDNNGetDataType(), memory::format::x); - conv_pd = - ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, strides, - paddings, mkldnn_engine, fuse_relu); + conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, + strides, paddings, mkldnn_engine, + fuse_relu, fuse_eltwise); } else { - conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, - paddings, mkldnn_engine, fuse_relu); + conv_pd = + ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, + mkldnn_engine, fuse_relu, fuse_eltwise); } // Save conv_pd/src_memory/weights_memory for backward pass dev_ctx.SetBlob(key_conv_pd, conv_pd); @@ -407,16 +423,26 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { } private: - mkldnn::primitive_attr AddRelu() const { - // Fusion with ReLU layer is executed through the PostOps feature. Create a - // PostOps object and configure it to execute an eltwise relu operation. + mkldnn::primitive_attr CreatePostOps(bool fuse_relu, + bool fuse_eltwise) const { mkldnn::primitive_attr conv_attr; - constexpr float scale = 1.0f; - constexpr float negative_slope = 0.0f; - constexpr float placeholder = 0.0f; mkldnn::post_ops post_operations; - post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu, - negative_slope, placeholder); + // Fusion with Elementwise layer relies on adding a sum post-operation with + // the scale parameter. It is assumed that when fuse_eltwise is true, the + // Output tensor contains the data coming from residual connection. The + // result of this post_op is: Output = scale * Output + Conv_Out. + if (fuse_eltwise) { + post_operations.append_sum(1.0f); + } + // Fusion with ReLU layer is executed through the PostOps feature. Create a + // PostOps object and configure it to execute an eltwise relu operation. + if (fuse_relu) { + constexpr float scale = 1.0f; + constexpr float negative_slope = 0.0f; + constexpr float placeholder = 0.0f; + post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu, + negative_slope, placeholder); + } conv_attr.set_post_ops(post_operations); return conv_attr; } @@ -425,8 +451,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, const memory::desc& dst, const std::vector& strides, const std::vector& paddings, - const mkldnn::engine& engine, - const bool fuse_relu) const { + const mkldnn::engine& engine, const bool fuse_relu, + const bool fuse_eltwise) const { memory::dims stride_dims = {strides[0], strides[1]}; memory::dims padding_dims = {paddings[0], paddings[1]}; @@ -435,10 +461,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { dst, stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); - mkldnn::primitive_attr conv_attr; - if (fuse_relu) { - conv_attr = AddRelu(); - } + mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise); auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( conv_desc, conv_attr, engine); @@ -452,8 +475,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const memory::desc& bias, const memory::desc& dst, const std::vector& strides, const std::vector& paddings, - const mkldnn::engine& engine, - const bool fuse_relu) const { + const mkldnn::engine& engine, const bool fuse_relu, + const bool fuse_eltwise) const { memory::dims stride_dims = {strides[0], strides[1]}; memory::dims padding_dims = {paddings[0], paddings[1]}; @@ -462,10 +485,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { bias, dst, stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); - mkldnn::primitive_attr conv_attr; - if (fuse_relu) { - conv_attr = AddRelu(); - } + mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise); auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( conv_desc, conv_attr, engine); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 41d4fcf6de7c8fcb3cfbb2063b0a2ac1a2356168..8f84bf71a7f77606bed6672f0830e3fc80165a42 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -164,6 +164,11 @@ void Conv2DOpMaker::Make() { .SetDefault(false); AddAttr("fuse_relu", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr("fuse_eltwise", + "(bool, default false) Only used in mkldnn kernel. Used " + "whenever convolution output is connected via skip connection " + "to a previous layer.") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h index 0dee1781623d5a62830545c0952e5aadbe37accb..6abeca1da443248d6ad3c1bcc64dd775d77f4ed8 100644 --- a/paddle/fluid/operators/detection/bbox_util.h +++ b/paddle/fluid/operators/detection/bbox_util.h @@ -9,6 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/tensor.h" @@ -21,7 +22,7 @@ namespace operators { */ template inline void BoxToDelta(const int box_num, const framework::Tensor& ex_boxes, - const framework::Tensor& gt_boxes, const T* weights, + const framework::Tensor& gt_boxes, const float* weights, const bool normalized, framework::Tensor* box_delta) { auto ex_boxes_et = framework::EigenTensor::From(ex_boxes); auto gt_boxes_et = framework::EigenTensor::From(gt_boxes); @@ -62,5 +63,35 @@ void Gather(const T* in, const int in_stride, const int* index, const int num, } } +template +void BboxOverlaps(const framework::Tensor& r_boxes, + const framework::Tensor& c_boxes, + framework::Tensor* overlaps) { + auto r_boxes_et = framework::EigenTensor::From(r_boxes); + auto c_boxes_et = framework::EigenTensor::From(c_boxes); + auto overlaps_et = framework::EigenTensor::From(*overlaps); + int r_num = r_boxes.dims()[0]; + int c_num = c_boxes.dims()[0]; + auto zero = static_cast(0.0); + T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h, + inter_area; + for (int i = 0; i < r_num; ++i) { + r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) * + (r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1); + for (int j = 0; j < c_num; ++j) { + c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) * + (c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1); + x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0)); + y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1)); + x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2)); + y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3)); + inter_w = std::max(x_max - x_min + 1, zero); + inter_h = std::max(y_max - y_min + 1, zero); + inter_area = inter_w * inter_h; + overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area); + } + } +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc index be06dc19743cfa6f093bcb3f4e9f91af315d4211..d7a53f1bef98ecda3ba7b36323678a11a632a15c 100644 --- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc @@ -42,10 +42,11 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel { "Input(RpnRois) shouldn't be null."); PADDLE_ENFORCE(ctx->HasInput("GtClasses"), "Input(GtClasses) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("IsCrowd"), + "Input(IsCrowd) shouldn't be null."); PADDLE_ENFORCE(ctx->HasInput("GtBoxes"), "Input(GtBoxes) shouldn't be null."); - PADDLE_ENFORCE(ctx->HasInput("ImScales"), - "Input(ImScales) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null."); PADDLE_ENFORCE(ctx->HasOutput("Rois"), "Output(Rois) of RpnTargetAssignOp should not be null"); @@ -64,22 +65,21 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel { auto rpn_rois_dims = ctx->GetInputDim("RpnRois"); auto gt_classes_dims = ctx->GetInputDim("GtClasses"); + auto is_crowd_dims = ctx->GetInputDim("IsCrowd"); auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); - auto im_scales_dims = ctx->GetInputDim("ImScales"); + auto im_info_dims = ctx->GetInputDim("ImInfo"); PADDLE_ENFORCE_EQ(rpn_rois_dims.size(), 2, "The rank of Input(RpnRois) must be 2."); - PADDLE_ENFORCE_EQ(gt_classes_dims.size(), 1, - "The rank of Input(GtClasses) must be 1."); PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2, "The rank of Input(GtBoxes) must be 2."); - PADDLE_ENFORCE_EQ(im_scales_dims.size(), 1, - "The rank of Input(ImScales) must be 1."); + PADDLE_ENFORCE_EQ(im_info_dims.size(), 2, + "The rank of Input(ImInfo) must be 2."); int class_nums = ctx->Attrs().Get("class_nums"); ctx->SetOutputDim("Rois", {-1, 4}); - ctx->SetOutputDim("LabelsInt32", {-1}); + ctx->SetOutputDim("LabelsInt32", {-1, 1}); ctx->SetOutputDim("BboxTargets", {-1, 4 * class_nums}); ctx->SetOutputDim("BboxInsideWeights", {-1, 4 * class_nums}); ctx->SetOutputDim("BboxOutsideWeights", {-1, 4 * class_nums}); @@ -105,45 +105,18 @@ void Concat(const platform::CPUDeviceContext& context, concat_functor(context, inputs, axis, out_tensor); } -template -void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes, - Tensor* overlaps) { - auto r_boxes_et = framework::EigenTensor::From(r_boxes); - auto c_boxes_et = framework::EigenTensor::From(c_boxes); - auto overlaps_et = framework::EigenTensor::From(*overlaps); - int r_num = r_boxes.dims()[0]; - int c_num = c_boxes.dims()[0]; - auto zero = static_cast(0.0); - T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h, - inter_area; - for (int i = 0; i < r_num; ++i) { - r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) * - (r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1); - for (int j = 0; j < c_num; ++j) { - c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) * - (c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1); - x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0)); - y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1)); - x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2)); - y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3)); - inter_w = std::max(x_max - x_min + 1, zero); - inter_h = std::max(y_max - y_min + 1, zero); - inter_area = inter_w * inter_h; - overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area); - } - } -} - template std::vector> SampleFgBgGt( const platform::CPUDeviceContext& context, Tensor* iou, - const int batch_size_per_im, const float fg_fraction, const float fg_thresh, - const float bg_thresh_hi, const float bg_thresh_lo, - std::minstd_rand engine) { + const Tensor& is_crowd, const int batch_size_per_im, + const float fg_fraction, const float fg_thresh, const float bg_thresh_hi, + const float bg_thresh_lo, std::minstd_rand engine, const bool use_random) { std::vector fg_inds; std::vector bg_inds; std::vector gt_inds; - T* proposal_to_gt_overlaps = iou->mutable_data(context.GetPlace()); + int64_t gt_num = is_crowd.numel(); + const int* crowd_data = is_crowd.data(); + T* proposal_to_gt_overlaps = iou->data(); int64_t row = iou->dims()[0]; int64_t col = iou->dims()[1]; float epsilon = 0.00001; @@ -152,6 +125,9 @@ std::vector> SampleFgBgGt( for (int64_t i = 0; i < row; ++i) { const T* v = proposal_to_gt_overlaps + i * col; T max_overlap = *std::max_element(v, v + col); + if ((i < gt_num) && (crowd_data[i])) { + max_overlap = -1.0; + } if (max_overlap > fg_thresh) { for (int64_t j = 0; j < col; ++j) { T val = proposal_to_gt_overlaps[i * col + j]; @@ -170,17 +146,19 @@ std::vector> SampleFgBgGt( } // Reservoir Sampling + std::uniform_real_distribution uniform(0, 1); int fg_rois_per_im = std::floor(batch_size_per_im * fg_fraction); int fg_rois_this_image = fg_inds.size(); int fg_rois_per_this_image = std::min(fg_rois_per_im, fg_rois_this_image); - std::uniform_real_distribution uniform(0, 1); - const int64_t fg_size = static_cast(fg_inds.size()); - if (fg_size > fg_rois_per_this_image) { - for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) { - int rng_ind = std::floor(uniform(engine) * i); - if (rng_ind < fg_rois_per_this_image) { - std::iter_swap(fg_inds.begin() + rng_ind, fg_inds.begin() + i); - std::iter_swap(gt_inds.begin() + rng_ind, gt_inds.begin() + i); + if (use_random) { + const int64_t fg_size = static_cast(fg_inds.size()); + if (fg_size > fg_rois_per_this_image) { + for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) { + int rng_ind = std::floor(uniform(engine) * i); + if (rng_ind < fg_rois_per_this_image) { + std::iter_swap(fg_inds.begin() + rng_ind, fg_inds.begin() + i); + std::iter_swap(gt_inds.begin() + rng_ind, gt_inds.begin() + i); + } } } } @@ -192,12 +170,14 @@ std::vector> SampleFgBgGt( int bg_rois_per_image = batch_size_per_im - fg_rois_per_this_image; int bg_rois_this_image = bg_inds.size(); int bg_rois_per_this_image = std::min(bg_rois_per_image, bg_rois_this_image); - const int64_t bg_size = static_cast(bg_inds.size()); - if (bg_size > bg_rois_per_this_image) { - for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) { - int rng_ind = std::floor(uniform(engine) * i); - if (rng_ind < fg_rois_per_this_image) - std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i); + if (use_random) { + const int64_t bg_size = static_cast(bg_inds.size()); + if (bg_size > bg_rois_per_this_image) { + for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) { + int rng_ind = std::floor(uniform(engine) * i); + if (rng_ind < fg_rois_per_this_image) + std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i); + } } } std::vector new_bg_inds(bg_inds.begin(), @@ -248,14 +228,14 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context, template std::vector SampleRoisForOneImage( const platform::CPUDeviceContext& context, Tensor* rpn_rois, - Tensor* gt_classes, Tensor* gt_boxes, Tensor* im_scale, + Tensor* gt_classes, Tensor* is_crowd, Tensor* gt_boxes, Tensor* im_info, const int batch_size_per_im, const float fg_fraction, const float fg_thresh, const float bg_thresh_hi, const float bg_thresh_lo, const std::vector& bbox_reg_weights, const int class_nums, - std::minstd_rand engine) { + std::minstd_rand engine, bool use_random) { auto rpn_rois_et = framework::EigenTensor::From(*rpn_rois); - auto im_scale_data = im_scale->data()[0]; - rpn_rois_et = rpn_rois_et / im_scale_data; + auto im_scale = im_info->data()[2]; + rpn_rois_et = rpn_rois_et / im_scale; Tensor boxes; int proposals_num = gt_boxes->dims()[0] + rpn_rois->dims()[0]; @@ -270,8 +250,8 @@ std::vector SampleRoisForOneImage( // Generate proposal index std::vector> fg_bg_gt = SampleFgBgGt( - context, &proposal_to_gt_overlaps, batch_size_per_im, fg_fraction, - fg_thresh, bg_thresh_hi, bg_thresh_lo, engine); + context, &proposal_to_gt_overlaps, *is_crowd, batch_size_per_im, + fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, engine, use_random); std::vector fg_inds = fg_bg_gt[0]; std::vector bg_inds = fg_bg_gt[1]; std::vector gt_inds = fg_bg_gt[2]; @@ -291,15 +271,15 @@ std::vector SampleRoisForOneImage( // Compute targets Tensor bbox_targets_single; bbox_targets_single.mutable_data(bbox_dim, context.GetPlace()); - BoxToDelta(fg_num, sampled_boxes, sampled_gts, nullptr, false, - &bbox_targets_single); + BoxToDelta(fg_num, sampled_boxes, sampled_gts, bbox_reg_weights.data(), + false, &bbox_targets_single); // Scale rois Tensor sampled_rois; sampled_rois.mutable_data(sampled_boxes.dims(), context.GetPlace()); auto sampled_rois_et = framework::EigenTensor::From(sampled_rois); auto sampled_boxes_et = framework::EigenTensor::From(sampled_boxes); - sampled_rois_et = sampled_boxes_et * im_scale_data; + sampled_rois_et = sampled_boxes_et * im_scale; // Expand box targets Tensor bbox_targets, bbox_inside_weights, bbox_outside_weights; @@ -351,8 +331,9 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { auto* rpn_rois = context.Input("RpnRois"); auto* gt_classes = context.Input("GtClasses"); + auto* is_crowd = context.Input("IsCrowd"); auto* gt_boxes = context.Input("GtBoxes"); - auto* im_scales = context.Input("ImScales"); + auto* im_info = context.Input("ImInfo"); auto* rois = context.Output("Rois"); auto* labels_int32 = context.Output("LabelsInt32"); @@ -369,18 +350,21 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { std::vector bbox_reg_weights = context.Attr>("bbox_reg_weights"); int class_nums = context.Attr("class_nums"); + bool use_random = context.Attr("use_random"); PADDLE_ENFORCE_EQ(rpn_rois->lod().size(), 1UL, "GenerateProposalLabelsOp rpn_rois needs 1 level of LoD"); PADDLE_ENFORCE_EQ( gt_classes->lod().size(), 1UL, "GenerateProposalLabelsOp gt_classes needs 1 level of LoD"); + PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL, + "GenerateProposalLabelsOp is_crowd needs 1 level of LoD"); PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL, "GenerateProposalLabelsOp gt_boxes needs 1 level of LoD"); int64_t n = static_cast(rpn_rois->lod().back().size() - 1); rois->mutable_data({n * batch_size_per_im, kBoxDim}, context.GetPlace()); - labels_int32->mutable_data({n * batch_size_per_im}, + labels_int32->mutable_data({n * batch_size_per_im, 1}, context.GetPlace()); bbox_targets->mutable_data({n * batch_size_per_im, kBoxDim * class_nums}, context.GetPlace()); @@ -391,8 +375,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { std::random_device rnd; std::minstd_rand engine; - int seed = - context.Attr("fix_seed") ? context.Attr("seed") : rnd(); + int seed = rnd(); engine.seed(seed); framework::LoD lod; @@ -403,19 +386,23 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { auto rpn_rois_lod = rpn_rois->lod().back(); auto gt_classes_lod = gt_classes->lod().back(); + auto is_crowd_lod = is_crowd->lod().back(); auto gt_boxes_lod = gt_boxes->lod().back(); for (int i = 0; i < n; ++i) { Tensor rpn_rois_slice = rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]); Tensor gt_classes_slice = gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]); + Tensor is_crowd_slice = + is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]); Tensor gt_boxes_slice = gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]); - Tensor im_scales_slice = im_scales->Slice(i, i + 1); + Tensor im_info_slice = im_info->Slice(i, i + 1); std::vector tensor_output = SampleRoisForOneImage( - dev_ctx, &rpn_rois_slice, >_classes_slice, >_boxes_slice, - &im_scales_slice, batch_size_per_im, fg_fraction, fg_thresh, - bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums, engine); + dev_ctx, &rpn_rois_slice, >_classes_slice, &is_crowd_slice, + >_boxes_slice, &im_info_slice, batch_size_per_im, fg_fraction, + fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums, + engine, use_random); Tensor sampled_rois = tensor_output[0]; Tensor sampled_labels_int32 = tensor_output[1]; Tensor sampled_bbox_targets = tensor_output[2]; @@ -442,7 +429,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { bbox_inside_weights->set_lod(lod); bbox_outside_weights->set_lod(lod); rois->Resize({num_rois, kBoxDim}); - labels_int32->Resize({num_rois}); + labels_int32->Resize({num_rois, 1}); bbox_targets->Resize({num_rois, kBoxDim * class_nums}); bbox_inside_weights->Resize({num_rois, kBoxDim * class_nums}); bbox_outside_weights->Resize({num_rois, kBoxDim * class_nums}); @@ -455,8 +442,9 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker { // TODO(buxingyuan): Add Document AddInput("RpnRois", "RpnRois."); AddInput("GtClasses", "GtClasses."); + AddInput("IsCrowd", "IsCrowd."); AddInput("GtBoxes", "GtBoxes."); - AddInput("ImScales", "ImScales."); + AddInput("ImInfo", "ImInfo."); AddOutput("Rois", "Rois."); AddOutput("LabelsInt32", "LabelsInt32."); @@ -471,8 +459,7 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("bg_thresh_lo", "bg_thresh_lo"); AddAttr>("bbox_reg_weights", "bbox_reg_weights"); AddAttr("class_nums", "class_nums"); - AddAttr("fix_seed", "fix_seed").SetDefault(false); - AddAttr("seed", "seed").SetDefault(0); + AddAttr("use_random", "use_random").SetDefault(true); AddComment(R"DOC( Generate Proposals Labels Operator. diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index ebe6830eccd87a156768eb0d4b96220bcc9f4edc..c33aa255362bc5234f2813fb93e70c943b03c33f 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -89,12 +89,11 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors, } for (int64_t i = 0; i < row; ++i) { - T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len]; - T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1]; + T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0; + T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0; - T anchor_center_x = (anchor_data[i * len + 2] + anchor_data[i * len]) / 2; - T anchor_center_y = - (anchor_data[i * len + 3] + anchor_data[i * len + 1]) / 2; + T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width; + T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height; T bbox_center_x = 0, bbox_center_y = 0; T bbox_width = 0, bbox_height = 0; @@ -106,25 +105,31 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors, bbox_center_y = variances_data[i * len + 1] * bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; - bbox_width = std::exp(variances_data[i * len + 2] * - bbox_deltas_data[i * len + 2]) * + bbox_width = std::exp(std::min(variances_data[i * len + 2] * + bbox_deltas_data[i * len + 2], + std::log(1000.0 / 16.0))) * anchor_width; - bbox_height = std::exp(variances_data[i * len + 3] * - bbox_deltas_data[i * len + 3]) * + bbox_height = std::exp(std::min(variances_data[i * len + 3] * + bbox_deltas_data[i * len + 3], + std::log(1000.0 / 16.0))) * anchor_height; } else { bbox_center_x = bbox_deltas_data[i * len] * anchor_width + anchor_center_x; bbox_center_y = bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; - bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width; - bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height; + bbox_width = std::exp(std::min(bbox_deltas_data[i * len + 2], + std::log(1000.0 / 16.0))) * + anchor_width; + bbox_height = std::exp(std::min(bbox_deltas_data[i * len + 3], + std::log(1000.0 / 16.0))) * + anchor_height; } proposals_data[i * len] = bbox_center_x - bbox_width / 2; proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2; - proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2; - proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2; + proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1; + proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1; } // return proposals; } @@ -156,18 +161,23 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes, float min_size, const Tensor &im_info, Tensor *keep) { const T *im_info_data = im_info.data(); T *boxes_data = boxes->mutable_data(ctx.GetPlace()); - min_size *= im_info_data[2]; + T im_scale = im_info_data[2]; keep->Resize({boxes->dims()[0], 1}); + min_size = std::max(min_size, 1.0f); int *keep_data = keep->mutable_data(ctx.GetPlace()); int keep_len = 0; for (int i = 0; i < boxes->dims()[0]; ++i) { T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1; T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1; + T ws_origin_scale = + (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1; + T hs_origin_scale = + (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1; T x_ctr = boxes_data[4 * i] + ws / 2; T y_ctr = boxes_data[4 * i + 1] + hs / 2; - if (ws >= min_size && hs >= min_size && x_ctr <= im_info_data[1] && - y_ctr <= im_info_data[0]) { + if (ws_origin_scale >= min_size && hs_origin_scale >= min_size && + x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) { keep_data[keep_len++] = i; } } @@ -218,8 +228,8 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) { const T inter_ymin = std::max(box1[1], box2[1]); const T inter_xmax = std::min(box1[2], box2[2]); const T inter_ymax = std::min(box1[3], box2[3]); - const T inter_w = inter_xmax - inter_xmin; - const T inter_h = inter_ymax - inter_ymin; + const T inter_w = std::max(0.0f, inter_xmax - inter_xmin + 1); + const T inter_h = std::max(0.0f, inter_ymax - inter_ymin + 1); const T inter_area = inter_w * inter_h; const T bbox1_area = BBoxArea(box1, normalized); const T bbox2_area = BBoxArea(box2, normalized); diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc index 88757f25cd9a5789758640de2d9cae0b12350b25..dda423efd35b96f5e1d7c55389818f46ef3d8694 100644 --- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc +++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc @@ -31,8 +31,14 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("DistMat"), - "Input(DistMat) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("Anchor"), + "Input(Anchor) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("GtBoxes"), + "Input(GtBoxes) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("IsCrowd"), + "Input(Anchor) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("ImInfo"), + "Input(ImInfo) of RpnTargetAssignOp should not be null"); PADDLE_ENFORCE( ctx->HasOutput("LocationIndex"), @@ -43,10 +49,20 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { PADDLE_ENFORCE( ctx->HasOutput("TargetLabel"), "Output(TargetLabel) of RpnTargetAssignOp should not be null"); - - auto in_dims = ctx->GetInputDim("DistMat"); - PADDLE_ENFORCE_EQ(in_dims.size(), 2, - "The rank of Input(DistMat) must be 2."); + PADDLE_ENFORCE( + ctx->HasOutput("TargetBBox"), + "Output(TargetBBox) of RpnTargetAssignOp should not be null"); + + auto anchor_dims = ctx->GetInputDim("Anchor"); + auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); + auto is_crowd_dims = ctx->GetInputDim("IsCrowd"); + auto im_info_dims = ctx->GetInputDim("ImInfo"); + PADDLE_ENFORCE_EQ(anchor_dims.size(), 2, + "The rank of Input(Anchor) must be 2."); + PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2, + "The rank of Input(GtBoxes) must be 2."); + PADDLE_ENFORCE_EQ(im_info_dims.size(), 2, + "The rank of Input(ImInfo) must be 2."); ctx->SetOutputDim("LocationIndex", {-1}); ctx->SetOutputDim("ScoreIndex", {-1}); @@ -59,198 +75,383 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType( - ctx.Input("DistMat")->type()), + ctx.Input("Anchor")->type()), platform::CPUPlace()); } }; template -class RpnTargetAssignKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* anchor_t = context.Input("Anchor"); // (H*W*A) * 4 - auto* gt_bbox_t = context.Input("GtBox"); - auto* dist_t = context.Input("DistMat"); +void AppendRpns(LoDTensor* out, int64_t offset, Tensor* to_add) { + auto* out_data = out->data(); + auto* to_add_data = to_add->data(); + memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T)); +} + +template +std::vector FilterStraddleAnchor( + const platform::CPUDeviceContext& context, const Tensor* anchor, + const float rpn_straddle_thresh, T im_height, T im_width) { + std::vector inds_inside; + int anchor_num = anchor->dims()[0]; + auto* anchor_data = anchor->data(); + if (rpn_straddle_thresh >= 0) { + int index; + for (int i = 0; i < anchor_num; ++i) { + index = i * 4; + if ((anchor_data[index + 0] >= -rpn_straddle_thresh) && + (anchor_data[index + 1] >= -rpn_straddle_thresh) && + (anchor_data[index + 2] < im_width + rpn_straddle_thresh) && + (anchor_data[index + 3] < im_height + rpn_straddle_thresh)) { + inds_inside.emplace_back(i); + } + } + } else { + for (int i = 0; i < anchor_num; ++i) { + inds_inside.emplace_back(i); + } + } + int inside_num = inds_inside.size(); + Tensor inds_inside_t; + int* inds_inside_data = + inds_inside_t.mutable_data({inside_num}, context.GetPlace()); + std::copy(inds_inside.begin(), inds_inside.end(), inds_inside_data); + Tensor inside_anchor_t; + T* inside_anchor_data = + inside_anchor_t.mutable_data({inside_num, 4}, context.GetPlace()); + Gather(anchor->data(), 4, inds_inside_data, inside_num, + inside_anchor_data); + std::vector res; + res.emplace_back(inds_inside_t); + res.emplace_back(inside_anchor_t); + return res; +} + +template +Tensor FilterCrowdGt(const platform::CPUDeviceContext& context, + Tensor* gt_boxes, Tensor* is_crowd) { + int gt_num = gt_boxes->dims()[0]; + std::vector not_crowd_inds; + auto* is_crowd_data = is_crowd->data(); + for (int i = 0; i < gt_num; ++i) { + if (is_crowd_data[i] == 0) { + not_crowd_inds.emplace_back(i); + } + } + int ncrowd_num = not_crowd_inds.size(); + Tensor ncrowd_gt_boxes; + T* ncrowd_gt_boxes_data = + ncrowd_gt_boxes.mutable_data({ncrowd_num, 4}, context.GetPlace()); + Gather(gt_boxes->data(), 4, not_crowd_inds.data(), ncrowd_num, + ncrowd_gt_boxes_data); + return ncrowd_gt_boxes; +} + +void ReservoirSampling(const int num, std::vector* inds, + std::minstd_rand engine, bool use_random) { + std::uniform_real_distribution uniform(0, 1); + size_t len = inds->size(); + if (len > static_cast(num)) { + if (use_random) { + for (size_t i = num; i < len; ++i) { + int rng_ind = std::floor(uniform(engine) * i); + if (rng_ind < num) + std::iter_swap(inds->begin() + rng_ind, inds->begin() + i); + } + } + inds->resize(num); + } +} + +template +void ScoreAssign(const T* anchor_by_gt_overlap_data, + const Tensor& anchor_to_gt_max, const Tensor& gt_to_anchor_max, + const int rpn_batch_size_per_im, const float rpn_fg_fraction, + const float rpn_positive_overlap, + const float rpn_negative_overlap, std::vector* fg_inds, + std::vector* bg_inds, std::vector* tgt_lbl, + std::minstd_rand engine, bool use_random) { + float epsilon = 0.00001; + int anchor_num = anchor_to_gt_max.dims()[0]; + int gt_num = gt_to_anchor_max.dims()[0]; + std::vector target_label(anchor_num, -1); + std::vector fg_inds_fake; + std::vector bg_inds_fake; + const T* anchor_to_gt_max_data = anchor_to_gt_max.data(); + const T* gt_to_anchor_max_data = gt_to_anchor_max.data(); + // TODO(buxingyuan): Match with Detectron now + // but it seems here is a bug in two directions assignment + // in which the later one may overwrites the former one. + for (int64_t i = 0; i < anchor_num; ++i) { + bool is_anchors_with_max_overlap = false; + for (int64_t j = 0; j < gt_num; ++j) { + T value = anchor_by_gt_overlap_data[i * gt_num + j]; + T diff = std::abs(value - gt_to_anchor_max_data[j]); + if (diff < epsilon) { + is_anchors_with_max_overlap = true; + break; + } + } + bool is_anchor_great_than_thresh = + (anchor_to_gt_max_data[i] >= rpn_positive_overlap); + if (is_anchors_with_max_overlap || is_anchor_great_than_thresh) { + fg_inds_fake.push_back(i); + } + } - auto* loc_index_t = context.Output("LocationIndex"); - auto* score_index_t = context.Output("ScoreIndex"); - auto* tgt_bbox_t = context.Output("TargetBBox"); - auto* tgt_lbl_t = context.Output("TargetLabel"); + // Reservoir Sampling + int fg_num = static_cast(rpn_fg_fraction * rpn_batch_size_per_im); + ReservoirSampling(fg_num, &fg_inds_fake, engine, use_random); + fg_num = static_cast(fg_inds_fake.size()); + for (int64_t i = 0; i < fg_num; ++i) { + target_label[fg_inds_fake[i]] = 1; + } - auto lod = dist_t->lod().back(); - int64_t batch_num = static_cast(lod.size() - 1); - int64_t anchor_num = dist_t->dims()[1]; - PADDLE_ENFORCE_EQ(anchor_num, anchor_t->dims()[0]); + int bg_num = rpn_batch_size_per_im - fg_num; + for (int64_t i = 0; i < anchor_num; ++i) { + if (anchor_to_gt_max_data[i] < rpn_negative_overlap) { + bg_inds_fake.push_back(i); + } + } + ReservoirSampling(bg_num, &bg_inds_fake, engine, use_random); + bg_num = static_cast(bg_inds_fake.size()); + for (int64_t i = 0; i < bg_num; ++i) { + target_label[bg_inds_fake[i]] = 0; + } - int rpn_batch_size = context.Attr("rpn_batch_size_per_im"); - float pos_threshold = context.Attr("rpn_positive_overlap"); - float neg_threshold = context.Attr("rpn_negative_overlap"); - float fg_fraction = context.Attr("fg_fraction"); + for (int64_t i = 0; i < anchor_num; ++i) { + if (target_label[i] == 1) fg_inds->emplace_back(i); + if (target_label[i] == 0) bg_inds->emplace_back(i); + } + fg_num = fg_inds->size(); + bg_num = bg_inds->size(); + + tgt_lbl->resize(fg_num + bg_num, 0); + std::vector fg_lbl(fg_num, 1); + std::vector bg_lbl(bg_num, 0); + std::copy(fg_lbl.begin(), fg_lbl.end(), tgt_lbl->data()); + std::copy(bg_lbl.begin(), bg_lbl.end(), tgt_lbl->data() + fg_num); +} + +template +std::vector SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx, + const Tensor& anchor_by_gt_overlap, + const int rpn_batch_size_per_im, + const float rpn_positive_overlap, + const float rpn_negative_overlap, + const float rpn_fg_fraction, + std::minstd_rand engine, bool use_random) { + auto* anchor_by_gt_overlap_data = anchor_by_gt_overlap.data(); + int anchor_num = anchor_by_gt_overlap.dims()[0]; + int gt_num = anchor_by_gt_overlap.dims()[1]; + + std::vector fg_inds; + std::vector bg_inds; + std::vector gt_inds; + std::vector tgt_lbl; + + // Calculate the max IoU between anchors and gt boxes + // Map from anchor to gt box that has highest overlap + auto place = ctx.GetPlace(); + Tensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max; + anchor_to_gt_max.mutable_data({anchor_num}, place); + int* argmax = anchor_to_gt_argmax.mutable_data({anchor_num}, place); + gt_to_anchor_max.mutable_data({gt_num}, place); + + auto anchor_by_gt_overlap_et = + framework::EigenMatrix::From(anchor_by_gt_overlap); + auto anchor_to_gt_max_et = + framework::EigenVector::Flatten(anchor_to_gt_max); + auto gt_to_anchor_max_et = + framework::EigenVector::Flatten(gt_to_anchor_max); + auto anchor_to_gt_argmax_et = + framework::EigenVector::Flatten(anchor_to_gt_argmax); + anchor_to_gt_max_et = + anchor_by_gt_overlap_et.maximum(Eigen::DSizes(1)); + anchor_to_gt_argmax_et = + anchor_by_gt_overlap_et.argmax(1).template cast(); + gt_to_anchor_max_et = + anchor_by_gt_overlap_et.maximum(Eigen::DSizes(0)); + + // Follow the Faster RCNN's implementation + ScoreAssign(anchor_by_gt_overlap_data, anchor_to_gt_max, gt_to_anchor_max, + rpn_batch_size_per_im, rpn_fg_fraction, rpn_positive_overlap, + rpn_negative_overlap, &fg_inds, &bg_inds, &tgt_lbl, engine, + use_random); + + int fg_num = fg_inds.size(); + int bg_num = bg_inds.size(); + gt_inds.reserve(fg_num); + for (int i = 0; i < fg_num; ++i) { + gt_inds.emplace_back(argmax[fg_inds[i]]); + } - int fg_num_per_batch = static_cast(rpn_batch_size * fg_fraction); + Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t; + int* loc_index_data = loc_index_t.mutable_data({fg_num}, place); + int* score_index_data = + score_index_t.mutable_data({fg_num + bg_num}, place); + int* tgt_lbl_data = tgt_lbl_t.mutable_data({fg_num + bg_num}, place); + int* gt_inds_data = gt_inds_t.mutable_data({fg_num}, place); + std::copy(fg_inds.begin(), fg_inds.end(), loc_index_data); + std::copy(fg_inds.begin(), fg_inds.end(), score_index_data); + std::copy(bg_inds.begin(), bg_inds.end(), score_index_data + fg_num); + std::copy(tgt_lbl.begin(), tgt_lbl.end(), tgt_lbl_data); + std::copy(gt_inds.begin(), gt_inds.end(), gt_inds_data); + std::vector loc_score_tgtlbl_gt; + loc_score_tgtlbl_gt.emplace_back(loc_index_t); + loc_score_tgtlbl_gt.emplace_back(score_index_t); + loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t); + loc_score_tgtlbl_gt.emplace_back(gt_inds_t); + + return loc_score_tgtlbl_gt; +} - int64_t max_num = batch_num * anchor_num; +template +class RpnTargetAssignKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* anchor = context.Input("Anchor"); // (H*W*A) * 4 + auto* gt_boxes = context.Input("GtBoxes"); + auto* is_crowd = context.Input("IsCrowd"); + auto* im_info = context.Input("ImInfo"); + + auto* loc_index = context.Output("LocationIndex"); + auto* score_index = context.Output("ScoreIndex"); + auto* tgt_bbox = context.Output("TargetBBox"); + auto* tgt_lbl = context.Output("TargetLabel"); + + PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL, + "RpnTargetAssignOp gt_boxes needs 1 level of LoD"); + PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL, + "RpnTargetAssignOp is_crowd needs 1 level of LoD"); + int64_t anchor_num = static_cast(anchor->dims()[0]); + int64_t batch_num = static_cast(gt_boxes->lod().back().size() - 1); + + int rpn_batch_size_per_im = context.Attr("rpn_batch_size_per_im"); + float rpn_straddle_thresh = context.Attr("rpn_straddle_thresh"); + float rpn_positive_overlap = context.Attr("rpn_positive_overlap"); + float rpn_negative_overlap = context.Attr("rpn_negative_overlap"); + float rpn_fg_fraction = context.Attr("rpn_fg_fraction"); + bool use_random = context.Attr("use_random"); + + int64_t max_num = batch_num * rpn_batch_size_per_im; auto place = context.GetPlace(); - tgt_bbox_t->mutable_data({max_num, 4}, place); - auto* loc_index = loc_index_t->mutable_data({max_num}, place); - auto* score_index = score_index_t->mutable_data({max_num}, place); + loc_index->mutable_data({max_num}, place); + score_index->mutable_data({max_num}, place); + tgt_bbox->mutable_data({max_num, 4}, place); + tgt_lbl->mutable_data({max_num, 1}, place); - Tensor tmp_tgt_lbl; - auto* tmp_lbl_data = tmp_tgt_lbl.mutable_data({max_num}, place); auto& dev_ctx = context.device_context(); - math::SetConstant iset; - iset(dev_ctx, &tmp_tgt_lbl, static_cast(-1)); std::random_device rnd; std::minstd_rand engine; - int seed = - context.Attr("fix_seed") ? context.Attr("seed") : rnd(); + int seed = rnd(); engine.seed(seed); - int fg_num = 0; - int bg_num = 0; + framework::LoD lod_loc, loc_score; + std::vector lod0_loc(1, 0); + std::vector lod0_score(1, 0); + + int total_loc_num = 0; + int total_score_num = 0; + auto gt_boxes_lod = gt_boxes->lod().back(); + auto is_crowd_lod = is_crowd->lod().back(); for (int i = 0; i < batch_num; ++i) { - Tensor dist = dist_t->Slice(lod[i], lod[i + 1]); - Tensor gt_bbox = gt_bbox_t->Slice(lod[i], lod[i + 1]); - auto fg_bg_gt = SampleFgBgGt(dev_ctx, dist, pos_threshold, neg_threshold, - rpn_batch_size, fg_num_per_batch, engine, - tmp_lbl_data + i * anchor_num); - - int cur_fg_num = fg_bg_gt[0].size(); - int cur_bg_num = fg_bg_gt[1].size(); - std::transform(fg_bg_gt[0].begin(), fg_bg_gt[0].end(), loc_index, - [i, anchor_num](int d) { return d + i * anchor_num; }); - memcpy(score_index, loc_index, cur_fg_num * sizeof(int)); - std::transform(fg_bg_gt[1].begin(), fg_bg_gt[1].end(), - score_index + cur_fg_num, - [i, anchor_num](int d) { return d + i * anchor_num; }); + Tensor gt_boxes_slice = + gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]); + Tensor is_crowd_slice = + is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]); + Tensor im_info_slice = im_info->Slice(i, i + 1); + auto* im_info_data = im_info_slice.data(); + auto im_height = im_info_data[0]; + auto im_width = im_info_data[1]; + auto im_scale = im_info_data[2]; + + // Filter straddle anchor + std::vector filter_output = FilterStraddleAnchor( + dev_ctx, anchor, rpn_straddle_thresh, im_height, im_width); + Tensor inds_inside = filter_output[0]; + Tensor inside_anchor = filter_output[1]; + + // Filter crowd gt + Tensor ncrowd_gt_boxes = + FilterCrowdGt(dev_ctx, >_boxes_slice, &is_crowd_slice); + auto ncrowd_gt_boxes_et = + framework::EigenTensor::From(ncrowd_gt_boxes); + ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale; + + Tensor anchor_by_gt_overlap; + anchor_by_gt_overlap.mutable_data( + {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place); + BboxOverlaps(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap); + + auto loc_score_tgtlbl_gt = SampleRpnFgBgGt( + dev_ctx, anchor_by_gt_overlap, rpn_batch_size_per_im, + rpn_positive_overlap, rpn_negative_overlap, rpn_fg_fraction, engine, + use_random); + + Tensor sampled_loc_index = loc_score_tgtlbl_gt[0]; + Tensor sampled_score_index = loc_score_tgtlbl_gt[1]; + Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2]; + Tensor sampled_gt_index = loc_score_tgtlbl_gt[3]; + + int loc_num = sampled_loc_index.dims()[0]; + int score_num = sampled_score_index.dims()[0]; + // unmap to all anchor + Tensor sampled_loc_index_unmap, sampled_score_index_unmap; + sampled_loc_index_unmap.mutable_data({loc_num}, place); + sampled_score_index_unmap.mutable_data({score_num}, place); + Gather(inds_inside.data(), 1, sampled_loc_index.data(), + loc_num, sampled_loc_index_unmap.data()); + Gather(inds_inside.data(), 1, sampled_score_index.data(), + score_num, sampled_score_index_unmap.data()); // get target bbox deltas - if (cur_fg_num) { - Tensor fg_gt; - T* gt_data = fg_gt.mutable_data({cur_fg_num, 4}, place); - Tensor tgt_bbox = tgt_bbox_t->Slice(fg_num, fg_num + cur_fg_num); - T* tgt_data = tgt_bbox.data(); - Gather(anchor_t->data(), 4, - reinterpret_cast(&fg_bg_gt[0][0]), cur_fg_num, - tgt_data); - Gather(gt_bbox.data(), 4, reinterpret_cast(&fg_bg_gt[2][0]), - cur_fg_num, gt_data); - BoxToDelta(cur_fg_num, tgt_bbox, fg_gt, nullptr, false, &tgt_bbox); - } - - loc_index += cur_fg_num; - score_index += cur_fg_num + cur_bg_num; - fg_num += cur_fg_num; - bg_num += cur_bg_num; - } - - int lbl_num = fg_num + bg_num; - PADDLE_ENFORCE_LE(fg_num, max_num); - PADDLE_ENFORCE_LE(lbl_num, max_num); - - tgt_bbox_t->Resize({fg_num, 4}); - loc_index_t->Resize({fg_num}); - score_index_t->Resize({lbl_num}); - auto* lbl_data = tgt_lbl_t->mutable_data({lbl_num, 1}, place); - Gather(tmp_lbl_data, 1, score_index_t->data(), lbl_num, - lbl_data); - } - - private: - void ScoreAssign(const T* dist_data, const Tensor& anchor_to_gt_max, - const int row, const int col, const float pos_threshold, - const float neg_threshold, int64_t* target_label, - std::vector* fg_inds, std::vector* bg_inds) const { - float epsilon = 0.0001; - for (int64_t i = 0; i < row; ++i) { - const T* v = dist_data + i * col; - T max = *std::max_element(v, v + col); - for (int64_t j = 0; j < col; ++j) { - if (std::abs(max - v[j]) < epsilon) { - target_label[j] = 1; - } - } - } - - // Pick the fg/bg - const T* anchor_to_gt_max_data = anchor_to_gt_max.data(); - for (int64_t j = 0; j < col; ++j) { - if (anchor_to_gt_max_data[j] >= pos_threshold) { - target_label[j] = 1; - } else if (anchor_to_gt_max_data[j] < neg_threshold) { - target_label[j] = 0; - } - if (target_label[j] == 1) { - fg_inds->push_back(j); - } else if (target_label[j] == 0) { - bg_inds->push_back(j); - } + Tensor sampled_anchor, sampled_gt, sampled_tgt_bbox; + auto* sampled_anchor_data = + sampled_anchor.mutable_data({loc_num, 4}, place); + auto* sampled_gt_data = sampled_gt.mutable_data({loc_num, 4}, place); + Gather(anchor->data(), 4, sampled_loc_index_unmap.data(), + loc_num, sampled_anchor_data); + Gather(ncrowd_gt_boxes.data(), 4, sampled_gt_index.data(), + loc_num, sampled_gt_data); + sampled_tgt_bbox.mutable_data({loc_num, 4}, place); + BoxToDelta(loc_num, sampled_anchor, sampled_gt, nullptr, false, + &sampled_tgt_bbox); + + // Add anchor offset + int anchor_offset = i * anchor_num; + auto sampled_loc_index_unmap_et = + framework::EigenTensor::From(sampled_loc_index_unmap); + sampled_loc_index_unmap_et = sampled_loc_index_unmap_et + anchor_offset; + auto sampled_score_index_unmap_et = + framework::EigenTensor::From(sampled_score_index_unmap); + sampled_score_index_unmap_et = + sampled_score_index_unmap_et + anchor_offset; + AppendRpns(loc_index, total_loc_num, &sampled_loc_index_unmap); + AppendRpns(score_index, total_score_num, &sampled_score_index_unmap); + AppendRpns(tgt_bbox, total_loc_num * 4, &sampled_tgt_bbox); + AppendRpns(tgt_lbl, total_score_num, &sampled_tgtlbl); + total_loc_num += loc_num; + + total_score_num += score_num; + lod0_loc.emplace_back(total_loc_num); + lod0_score.emplace_back(total_score_num); } - } - - void ReservoirSampling(const int num, std::minstd_rand engine, - std::vector* inds) const { - std::uniform_real_distribution uniform(0, 1); - size_t len = inds->size(); - if (len > static_cast(num)) { - for (size_t i = num; i < len; ++i) { - int rng_ind = std::floor(uniform(engine) * i); - if (rng_ind < num) - std::iter_swap(inds->begin() + rng_ind, inds->begin() + i); - } - inds->resize(num); - } - } - // std::vector> RpnTargetAssign( - std::vector> SampleFgBgGt( - const platform::CPUDeviceContext& ctx, const Tensor& dist, - const float pos_threshold, const float neg_threshold, - const int rpn_batch_size, const int fg_num, std::minstd_rand engine, - int64_t* target_label) const { - auto* dist_data = dist.data(); - int row = dist.dims()[0]; - int col = dist.dims()[1]; - - std::vector fg_inds; - std::vector bg_inds; - std::vector gt_inds; - - // Calculate the max IoU between anchors and gt boxes - // Map from anchor to gt box that has highest overlap - auto place = ctx.GetPlace(); - Tensor anchor_to_gt_max, anchor_to_gt_argmax; - anchor_to_gt_max.mutable_data({col}, place); - int* argmax = anchor_to_gt_argmax.mutable_data({col}, place); - - auto x = framework::EigenMatrix::From(dist); - auto x_col_max = framework::EigenVector::Flatten(anchor_to_gt_max); - auto x_col_argmax = - framework::EigenVector::Flatten(anchor_to_gt_argmax); - x_col_max = x.maximum(Eigen::DSizes(0)); - x_col_argmax = x.argmax(0).template cast(); - - // Follow the Faster RCNN's implementation - ScoreAssign(dist_data, anchor_to_gt_max, row, col, pos_threshold, - neg_threshold, target_label, &fg_inds, &bg_inds); - // Reservoir Sampling - ReservoirSampling(fg_num, engine, &fg_inds); - int fg_num2 = static_cast(fg_inds.size()); - int bg_num = rpn_batch_size - fg_num2; - ReservoirSampling(bg_num, engine, &bg_inds); - - gt_inds.reserve(fg_num2); - for (int i = 0; i < fg_num2; ++i) { - gt_inds.emplace_back(argmax[fg_inds[i]]); - } - std::vector> fg_bg_gt; - fg_bg_gt.emplace_back(fg_inds); - fg_bg_gt.emplace_back(bg_inds); - fg_bg_gt.emplace_back(gt_inds); - - return fg_bg_gt; + PADDLE_ENFORCE_LE(total_loc_num, max_num); + PADDLE_ENFORCE_LE(total_score_num, max_num); + + lod_loc.emplace_back(lod0_loc); + loc_score.emplace_back(lod0_score); + loc_index->set_lod(lod_loc); + score_index->set_lod(loc_score); + tgt_bbox->set_lod(lod_loc); + tgt_lbl->set_lod(loc_score); + loc_index->Resize({total_loc_num}); + score_index->Resize({total_score_num}); + tgt_bbox->Resize({total_loc_num, 4}); + tgt_lbl->Resize({total_score_num, 1}); } }; @@ -259,18 +460,22 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("Anchor", "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4]."); - AddInput("GtBox", "(LoDTensor) input groud-truth bbox with shape [K, 4]."); - AddInput( - "DistMat", - "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape " - "[K, M]. It is pair-wise distance matrix between the entities " - "represented by each row and each column. For example, assumed one " - "entity is A with shape [K], another entity is B with shape [M]. The " - "DistMat[i][j] is the distance between A[i] and B[j]. The bigger " - "the distance is, the better macthing the pairs are. Please note, " - "This tensor can contain LoD information to represent a batch of " - "inputs. One instance of this batch can contain different numbers of " - "entities."); + AddInput("GtBoxes", + "(LoDTensor) input groud-truth bbox with shape [K, 4]."); + AddInput("IsCrowd", + "(LoDTensor) input which indicates groud-truth is crowd."); + AddInput("ImInfo", + "(LoDTensor) input image information with shape [N, 3]. " + "N is the batch size, each image information includes height, " + "width and scale."); + AddAttr("rpn_batch_size_per_im", + "Total number of RPN examples per image.") + .SetDefault(256); + AddAttr( + "rpn_straddle_thresh", + "Remove RPN anchors that go outside the image by straddle_thresh " + "pixels, " + "Set to -1 or a large value, e.g. 100000, to disable pruning anchors."); AddAttr( "rpn_positive_overlap", "Minimum overlap required between an anchor and ground-truth " @@ -282,20 +487,15 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { "box for the (anchor, gt box) pair to be a negative examples.") .SetDefault(0.3); AddAttr( - "fg_fraction", + "rpn_fg_fraction", "Target fraction of RoI minibatch that " "is labeled foreground (i.e. class > 0), 0-th class is background.") .SetDefault(0.25); - AddAttr("rpn_batch_size_per_im", - "Total number of RPN examples per image.") - .SetDefault(256); - AddAttr("fix_seed", - "A flag indicating whether to use a fixed seed to generate " - "random mask. NOTE: DO NOT set this flag to true in " - "training. Setting this flag to true is only useful in " - "unittest.") - .SetDefault(false); - AddAttr("seed", "RpnTargetAssign random seed.").SetDefault(0); + AddAttr("use_random", + "A flag indicating whether to use a ReservoirSampling. " + "NOTE: DO NOT set this flag to false in training. " + "Setting this flag to false is only useful in unittest.") + .SetDefault(true); AddOutput( "LocationIndex", "(Tensor), The indexes of foreground anchors in all RPN anchors, the " @@ -308,16 +508,16 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { "ScoreIndex is [F + B], F and B are sampled foreground and backgroud " " number."); AddOutput("TargetBBox", - "(Tensor), The target bbox deltas with shape " + "(Tensor), The target bbox deltas with shape " "[F, 4], F is the sampled foreground number."); AddOutput( "TargetLabel", - "(Tensor), The target labels of each anchor with shape " + "(Tensor), The target labels of each anchor with shape " "[F + B, 1], F and B are sampled foreground and backgroud number."); AddComment(R"DOC( -This operator can be, for given the IoU between the ground truth bboxes and the +This operator can be, for a given set of ground truth bboxes and the anchors, to assign classification and regression targets to each prediction. -The Score index and LocationIndex will be generated according to the DistMat. +The ScoreIndex and LocationIndex will be generated according to the anchor-groundtruth IOU. The rest anchors would not contibute to the RPN training loss ScoreIndex is composed of foreground anchor indexes(positive labels) and diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index 07ac20797ddab54296a45e99915588a40cc6f3c7..13682b78f0eccf049daa315f3a26aafd22e42a41 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -125,7 +125,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, VarHandlePtr h(new VarHandle(ep, "Get", var_name_val, p_ctx, p_scope)); s->Prepare(h, time_out); - framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] { + framework::AsyncIO([var_name_val, s, this] { // prepare input sendrecv::VariableMessage req; req.set_varname(var_name_val); @@ -166,7 +166,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, s->Prepare(h, time_out); framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, - time_out, s, this] { + s, this] { auto* var = p_scope->FindVar(in_var_name_val); ::grpc::ByteBuffer req; @@ -290,12 +290,18 @@ void GRPCClient::Proceed() { c->Finish(false); } - delete c; + bool notify = false; { std::lock_guard lk(sync_mutex_); req_count_--; + notify = (req_count_ <= 0 || !c->status_.ok()); + } + + delete c; + + if (notify) { + sync_cond_.notify_all(); } - sync_cond_.notify_all(); } VLOG(3) << "GRPCClient Proceed end"; } diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h index 2fab02e32fe18ee04f86a69bb5bae1cbe7c6762c..d2b0eb6ca6de1984dc7cfc2a662c88d5e56e1e05 100644 --- a/paddle/fluid/operators/distributed/proto_encoder_helper.h +++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h @@ -82,8 +82,10 @@ class ProtoEncodeHelper { : base_(buf), p_(buf), limit_(base_ + max_size) {} ~ProtoEncodeHelper() { +#define REPLACE_ENFORCE_GLOG 1 // Make sure callers didn't do operations that went over max_size promised - PADDLE_ENFORCE_LE(p_, limit_); + paddle::platform::throw_on_error(p_ <= limit_); +#undef REPLACE_ENFORCE_GLOG } const char* data() const { return base_; } diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index 3c3f9d17c871ac1cb4df83db17cf489d5b9e0563..3dbbd75b1e945208395c42ace3235db7891936c5 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -56,7 +56,7 @@ class VarHandle { const std::string& name, const platform::DeviceContext* p_ctx = nullptr, const framework::Scope* p_scope = nullptr) - : ok_(kVarHandleDefaultState) { + : status_(kDefaultState) { ep_ = ep; ctx_ = p_ctx; scope_ = p_scope; @@ -68,18 +68,20 @@ class VarHandle { public: bool Wait() { + int ret = kDefaultState; { std::unique_lock lk(sync_mutex_); - wait_cond_.wait(lk, [this] { return ok_ != kVarHandleDefaultState; }); + wait_cond_.wait(lk, [this] { return status_ != kDefaultState; }); + ret = status_; } - VLOG(7) << "VarHandle wait:" << ok_; - return ok_ != 0; + VLOG(7) << "VarHandle wait:" << ret; + return ret != kErrorState; } void Finish(bool ok) { { std::unique_lock lk(sync_mutex_); - ok_ = ok; + status_ = ok ? kFinishState : kErrorState; } VLOG(7) << "VarHandle finish:" << ok; wait_cond_.notify_all(); @@ -87,8 +89,8 @@ class VarHandle { std::string String() const { std::ostringstream s; - s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], ok:[" << ok_ - << "]"; + s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], status:[" + << status_ << "]"; return s.str(); } @@ -111,9 +113,13 @@ class VarHandle { protected: std::mutex sync_mutex_; std::condition_variable wait_cond_; - int ok_; - static const int kVarHandleDefaultState = -1; + enum VarHandleStatus { + kDefaultState = -1, + kErrorState = 0, + kFinishState = 1, + }; + VarHandleStatus status_; private: DISABLE_COPY_AND_ASSIGN(VarHandle); diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc index 55e465e3af08c012b8cff7714452ed32b32a5556..8ca79d20ec4f6412b00dbf3990068f81b65e2efd 100644 --- a/paddle/fluid/operators/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fusion_lstm_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fusion_lstm_op.h" #include #include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/cpu_lstm_compute.h" #include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc_compute.h" #include "paddle/fluid/operators/math/sequence2batch.h" @@ -269,7 +270,6 @@ class FuisonLSTMKernel : public framework::OpKernel { blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast(1), prev, D, \ wh_data, D4, static_cast(1), out, D4) -// gates: W_ch, W_ih, W_fh, W_oh #define GET_Ct(ct_1, gates, ct) \ /* C_t = C_t-1 * fgated + cand_gated * igated*/ \ act_cand(D, gates, gates); \ @@ -395,11 +395,22 @@ class FuisonLSTMKernel : public framework::OpKernel { } } } else { + // TODO(TJ): unly workaround, clean me + std::function compute_ctht; + if (platform::jit::MayIUse(platform::jit::avx) && + act_gate_str == "sigmoid" && act_cand_str == "tanh" && + act_cell_str == "tanh" && D == 8) { + compute_ctht = math::lstm_compute_ctht; + } else { + compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) { + COMPUTE_CtHt(gates, ct_1, ct, ht); + }; + } for (int i = 0; i < N; ++i) { PROCESS_H0C0 for (int step = tstart; step < seq_len; ++step) { GEMM_WH_ADDON(1, prev_h_data, xx_data); - COMPUTE_CtHt(xx_data, prev_c_data, c_out_data, h_out_data); + compute_ctht(xx_data, prev_c_data, c_out_data, h_out_data); MOVE_ONE_STEP; } } @@ -532,12 +543,23 @@ class FuisonLSTMKernel : public framework::OpKernel { MOVE_ONE_STEP; } } else { + // TODO(TJ): unly workaround, clean me + std::function compute_ctht; + if (platform::jit::MayIUse(platform::jit::avx) && + act_gate_str == "sigmoid" && act_cand_str == "tanh" && + act_cell_str == "tanh" && D == 8) { + compute_ctht = math::lstm_compute_ctht; + } else { + compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) { + COMPUTE_CtHt(gates, ct_1, ct, ht); + }; + } for (int step = tstart; step < max_seq_len; ++step) { const int cur_bs = batch_starts[step + 1] - batch_starts[step]; GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data); DEFINE_CUR; for (int i = 0; i < cur_bs; ++i) { - COMPUTE_CtHt(cur_in_data, cur_prev_c_data, cur_c_out_data, + compute_ctht(cur_in_data, cur_prev_c_data, cur_c_out_data, cur_h_out_data); MOVE_ONE_BATCH; } diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 966d78b84130c172c41e8049bf6bb1dc659d7d48..dc008d16971bc762b401ddece56f9ec56f7a47d6 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -59,17 +59,16 @@ static void ParallelExecuteBlocks( framework::ProgramDesc *program, framework::Scope *scope) { std::vector> fs; for (size_t idx : parallel_blkids) { - fs.push_back( - framework::Async([&executor, &prepared, &program, &scope, idx]() { - int run_block = idx; // thread local - try { - VLOG(3) << "running server block: " << run_block - << "pointer: " << prepared[run_block].get(); - executor->RunPreparedContext(prepared[run_block].get(), scope); - } catch (const std::exception &e) { - LOG(ERROR) << "run sub program error " << e.what(); - } - })); + fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() { + int run_block = idx; // thread local + try { + VLOG(3) << "running server block: " << run_block + << "pointer: " << prepared[run_block].get(); + executor->RunPreparedContext(prepared[run_block].get(), scope); + } catch (const std::exception &e) { + LOG(ERROR) << "run sub program error " << e.what(); + } + })); } for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); } diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index d7f0f3c6280db7d121bf8821ec6d578e22a33da6..91101356436c26171eaca2fe01dfd4d937e71717 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -45,6 +45,8 @@ math_library(im2col) if (NOT WIN32) # windows do not support avx functions yet. math_library(gru_compute DEPS activation_functions math_function) math_library(lstm_compute DEPS activation_functions) +# TODO(TJ): ugly workaround, clean me +cc_library(cpu_lstm_compute SRCS cpu_lstm_compute.cc DEPS activation_functions cblas cpu_info) endif (NOT WIN32) cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context) diff --git a/paddle/fluid/operators/math/cpu_lstm_compute.cc b/paddle/fluid/operators/math/cpu_lstm_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..f7c55c215bacdafc99da5fcd0b750a058dfed21c --- /dev/null +++ b/paddle/fluid/operators/math/cpu_lstm_compute.cc @@ -0,0 +1,88 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/cpu_lstm_compute.h" +#include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/platform/cpu_info.h" +#ifdef __AVX__ +#include +#endif + +namespace paddle { +namespace operators { +namespace math { + +// TODO(TJ): ugly workaround, clean me +template +void lstm_compute_ctht(T* gates, const T* ct_1, T* ct, T* ht) { + // gates: W_ch, W_ih, W_fh, W_oh + vec_sigmoid(24, gates + 8, gates + 8); + vec_tanh(8, gates, gates); + const T *i = gates + 8, *f = gates + 16, *o = gates + 24; + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + for (int d = 0; d < 8; ++d) { + // C_t = C_t-1 * fgated + cand_gated * igated + ct[d] = ct_1[d] * f[d] + gates[d] * i[d]; + // H_t = act_cell(C_t) * ogated + T tmp = ct[d] * 2; + tmp = static_cast(0) - ((tmp < min) ? min : ((tmp > max) ? max : tmp)); + vec_exp(1, &tmp, &tmp); + tmp = static_cast(2) / (static_cast(1) + tmp) - static_cast(1); + ht[d] = tmp * o[d]; + } +} + +#ifdef __AVX__ +namespace detail { +namespace forward { +namespace avx { +__m256 Sigmoid(const __m256 a); +__m256 Tanh(const __m256 a); +} // namespace avx +} // namespace forward +} // namespace detail + +template <> +void lstm_compute_ctht(float* gates, const float* ct_1, float* ct, + float* ht) { + namespace act = detail::forward::avx; + // gates: W_ch, W_ih, W_fh, W_oh + __m256 c, i, f, o; + c = _mm256_loadu_ps(gates); + i = _mm256_loadu_ps(gates + 8); + f = _mm256_loadu_ps(gates + 16); + o = _mm256_loadu_ps(gates + 24); + + /* C_t = C_t-1 * fgated + cand_gated * igated*/ + c = _mm256_mul_ps(act::Tanh(c), act::Sigmoid(i)); + i = _mm256_loadu_ps(ct_1); + f = _mm256_mul_ps(i, act::Sigmoid(f)); + f = _mm256_add_ps(c, f); + _mm256_storeu_ps(ct, f); + + /* H_t = act_cell(C_t) * ogated */ + o = _mm256_mul_ps(act::Tanh(f), act::Sigmoid(o)); + _mm256_storeu_ps(ht, o); +} +#endif + +template void lstm_compute_ctht(float* gates, const float* ct_1, + float* ct, float* ht); +template void lstm_compute_ctht(double* gates, const double* ct_1, + double* ct, double* ht); + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/cpu_lstm_compute.h b/paddle/fluid/operators/math/cpu_lstm_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..244164f08c4bb70833a9bfc884982a4225945bf0 --- /dev/null +++ b/paddle/fluid/operators/math/cpu_lstm_compute.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include + +namespace paddle { +namespace operators { +namespace math { + +// TODO(TJ): ugly workaround, clean me +template +void lstm_compute_ctht(T* gates, const T* ct_1, T* ct, T* ht); + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index 9560e3a3c15ca63892fbe3552679a22f027f11e2..6a059968b79189458349e466079cc7a663a8e5ff 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/enforce.h" #ifdef __AVX__ #include #endif @@ -476,7 +477,7 @@ class VecActivations { } else if (type == "identity" || type == "") { return vec_identity; } - LOG(FATAL) << "Not support type: " << type; + PADDLE_THROW("Not support type: %s", type); } }; diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index f25d3d3f1ee1f89d46b8e7c88ca68048f5203544..69318a6598c8c69eceab7216df6382537153d34f 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -103,6 +103,58 @@ class MaxSeqPoolGradFunctor { } }; +template +class LastSeqPoolFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::LoDTensor& input, + framework::Tensor* output) { + // Create pointers to input and output data + auto* in_data = input.data(); + auto* out_data = output->data(); + + // Calculate the size of each item in sequence + int64_t item_size = input.numel() / input.dims()[0]; + auto lod = input.lod()[0]; + int seq_num = static_cast(lod.size()) - 1; + for (int i = 0; i < seq_num; ++i) { + // Calculate the length of each sequence + int64_t seq_len = static_cast(lod[i + 1] - lod[i]); + // Point to the begin of next sequence + in_data += seq_len * item_size; + // Copy the last item of sequence to output + std::memcpy(out_data, (in_data - item_size), item_size * sizeof(T)); + out_data += item_size; + } + } +}; + +template +class FirstSeqPoolFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::LoDTensor& input, + framework::Tensor* output) { + // Create pointers to input and output data + auto* in_data = input.data(); + auto* out_data = output->data(); + + // Calculate the size of each item in sequence + int64_t item_size = input.numel() / input.dims()[0]; + auto lod = input.lod()[0]; + int seq_num = static_cast(lod.size()) - 1; + for (int i = 0; i < seq_num; ++i) { + // Calculate the length of each sequence + int64_t seq_len = static_cast(lod[i + 1] - lod[i]); + // Copy the first item of sequence to output + std::memcpy(out_data, in_data, item_size * sizeof(T)); + // Point to the next sequence + in_data += seq_len * item_size; + out_data += item_size; + } + } +}; + template class SequencePoolFunctor { public: @@ -116,6 +168,16 @@ class SequencePoolFunctor { max_pool(context, input, output, index); return; } + if (pooltype == "LAST") { + math::LastSeqPoolFunctor last_pool; + last_pool(context, input, output); + return; + } + if (pooltype == "FIRST") { + math::FirstSeqPoolFunctor first_pool; + first_pool(context, input, output); + return; + } auto lod = input.lod()[0]; auto& place = *context.eigen_device(); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { @@ -133,10 +195,6 @@ class SequencePoolFunctor { } else if (pooltype == "SQRT") { out_e.device(place) = in_e.sum(Eigen::array({{0}})) / std::sqrt(static_cast(h)); - } else if (pooltype == "LAST") { - out_e.device(place) = in_e.chip(h - 1, 0); - } else if (pooltype == "FIRST") { - out_e.device(place) = in_e.chip(0, 0); } else { PADDLE_THROW("unsupported pooling pooltype"); } diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc index 058115cb624627d81b31d0903f7d615d19708c77..b2543d3d0d80f0573f2cbc755318c1b5a0982324 100644 --- a/paddle/fluid/operators/maxout_op.cc +++ b/paddle/fluid/operators/maxout_op.cc @@ -71,8 +71,7 @@ class MaxOutOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of MaxoutOp" - "should not be null."); + "Input(X) of MaxoutOpshould not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of MaxoutOp should not be null."); auto in_x_dims = ctx->GetInputDim("X"); @@ -90,9 +89,10 @@ class MaxOutOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of MaxOutOpGrad must not be null."); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), - "Input(X@GRAD) should not be null."); + "Output(Grad@X) of MaxOutOpGrad should not be null."); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); } }; diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index 23d9ea88f6701f9f9e5e02948e996878a849ddd6..e0c4c81bdd5b5d0af3bafe632a2fa033efd08050 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -26,10 +26,13 @@ class PReluOp : public framework::OperatorWithKernel { std::string mode = ctx->Attrs().Get("mode"); auto x_dim = ctx->GetInputDim("X"); - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); - PADDLE_ENFORCE(ctx->HasInput("Alpha"), "Input(Alpha) should not be null"); + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of PreluOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("Alpha"), + "Input(Alpha) of PreluOp should not be null"); - PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of PreluOp should not be null"); if (mode == "all") { PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1, "For mode 'all', size of weight Alpha must be one."); diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc index 23e5fc1112d0b1e634d0ab288721cbba57b3ffe5..13df1d4b4bb6c240610f96ccc8f223fc984d63f7 100644 --- a/paddle/fluid/operators/rnn_memory_helper_op.cc +++ b/paddle/fluid/operators/rnn_memory_helper_op.cc @@ -42,7 +42,7 @@ class RNNMemoryHelperOp : public framework::OperatorBase { auto *out_tensor = out_var->GetMutable(); auto &mem_tensor = mem_var->Get(); - out_tensor->ShareDataWith(mem_tensor); + framework::TensorCopySync(mem_tensor, dev_place, out_tensor); out_tensor->set_lod(mem_tensor.lod()); } }; @@ -50,8 +50,10 @@ class RNNMemoryHelperOp : public framework::OperatorBase { class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase { public: void operator()(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), ""); - PADDLE_ENFORCE(ctx->HasOutput("Out"), ""); + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of rnn_memory_helper op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output of rnn_memory_helper op should not be null."); ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->ShareLoD("X", /*->*/ "Out"); } @@ -107,7 +109,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase { } else { auto &out_grad_tensor = out_grad_var->Get(); auto *in_grad_tensor = in_grad_var->GetMutable(); - in_grad_tensor->ShareDataWith(out_grad_tensor); + framework::TensorCopySync(out_grad_tensor, dev_place, in_grad_tensor); in_grad_tensor->set_lod(out_grad_tensor.lod()); } } @@ -133,8 +135,11 @@ class RNNMemoryHelperGradOpShapeInference : public framework::InferShapeBase { public: void operator()(framework::InferShapeContext *ctx) const override { auto x_grad_name = framework::GradVarName("X"); - PADDLE_ENFORCE(ctx->HasOutput(x_grad_name), ""); - PADDLE_ENFORCE(ctx->HasInput("X"), ""); + PADDLE_ENFORCE(ctx->HasOutput(x_grad_name), + "Gradient of Input(X) in rnn_memory_helper_grad of should " + "not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of rnn_memory_helper_grad of should not be null."); ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X")); ctx->ShareLoD("X", /*->*/ x_grad_name); } diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc index 4bd23d594134f227e86b01fd75b7e202dd76c11b..e55462d6cfe389033a9c24a464fbf5b5d699f34f 100644 --- a/paddle/fluid/operators/slice_op.cc +++ b/paddle/fluid/operators/slice_op.cc @@ -25,7 +25,7 @@ class SliceOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { + void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("Input"), "Input (Input) of slice op should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), @@ -58,7 +58,7 @@ class SliceOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { + const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace()); @@ -87,13 +87,13 @@ Slice Operator. Produces a slice of the input tensor along multiple axes. Similar to numpy: https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html -Slice uses `axes`, `starts` and `ends` attributes to specify the start and +Slice uses `axes`, `starts` and `ends` attributes to specify the start and end dimension for each axis in the list of axes, it uses this information -to slice the input data tensor. If a negative value is passed for any of -the start or end indices, it represents number of elements before the end +to slice the input data tensor. If a negative value is passed for any of +the start or end indices, it represents number of elements before the end of that dimension. If the value passed to start or end is larger than -the n (the number of elements in this dimension), it represents n. -For slicing to the end of a dimension with unknown size, it is recommended +the n (the number of elements in this dimension), it represents n. +For slicing to the end of a dimension with unknown size, it is recommended to pass in INT_MAX. If axes are omitted, they are set to [0, ..., ndim-1]. Following examples will explain how slice works: @@ -119,15 +119,54 @@ Following examples will explain how slice works: } }; +class SliceOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), "Input should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto x_dims = ctx->GetInputDim("Input"); + auto x_grad_name = framework::GradVarName("Input"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + } +}; + +class SliceOpGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* bind = new framework::OpDesc(); + bind->SetInput("Input", Input("Input")); + bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + bind->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + bind->SetAttrMap(Attrs()); + bind->SetType("slice_grad"); + return std::unique_ptr(bind); + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(slice, ops::SliceOp, ops::SliceOpMaker, - paddle::framework::EmptyGradOpMaker); + ops::SliceOpGradMaker); +REGISTER_OPERATOR(slice_grad, ops::SliceOpGrad); REGISTER_OP_CPU_KERNEL( slice, ops::SliceKernel, ops::SliceKernel, ops::SliceKernel, ops::SliceKernel); + +REGISTER_OP_CPU_KERNEL( + slice_grad, ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel); diff --git a/paddle/fluid/operators/slice_op.cu b/paddle/fluid/operators/slice_op.cu index 8c1767c70b19d1386af9610ef3405eb487a39878..5efecb78d1a4eaffc3a9c62e1e82a9bcb5922748 100644 --- a/paddle/fluid/operators/slice_op.cu +++ b/paddle/fluid/operators/slice_op.cu @@ -20,3 +20,10 @@ REGISTER_OP_CUDA_KERNEL( ops::SliceKernel, ops::SliceKernel, ops::SliceKernel); + +REGISTER_OP_CUDA_KERNEL( + slice_grad, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel); diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h index ba231aee176564b91a642912ce0b32bcdef8cfc1..f38d08d7640794bd9a456a6c4ee1da2e04e96b37 100644 --- a/paddle/fluid/operators/slice_op.h +++ b/paddle/fluid/operators/slice_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include +#include #include #include "paddle/fluid/framework/op_registry.h" @@ -84,5 +85,79 @@ class SliceKernel : public framework::OpKernel { out_t.device(place) = in_t.slice(offsets, extents); } }; + +template +class SliceGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + size_t rank = ctx.Input(framework::GradVarName("Out")) + ->dims() + .size(); + switch (rank) { + case 1: + SliceCompute<1>(ctx); + break; + case 2: + SliceCompute<2>(ctx); + break; + case 3: + SliceCompute<3>(ctx); + break; + case 4: + SliceCompute<4>(ctx); + break; + case 5: + SliceCompute<5>(ctx); + break; + case 6: + SliceCompute<6>(ctx); + break; + } + } + + private: + template + void SliceCompute(const framework::ExecutionContext& context) const { + auto& place = + *context.template device_context().eigen_device(); + auto* d_out = + context.Input(framework::GradVarName("Out")); + auto* d_input = + context.Output(framework::GradVarName("Input")); + d_input->mutable_data(context.GetPlace()); + auto out_dims = d_out->dims(); + auto in_dims = d_input->dims(); + auto axes = context.Attr>("axes"); + auto starts = context.Attr>("starts"); + + auto offsets = Eigen::array(); + auto extents = Eigen::array(); + for (size_t i = 0; i < D; ++i) { + offsets[i] = 0; + extents[i] = out_dims[i]; + } + int start; + for (size_t i = 0; i < axes.size(); ++i) { + start = starts[i]; + if (start < 0) { + start = (start + in_dims[axes[i]]); + } + start = std::max(start, 0); + offsets[axes[i]] = start; + } + Eigen::array, D> paddings; + for (size_t i = 0; i < paddings.size(); ++i) { + paddings[i].first = offsets[i]; + paddings[i].second = (in_dims[i] - out_dims[i]) - offsets[i]; + } + auto d_in_t = + framework::EigenTensor::From( + *d_input); + auto d_out_t = + framework::EigenTensor::From( + *d_out); + d_in_t.device(place) = d_out_t.pad(paddings, 0); + } +}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 20fc08e21dadc12be8903476df374abb5caecf61..8bc30fc123163983f4bddc19af489920db93e0c0 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -683,7 +683,6 @@ All parameter, weight, gradient are variables in Paddle. const std::string &, Scope *, std::vector &, const ExecutionStrategy &, const BuildStrategy &, size_t, size_t>()) - .def("_bcast_params", &ParallelExecutor::BCastParamsToDevices) // NOTE: even we return a vec* to Python use reference policy. // We still cannot get local_scope from this vector, since the element // of vec will be freed by Python GC. We can only return Scope* diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/fluid/string/CMakeLists.txt index 719411bf6677c923397748289b95415c47fa299a..8572dc1e8e543b552e3ed5a180ec942faf90a624 100644 --- a/paddle/fluid/string/CMakeLists.txt +++ b/paddle/fluid/string/CMakeLists.txt @@ -1,6 +1,5 @@ cc_library(stringpiece SRCS piece.cc) cc_library(pretty_log SRCS pretty_log.cc) -cc_test(test_pretty_log SRCS pretty_log.cc) cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags) cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags) cc_test(to_string_test SRCS to_string_test.cc) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index ad095b92711dccb44f26748bcfa89a0b4123c6e7..05b06d3677ce53752cc169cb93b89b408a81bde4 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -33,6 +33,7 @@ function print_usage() { ${BLUE}single_test${NONE}: run a single unit test ${BLUE}bind_test${NONE}: parallel tests bind to different GPU ${BLUE}doc${NONE}: generate paddle documents + ${BLUE}gen_doc_lib${NONE}: generate paddle documents library ${BLUE}html${NONE}: convert C++ source code into HTML ${BLUE}dockerfile${NONE}: generate paddle release dockerfile ${BLUE}capi${NONE}: generate paddle CAPI package @@ -67,26 +68,44 @@ function cmake_gen() { # Support build for all python versions, currently # including cp27-cp27m and cp27-cp27mu. PYTHON_FLAGS="" - if [ "$1" != "" ]; then - echo "using python abi: $1" - if [ "$1" == "cp27-cp27m" ]; then - export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:} - export PATH=/opt/python/cp27-cp27m/bin/:${PATH} - PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python - -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7 - -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so" - elif [ "$1" == "cp27-cp27mu" ]; then - export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:} - export PATH=/opt/python/cp27-cp27mu/bin/:${PATH} - PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python - -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7 - -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so" - elif [ "$1" == "cp35-cp35m" ]; then - export LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} - export PATH=/opt/_internal/cpython-3.5.1/bin/:${PATH} - export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3 + SYSTEM=`uname -s` + if [ "$SYSTEM" == "Darwin" ]; then + if [[ "$1" == "cp27-cp27m" ]] || [[ "$1" == "" ]]; then + echo "using python abi: $1" + if [ -d "/Library/Frameworks/Python.framework/Versions/2.7" ]; then + export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7 + export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7 + export PATH=/Library/Frameworks/Python.framework/Versions/2.7/bin/:${PATH} + PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/bin/python2.7 + -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/2.7/include/python2.7 + -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/lib/libpython2.7.dylib" + else + exit 1 + fi + # TODO: qiyang add python3 part here + fi + else + if [ "$1" != "" ]; then + echo "using python abi: $1" + if [ "$1" == "cp27-cp27m" ]; then + export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:} + export PATH=/opt/python/cp27-cp27m/bin/:${PATH} + PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python + -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7 + -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so" + elif [ "$1" == "cp27-cp27mu" ]; then + export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:} + export PATH=/opt/python/cp27-cp27mu/bin/:${PATH} + PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python + -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7 + -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so" + elif [ "$1" == "cp35-cp35m" ]; then + export LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} + export PATH=/opt/_internal/cpython-3.5.1/bin/:${PATH} + export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3 -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so" + fi fi fi @@ -200,6 +219,19 @@ EOF make install -j `nproc` } +function build_mac() { + mkdir -p ${PADDLE_ROOT}/build + cd ${PADDLE_ROOT}/build + cat < 0), 0-th class is background. rpn_positive_overlap(float): Minimum overlap required between an anchor and ground-truth box for the (anchor, gt box) pair to be a positive @@ -129,45 +137,48 @@ def rpn_target_assign(loc, Examples: .. code-block:: python - loc = layers.data(name='location', shape=[2, 80], + bbox_pred = layers.data(name='bbox_pred', shape=[100, 4], append_batch_size=False, dtype='float32') - scores = layers.data(name='scores', shape=[2, 40], + cls_logits = layers.data(name='cls_logits', shape=[100, 1], append_batch_size=False, dtype='float32') anchor_box = layers.data(name='anchor_box', shape=[20, 4], append_batch_size=False, dtype='float32') - gt_box = layers.data(name='gt_box', shape=[10, 4], + gt_boxes = layers.data(name='gt_boxes', shape=[10, 4], append_batch_size=False, dtype='float32') loc_pred, score_pred, loc_target, score_target = - fluid.layers.detection_output(loc=location, - scores=scores, + fluid.layers.rpn_target_assign(bbox_pred=bbox_pred, + cls_logits=cls_logits, anchor_box=anchor_box, - gt_box=gt_box) + gt_boxes=gt_boxes) """ helper = LayerHelper('rpn_target_assign', **locals()) - # Compute overlaps between the prior boxes and the gt boxes overlaps - iou = iou_similarity(x=gt_box, y=anchor_box) # Assign target label to anchors loc_index = helper.create_tmp_variable(dtype='int32') score_index = helper.create_tmp_variable(dtype='int32') - target_label = helper.create_tmp_variable(dtype='int64') + target_label = helper.create_tmp_variable(dtype='int32') target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype) helper.append_op( type="rpn_target_assign", - inputs={'Anchor': anchor_box, - 'GtBox': gt_box, - 'DistMat': iou}, + inputs={ + 'Anchor': anchor_box, + 'GtBoxes': gt_boxes, + 'IsCrowd': is_crowd, + 'ImInfo': im_info + }, outputs={ 'LocationIndex': loc_index, 'ScoreIndex': score_index, 'TargetLabel': target_label, - 'TargetBBox': target_bbox, + 'TargetBBox': target_bbox }, attrs={ 'rpn_batch_size_per_im': rpn_batch_size_per_im, + 'rpn_straddle_thresh': rpn_straddle_thresh, 'rpn_positive_overlap': rpn_positive_overlap, 'rpn_negative_overlap': rpn_negative_overlap, - 'fg_fraction': fg_fraction + 'rpn_fg_fraction': rpn_fg_fraction, + 'use_random': use_random }) loc_index.stop_gradient = True @@ -175,12 +186,12 @@ def rpn_target_assign(loc, target_label.stop_gradient = True target_bbox.stop_gradient = True - scores = nn.reshape(x=scores, shape=(-1, 1)) - loc = nn.reshape(x=loc, shape=(-1, 4)) - predicted_scores = nn.gather(scores, score_index) - predicted_location = nn.gather(loc, loc_index) + cls_logits = nn.reshape(x=cls_logits, shape=(-1, 1)) + bbox_pred = nn.reshape(x=bbox_pred, shape=(-1, 4)) + predicted_cls_logits = nn.gather(cls_logits, score_index) + predicted_bbox_pred = nn.gather(bbox_pred, loc_index) - return predicted_scores, predicted_location, target_label, target_bbox + return predicted_cls_logits, predicted_bbox_pred, target_label, target_bbox def detection_output(loc, @@ -1258,15 +1269,17 @@ def anchor_generator(input, def generate_proposal_labels(rpn_rois, gt_classes, + is_crowd, gt_boxes, - im_scales, + im_info, batch_size_per_im=256, fg_fraction=0.25, fg_thresh=0.25, bg_thresh_hi=0.5, bg_thresh_lo=0.0, bbox_reg_weights=[0.1, 0.1, 0.2, 0.2], - class_nums=None): + class_nums=None, + use_random=True): """ ** Generate proposal labels Faster-RCNN ** TODO(buxingyuan): Add Document @@ -1285,8 +1298,9 @@ def generate_proposal_labels(rpn_rois, inputs={ 'RpnRois': rpn_rois, 'GtClasses': gt_classes, + 'IsCrowd': is_crowd, 'GtBoxes': gt_boxes, - 'ImScales': im_scales + 'ImInfo': im_info }, outputs={ 'Rois': rois, @@ -1302,7 +1316,8 @@ def generate_proposal_labels(rpn_rois, 'bg_thresh_hi': bg_thresh_hi, 'bg_thresh_lo': bg_thresh_lo, 'bbox_reg_weights': bbox_reg_weights, - 'class_nums': class_nums + 'class_nums': class_nums, + 'use_random': use_random }) rois.stop_gradient = True diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index bd9f8b3c356ca1e43923d42beebaa5ab98158084..44af29d3390e35129d0ee65b31eacad6b28a9d60 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -142,11 +142,6 @@ class ParallelExecutor(object): main = main if main else framework.default_main_program() if scope == None: scope = executor.global_scope() - # FIXME(Yancey1989): it's a temporary approach to determinate the distribute - # train program, call self.bcast_param() at the end of each mini-batch. - self.is_dist = True if "recv" in [ - op.type for op in main.global_block().ops - ] else False if share_vars_from and not isinstance(share_vars_from, ParallelExecutor): @@ -286,21 +281,11 @@ class ParallelExecutor(object): self.executor.run(fetch_list, fetch_var_name) arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() - if self.is_dist: - self._bcast_params() - if return_numpy: return executor.as_numpy(arr) return [arr[i] for i in range(len(arr))] - def _bcast_params(self): - """ - Broadcast the parameters to other devices. It is used during - distributed training. - """ - self.executor._bcast_params(set(self.persistable_vars)) - @property def device_count(self): return len(self._act_places) diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index e2564763d19d180f7c6933429dddf58c77be7bb8..56129641ce5900d82aedf243d2fa1eadfd6b8d86 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -148,51 +148,60 @@ class TestAnchorGenerator(unittest.TestCase): class TestGenerateProposalLabels(unittest.TestCase): def test_generate_proposal_labels(self): - rpn_rois = layers.data( - name='rpn_rois', - shape=[4, 4], - dtype='float32', - lod_level=1, - append_batch_size=False) - gt_classes = layers.data( - name='gt_classes', - shape=[6], - dtype='int32', - lod_level=1, - append_batch_size=False) - gt_boxes = layers.data( - name='gt_boxes', - shape=[6, 4], - dtype='float32', - lod_level=1, - append_batch_size=False) - im_scales = layers.data( - name='im_scales', - shape=[1], - dtype='float32', - lod_level=1, - append_batch_size=False) - class_nums = 5 - rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights = fluid.layers.generate_proposal_labels( - rpn_rois=rpn_rois, - gt_classes=gt_classes, - gt_boxes=gt_boxes, - im_scales=im_scales, - batch_size_per_im=2, - fg_fraction=0.5, - fg_thresh=0.5, - bg_thresh_hi=0.5, - bg_thresh_lo=0.0, - bbox_reg_weights=[0.1, 0.1, 0.2, 0.2], - class_nums=class_nums) - assert rois.shape[1] == 4 - assert rois.shape[0] == labels_int32.shape[0] - assert rois.shape[0] == bbox_targets.shape[0] - assert rois.shape[0] == bbox_inside_weights.shape[0] - assert rois.shape[0] == bbox_outside_weights.shape[0] - assert bbox_targets.shape[1] == 4 * class_nums - assert bbox_inside_weights.shape[1] == 4 * class_nums - assert bbox_outside_weights.shape[1] == 4 * class_nums + program = Program() + with program_guard(program): + rpn_rois = layers.data( + name='rpn_rois', + shape=[4, 4], + dtype='float32', + lod_level=1, + append_batch_size=False) + gt_classes = layers.data( + name='gt_classes', + shape=[6], + dtype='int32', + lod_level=1, + append_batch_size=False) + is_crowd = layers.data( + name='is_crowd', + shape=[6], + dtype='int32', + lod_level=1, + append_batch_size=False) + gt_boxes = layers.data( + name='gt_boxes', + shape=[6, 4], + dtype='float32', + lod_level=1, + append_batch_size=False) + im_info = layers.data( + name='im_info', + shape=[1, 3], + dtype='float32', + lod_level=1, + append_batch_size=False) + class_nums = 5 + rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights = fluid.layers.generate_proposal_labels( + rpn_rois=rpn_rois, + gt_classes=gt_classes, + is_crowd=is_crowd, + gt_boxes=gt_boxes, + im_info=im_info, + batch_size_per_im=2, + fg_fraction=0.5, + fg_thresh=0.5, + bg_thresh_hi=0.5, + bg_thresh_lo=0.0, + bbox_reg_weights=[0.1, 0.1, 0.2, 0.2], + class_nums=class_nums) + assert rois.shape[1] == 4 + assert rois.shape[0] == labels_int32.shape[0] + assert rois.shape[0] == bbox_targets.shape[0] + assert rois.shape[0] == bbox_inside_weights.shape[0] + assert rois.shape[0] == bbox_outside_weights.shape[0] + assert bbox_targets.shape[1] == 4 * class_nums + assert bbox_inside_weights.shape[1] == 4 * class_nums + assert bbox_outside_weights.shape[1] == 4 * class_nums class TestMultiBoxHead(unittest.TestCase): @@ -254,18 +263,18 @@ class TestRpnTargetAssign(unittest.TestCase): def test_rpn_target_assign(self): program = Program() with program_guard(program): - loc_shape = [10, 50, 4] - score_shape = [10, 50, 2] + bbox_pred_shape = [10, 50, 4] + cls_logits_shape = [10, 50, 2] anchor_shape = [50, 4] - loc = layers.data( - name='loc', - shape=loc_shape, + bbox_pred = layers.data( + name='bbox_pred', + shape=bbox_pred_shape, append_batch_size=False, dtype='float32') - scores = layers.data( - name='scores', - shape=score_shape, + cls_logits = layers.data( + name='cls_logits', + shape=cls_logits_shape, append_batch_size=False, dtype='float32') anchor_box = layers.data( @@ -278,17 +287,31 @@ class TestRpnTargetAssign(unittest.TestCase): shape=anchor_shape, append_batch_size=False, dtype='float32') - gt_box = layers.data( - name='gt_box', shape=[4], lod_level=1, dtype='float32') - + gt_boxes = layers.data( + name='gt_boxes', shape=[4], lod_level=1, dtype='float32') + is_crowd = layers.data( + name='is_crowd', + shape=[10], + dtype='int32', + lod_level=1, + append_batch_size=False) + im_info = layers.data( + name='im_info', + shape=[1, 3], + dtype='float32', + lod_level=1, + append_batch_size=False) pred_scores, pred_loc, tgt_lbl, tgt_bbox = layers.rpn_target_assign( - loc=loc, - scores=scores, + bbox_pred=bbox_pred, + cls_logits=cls_logits, anchor_box=anchor_box, anchor_var=anchor_var, - gt_box=gt_box, + gt_boxes=gt_boxes, + is_crowd=is_crowd, + im_info=im_info, rpn_batch_size_per_im=256, - fg_fraction=0.25, + rpn_straddle_thresh=0.0, + rpn_fg_fraction=0.5, rpn_positive_overlap=0.7, rpn_negative_overlap=0.3) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 9d7c528dbd3d1aa498dd48d83e7eae380098816f..35c4e996c5e4a4aab2e116599beb1acf35dcd9ff 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -28,9 +28,18 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test + +message(WARNING "These tests has been disabled in OSX before being fixed: \n test_detection_map_op \n test_desc_clone \n test_debugger \n test_program_code \n test_dist_transformer \n test_dist_se_resnext") if(APPLE) # this op is not support on mac list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op) + # TODO: add the unitest back when it fixed + list(REMOVE_ITEM TEST_OPS test_detection_map_op) + list(REMOVE_ITEM TEST_OPS test_desc_clone) + list(REMOVE_ITEM TEST_OPS test_debugger) + list(REMOVE_ITEM TEST_OPS test_program_code) + list(REMOVE_ITEM TEST_OPS test_dist_transformer) + list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) endif() function(py_test_modules TARGET_NAME) @@ -50,6 +59,7 @@ function(py_test_modules TARGET_NAME) endfunction() list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_dist_train) +list(REMOVE_ITEM TEST_OPS test_dist_transpiler) list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) @@ -65,11 +75,12 @@ if(WITH_DISTRIBUTE) set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20) set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) + py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) + py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) + py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) endif() py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150) -py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) -py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py index e3db316698398ff693157d583ad1410d10dcf81d..3ec79f8ef6e6f70f1365eaa32352c284d294a1ea 100644 --- a/python/paddle/fluid/tests/unittests/dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/dist_transformer.py @@ -92,7 +92,7 @@ class TrainTaskConfig(object): src_vocab_fpath = data_path + "vocab.bpe.32000" trg_vocab_fpath = data_path + "vocab.bpe.32000" train_file_pattern = data_path + "train.tok.clean.bpe.32000.en-de" - val_file_pattern = data_path + "newstest2013.tok.bpe.32000.en-de" + val_file_pattern = data_path + "newstest2013.tok.bpe.32000.en-de.cut" pool_size = 2000 sort_type = None local = True @@ -624,11 +624,12 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, init = True # Validate and save the model for inference. - if TrainTaskConfig.val_file_pattern is not None: - val_avg_cost, val_ppl = test() - print("[%f]" % val_avg_cost) - else: - assert (False) + if batch_id == 0 or batch_id == 4: + if TrainTaskConfig.val_file_pattern is not None: + val_avg_cost, val_ppl = test() + print("[%f]" % val_avg_cost) + else: + assert (False) #import transformer_reader as reader @@ -1701,8 +1702,9 @@ class DistTransformer2x2(TestDistRunnerBase): exe.run(startup_prog) exe.run(pserver_prog) - def run_trainer(self, place, args): - + def run_trainer(self, use_cuda, args): + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + TrainTaskConfig.use_gpu = use_cuda sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model( args.is_dist, not args.sync_mode) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index c0f5da5a1ae43847dff6348ea5f3e3bfd5e89ab9..37cad73019c529f64868b6ad3c6e2fffe59cc0d8 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -61,9 +61,10 @@ class TestDistRunnerBase(object): exe.run(startup_prog) exe.run(pserver_prog) - def run_trainer(self, place, args): + def run_trainer(self, use_cuda, args): import paddle import paddle.fluid as fluid + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=2) if args.mem_opt: @@ -91,7 +92,7 @@ class TestDistRunnerBase(object): build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce exe = fluid.ParallelExecutor( - True, + use_cuda, loss_name=avg_cost.name, exec_strategy=strategy, build_strategy=build_stra) @@ -142,9 +143,8 @@ def runtime_main(test_class): if args.role == "pserver" and args.is_dist: model.run_pserver(args) else: - p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( - ) else fluid.CPUPlace() - model.run_trainer(p, args) + use_cuda = True if core.is_compiled_with_cuda() else False + model.run_trainer(use_cuda, args) import paddle.compat as cpt @@ -225,11 +225,12 @@ class TestDistBase(unittest.TestCase): def check_with_place(self, model_file, delta=1e-3, check_error_log=False): # TODO(typhoonzero): should auto adapt GPU count on the machine. required_envs = { - "PATH": os.getenv("PATH"), - "PYTHONPATH": os.getenv("PYTHONPATH"), - "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"), + "PATH": os.getenv("PATH", ""), + "PYTHONPATH": os.getenv("PYTHONPATH", ""), + "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), "FLAGS_fraction_of_gpu_memory_to_use": "0.15", - "FLAGS_cudnn_deterministic": "1" + "FLAGS_cudnn_deterministic": "1", + "CPU_NUM": "1" } if check_error_log: diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py index a8e6ce4cfe18384e405f1602429628914d2c2e00..e55f8707a9a8ac2b0d69c65b15e6593025511999 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py @@ -14,6 +14,7 @@ from __future__ import print_function +import os import unittest import paddle from test_dist_base import TestDistBase @@ -44,6 +45,14 @@ def download_files(): test_url = url_prefix + 'newstest2013.tok.bpe.32000.en-de' test_md5 = '9dd74a266dbdb25314183899f269b4a2' paddle.dataset.common.download(test_url, 'test_dist_transformer', test_md5) + # cut test data for faster CI + orig_path = os.path.join(paddle.dataset.common.DATA_HOME, + "test_dist_transformer", + "newstest2013.tok.bpe.32000.en-de") + head_path = os.path.join(paddle.dataset.common.DATA_HOME, + "test_dist_transformer", + "newstest2013.tok.bpe.32000.en-de.cut") + os.system("head -n10 %s > %s" % (orig_path, head_path)) class TestDistTransformer2x2Sync(TestDistBase): diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index b85501ef6b80d1f5004aa0dd08c3123d3bda48a5..a198b25520f97ce23b9c1ebb9cd82fc458222d73 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -62,7 +62,7 @@ class TranspilerTest(unittest.TestCase): t = self._transpiler_instance(config) - trainer_main = t.get_trainer_program() + trainer_main = t.get_trainer_program(wait_port=False) trainer_startup = fluid.default_startup_program() assert (src.num_blocks == 1) diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py similarity index 77% rename from python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py rename to python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py index 6dc101b6dad8813893c6a891da0e16f952bb4c2d..2d5cd3b24bff52d82353ccf3fd2ecb69166c66c6 100644 --- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py +++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py @@ -20,10 +20,10 @@ import paddle.fluid as fluid from op_test import OpTest -def generate_proposal_labels_in_python( - rpn_rois, gt_classes, gt_boxes, im_scales, batch_size_per_im, - fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, - class_nums): +def generate_proposal_labels_in_python(rpn_rois, gt_classes, is_crowd, gt_boxes, + im_info, batch_size_per_im, fg_fraction, + fg_thresh, bg_thresh_hi, bg_thresh_lo, + bbox_reg_weights, class_nums): rois = [] labels_int32 = [] bbox_targets = [] @@ -31,13 +31,13 @@ def generate_proposal_labels_in_python( bbox_outside_weights = [] lod = [] assert len(rpn_rois) == len( - im_scales), 'batch size of rpn_rois and ground_truth is not matched' + im_info), 'batch size of rpn_rois and ground_truth is not matched' - for im_i in range(len(im_scales)): + for im_i in range(len(im_info)): frcn_blobs = _sample_rois( - rpn_rois[im_i], gt_classes[im_i], gt_boxes[im_i], im_scales[im_i], - batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi, - bg_thresh_lo, bbox_reg_weights, class_nums) + rpn_rois[im_i], gt_classes[im_i], is_crowd[im_i], gt_boxes[im_i], + im_info[im_i], batch_size_per_im, fg_fraction, fg_thresh, + bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums) lod.append(frcn_blobs['rois'].shape[0]) @@ -50,13 +50,14 @@ def generate_proposal_labels_in_python( return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights, lod -def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im, - fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, - bbox_reg_weights, class_nums): +def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info, + batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi, + bg_thresh_lo, bbox_reg_weights, class_nums): rois_per_image = int(batch_size_per_im) fg_rois_per_im = int(np.round(fg_fraction * rois_per_image)) # Roidb + im_scale = im_info[2] inv_im_scale = 1. / im_scale rpn_rois = rpn_rois * inv_im_scale @@ -78,6 +79,9 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im, box_to_gt_ind_map[overlapped_boxes_ind] = overlaps_argmax[ overlapped_boxes_ind] + crowd_ind = np.where(is_crowd)[0] + gt_overlaps[crowd_ind] = -1 + max_overlaps = gt_overlaps.max(axis=1) max_classes = gt_overlaps.argmax(axis=1) @@ -85,9 +89,10 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im, fg_inds = np.where(max_overlaps >= fg_thresh)[0] fg_rois_per_this_image = np.minimum(fg_rois_per_im, fg_inds.shape[0]) # Sample foreground if there are too many - if fg_inds.shape[0] > fg_rois_per_this_image: - fg_inds = np.random.choice( - fg_inds, size=fg_rois_per_this_image, replace=False) + # if fg_inds.shape[0] > fg_rois_per_this_image: + # fg_inds = np.random.choice( + # fg_inds, size=fg_rois_per_this_image, replace=False) + fg_inds = fg_inds[:fg_rois_per_this_image] # Background bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >= @@ -96,9 +101,10 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im, bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.shape[0]) # Sample background if there are too many - if bg_inds.shape[0] > bg_rois_per_this_image: - bg_inds = np.random.choice( - bg_inds, size=bg_rois_per_this_image, replace=False) + # if bg_inds.shape[0] > bg_rois_per_this_image: + # bg_inds = np.random.choice( + # bg_inds, size=bg_rois_per_this_image, replace=False) + bg_inds = bg_inds[:bg_rois_per_this_image] keep_inds = np.append(fg_inds, bg_inds) sampled_labels = max_classes[keep_inds] @@ -208,8 +214,9 @@ class TestGenerateProposalLabelsOp(OpTest): self.inputs = { 'RpnRois': (self.rpn_rois[0], self.rpn_rois_lod), 'GtClasses': (self.gt_classes[0], self.gts_lod), + 'IsCrowd': (self.is_crowd[0], self.gts_lod), 'GtBoxes': (self.gt_boxes[0], self.gts_lod), - 'ImScales': self.im_scales[0] + 'ImInfo': self.im_info } self.attrs = { 'batch_size_per_im': self.batch_size_per_im, @@ -218,14 +225,15 @@ class TestGenerateProposalLabelsOp(OpTest): 'bg_thresh_hi': self.bg_thresh_hi, 'bg_thresh_lo': self.bg_thresh_lo, 'bbox_reg_weights': self.bbox_reg_weights, - 'class_nums': self.class_nums + 'class_nums': self.class_nums, + 'use_random': False } self.outputs = { - 'Rois': (self.rois[0], [self.lod]), - 'LabelsInt32': (self.labels_int32[0], [self.lod]), - 'BboxTargets': (self.bbox_targets[0], [self.lod]), - 'BboxInsideWeights': (self.bbox_inside_weights[0], [self.lod]), - 'BboxOutsideWeights': (self.bbox_outside_weights[0], [self.lod]), + 'Rois': (self.rois, [self.lod]), + 'LabelsInt32': (self.labels_int32, [self.lod]), + 'BboxTargets': (self.bbox_targets, [self.lod]), + 'BboxInsideWeights': (self.bbox_inside_weights, [self.lod]), + 'BboxOutsideWeights': (self.bbox_outside_weights, [self.lod]), } def test_check_output(self): @@ -236,8 +244,8 @@ class TestGenerateProposalLabelsOp(OpTest): self.set_data() def init_test_params(self): - self.batch_size_per_im = 10 - self.fg_fraction = 1.0 + self.batch_size_per_im = 512 + self.fg_fraction = 0.25 self.fg_thresh = 0.5 self.bg_thresh_hi = 0.5 self.bg_thresh_lo = 0.0 @@ -246,14 +254,14 @@ class TestGenerateProposalLabelsOp(OpTest): def init_test_input(self): np.random.seed(0) - image_nums = 1 gt_nums = 6 # Keep same with batch_size_per_im for unittest - proposal_nums = self.batch_size_per_im - gt_nums - images_shape = [] - self.im_scales = [] - for i in range(image_nums): - images_shape.append(np.random.randint(200, size=2)) - self.im_scales.append(np.ones((1)).astype(np.float32)) + proposal_nums = 2000 #self.batch_size_per_im - gt_nums + images_shape = [[64, 64]] + self.im_info = np.ones((len(images_shape), 3)).astype(np.float32) + for i in range(len(images_shape)): + self.im_info[i, 0] = images_shape[i][0] + self.im_info[i, 1] = images_shape[i][1] + self.im_info[i, 2] = 0.8 #scale self.rpn_rois, self.rpn_rois_lod = _generate_proposals(images_shape, proposal_nums) @@ -261,16 +269,23 @@ class TestGenerateProposalLabelsOp(OpTest): images_shape, self.class_nums, gt_nums) self.gt_classes = [gt['gt_classes'] for gt in ground_truth] self.gt_boxes = [gt['boxes'] for gt in ground_truth] + self.is_crowd = [gt['is_crowd'] for gt in ground_truth] def init_test_output(self): self.rois, self.labels_int32, self.bbox_targets, \ self.bbox_inside_weights, self.bbox_outside_weights, \ self.lod = generate_proposal_labels_in_python( - self.rpn_rois, self.gt_classes, self.gt_boxes, self.im_scales, + self.rpn_rois, self.gt_classes, self.is_crowd, self.gt_boxes, self.im_info, self.batch_size_per_im, self.fg_fraction, self.fg_thresh, self.bg_thresh_hi, self.bg_thresh_lo, self.bbox_reg_weights, self.class_nums ) + self.rois = np.vstack(self.rois) + self.labels_int32 = np.hstack(self.labels_int32) + self.labels_int32 = self.labels_int32[:, np.newaxis] + self.bbox_targets = np.vstack(self.bbox_targets) + self.bbox_inside_weights = np.vstack(self.bbox_inside_weights) + self.bbox_outside_weights = np.vstack(self.bbox_outside_weights) def _generate_proposals(images_shape, proposal_nums): @@ -280,7 +295,7 @@ def _generate_proposals(images_shape, proposal_nums): for i, image_shape in enumerate(images_shape): proposals = _generate_boxes(image_shape, proposal_nums) rpn_rois.append(proposals) - num_proposals += len(proposals) + num_proposals = len(proposals) rpn_rois_lod.append(num_proposals) return rpn_rois, [rpn_rois_lod] @@ -294,7 +309,11 @@ def _generate_groundtruth(images_shape, class_nums, gt_nums): gt_classes = np.random.randint( low=1, high=class_nums, size=gt_nums).astype(np.int32) gt_boxes = _generate_boxes(image_shape, gt_nums) - ground_truth.append(dict(gt_classes=gt_classes, boxes=gt_boxes)) + is_crowd = np.zeros((gt_nums), dtype=np.int32) + is_crowd[0] = 1 + ground_truth.append( + dict( + gt_classes=gt_classes, boxes=gt_boxes, is_crowd=is_crowd)) num_gts += len(gt_classes) gts_lod.append(num_gts) return ground_truth, [gts_lod] diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py similarity index 88% rename from python/paddle/fluid/tests/unittests/test_generate_proposals.py rename to python/paddle/fluid/tests/unittests/test_generate_proposals_op.py index 3fbd2ce95a4f22b91cd4955f914e12f422b0ee83..86e27fe29ed945ec77fbbcdbd1c7cc6ecfba0fd5 100644 --- a/python/paddle/fluid/tests/unittests/test_generate_proposals.py +++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py @@ -114,10 +114,10 @@ def box_coder(all_anchors, bbox_deltas, variances): #anchor_loc: width, height, center_x, center_y anchor_loc = np.zeros_like(bbox_deltas, dtype=np.float32) - anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] - anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] - anchor_loc[:, 2] = (all_anchors[:, 2] + all_anchors[:, 0]) / 2 - anchor_loc[:, 3] = (all_anchors[:, 3] + all_anchors[:, 1]) / 2 + anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] + 1 + anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] + 1 + anchor_loc[:, 2] = all_anchors[:, 0] + 0.5 * anchor_loc[:, 0] + anchor_loc[:, 3] = all_anchors[:, 1] + 0.5 * anchor_loc[:, 1] #predicted bbox: bbox_center_x, bbox_center_y, bbox_width, bbox_height pred_bbox = np.zeros_like(bbox_deltas, dtype=np.float32) @@ -127,23 +127,29 @@ def box_coder(all_anchors, bbox_deltas, variances): i, 0] + anchor_loc[i, 2] pred_bbox[i, 1] = variances[i, 1] * bbox_deltas[i, 1] * anchor_loc[ i, 1] + anchor_loc[i, 3] - pred_bbox[i, 2] = math.exp(variances[i, 2] * - bbox_deltas[i, 2]) * anchor_loc[i, 0] - pred_bbox[i, 3] = math.exp(variances[i, 3] * - bbox_deltas[i, 3]) * anchor_loc[i, 1] + pred_bbox[i, 2] = math.exp( + min(variances[i, 2] * bbox_deltas[i, 2], math.log( + 1000 / 16.0))) * anchor_loc[i, 0] + pred_bbox[i, 3] = math.exp( + min(variances[i, 3] * bbox_deltas[i, 3], math.log( + 1000 / 16.0))) * anchor_loc[i, 1] else: for i in range(bbox_deltas.shape[0]): pred_bbox[i, 0] = bbox_deltas[i, 0] * anchor_loc[i, 0] + anchor_loc[ i, 2] pred_bbox[i, 1] = bbox_deltas[i, 1] * anchor_loc[i, 1] + anchor_loc[ i, 3] - pred_bbox[i, 2] = math.exp(bbox_deltas[i, 2]) * anchor_loc[i, 0] - pred_bbox[i, 3] = math.exp(bbox_deltas[i, 3]) * anchor_loc[i, 1] + pred_bbox[i, 2] = math.exp( + min(bbox_deltas[i, 2], math.log(1000 / 16.0))) * anchor_loc[i, + 0] + pred_bbox[i, 3] = math.exp( + min(bbox_deltas[i, 3], math.log(1000 / 16.0))) * anchor_loc[i, + 1] proposals[:, 0] = pred_bbox[:, 0] - pred_bbox[:, 2] / 2 proposals[:, 1] = pred_bbox[:, 1] - pred_bbox[:, 3] / 2 - proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 + proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - 1 + proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 - 1 return proposals @@ -170,13 +176,16 @@ def filter_boxes(boxes, min_size, im_info): """Only keep boxes with both sides >= min_size and center within the image. """ # Scale min_size to match image scale - min_size *= im_info[2] + im_scale = im_info[2] + min_size = max(min_size, 1.0) ws = boxes[:, 2] - boxes[:, 0] + 1 hs = boxes[:, 3] - boxes[:, 1] + 1 + ws_orig_scale = (boxes[:, 2] - boxes[:, 0]) / im_scale + 1 + hs_orig_scale = (boxes[:, 3] - boxes[:, 1]) / im_scale + 1 x_ctr = boxes[:, 0] + ws / 2. y_ctr = boxes[:, 1] + hs / 2. - keep = np.where((ws >= min_size) & (hs >= min_size) & (x_ctr < im_info[1]) & - (y_ctr < im_info[0]))[0] + keep = np.where((ws_orig_scale >= min_size) & (hs_orig_scale >= min_size) & + (x_ctr < im_info[1]) & (y_ctr < im_info[0]))[0] return keep @@ -204,7 +213,7 @@ def iou(box_a, box_b): xb = min(xmax_a, xmax_b) yb = min(ymax_a, ymax_b) - inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0) + inter_area = max(xb - xa + 1, 0.0) * max(yb - ya + 1, 0.0) iou_ratio = inter_area / (area_a + area_b - inter_area) diff --git a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py index bd548009b3ada9512e4b5f7d7b61b67b0717a39b..f63dbcd3d7f6bfce3ccc1c42ae41afe42bfad003 100644 --- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py +++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py @@ -19,48 +19,58 @@ import numpy as np import paddle.fluid.core as core from op_test import OpTest from test_anchor_generator_op import anchor_generator_in_python -from test_generate_proposal_labels import _generate_groundtruth -from test_generate_proposal_labels import _bbox_overlaps, _box_to_delta - - -def rpn_target_assign(gt_anchor_iou, rpn_batch_size_per_im, - rpn_positive_overlap, rpn_negative_overlap, fg_fraction): - iou = np.transpose(gt_anchor_iou) - anchor_to_gt_max = iou.max(axis=1) - anchor_to_gt_argmax = iou.argmax(axis=1) - - gt_to_anchor_argmax = iou.argmax(axis=0) - gt_to_anchor_max = iou[gt_to_anchor_argmax, np.arange(iou.shape[1])] - anchors_with_max_overlap = np.where(iou == gt_to_anchor_max)[0] - - tgt_lbl = np.ones((iou.shape[0], ), dtype=np.int32) * -1 - tgt_lbl[anchors_with_max_overlap] = 1 - tgt_lbl[anchor_to_gt_max >= rpn_positive_overlap] = 1 - - num_fg = int(fg_fraction * rpn_batch_size_per_im) - fg_inds = np.where(tgt_lbl == 1)[0] - if len(fg_inds) > num_fg: +from test_generate_proposal_labels_op import _generate_groundtruth +from test_generate_proposal_labels_op import _bbox_overlaps, _box_to_delta + + +def rpn_target_assign(anchor_by_gt_overlap, + rpn_batch_size_per_im, + rpn_positive_overlap, + rpn_negative_overlap, + rpn_fg_fraction, + use_random=True): + anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1) + anchor_to_gt_max = anchor_by_gt_overlap[np.arange( + anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax] + + gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0) + gt_to_anchor_max = anchor_by_gt_overlap[gt_to_anchor_argmax, np.arange( + anchor_by_gt_overlap.shape[1])] + anchors_with_max_overlap = np.where( + anchor_by_gt_overlap == gt_to_anchor_max)[0] + + labels = np.ones((anchor_by_gt_overlap.shape[0], ), dtype=np.int32) * -1 + labels[anchors_with_max_overlap] = 1 + labels[anchor_to_gt_max >= rpn_positive_overlap] = 1 + + num_fg = int(rpn_fg_fraction * rpn_batch_size_per_im) + fg_inds = np.where(labels == 1)[0] + if len(fg_inds) > num_fg and use_random: disable_inds = np.random.choice( fg_inds, size=(len(fg_inds) - num_fg), replace=False) - tgt_lbl[disable_inds] = -1 - fg_inds = np.where(tgt_lbl == 1)[0] + else: + disable_inds = fg_inds[num_fg:] + labels[disable_inds] = -1 + fg_inds = np.where(labels == 1)[0] - num_bg = rpn_batch_size_per_im - np.sum(tgt_lbl == 1) + num_bg = rpn_batch_size_per_im - np.sum(labels == 1) bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0] - tgt_lbl[bg_inds] = 0 - if len(bg_inds) > num_bg: + if len(bg_inds) > num_bg and use_random: enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)] - tgt_lbl[enable_inds] = 0 - bg_inds = np.where(tgt_lbl == 0)[0] - tgt_lbl[bg_inds] = 0 + else: + enable_inds = bg_inds[:num_bg] + labels[enable_inds] = 0 + fg_inds = np.where(labels == 1)[0] + bg_inds = np.where(labels == 0)[0] loc_index = fg_inds score_index = np.hstack((fg_inds, bg_inds)) - tgt_lbl = np.expand_dims(tgt_lbl, axis=1) + labels = labels[score_index] + assert not np.any(labels == -1), "Wrong labels with -1" gt_inds = anchor_to_gt_argmax[fg_inds] - return loc_index, score_index, tgt_lbl, gt_inds + return loc_index, score_index, labels, gt_inds def get_anchor(n, c, h, w): @@ -75,85 +85,129 @@ def get_anchor(n, c, h, w): return anchors -def rpn_blob(anchor, gt_boxes, iou, lod, rpn_batch_size_per_im, - rpn_positive_overlap, rpn_negative_overlap, fg_fraction): - - loc_indexes = [] - score_indexes = [] - tmp_tgt_labels = [] - tgt_bboxes = [] - anchor_num = anchor.shape[0] - +def rpn_target_assign_in_python(all_anchors, + gt_boxes, + is_crowd, + im_info, + lod, + rpn_straddle_thresh, + rpn_batch_size_per_im, + rpn_positive_overlap, + rpn_negative_overlap, + rpn_fg_fraction, + use_random=True): + anchor_num = all_anchors.shape[0] batch_size = len(lod) - 1 for i in range(batch_size): + im_height = im_info[i][0] + im_width = im_info[i][1] + im_scale = im_info[i][2] + if rpn_straddle_thresh >= 0: + # Only keep anchors inside the image by a margin of straddle_thresh + inds_inside = np.where( + (all_anchors[:, 0] >= -rpn_straddle_thresh) & + (all_anchors[:, 1] >= -rpn_straddle_thresh) & ( + all_anchors[:, 2] < im_width + rpn_straddle_thresh) & ( + all_anchors[:, 3] < im_height + rpn_straddle_thresh))[0] + # keep only inside anchors + inside_anchors = all_anchors[inds_inside, :] + else: + inds_inside = np.arange(all_anchors.shape[0]) + inside_anchors = all_anchors + b, e = lod[i], lod[i + 1] - iou_slice = iou[b:e, :] - bboxes_slice = gt_boxes[b:e, :] + gt_boxes_slice = gt_boxes[b:e, :] * im_scale + is_crowd_slice = is_crowd[b:e] - loc_idx, score_idx, tgt_lbl, gt_inds = rpn_target_assign( - iou_slice, rpn_batch_size_per_im, rpn_positive_overlap, - rpn_negative_overlap, fg_fraction) + not_crowd_inds = np.where(is_crowd_slice == 0)[0] + gt_boxes_slice = gt_boxes_slice[not_crowd_inds] + iou = _bbox_overlaps(inside_anchors, gt_boxes_slice) - fg_bboxes = bboxes_slice[gt_inds] - fg_anchors = anchor[loc_idx] - box_deltas = _box_to_delta(fg_anchors, fg_bboxes, [1., 1., 1., 1.]) + loc_inds, score_inds, labels, gt_inds = rpn_target_assign( + iou, rpn_batch_size_per_im, rpn_positive_overlap, + rpn_negative_overlap, rpn_fg_fraction, use_random) + # unmap to all anchor + loc_inds = inds_inside[loc_inds] + score_inds = inds_inside[score_inds] + + sampled_gt = gt_boxes_slice[gt_inds] + sampled_anchor = all_anchors[loc_inds] + box_deltas = _box_to_delta(sampled_anchor, sampled_gt, [1., 1., 1., 1.]) if i == 0: - loc_indexes = loc_idx - score_indexes = score_idx - tmp_tgt_labels = tgt_lbl + loc_indexes = loc_inds + score_indexes = score_inds + tgt_labels = labels tgt_bboxes = box_deltas else: loc_indexes = np.concatenate( - [loc_indexes, loc_idx + i * anchor_num]) + [loc_indexes, loc_inds + i * anchor_num]) score_indexes = np.concatenate( - [score_indexes, score_idx + i * anchor_num]) - tmp_tgt_labels = np.concatenate([tmp_tgt_labels, tgt_lbl]) + [score_indexes, score_inds + i * anchor_num]) + tgt_labels = np.concatenate([tgt_labels, labels]) tgt_bboxes = np.vstack([tgt_bboxes, box_deltas]) - tgt_labels = tmp_tgt_labels[score_indexes] return loc_indexes, score_indexes, tgt_bboxes, tgt_labels class TestRpnTargetAssignOp(OpTest): def setUp(self): n, c, h, w = 2, 4, 14, 14 - anchor = get_anchor(n, c, h, w) + all_anchors = get_anchor(n, c, h, w) gt_num = 10 - anchor = anchor.reshape(-1, 4) - anchor_num = anchor.shape[0] - - im_shapes = [[64, 64], [64, 64]] - gt_box, lod = _generate_groundtruth(im_shapes, 3, 4) - bbox = np.vstack([v['boxes'] for v in gt_box]) - - iou = _bbox_overlaps(bbox, anchor) - - anchor = anchor.astype('float32') - bbox = bbox.astype('float32') - iou = iou.astype('float32') - - loc_index, score_index, tgt_bbox, tgt_lbl = rpn_blob( - anchor, bbox, iou, [0, 4, 8], 25600, 0.95, 0.03, 0.25) + all_anchors = all_anchors.reshape(-1, 4) + anchor_num = all_anchors.shape[0] + + images_shape = [[64, 64], [64, 64]] + #images_shape = [[64, 64]] + groundtruth, lod = _generate_groundtruth(images_shape, 3, 4) + lod = [0, 4, 8] + #lod = [0, 4] + + im_info = np.ones((len(images_shape), 3)).astype(np.float32) + for i in range(len(images_shape)): + im_info[i, 0] = images_shape[i][0] + im_info[i, 1] = images_shape[i][1] + im_info[i, 2] = 0.8 #scale + gt_boxes = np.vstack([v['boxes'] for v in groundtruth]) + is_crowd = np.hstack([v['is_crowd'] for v in groundtruth]) + + all_anchors = all_anchors.astype('float32') + gt_boxes = gt_boxes.astype('float32') + + rpn_straddle_thresh = 0.0 + rpn_batch_size_per_im = 256 + rpn_positive_overlap = 0.7 + rpn_negative_overlap = 0.3 + rpn_fg_fraction = 0.5 + use_random = False + + loc_index, score_index, tgt_bbox, labels = rpn_target_assign_in_python( + all_anchors, gt_boxes, is_crowd, im_info, lod, rpn_straddle_thresh, + rpn_batch_size_per_im, rpn_positive_overlap, rpn_negative_overlap, + rpn_fg_fraction, use_random) + labels = labels[:, np.newaxis] self.op_type = "rpn_target_assign" self.inputs = { - 'Anchor': anchor, - 'GtBox': (bbox, [[4, 4]]), - 'DistMat': (iou, [[4, 4]]), + 'Anchor': all_anchors, + 'GtBoxes': (gt_boxes, [[4, 4]]), + 'IsCrowd': (is_crowd, [[4, 4]]), + 'ImInfo': (im_info, [[1, 1]]) } self.attrs = { - 'rpn_batch_size_per_im': 25600, - 'rpn_positive_overlap': 0.95, - 'rpn_negative_overlap': 0.03, - 'fg_fraction': 0.25, - 'fix_seed': True + 'rpn_batch_size_per_im': rpn_batch_size_per_im, + 'rpn_straddle_thresh': rpn_straddle_thresh, + 'rpn_positive_overlap': rpn_positive_overlap, + 'rpn_negative_overlap': rpn_negative_overlap, + 'rpn_fg_fraction': rpn_fg_fraction, + 'use_random': use_random } self.outputs = { 'LocationIndex': loc_index.astype('int32'), 'ScoreIndex': score_index.astype('int32'), 'TargetBBox': tgt_bbox.astype('float32'), - 'TargetLabel': tgt_lbl.astype('int64'), + 'TargetLabel': labels.astype('int32') } def test_check_output(self): diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py index 134df38eea6655857db04dfdc19dd7f7897946f4..4e6ed3a74b344da068bbfb60707838a1b4fc40fd 100644 --- a/python/paddle/fluid/tests/unittests/test_slice_op.py +++ b/python/paddle/fluid/tests/unittests/test_slice_op.py @@ -41,6 +41,9 @@ class TestSliceOp(OpTest): def test_check_output(self): self.check_output() + def test_check_grad_normal(self): + self.check_grad(['Input'], 'Out', max_relative_error=0.006) + class TestCase1(TestSliceOp): def config(self): diff --git a/python/paddle/fluid/transpiler/details/__init__.py b/python/paddle/fluid/transpiler/details/__init__.py index 5e98266a761c7e01bd6668e85e6adeb54103ca80..f33c05ed2f48c2498b98fc486d6ff7471088d77e 100644 --- a/python/paddle/fluid/transpiler/details/__init__.py +++ b/python/paddle/fluid/transpiler/details/__init__.py @@ -16,3 +16,4 @@ from __future__ import print_function from .program_utils import * from .ufind import * +from .checkport import * diff --git a/python/paddle/fluid/transpiler/details/checkport.py b/python/paddle/fluid/transpiler/details/checkport.py new file mode 100644 index 0000000000000000000000000000000000000000..7bad4b427a2d53bd14c7a1f870ce74a883158d04 --- /dev/null +++ b/python/paddle/fluid/transpiler/details/checkport.py @@ -0,0 +1,50 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import time +import socket +from contextlib import closing + + +def wait_server_ready(endpoints): + """ + Wait until parameter servers are ready, use connext_ex to detect + port readiness. + + Args: + endpoints (list): endpoints string list, like: + ["127.0.0.1:8080", "127.0.0.1:8081"] + + Examples: + .. code-block:: python + + wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"]) + """ + while True: + all_ok = True + for ep in endpoints: + ip_port = ep.split(":") + with closing(socket.socket(socket.AF_INET, + socket.SOCK_STREAM)) as sock: + sock.settimeout(2) + result = sock.connect_ex((ip_port[0], int(ip_port[1]))) + if result != 0: + all_ok = False + if not all_ok: + sys.stderr.write("pserver not ready, wait 3 sec to retry...\n") + sys.stderr.flush() + time.sleep(3) + else: + break diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index d4d218d547a394a56c040ade2a9ba703b691b86b..53c9cbe23dd82af866658fe46d1d631b0a3b26f3 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -381,7 +381,7 @@ class DistributeTranspiler(object): pserver_endpoints) self._split_table_grad_and_add_send_vars(program, pserver_endpoints) - def get_trainer_program(self): + def get_trainer_program(self, wait_port=True): """ Get transpiled trainer side program. @@ -393,6 +393,9 @@ class DistributeTranspiler(object): delete_ops(self.origin_program.global_block(), self.optimize_ops) self.origin_program.__str__() + if wait_port: + wait_server_ready(self.pserver_endpoints) + return self.origin_program def _get_trainer_startup_program(self, recv_vars, eplist): diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index adad2428f7fdc554cf4efd652f52b5c5de0ab527..49ba2cfd55bc881ed753fcefbd41f5b8fd4ebaf7 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -65,8 +65,43 @@ class InferenceTranspiler(object): if use_mkldnn: self._fuse_conv_bias_mkldnn(program) self._fuse_conv_relu_mkldnn(program) + self._fuse_conv_eltwise_mkldnn(program) + self._fuse_conv_relu_mkldnn( + program) # ResNet residual block merging self._fuse_bn_relu_mkldnn(program) + def _fuse_conv_eltwise_mkldnn(self, program): + ''' + Transpile the program fusing elementwise_add into conv for MKLDNN + program. Elementwise add following convolution OP can be fused by adding + 'fuse_eltwise' attribute to convolution OP and replacing its output + Tensor with second parameter of elementwise_add. + The result of fuse is: + - before: + - conv->elementwise_add->any_other_op + - after: + - conv->any_other_op + :param program: program to transpile + :type program: Program + ''' + self.block = program.block(0) + + i = 0 + while i < len(self.block.ops): + current_op = self.block.ops[i] + if current_op.type in ['conv2d']: + next_op = self.block.ops[i + 1] + if next_op.type == 'elementwise_add': + self._fuse_conv_eltwise(current_op, next_op) + self.block._remove_op(i + 1) # Remove elementwise_add + i = i + 1 + self._adjust_input() + self._remove_unused_var() + # TODO(luotao): use clone() method to flush the program.desc in force, + # since some large program.desc will not be flushed immediately. + # And a better solution will be considered later. + program = program.clone() + def _fuse_conv_relu_mkldnn(self, program): ''' Transpile the program by fused relu activation for MKLDNN program. @@ -88,9 +123,9 @@ class InferenceTranspiler(object): if current_op.type in ['conv2d']: next_op = self.block.ops[i + 1] if next_op.type == 'relu': - # modify conv OP to include relu + # modify bnorm OP to include relu current_op.set_attr("fuse_relu", True) - # remove conv OP + # remove relu OP self.block._remove_op(i + 1) i = i + 1 @@ -409,6 +444,20 @@ class InferenceTranspiler(object): outputs={"Output": out_var}, attrs=attrs) + def _fuse_conv_eltwise(self, conv_op, eltwise_op): + ''' + fuse the conv op with elementwise_add + + :param conv_op: convolution operator + :type conv_op: Operator + :param eltwise_op: operator adding data from skip connection + :type eltwise_op: Operator + ''' + + conv_op.set_attr("fuse_eltwise", True) + self.input_map[conv_op.output("Output")[0]] = eltwise_op.input("Y")[0] + self.input_map[eltwise_op.output("Out")[0]] = eltwise_op.input("Y")[0] + def _adjust_input(self): for i in range(len(self.block.ops)): current_op = self.block.ops[i]