diff --git a/CMakeLists.txt b/CMakeLists.txt index 8e7ffe72b5fb846fb55ab8dc4809d87a40cfe06c..6bb0e5f51f4bee20905016579a99715859ab37c5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,6 +24,8 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") +message(STATUS "AR tools: ${CMAKE_AR}") + if(WIN32) set(CMAKE_SUPPRESS_REGENERATION ON) set(CMAKE_STATIC_LIBRARY_PREFIX lib) diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 11a5b1b4554e7899c3ee7092a9295234743750d7..34c6cbd73ddd67860ef4e74ad7ce98b9b954d9ad 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -110,7 +110,7 @@ function(op_library TARGET) # Define operators that don't need pybind here. foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" -"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op") +"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op") if ("${TARGET}" STREQUAL "${manual_pybind_op}") set(pybind_flag 1) endif() diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 68c6c8fd67db9f4428d612e86305fa0ba5f98a50..66fc323e6b96b065e4d44e2145dd277769ba91f8 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -68,7 +68,7 @@ paddle.fluid.initializer.MSRAInitializer.__init__ (ArgSpec(args=['self', 'unifor paddle.fluid.initializer.force_init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '6d0f3e22c90d9d500d36ff57daf056ee')) paddle.fluid.initializer.init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'a6d7011ca3d8c0d454dac3a56eae0c29')) paddle.fluid.initializer.NumpyArrayInitializer.__init__ (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '1929058262994f212620599c63aea6bd')) +paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '424e898365195e3ccbc2e7dc8b63605e')) paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', '89c2c55a0b0656b106064048e068e77a')) paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', 'dfbb624f85015df29e994ca6999e8ff6')) paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'b4b608b986eb9617aa0525e1be21d32d')) @@ -91,7 +91,7 @@ paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'po paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '043de7333b79ee0ac55053c14ed81625')) paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '859b887174d06f361658f69cb7c06d95')) paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '120f4323a3d7ed9c0916f15a59f0e497')) -paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', 'c527b71b8a4c60dca8df8a745c2b598d')) +paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '320c6973b02ea179fa89fecc80796464')) paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', 'e45e09e65a2658e07cad987222f0d9ab')) paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b0b8d53821716cd50c42e09b593f3feb')) paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '03993955ab1e6d3044c44e6f17fc85e9')) @@ -330,7 +330,8 @@ paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes', paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1')) paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e')) paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b')) -paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691')) +paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gtscore', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', '57fa96922e42db8f064c3fb77f2255e8')) +paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5566169a5ab993d177792c023c7fb340')) paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e')) paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0')) paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d')) @@ -367,7 +368,7 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array (ArgSpec(args=['self', 'init', paddle.fluid.contrib.BeamSearchDecoder.update_array (ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None), ('document', '5754e9b3212b7c09497151516a0de5a7')) paddle.fluid.contrib.memory_usage (ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '8fcb2f93bb743693baa8d4860a5ccc47')) paddle.fluid.contrib.op_freq_statistic (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4d43687113c4bf5b29d15aee2f4e4afa')) -paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000)), ('document', '14b39f1fcd5667ff556b1aad94357d1d')) +paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size', 'moving_rate'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000, 0.9)), ('document', '14b39f1fcd5667ff556b1aad94357d1d')) paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)), ('document', '909675a1ab055c69b436a7893fcae4fd')) paddle.fluid.contrib.QuantizeTranspiler.training_transpile (ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6dd9909f10b283ba2892a99058a72884')) @@ -392,9 +393,9 @@ paddle.fluid.contrib.MagnitudePruner.__init__ (ArgSpec(args=['self', 'threshold' paddle.fluid.contrib.MagnitudePruner.prune (ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.RatioPruner.__init__ (ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e7a81a325b296a9ca502ee5adb4fc85d')) paddle.fluid.contrib.RatioPruner.prune (ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,)), ('document', '358cbf2978c91028fb96a195a9884645')) -paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '11fbf7e8dd2289805de291b453a33ee7')) -paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '5b5577bb3d24070da819674255d16196')) -paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4efbd93876832d4d35497cdbc7a1e6d8')) +paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '2ab36d4f7a564f5f65e455807ad06c67')) +paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '59066bac9db0ac6ce414d05780b7333f')) +paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '74c39c595dc70d6be2f16d8e462d282b')) paddle.fluid.contrib.HDFSClient.__init__ (ArgSpec(args=['self', 'hadoop_home', 'configs'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.HDFSClient.delete (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', 'c3721aa2d4d9ef5a857dd47b2681c03e')) paddle.fluid.contrib.HDFSClient.download (ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'unzip'], varargs=None, keywords=None, defaults=(False, False)), ('document', 'ca55bde92184d3fd0f9f5c963b25e634')) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 2cfc76e47f41862731fb2de5d1d03287acd4d9d7..932d0b4538eb2ec5df97d0bde806c33f825b6f68 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include #include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/multi_devices_graph_pass.h" @@ -49,6 +50,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { AppendPass("sequential_execution_pass"); } + // Add op fusion. + if (strategy.sync_batch_norm_) { + AppendPass("sync_batch_norm_pass"); + } + // Add op fusion. if (strategy.fuse_relu_depthwise_conv_) { AppendPass("fuse_relu_depthwise_conv_pass"); @@ -227,6 +233,7 @@ std::unique_ptr BuildStrategy::Apply( } // namespace framework } // namespace paddle +USE_PASS(sync_batch_norm_pass); USE_PASS(fuse_relu_depthwise_conv_pass); USE_PASS(fuse_elewise_add_act_pass); USE_PASS(graph_viz_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index d755a2505aead37538bef2b01a193dba87dc1567..122411641dacde57ef3851f05bc92d86c1f83866 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -77,6 +77,8 @@ struct BuildStrategy { bool fuse_relu_depthwise_conv_{false}; + bool sync_batch_norm_{false}; + bool memory_optimize_{true}; // TODO(dzhwinter): // make enable_inplace, memory_optimize_ diff --git a/paddle/fluid/framework/details/memory_optimize_helper.cc b/paddle/fluid/framework/details/memory_optimize_helper.cc index c89a33fc959247afb74dab49056fc3fca8b9bd89..533d3269be350de35954e575965fe7a089941058 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper.cc +++ b/paddle/fluid/framework/details/memory_optimize_helper.cc @@ -337,7 +337,6 @@ bool NodeCanReused(const VarDesc& node) { auto type = node.GetType(); // only these types holds bulk of gpu memory if (!(type == proto::VarType::LOD_TENSOR || - type == proto::VarType::SELECTED_ROWS || type == proto::VarType::LOD_TENSOR_ARRAY)) { return false; } diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h index 9bccb1a32bf63b30351ef4428594691b0eef0b6a..f2f4c53eea2150b68f15d2a655809d94611b2034 100644 --- a/paddle/fluid/framework/grad_op_desc_maker.h +++ b/paddle/fluid/framework/grad_op_desc_maker.h @@ -14,7 +14,9 @@ limitations under the License. */ #pragma once #include +#include #include +#include #include #include #include "paddle/fluid/framework/op_desc.h" @@ -55,11 +57,11 @@ class GradOpDescMakerBase { std::back_inserter(ret_val), [this](const std::string& fwd_var_name) -> std::string { auto g_name = GradVarName(fwd_var_name); - if (no_grad_set_.count(g_name)) { - return kEmptyVarName; - } else { + if (no_grad_set_.empty() || !no_grad_set_.count(g_name)) { (*this->grad_to_var_)[g_name] = fwd_var_name; return g_name; + } else { + return kEmptyVarName; } }); if (!drop_empty_grad) { diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index ca6b0229e906c0f8bfbf9ee6781013cb4ef7bbce..3808dd5fbaeaa3671e50bb31ea532251a1e9f6fc 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -46,6 +46,8 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass) pass_library(graph_to_program_pass base) pass_library(graph_viz_pass base) pass_library(lock_free_optimize_pass base) +pass_library(cpu_quantize_pass inference) +pass_library(cpu_quantize_squash_pass inference) pass_library(fc_fuse_pass inference) pass_library(attention_lstm_fuse_pass inference) pass_library(infer_clean_graph_pass inference) @@ -66,6 +68,7 @@ pass_library(conv_elementwise_add_fuse_pass inference) pass_library(conv_affine_channel_fuse_pass inference) pass_library(transpose_flatten_concat_fuse_pass inference) pass_library(identity_scale_op_clean_pass base) +pass_library(sync_batch_norm_pass base) # There may be many transpose-flatten structures in a model, and the output of # these structures will be used as inputs to the concat Op. This pattern will @@ -100,6 +103,11 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto) cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass) +cc_test(test_cpu_quantize_pass SRCS cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor) +cc_test(test_cpu_quantize_squash_pass SRCS cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor) +if(NOT WIN32) + cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass) +endif() if (WITH_MKLDNN) cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass) cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor) diff --git a/paddle/fluid/framework/ir/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/cpu_quantize_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..edfaf47f018a61d72aa3764185f2c185722b553f --- /dev/null +++ b/paddle/fluid/framework/ir/cpu_quantize_pass.cc @@ -0,0 +1,239 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/cpu_quantize_pass.h" +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace framework { +namespace ir { + +namespace { + +void UnlinkNodes(ir::Node* a, ir::Node* b) { + a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b), + a->outputs.end()); + b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a), + b->inputs.end()); +} + +} // namespace + +enum { U8_MAX = 255, S8_MAX = 127 }; + +using EigenVectorArrayMap = Eigen::Map>; +using string::PrettyLogDetail; + +void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input, + std::string input_name, double scale_to_one, + bool is_unsigned, + std::string scale_attr_name) const { + unsigned max = is_unsigned ? U8_MAX : S8_MAX; + float scale = scale_to_one * max; + + // Create quantize output variable + VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out")); + auto* quantize_out_node = g->CreateVarNode(&quantize_out_desc); + + // create a quantize op node + OpDesc q_desc; + q_desc.SetType("quantize"); + q_desc.SetInput("Input", std::vector({input->Name()})); + q_desc.SetOutput("Output", + std::vector({quantize_out_node->Name()})); + q_desc.SetAttr("Scale", scale); + q_desc.SetAttr("is_negative_input", !is_unsigned); + auto quantize_op = g->CreateOpNode(&q_desc); // OpDesc will be copied. + + // update op's input + op->Op()->SetInput(input_name, + std::vector({quantize_out_node->Name()})); + + // link quantize op + UnlinkNodes(input, op); + IR_NODE_LINK_TO(input, quantize_op); + IR_NODE_LINK_TO(quantize_op, quantize_out_node); + IR_NODE_LINK_TO(quantize_out_node, op); + + if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale); +} + +void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output, + std::string output_name, + double scale_to_one, bool is_unsigned, + std::string scale_attr_name) const { + unsigned max = is_unsigned ? U8_MAX : S8_MAX; + float scale = scale_to_one * max; + + // Create dequantize input variable + VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in")); + auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc); + + // create a dequantize op node for output. + OpDesc deq_desc; + deq_desc.SetType("dequantize"); + deq_desc.SetInput("Input", + std::vector({dequantize_in_node->Name()})); + deq_desc.SetOutput("Output", std::vector({output->Name()})); + deq_desc.SetAttr("Scale", scale); + auto dequantize_op = g->CreateOpNode(&deq_desc); // OpDesc will be copied. + + // update op's output + op->Op()->SetOutput(output_name, + std::vector({dequantize_in_node->Name()})); + + // link dequantize op + UnlinkNodes(op, output); + IR_NODE_LINK_TO(op, dequantize_in_node); + IR_NODE_LINK_TO(dequantize_in_node, dequantize_op); + IR_NODE_LINK_TO(dequantize_op, output); + + if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale); +} + +void CPUQuantizePass::QuantizeConv(Graph* graph, + bool with_residual_data) const { + GraphPatternDetector gpd; + auto pattern = gpd.mutable_pattern(); + patterns::ConvResidual conv_pattern{pattern, name_scope_}; + conv_pattern(with_residual_data); + + int quantize_conv_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "Quantize conv2d op"; + GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern); + auto* conv_op_desc = conv_op->Op(); + + // skip if should not be quantized + if (!conv_op_desc->HasAttr("use_quantizer") || + !boost::get(conv_op_desc->GetAttr("use_quantizer"))) + return; + + GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); + + // get scales calculated after warmup, they scale variables to MAX=1.0 + auto scales = Get("quant_var_scales"); + + auto input_scale = scales[conv_input->Name()].second.data()[0]; + bool is_input_unsigned = scales[conv_input->Name()].first; + QuantizeInput(g, conv_op, conv_input, "Input", input_scale, + is_input_unsigned, "Scale_in"); + + auto filter_scale_tensor = scales[conv_filter->Name()].second; + EigenVectorArrayMap eigen_tensor{filter_scale_tensor.data(), + filter_scale_tensor.numel(), 1}; + eigen_tensor *= static_cast(S8_MAX); + std::vector filter_scale{ + filter_scale_tensor.data(), + filter_scale_tensor.data() + filter_scale_tensor.numel()}; + + conv_op->Op()->SetAttr("Scale_weights", filter_scale); + + if (with_residual_data) { + GET_IR_NODE_FROM_SUBGRAPH(conv_residual_data, conv_residual_data, + conv_pattern); + auto residual_scale = + scales[conv_residual_data->Name()].second.data()[0]; + bool is_residual_unsigned = scales[conv_residual_data->Name()].first; + + QuantizeInput(g, conv_op, conv_residual_data, "ResidualData", + residual_scale, is_residual_unsigned, "Scale_in_eltwise"); + } + + auto output_scale = scales[conv_output->Name()].second.data()[0]; + bool is_output_unsigned = scales[conv_output->Name()].first; + DequantizeOutput(g, conv_op, conv_output, "Output", output_scale, + is_output_unsigned, "Scale_out"); + + ++quantize_conv_count; + }; + + gpd(graph, handler); + AddStatis(quantize_conv_count); + + std::stringstream msg_ss; + msg_ss << "--- quantized " << quantize_conv_count << " conv2d ops"; + if (with_residual_data) msg_ss << " with residual connection"; + PrettyLogDetail(msg_ss.str().c_str()); +} + +void CPUQuantizePass::QuantizePool(Graph* graph) const { + GraphPatternDetector gpd; + auto pattern = gpd.mutable_pattern(); + patterns::Pool pool_pattern{pattern, name_scope_}; + pool_pattern(); + + int quantize_pool_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "Quantize pool2d op"; + GET_IR_NODE_FROM_SUBGRAPH(pool_op, pool_op, pool_pattern); + auto* pool_op_desc = pool_op->Op(); + + // skip if should not be quantized + if (!pool_op_desc->HasAttr("use_quantizer") || + !boost::get(pool_op_desc->GetAttr("use_quantizer"))) + return; + + GET_IR_NODE_FROM_SUBGRAPH(pool_input, pool_input, pool_pattern); + GET_IR_NODE_FROM_SUBGRAPH(pool_output, pool_output, pool_pattern); + + // get scales calculated after warmup, they scale variables to MAX=1.0 + auto scales = Get("quant_var_scales"); + + auto input_scale = scales[pool_input->Name()].second.data()[0]; + bool is_input_unsigned = scales[pool_input->Name()].first; + QuantizeInput(g, pool_op, pool_input, "X", input_scale, is_input_unsigned); + + auto output_scale = scales[pool_output->Name()].second.data()[0]; + bool is_output_unsigned = scales[pool_output->Name()].first; + DequantizeOutput(g, pool_op, pool_output, "Out", output_scale, + is_output_unsigned); + + ++quantize_pool_count; + }; + + gpd(graph, handler); + AddStatis(quantize_pool_count); + + PrettyLogDetail("--- quantized %d pool2d ops", quantize_pool_count); +} + +std::unique_ptr CPUQuantizePass::ApplyImpl( + std::unique_ptr graph) const { + VLOG(3) << "Quantizing the graph."; + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init(name_scope_, graph.get()); + + PADDLE_ENFORCE(param_scope()); + + QuantizeConv(graph.get(), true /* with_residual_data */); + QuantizeConv(graph.get()); + QuantizePool(graph.get()); + + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(cpu_quantize_pass, paddle::framework::ir::CPUQuantizePass) + .RequirePassAttr("quant_var_scales"); diff --git a/paddle/fluid/framework/ir/cpu_quantize_pass.h b/paddle/fluid/framework/ir/cpu_quantize_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..9873bb04e138a745ac6aa44cf5791651ad897444 --- /dev/null +++ b/paddle/fluid/framework/ir/cpu_quantize_pass.h @@ -0,0 +1,66 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Map variable name to tensor of scaling factors scaling it to MAX=1.0. + * bool denotes whether quantization of the variable should be done to unsigned + * type. + */ +using VarQuantScale = + std::unordered_map>; + +/* + * Quantize all supported operators. + */ +class CPUQuantizePass : public FusePassBase { + public: + virtual ~CPUQuantizePass() {} + + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; + + void QuantizeConv(Graph* graph, bool with_residual_data = false) const; + + void QuantizePool(Graph* graph) const; + + void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name, + double scale_to_one, bool is_unsigned, + std::string scale_attr_name = "") const; + + void DequantizeOutput(Graph* g, Node* op, Node* output, + std::string output_name, double scale_to_one, + bool is_unsigned, + std::string scale_attr_name = "") const; + + const std::string name_scope_{"quantize"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..89601be7d1c0f5c9d3c3dcefa4327be7c20a7d65 --- /dev/null +++ b/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc @@ -0,0 +1,211 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/cpu_quantize_pass.h" +#include +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, + const std::vector& inputs, + const std::vector& outputs, bool use_mkldnn, + bool use_quantizer = false) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + op->SetAttr("use_mkldnn", use_mkldnn); + op->SetAttr("name", name); + if (type == "conv2d") { + op->SetInput("Input", {inputs[0]}); + op->SetInput("Filter", {inputs[1]}); + if (inputs.size() > 2) + op->SetInput("Bias", {inputs[2]}); + else + op->SetInput("Bias", {}); + if (inputs.size() > 3) { + op->SetInput("ResidualData", {inputs[3]}); + op->SetAttr("fuse_residual_connection", true); + } else { + op->SetInput("ResidualData", {}); + op->SetAttr("fuse_residual_connection", false); + } + op->SetOutput("Output", {outputs[0]}); + op->SetAttr("use_quantizer", use_quantizer); + op->SetAttr("Scale_in", 1.0f); + op->SetAttr("Scale_out", 1.0f); + op->SetAttr("Scale_weights", std::vector{1.0f}); + } else if (type == "pool2d") { + op->SetInput("X", {inputs[0]}); + op->SetOutput("Out", {outputs[0]}); + op->SetAttr("use_quantizer", use_quantizer); + } else if (type == "dropout") { + op->SetInput("X", {inputs[0]}); + op->SetOutput("Out", {outputs[0]}); + } else if (type == "fc") { + op->SetInput("Input", {inputs[0]}); + if (inputs.size() > 1) op->SetInput("W", {inputs[1]}); + if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]}); + op->SetOutput("Out", {outputs[0]}); + } +} + +static const std::initializer_list variable_names{ + "a", "w1", "c", "d", "w2", "e", "f", "g", + "h", "w3", "b1", "i", "j", "w4", "b2"}; +// (a,w1)->Conv1->c and c->Pool1->d +// +// (d,w2)->Conv2->e and e->Pool2->f +// +// d->Dropout1->g and g->Fc1->h and (h,w3,b1,i)->Conv3->j +// +// (d,w4, b2)->Conv4->i +ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) { + ProgramDesc prog; + for (auto& v : variable_names) { + auto* var = prog.MutableBlock(0)->Var(v); + if (v.find("w") == 0 || v.find("b") == 0) { + var->SetPersistable(true); + } + } + + SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"c"}, use_mkldnn, + use_quantizer); + SetOp(&prog, "pool2d", "Pool1", {"c"}, {"d"}, use_mkldnn, use_quantizer); + + SetOp(&prog, "conv2d", "Conv2", {"d", "w2"}, {"e"}, use_mkldnn, + use_quantizer); + SetOp(&prog, "pool2d", "Pool2", {"e"}, {"f"}, use_mkldnn, use_quantizer); + + SetOp(&prog, "dropout", "Dropout1", {"d"}, {"g"}, use_mkldnn); + SetOp(&prog, "fc", "Fc1", {"g"}, {"h"}, use_mkldnn); + SetOp(&prog, "conv2d", "Conv3", {"h", "w3", "b1", "i"}, {"j"}, use_mkldnn, + use_quantizer); + + SetOp(&prog, "conv2d", "Conv4", {"c", "w4", "b2"}, {"i"}, use_mkldnn, + use_quantizer); + + return prog; +} + +void InitTensorHolder(Scope* scope, const paddle::platform::Place& place, + const char* var_name) { + auto x = scope->Var(var_name); + auto tensor = x->GetMutable(); + tensor->mutable_data(place, proto::VarType::FP32, + ::paddle::memory::Allocator::kDefault, 1); +} + +void MainTest(const ProgramDesc& prog, int conv_count, int pool_count, + int quant_count, int dequant_count, int added_nodes_count, + float scale) { + std::unique_ptr graph(new ir::Graph(prog)); + + // Init scope, as it is used in pass + auto place = paddle::platform::CPUPlace(); + NaiveExecutor exe{place}; + Scope scope; + exe.CreateVariables(prog, 0, true, &scope); + + auto* scales = new VarQuantScale(); + + for (auto& v : variable_names) { + InitTensorHolder(&scope, place, v.c_str()); + LoDTensor tensor; + tensor.Resize({1}); + auto* ptr = tensor.mutable_data(place); + ptr[0] = 2.0; + + (*scales)[v] = std::make_pair(false, std::move(tensor)); + } + + graph->Set(kParamScopeAttr, new framework::Scope*(&scope)); + + auto pass = PassRegistry::Instance().Get("cpu_quantize_pass"); + pass->Set("quant_var_scales", scales); + + int original_nodes_num = graph->Nodes().size(); + + graph = pass->Apply(std::move(graph)); + + int current_nodes_num = graph->Nodes().size(); + + int quantize_nodes_count = 0; + int dequantize_nodes_count = 0; + int conv2d_nodes_count = 0; + int pool2d_nodes_count = 0; + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op = node->Op(); + if (op->Type() == "conv2d") { + conv2d_nodes_count++; + auto op_name = boost::get(op->GetAttr("name")); + EXPECT_EQ(boost::get(op->GetAttr("Scale_in")), scale) + << "Scale_in for node '" + op_name + "'."; + EXPECT_EQ(boost::get(op->GetAttr("Scale_out")), scale) + << "Scale_out for node '" + op_name + "'."; + EXPECT_EQ( + boost::get>(op->GetAttr("Scale_weights"))[0], + scale) + << "Scale_weights for node '" + op_name + "'."; + } else if (op->Type() == "pool2d") { + pool2d_nodes_count++; + } else if (op->Type() == "quantize") { + quantize_nodes_count++; + } else if (op->Type() == "dequantize") { + dequantize_nodes_count++; + } + } + } + EXPECT_EQ(conv2d_nodes_count, conv_count); + EXPECT_EQ(pool2d_nodes_count, pool_count); + EXPECT_EQ(quantize_nodes_count, quant_count); + EXPECT_EQ(dequantize_nodes_count, dequant_count); + EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num); +} + +TEST(CpuQuantizePass, quantize) { + bool use_mkldnn = true; + bool use_quantizer = true; + // (a->QUANT1->IN1,w1)->Conv1->OUT1->DEQUANT1->c and + // c->QUANT2->IN2->Pool1->OUT2->DEQUANT2->d + // + // (d->QUANT3->IN3,w2)->Conv2->OUT3->DEQUANT3->e and + // e->QUANT4->IN4->Pool2->OUT4->DEQUANT4->f + // + // d->Dropout1->g and g->Fc1->h and + // (h->QUANT5->IN5,w3,b1,i->QUANT6->IN6)->Conv3->OUT5->DEQUANT5->j + // + // (d->QUANT7->IN7,w4, b2)->Conv4->DEQUANT6->OUT6->i + // Insert nodes: 7 Quant + 7 IN + 6 OUT + 6 DEQUANT + int added_nodes = 7 + 7 + 6 + 6; + MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 7, 6, added_nodes, + 2.0f * 127); +} + +TEST(CpuQuantizePass, do_not_quantize) { + bool use_mkldnn = true; + bool use_quantizer = false; + int added_nodes = 0; + MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 0, 0, added_nodes, + 1.0f); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(cpu_quantize_pass); diff --git a/paddle/fluid/framework/ir/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/cpu_quantize_squash_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..de62a69de4f25912c5f56973de0dca5343bbe906 --- /dev/null +++ b/paddle/fluid/framework/ir/cpu_quantize_squash_pass.cc @@ -0,0 +1,146 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file eint8_outcept in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either eint8_outpress or +// implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h" +#include +#include +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace framework { +namespace ir { + +using string::PrettyLogDetail; + +void CPUQuantizeSquashPass::FindNodesToKeep( + Graph* graph, + std::unordered_map* nodes_keep_counter) const { + GraphPatternDetector gpd; + patterns::DequantAny deq_any_pattern{gpd.mutable_pattern(), "deqant_any"}; + deq_any_pattern(); + + int found_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, deq_any_pattern); + + if (nodes_keep_counter->find(dequant_out) == nodes_keep_counter->end()) + (*nodes_keep_counter)[dequant_out] = 1; + else + (*nodes_keep_counter)[dequant_out] += 1; + + found_count++; + }; + gpd(graph, handler); + AddStatis(found_count); +} + +void CPUQuantizeSquashPass::Squash( + Graph* graph, + std::unordered_map* nodes_keep_counter) const { + GraphPatternDetector gpd; + patterns::DequantQuantAny squash_pattern{gpd.mutable_pattern(), "squash"}; + squash_pattern(); + + int found_squash_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "squash requantize-quantize ops pair"; + + GET_IR_NODE_FROM_SUBGRAPH(dequant_in, dequant_in, squash_pattern); + GET_IR_NODE_FROM_SUBGRAPH(dequant_op, dequant_op, squash_pattern); + GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, squash_pattern); + GET_IR_NODE_FROM_SUBGRAPH(quant_op, quant_op, squash_pattern); + GET_IR_NODE_FROM_SUBGRAPH(quant_out, quant_out, squash_pattern); + GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, squash_pattern); + + auto* next_op_desc = next_op->Op(); + float dequant_scale = boost::get(dequant_op->Op()->GetAttr("Scale")); + float quant_scale = boost::get(quant_op->Op()->GetAttr("Scale")); + PADDLE_ENFORCE(nodes_keep_counter->find(dequant_out) != + nodes_keep_counter->end()); + + // check if dequantize op should be kept or removed, decrease the counter + bool keep_dequant = (*nodes_keep_counter)[dequant_out]-- > 1; + + if (dequant_scale == quant_scale) { + // squash dequantize-quantize to nothing + auto quant_out_var_name = quant_out->Name(); + auto next_op_inputs = next_op_desc->InputNames(); + for (const auto& name : next_op_inputs) { + auto var_name = next_op_desc->Input(name)[0]; + if (var_name.compare(quant_out_var_name) == 0) { + next_op_desc->SetInput( + name, std::vector({dequant_in->Name()})); + break; + } + } + + if (keep_dequant) + GraphSafeRemoveNodes(graph, {quant_op, quant_out}); + else + GraphSafeRemoveNodes(graph, + {dequant_op, quant_op, dequant_out, quant_out}); + + IR_NODE_LINK_TO(dequant_in, next_op); + + found_squash_count++; + } else { + // squash dequantize-quantize to requantize op + OpDesc desc; + desc.SetType("requantize"); + desc.SetInput("Input", std::vector({dequant_in->Name()})); + desc.SetOutput("Output", std::vector({quant_out->Name()})); + desc.SetAttr("Scale_in", dequant_scale); + desc.SetAttr("Scale_out", quant_scale); + + auto requant_op = g->CreateOpNode(&desc); + + if (keep_dequant) + GraphSafeRemoveNodes(graph, {quant_op}); + else + GraphSafeRemoveNodes(graph, {dequant_op, quant_op, dequant_out}); + + IR_NODE_LINK_TO(dequant_in, requant_op); + IR_NODE_LINK_TO(requant_op, quant_out); + + found_squash_count++; + } + }; + gpd(graph, handler); + AddStatis(found_squash_count); + PrettyLogDetail("--- squashed %d dequantize-quantize pairs", + found_squash_count); +} + +std::unique_ptr CPUQuantizeSquashPass::ApplyImpl( + std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init("cpu_quantize_squash_pass", graph.get()); + + std::unordered_map nodes_keep_counter; + FindNodesToKeep(graph.get(), &nodes_keep_counter); + Squash(graph.get(), &nodes_keep_counter); + + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(cpu_quantize_squash_pass, + paddle::framework::ir::CPUQuantizeSquashPass); diff --git a/paddle/fluid/framework/ir/cpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/cpu_quantize_squash_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..b823a2cef35b2f9994df9c9473246db3d69843e7 --- /dev/null +++ b/paddle/fluid/framework/ir/cpu_quantize_squash_pass.h @@ -0,0 +1,58 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Squash dequantize->quantize pair pattern into requantize op + */ +class CPUQuantizeSquashPass : public FusePassBase { + public: + virtual ~CPUQuantizeSquashPass() {} + + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; + + /* + * For each dequantize's output find the number of operators it is an input to + */ + void FindNodesToKeep( + Graph* graph, + std::unordered_map* nodes_keep_counter) const; + + /* + * Squash dequantize-quantize ops pairs into requantize or nothing + */ + void Squash(Graph* graph, + std::unordered_map* nodes_keep_counter) const; + + const std::string name_scope_{"squash"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/cpu_quantize_squash_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..3a3eb53f79955b37f5f9af6a09b2f9c8e934aa3e --- /dev/null +++ b/paddle/fluid/framework/ir/cpu_quantize_squash_pass_tester.cc @@ -0,0 +1,179 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h" +#include +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, + const std::vector& inputs, + const std::vector& outputs, bool use_mkldnn, + float scale = 0) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + op->SetAttr("use_mkldnn", use_mkldnn); + op->SetAttr("name", name); + if (type == "conv2d") { + op->SetInput("Input", {inputs[0]}); + if (inputs.size() > 1) op->SetInput("Filter", {inputs[1]}); + if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]}); + op->SetOutput("Output", {outputs[0]}); + } else if (type == "quantize") { + op->SetInput("Input", {inputs[0]}); + op->SetOutput("Output", {outputs[0]}); + op->SetAttr("Scale", scale); + } else if (type == "dequantize") { + op->SetInput("Input", {inputs[0]}); + op->SetOutput("Output", {outputs[0]}); + op->SetAttr("Scale", scale); + } +} + +// (a,w1,b1)->Conv1->d +// d->Dequant->e +// e->Quant->f +// (f,w2,b2)->Conv2->i +ProgramDesc BuildProgramDesc(bool use_mkldnn, float scale1, float scale2) { + ProgramDesc prog; + for (auto& v : std::initializer_list( + {"a", "w1", "b1", "d", "e", "f", "w2", "b2", "i"})) { + auto* var = prog.MutableBlock(0)->Var(v); + if (v.find("w") == 0 || v.find("b") == 0) { + var->SetPersistable(true); + } + } + + SetOp(&prog, "conv2d", "Conv1", {"a", "w1", "b1"}, {"d"}, use_mkldnn); + SetOp(&prog, "dequantize", "Dequant", {"d"}, {"e"}, use_mkldnn, scale1); + SetOp(&prog, "quantize", "Quant", {"e"}, {"f"}, use_mkldnn, scale2); + SetOp(&prog, "conv2d", "Conv2", {"f", "w2", "b2"}, {"i"}, use_mkldnn); + return prog; +} + +static const std::initializer_list variable_names{ + "a", "b", "c", "d", "e", "f", "g", "h"}; +// a->Conv1->b +// b->Dequant->c +// +// c->Quant1->d and d->Conv2->e +// +// c->Conv3->f +// +// c->Quant2->g and g->Conv4->h +// +ProgramDesc BuildProgramDesc2(bool use_mkldnn, float scale1, float scale2, + float scale3) { + ProgramDesc prog; + for (auto& v : variable_names) { + prog.MutableBlock(0)->Var(v); + } + + SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn); + SetOp(&prog, "dequantize", "Dequant", {"b"}, {"c"}, use_mkldnn, scale1); + + SetOp(&prog, "quantize", "Quant1", {"c"}, {"d"}, use_mkldnn, scale2); + SetOp(&prog, "conv2d", "Conv2", {"d"}, {"e"}, use_mkldnn); + + SetOp(&prog, "conv2d", "Conv3", {"c"}, {"f"}, use_mkldnn); + + SetOp(&prog, "quantize", "Quant2", {"c"}, {"g"}, use_mkldnn, scale3); + SetOp(&prog, "conv2d", "Conv4", {"g"}, {"h"}, use_mkldnn); + + return prog; +} + +void InitTensorHolder(Scope* scope, const paddle::platform::Place& place, + const char* var_name) { + auto x = scope->Var(var_name); + auto tensor = x->GetMutable(); + tensor->mutable_data(place, proto::VarType::FP32, + ::paddle::memory::Allocator::kDefault, 1); +} + +void MainTest(const ProgramDesc& prog, int removed_nodes_num) { + std::unique_ptr graph(new ir::Graph(prog)); + + // Init scope, as it is used in pass + auto place = paddle::platform::CPUPlace(); + NaiveExecutor exe{place}; + Scope scope; + exe.CreateVariables(prog, 0, true, &scope); + + for (auto& v : variable_names) { + InitTensorHolder(&scope, place, v.c_str()); + } + + graph->Set(kParamScopeAttr, new framework::Scope*(&scope)); + + auto pass = PassRegistry::Instance().Get("cpu_quantize_squash_pass"); + + int original_nodes_num = graph->Nodes().size(); + + graph = pass->Apply(std::move(graph)); + + int current_nodes_num = graph->Nodes().size(); + + EXPECT_EQ(original_nodes_num - removed_nodes_num, current_nodes_num); +} + +TEST(CpuQuantizeSquashPass, equal_scales) { + auto scale = 1.2345f; + auto use_mkldnn = true; + // Remove 4 nodes: Dequant, Quant, e, f + auto remove_nodes = 4; + MainTest(BuildProgramDesc(use_mkldnn, scale, scale), remove_nodes); + + use_mkldnn = !use_mkldnn; + MainTest(BuildProgramDesc(use_mkldnn, scale, scale), remove_nodes); +} + +TEST(CpuQuantizeSquashPass, inequal_scales) { + auto scale1 = 1.2345f; + auto scale2 = 21.0f; + auto use_mkldnn = true; + // Remove 3 nodes: Dequant, Quant, e + // Insert 1 node: requantize + auto remove_nodes = 2; + MainTest(BuildProgramDesc(use_mkldnn, scale1, scale2), remove_nodes); + + use_mkldnn = !use_mkldnn; + MainTest(BuildProgramDesc(use_mkldnn, scale1, scale2), remove_nodes); +} + +TEST(CpuQuantizeSquashPass, branch_to_equal_inequal_and_fp32) { + // Delete both quantize ops, + // bypass dequantize in both branches, + // insert requantize on one branch + auto scale = 1.2345f; + auto scale2 = 21.0f; + auto use_mkldnn = true; + // Remove 3 nodes: Quant1, Quant2, g + // Insert 1 node: requantize + auto remove_nodes = 2; + MainTest(BuildProgramDesc2(use_mkldnn, scale, scale, scale2), remove_nodes); + + use_mkldnn = !use_mkldnn; + MainTest(BuildProgramDesc2(use_mkldnn, scale, scale, scale2), remove_nodes); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(cpu_quantize_squash_pass); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index c0c34d186b00814fe6c6fd42beb78133233a1357..b653e5a521eeb81d1ac3cb5cca1dc86025837ecd 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -90,7 +90,8 @@ void GraphPatternDetector::operator()(Graph *graph, ValidateByNodeRole(&subgraphs); if (subgraphs.empty()) return; - PrettyLogEndl(Style::detail(), "--- detect %d subgraphs", subgraphs.size()); + PrettyLogEndl(Style::detail(), "--- detected %d subgraphs", + subgraphs.size()); int id = 0; for (auto &g : subgraphs) { VLOG(3) << "optimizing #" << id++ << " subgraph"; @@ -1074,9 +1075,53 @@ PDNode *patterns::Conv::operator()() { ->AsOutput() ->assert_is_op_output("conv2d", "Output"); - conv_op->LinksFrom({input_var, filter_var}); - conv_op->LinksTo({output_var}); + conv_op->LinksFrom({input_var, filter_var}).LinksTo({output_var}); + return output_var; +} + +PDNode *patterns::ConvResidual::operator()(bool with_residual_data) { + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + + if (!with_residual_data) + conv_op->assert_op_attr("fuse_residual_connection", false); + + auto input_var = pattern->NewNode(conv_input_repr()) + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + + auto filter_var = pattern->NewNode(conv_filter_repr()) + ->AsInput() + ->assert_is_op_input("conv2d", "Filter"); + + auto output_var = pattern->NewNode(conv_output_repr()) + ->AsOutput() + ->assert_is_op_output("conv2d", "Output"); + + std::vector links_from{input_var, filter_var}; + + if (with_residual_data) { + auto res_conn_var = pattern->NewNode(conv_residual_data_repr()) + ->AsInput() + ->assert_is_op_input("conv2d", "ResidualData"); + links_from.push_back(res_conn_var); + } + + conv_op->LinksFrom(links_from).LinksTo({output_var}); + return output_var; +} + +PDNode *patterns::Pool::operator()() { + auto pool_op = pattern->NewNode(pool_op_repr())->assert_is_op("pool2d"); + + auto input_var = pattern->NewNode(pool_input_repr()) + ->AsInput() + ->assert_is_op_input("pool2d", "X"); + + auto output_var = pattern->NewNode(pool_output_repr()) + ->AsOutput() + ->assert_is_op_output("pool2d", "Out"); + pool_op->LinksFrom({input_var}).LinksTo({output_var}); return output_var; } @@ -1301,6 +1346,51 @@ PDNode *patterns::ConvAffineChannel::operator()( return ac_out_var; } +PDNode *patterns::DequantQuantAny::operator()() { + auto *dequant_in = pattern->NewNode(dequant_in_repr()) + ->AsInput() + ->assert_is_op_input("dequantize", "Input"); + + auto *dequant_op = + pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize"); + + auto *dequant_out = pattern->NewNode(dequant_out_repr()) + ->AsOutput() + ->assert_is_op_output("dequantize", "Output"); + + auto *quant_op = pattern->NewNode(quant_op_repr()) + ->assert_is_op("quantize") + ->AsIntermediate(); + + auto *quant_out = pattern->NewNode(quant_out_repr()) + ->AsOutput() + ->assert_is_op_output("quantize"); + + auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op(); + + dequant_op->LinksFrom({dequant_in}).LinksTo({dequant_out}); + quant_op->LinksFrom({dequant_out}).LinksTo({quant_out}); + next_op->LinksFrom({quant_out}); + + return quant_out; +} + +PDNode *patterns::DequantAny::operator()() { + auto *dequant_op = + pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize"); + + auto *dequant_out = pattern->NewNode(dequant_out_repr()) + ->AsOutput() + ->assert_is_op_output("dequantize", "Output"); + + auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op(); + + dequant_op->LinksTo({dequant_out}); + next_op->LinksFrom({dequant_out}); + + return dequant_out; +} + // a -> transpose_op(1) -> transpose_out_a -> flatten_op(1) -> flatten_out_a // b -> transpose_op(2) -> transpose_out_b -> flatten_op(2) -> flatten_out_b // ... diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index c8be586f546dc604375401b13a801841efbf08d2..fc30b5b21c580afdede64421bb4a1f4174bbad03 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -18,8 +18,11 @@ #include #endif +#include #include #include +#include +#include #include #include #include "paddle/fluid/framework/ir/graph.h" @@ -656,6 +659,35 @@ struct Conv : public PatternBase { PATTERN_DECL_NODE(conv_output); }; +// Convolution op with residual data +struct ConvResidual : public PatternBase { + ConvResidual(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_residual") {} + + PDNode* operator()(bool with_residual_data); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_input); + PATTERN_DECL_NODE(conv_filter); + PATTERN_DECL_NODE(conv_residual_data); + PATTERN_DECL_NODE(conv_output); +}; + +// Pool op +// Forward pass for pooling. +// pool_input is the input. +// pool_output is a result of the operator. +struct Pool : public PatternBase { + Pool(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "pooling") {} + + PDNode* operator()(); + + PATTERN_DECL_NODE(pool_op); + PATTERN_DECL_NODE(pool_input); + PATTERN_DECL_NODE(pool_output); +}; + // ElementwiseAdd used in residual connections. // y_var is used and convolution output. // The operator is removed, when residual @@ -766,6 +798,34 @@ struct ConvAffineChannel : public PatternBase { PATTERN_DECL_NODE(ac_out); // Out }; +// Dequantize + Quantize + anyOP +// This pattern is used for squashing the dequantize-quantize pairs. +struct DequantQuantAny : public PatternBase { + DequantQuantAny(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "dequant_quant_any") {} + PDNode* operator()(); + + PATTERN_DECL_NODE(dequant_in); + PATTERN_DECL_NODE(dequant_op); + PATTERN_DECL_NODE(dequant_out); + PATTERN_DECL_NODE(quant_op); + PATTERN_DECL_NODE(quant_out); + PATTERN_DECL_NODE(next_op); +}; + +// Dequantize + anyOP +// This quantize is used for getting number of ops the Dequantize's +// output is an input to. +struct DequantAny : public PatternBase { + DequantAny(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "dequant_any") {} + PDNode* operator()(); + + PATTERN_DECL_NODE(dequant_op); + PATTERN_DECL_NODE(dequant_out); + PATTERN_DECL_NODE(next_op); +}; + struct TransposeFlattenConcat : public PatternBase { TransposeFlattenConcat(PDPattern* pattern, const std::string& name_scope) : PatternBase(pattern, name_scope, "transpose_flatten_concat") {} diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..b37003991505140b0d531a4ea2b481c6d4b09d75 --- /dev/null +++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc @@ -0,0 +1,45 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/sync_batch_norm_pass.h" +#include +#include +#include + +namespace paddle { +namespace framework { +namespace ir { + +std::unique_ptr SyncBatchNormPass::ApplyImpl( + std::unique_ptr graph) const { + VLOG(3) << "Use synchronous batch norm"; + for (const Node* n : graph->Nodes()) { + if (n->IsOp()) { + auto* op = n->Op(); + if (op->Type() == "batch_norm") { + op->SetType("sync_batch_norm"); + } + if (op->Type() == "batch_norm_grad") { + op->SetType("sync_batch_norm_grad"); + } + } + } + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(sync_batch_norm_pass, paddle::framework::ir::SyncBatchNormPass); diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass.h b/paddle/fluid/framework/ir/sync_batch_norm_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..51cce3dca69330071f7d12efef08e2006e8bd7ac --- /dev/null +++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class SyncBatchNormPass : public Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..9c94c1746a6590df5a43c099b9c4c3678ca6e393 --- /dev/null +++ b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/sync_batch_norm_pass.h" +#include + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, + const std::vector& inputs, + const std::vector& outputs) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + op->SetAttr("name", name); + op->SetInput("X", inputs); + op->SetOutput("Out", outputs); +} + +// (a, conv_w)->conv2d->b +// (b, bn_scale, bn_bias, mean, var)->batch_norm +// ->(c, mean, var, save_mean, save_inv_var) +ProgramDesc BuildProgramDesc() { + ProgramDesc prog; + for (auto& v : std::vector({"a", "conv_w", "b", "bn_scale", + "bn_bias", "mean", "var", "c", + "save_mean", "save_inv_var"})) { + auto* var = prog.MutableBlock(0)->Var(v); + if (v == "conv_w" || v == "bn_scale" || v == "bn_bias" || v == "mean" || + v == "var") { + var->SetPersistable(true); + } + } + + SetOp(&prog, "conv2d", "conv", std::vector({"a", "conv_w"}), + std::vector({"b"})); + SetOp(&prog, "batch_norm", "bn", + std::vector({"b", "bn_scale", "bn_bias", "mean", "var"}), + std::vector( + {"c", "mean", "var", "save_mean", "save_inv_var"})); + return prog; +} + +TEST(IsTestPass, basic) { + auto prog = BuildProgramDesc(); + + std::unique_ptr graph(new ir::Graph(prog)); + + auto pass = PassRegistry::Instance().Get("sync_batch_norm_pass"); + + graph = pass->Apply(std::move(graph)); + + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op = node->Op(); + auto op_name = boost::get(op->GetAttr("name")); + if (op_name == "bn") { + ASSERT_EQ(op->Type(), "sync_batch_norm"); + } + } + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(sync_batch_norm_pass); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index acc71396b441149988f654843482a9e292977de3..56f108cea2e5d7dadbea2e2cbec39dbe7f4ba094 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -14,8 +14,10 @@ limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" #include +#include #include #include +#include #include #include "paddle/fluid/framework/ir/graph_helper.h" @@ -251,6 +253,20 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, member_->nccl_ctxs_.reset(new platform::NCCLContextMap( member_->places_, nccl_id, build_strategy.num_trainers_, build_strategy.trainer_id_)); + + std::unique_ptr dev_nccl_ctxs; + dev_nccl_ctxs.reset(new platform::NCCLContextMap(member_->places_)); + // Initialize device context's nccl comm + // Note, more than one ParallelExecutor with same place, the nccl comm will + // be rewrite and there will be some problem. + for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { + auto &nccl_ctx = dev_nccl_ctxs->at(dev_id); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto *dev_ctx = static_cast( + pool.Get(member_->places_[dev_id])); + dev_ctx->set_nccl_comm(nccl_ctx.comm()); + } #else PADDLE_THROW("Not compiled with CUDA"); #endif diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index ec8dedd605235a2d197e6a313bd589d5b9520cdf..0d116a6495477ca69c10c130e63247a4f6c03b23 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -2,4 +2,5 @@ if(WITH_PYTHON) cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind) cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind) cc_library(engine SRCS engine.cc) +cc_library(imperative_profiler SRCS profiler.cc) endif() diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 5530823b90f6580692456253b0eb9d0af4e3240b..3c7ddf08308ad40ed6106c7930bb409328d3048a 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -214,10 +214,8 @@ framework::LoDTensor& VarBase::GradValue() { } std::map> OpBase::ApplyGrad() { - if (grad_op_descs_.empty() && backward_id_ <= 0) { - VLOG(3) << "op with no grad: " << Type(); - return {}; - } + PADDLE_ENFORCE(!grad_op_descs_.empty() || backward_id_ > 0, + "%s has no backward implementation", Type()); VLOG(3) << "apply op grad: " << Type(); std::vector tmp_grad_outputs; @@ -239,7 +237,7 @@ std::map> OpBase::ApplyGrad() { VLOG(3) << "apply grad op " << grad_op_desc->Type(); // Allocate tmp grad output variable - for (auto it : grad_output_variable_map) { + for (const auto& it : grad_output_variable_map) { auto& outputs = tmp_grad_outputs[k][it.first]; outputs.reserve(it.second.size()); for (size_t i = 0; i < it.second.size(); ++i) { @@ -273,9 +271,9 @@ std::map> OpBase::ApplyGrad() { // Add tmp grad outputs to original grad vars for (size_t k = 0; k < grad_output_vars_.size(); ++k) { - for (auto it : grad_output_vars_[k]) { + for (const auto& it : grad_output_vars_[k]) { auto& outputs = tmp_grad_outputs[k][it.first]; - auto& origin_outputs = it.second; + const auto& origin_outputs = it.second; PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); for (size_t i = 0; i < outputs.size(); ++i) { diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 618a5b7a03295ce679dc6a88e0eac57069e78b8b..27cb1c84f568867506afc0ca5ce6eb1b28447efe 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -294,17 +294,23 @@ class PYBIND11_HIDDEN OpBase { void InvokeBackwardHooks(); - void TrackPreOp(const VarBase* inp_var, const std::string& inp_name) { - if (inp_var->PreOp() && !inp_var->IsStopGradient()) { - VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot " - << inp_name; - pre_ops_[inp_name].push_back(inp_var->PreOp()); - pre_ops_out_idx_[inp_name].push_back(inp_var->PreOpOutIdx()); - } else { - VLOG(3) << "no pre op in slot " << inp_name - << " input var stop_gradient: " << inp_var->IsStopGradient(); - pre_ops_[inp_name].push_back(nullptr); - // pre_ops_out_idx_[inp_name].push_back(-1); + void TrackPreOp(const std::string& inp_name, + const std::vector& inputs) { + auto& pre_ops_list = pre_ops_[inp_name]; + pre_ops_list.reserve(inputs.size()); + auto& pre_ops_out_idx_list = pre_ops_out_idx_[inp_name]; + for (VarBase* inp_var : inputs) { + if (inp_var->PreOp() && !inp_var->IsStopGradient()) { + VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot " + << inp_name; + pre_ops_list.emplace_back(inp_var->PreOp()); + pre_ops_out_idx_list.push_back(inp_var->PreOpOutIdx()); + } else { + VLOG(3) << "no pre op in slot " << inp_name + << " input var stop_gradient: " << inp_var->IsStopGradient(); + pre_ops_list.emplace_back(nullptr); + // pre_ops_out_idx_list.push_back(-1); + } } } diff --git a/paddle/fluid/imperative/profiler.cc b/paddle/fluid/imperative/profiler.cc new file mode 100644 index 0000000000000000000000000000000000000000..34570b3a60ec83fdeb1577789271942125b16eb1 --- /dev/null +++ b/paddle/fluid/imperative/profiler.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/profiler.h" + +#ifdef WITH_GPERFTOOLS +#include "gperftools/profiler.h" +#endif +#include +#include +#include // NOLINT +#include // NOLINT + +DEFINE_string( + tracer_profile_fname, "xxgperf", + "Profiler filename for imperative tracer, which generated by gperftools." + "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable."); + +namespace paddle { +namespace imperative { + +static std::once_flag gTracerProfileOnce; +#ifdef WITH_GPERFTOOLS +static bool gTracerProfilerStarted = false; +#endif + +void StartProfile() { + if (!FLAGS_tracer_profile_fname.empty()) { + std::call_once(gTracerProfileOnce, [] { +#ifdef WITH_GPERFTOOLS + ProfilerStart(FLAGS_tracer_profile_fname.c_str()); + gTracerProfilerStarted = true; +#else + LOG(WARNING) << "Paddle is not compiled with gperftools. " + "FLAGS_tracer_profile_fname will be ignored"; +#endif + }); + } +} + +void StopProfile() { +#ifdef WITH_GPERFTOOLS + ProfilerFlush(); +#else + LOG(WARNING) << "Paddle is not compiled with gperftools. " + "FLAGS_tracer_profile_fname will be ignored"; +#endif +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/profiler.h b/paddle/fluid/imperative/profiler.h new file mode 100644 index 0000000000000000000000000000000000000000..d52aeed4e81755cfa285616d7b0a7e79061c6af8 --- /dev/null +++ b/paddle/fluid/imperative/profiler.h @@ -0,0 +1,25 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace paddle { +namespace imperative { + +extern void StartProfile(); + +extern void StopProfile(); + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 7ee92b4d8c46d8814400dbc02847d701005f3d5b..8ce05a2b52477c6e6e05be20999e30f1fa961f08 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -23,34 +23,21 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" -#ifdef WITH_GPERFTOOLS -#include "gperftools/profiler.h" -#endif - -DEFINE_string( - tracer_profile_fname, "", - "Profiler filename for imperative tracer, which generated by gperftools." - "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable."); - namespace paddle { namespace imperative { -static std::once_flag gTracerProfileOnce; -#ifdef WITH_GPERFTOOLS -static bool gTracerProfilerStarted = false; -#endif - void CreateGradOp(const framework::OpDesc& op_desc, const std::unordered_set& no_grad_set, const std::vector& grad_sub_block, std::vector* grad_op_descs, std::unordered_map* grad_to_var) { PADDLE_ENFORCE(grad_op_descs->empty()); - std::vector> descs = - framework::OpInfoMap::Instance() - .Get(op_desc.Type()) - .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); + const framework::OpInfo& op_info = + framework::OpInfoMap::Instance().Get(op_desc.Type()); + if (!op_info.grad_op_maker_) return; + std::vector> descs = + op_info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); for (auto& desc : descs) { grad_op_descs->emplace_back(desc.release()); } @@ -145,31 +132,13 @@ framework::VariableNameMap CreateOutputVarNameMap( return result; } -Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { - if (!FLAGS_tracer_profile_fname.empty()) { - std::call_once(gTracerProfileOnce, [] { -#ifdef WITH_GPERFTOOLS - ProfilerStart(FLAGS_tracer_profile_fname.c_str()); - gTracerProfilerStarted = true; -#else - LOG(WARNING) << "Paddle is not compiled with gperftools. " - "FLAGS_tracer_profile_fname will be ignored"; -#endif - }); - } -} +Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {} std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, const VarBasePtrMap& outputs, framework::AttributeMap attrs_map, const platform::Place expected_place, const bool stop_gradient) { -#ifdef WITH_GPERFTOOLS - if (gTracerProfilerStarted) { - ProfilerFlush(); - } -#endif - framework::VariableValueMap invars_map; framework::VariableValueMap outvars_map; @@ -184,7 +153,6 @@ std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, inp->Name()); invars.emplace_back(inp->var_); - op->TrackPreOp(inp, it.first); if (!stop_gradient) { current_vars_map[inp->Name()] = inp; } @@ -192,6 +160,7 @@ std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, << " inited: " << inp->var_->IsInitialized() << " stop_grad: " << inp->IsStopGradient(); } + op->TrackPreOp(it.first, it.second); } op->output_vars_ = outputs; @@ -319,9 +288,7 @@ std::vector Tracer::PyTrace(OpBase* op, std::vector ret_vars = PyLayer::Apply(op->forward_id_, inputs); - for (VarBase* inp : inputs) { - op->TrackPreOp(inp, PyLayer::kFwdInp); - } + op->TrackPreOp(PyLayer::kFwdInp, inputs); std::vector& outputs = op->output_vars_[PyLayer::kFwdOut]; outputs.reserve(ret_vars.size()); diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 762640d6d1ce12dff511fc7149e872efa834036c..d27ef8fe3c33f0b293671a4fdac9e574cb92c806 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -91,5 +91,5 @@ if(WITH_TESTING) add_subdirectory(tests/book) if(WITH_INFERENCE_API_TEST) add_subdirectory(tests/api) - endif() + endif() endif() diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 89e934ae27b9319d4e1d2d51586d5f8fa7dccfce..321deccf86718aad013c106b5a783161f96cbcb9 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include "paddle/fluid/framework/ir/graph.h" @@ -38,7 +39,10 @@ namespace paddle { namespace inference { namespace analysis { + using framework::ir::Graph; +using VarQuantScale = + std::unordered_map>; /* * The argument definition of both Pass and PassManagers. @@ -127,6 +131,8 @@ struct Argument { // Pass a set of op types to enable its mkldnn kernel DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes, std::unordered_set); + // Scales for variables to be quantized + DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale); // Passed from config. DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 1cdb4881fbc1e2c0249430f7148bf56261bd6c41..8fd86b2cc56c4af50e735be2d660ec3db23e1547 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/inference/analysis/ir_pass_manager.h" #include +#include #include #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" @@ -55,14 +56,14 @@ void IRPassManager::CreatePasses(Argument *argument, ".dot"; pass->Set("graph_viz_path", new std::string(std::move(dot_file_path))); pass_num++; - } - if (pass_name == "mkldnn_placement_pass") { + } else if (pass_name == "mkldnn_placement_pass") { pass->Set("mkldnn_enabled_op_types", new std::unordered_set( argument->mkldnn_enabled_op_types())); - } - - if (pass_name == "tensorrt_subgraph_pass") { + } else if (pass_name == "cpu_quantize_pass") { + pass->Set("quant_var_scales", + new VarQuantScale(argument->quant_var_scales())); + } else if (pass_name == "tensorrt_subgraph_pass") { pass->Set("workspace_size", new int(argument->tensorrt_workspace_size())); pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); pass->Set("min_subgraph_size", diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 77411112220dcb722d4d3482bc844720981a2da2..92526f4e74a217aa2cdfd43f258846ada54b9374 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -219,7 +219,14 @@ void AnalysisConfig::Update() { } if (enable_memory_optim_) { - pass_builder()->AppendAnalysisPass("memory_optimize_pass"); + auto analysis_passes = pass_builder()->AnalysisPasses(); + auto memory_opti_pass_name = "memory_optimize_pass"; + bool already_exists = + std::find(analysis_passes.begin(), analysis_passes.end(), + memory_opti_pass_name) != analysis_passes.end(); + if (!already_exists) { + pass_builder()->AppendAnalysisPass(memory_opti_pass_name); + } } if (ir_debug_) { diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index a3f2a69aef52b6f55aa09e6dee2c22c048626c0d..651c5e6e75834c27313abd79a33bedb62ecd2632 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -44,10 +44,10 @@ if (WITH_DISTRIBUTE) SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch) endif() -register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) +register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op sync_batch_norm_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) -# warpctc_op needs cudnn 7 above if (WITH_GPU) + # warpctc_op needs cudnn 7 above if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) else() @@ -58,6 +58,10 @@ if (WITH_GPU) op_library(conv_fusion_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n") endif() + if (NOT WIN32) + op_library(sync_batch_norm_op) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n") + endif() else() op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) endif() diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index feac4125381bd897dac89943af44850012e4761d..c0ad959309a7036639c4bc15621a2bd0296526f5 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/batch_norm_op.h" +#include #include +#include #include "paddle/fluid/framework/data_layout.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -22,147 +24,150 @@ limitations under the License. */ namespace paddle { namespace operators { -class BatchNormOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), ""); - PADDLE_ENFORCE(ctx->HasInput("Scale"), ""); - PADDLE_ENFORCE(ctx->HasInput("Bias"), ""); - PADDLE_ENFORCE(ctx->HasInput("Mean"), ""); - PADDLE_ENFORCE(ctx->HasInput("Variance"), ""); - PADDLE_ENFORCE(ctx->HasOutput("Y"), ""); - PADDLE_ENFORCE(ctx->HasOutput("MeanOut"), ""); - PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"), ""); - PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), ""); - PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), ""); - - // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python - PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0], - "Mean and MeanOut should share the same memory"); - PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0], - ctx->Outputs("VarianceOut")[0], - "Variance and VarianceOut should share the same memory"); - - const auto x_dims = ctx->GetInputDim("X"); - const DataLayout data_layout = framework::StringToDataLayout( - ctx->Attrs().Get("data_layout")); - - PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, - "Input X must have 2 to 5 dimensions."); - - const int64_t C = - (data_layout == DataLayout::kNCHW ? x_dims[1] - : x_dims[x_dims.size() - 1]); - - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], C); - - ctx->SetOutputDim("Y", x_dims); - ctx->SetOutputDim("MeanOut", {C}); - ctx->SetOutputDim("VarianceOut", {C}); - ctx->SetOutputDim("SavedMean", {C}); - ctx->SetOutputDim("SavedVariance", {C}); - ctx->ShareLoD("X", "Y"); +void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Scale"), + "Input(Scale) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), + "Input(Bias) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Mean"), + "Input(Mean) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Variance"), + "Input(Variance) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Y"), + "Output(Y) of ConvOp should not be null."); + bool is_test = ctx->Attrs().Get("is_test"); + if (!is_test) { + PADDLE_ENFORCE(ctx->HasOutput("MeanOut"), + "Output(MeanOut) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"), + "Output(VarianceOut) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), + "Output(SavedMean) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), + "Output(SavedVariance) of ConvOp should not be null."); } - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - auto input_data_type = ctx.Input("X")->type(); - // By default, the type of the scale, bias, mean, - // and var tensors should both be float. (For float or float16 input tensor) - // or double (For double input tensor). - auto bn_param_type = framework::proto::VarType::FP32; - if (input_data_type == framework::proto::VarType::FP64) { - bn_param_type = framework::proto::VarType::FP64; - } - PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Scale")->type(), - "Scale input should be of float type"); - PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Bias")->type(), - "Bias input should be of float type"); - PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Mean")->type(), - "Mean input should be of float type"); - PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Variance")->type(), - "Variance input should be of float type"); - - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::LibraryType library = framework::LibraryType::kPlain; - framework::DataLayout layout = framework::DataLayout::kAnyLayout; + // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python + PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0], + "Mean and MeanOut should share the same memory"); + PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0], ctx->Outputs("VarianceOut")[0], + "Variance and VarianceOut should share the same memory"); + + const auto x_dims = ctx->GetInputDim("X"); + const DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); + + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "Input X must have 2 to 5 dimensions."); + + const int64_t C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], C); + + ctx->SetOutputDim("Y", x_dims); + ctx->SetOutputDim("MeanOut", {C}); + ctx->SetOutputDim("VarianceOut", {C}); + ctx->SetOutputDim("SavedMean", {C}); + ctx->SetOutputDim("SavedVariance", {C}); + ctx->ShareLoD("X", "Y"); +} + +framework::OpKernelType BatchNormOp::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + auto input_data_type = ctx.Input("X")->type(); + // By default, the type of the scale, bias, mean, + // and var tensors should both be float. (For float or float16 input tensor) + // or double (For double input tensor). + auto bn_param_type = framework::proto::VarType::FP32; + if (input_data_type == framework::proto::VarType::FP64) { + bn_param_type = framework::proto::VarType::FP64; + } + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Scale")->type(), + "Scale input should be of float type"); + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Bias")->type(), + "Bias input should be of float type"); + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Mean")->type(), + "Mean input should be of float type"); + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Variance")->type(), + "Variance input should be of float type"); + + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; #ifdef PADDLE_WITH_MKLDNN - if (library == framework::LibraryType::kPlain && - platform::CanMKLDNNBeUsed(ctx)) { - library = framework::LibraryType::kMKLDNN; - layout = framework::DataLayout::kMKLDNN; - } -#endif - - return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, - library); + if (library == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; } -}; +#endif -class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddAttr("is_test", - "(bool, default false) Set to true for inference only, false " - "for training. Some layers may run faster when this is true.") - .SetDefault(false); - AddAttr("momentum", "").SetDefault(0.9); - AddAttr("epsilon", "") - .SetDefault(1e-5) - .AddCustomChecker([](const float &epsilon) { - PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f, - "'epsilon' should be between 0.0 and 0.001."); - }); - AddAttr("data_layout", "").SetDefault("NCHW"); - AddInput("X", "The input tensor"); - AddInput("Scale", - "Scale is a 1-dimensional tensor of size C " - "that is applied to the output"); - AddInput("Bias", - "Bias is a 1-dimensional tensor of size C " - "that is applied to the output"); - AddInput("Mean", - "The global mean (for training) or " - "estimated mean (for testing)"); - AddInput("Variance", - "The global variance (for training) " - "or estimated Variance (for testing)"); - AddOutput("Y", "result after normalization"); - AddOutput("MeanOut", - "Share memory with Mean. " - "Store the global mean when training"); - AddOutput("VarianceOut", - "Share memory with Variance. " - "Store the global Variance when training"); - AddOutput("SavedMean", - "Mean of the current mini batch, " - "will apply to output when training") - .AsIntermediate(); - AddOutput("SavedVariance", - "Variance of the current mini batch, " - "will apply to output when training") - .AsIntermediate(); - AddAttr("use_mkldnn", - "(bool, default false) Only used in mkldnn kernel") - .SetDefault(false); - AddAttr("fuse_with_relu", - "(bool, default false) Only used in mkldnn kernel") - .SetDefault(false); - AddAttr("use_global_stats", - "(bool, default false) Whether to use global mean and " - "variance. In inference or test mode, set use_global_stats " - "to true or is_test true. the behavior is equivalent. " - "In train mode, when setting use_global_stats True, the " - "global mean and variance are also used during train time, " - "the BN acts as scaling and shiffting.") - .SetDefault(false); - AddComment(R"DOC( + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, + library); +} + +void BatchNormOpMaker::Make() { + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); + AddAttr("momentum", "").SetDefault(0.9); + AddAttr("epsilon", "") + .SetDefault(1e-5) + .AddCustomChecker([](const float &epsilon) { + PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f, + "'epsilon' should be between 0.0 and 0.001."); + }); + AddAttr("data_layout", "").SetDefault("NCHW"); + AddInput("X", "The input tensor"); + AddInput("Scale", + "Scale is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddInput("Bias", + "Bias is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddInput("Mean", + "The global mean (for training) or " + "estimated mean (for testing)"); + AddInput("Variance", + "The global variance (for training) " + "or estimated Variance (for testing)"); + AddOutput("Y", "result after normalization"); + AddOutput("MeanOut", + "Share memory with Mean. " + "Store the global mean when training"); + AddOutput("VarianceOut", + "Share memory with Variance. " + "Store the global Variance when training"); + AddOutput("SavedMean", + "Mean of the current mini batch, " + "will apply to output when training") + .AsIntermediate(); + AddOutput("SavedVariance", + "Variance of the current mini batch, " + "will apply to output when training") + .AsIntermediate(); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr("fuse_with_relu", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr("use_global_stats", + "(bool, default false) Whether to use global mean and " + "variance. In inference or test mode, set use_global_stats " + "to true or is_test true. the behavior is equivalent. " + "In train mode, when setting use_global_stats True, the " + "global mean and variance are also used during train time, " + "the BN acts as scaling and shiffting.") + .SetDefault(false); + AddComment(R"DOC( Batch Normalization. Batch Norm has been implemented as discussed in the paper: @@ -173,17 +178,7 @@ The required data format for this layer is one of the following: 2. NCHW `[batch, in_channels, in_height, in_width]` )DOC"); - } -}; - -class BatchNormOpInferVarType - : public framework::PassInDtypeAndVarTypeToOutput { - protected: - std::unordered_map GetInputOutputWithSameType() - const override { - return std::unordered_map{{"X", /*->*/ "Y"}}; - } -}; +} template class BatchNormKernel @@ -336,82 +331,75 @@ class BatchNormKernel } }; -class BatchNormGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - // check input - PADDLE_ENFORCE(ctx->HasInput("X")); - PADDLE_ENFORCE(ctx->HasInput("Scale"), "Input(scale) should not be null."); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), - "Input(Y@GRAD) should not be null."); - PADDLE_ENFORCE(ctx->HasInput("SavedMean"), - "Input(SavedMean) should not be null."); - PADDLE_ENFORCE(ctx->HasInput("SavedVariance"), - "Input(SavedVariance) should not be null"); - - // check output - PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), ""); - if (ctx->HasOutput(framework::GradVarName("Scale"))) { - PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), - "Output(Scale@GRAD) and Output(Bias@GRAD) should not be " - "null at same time"); - } - const bool use_global_stats = ctx->Attrs().Get("use_global_stats"); - if (use_global_stats) { - PADDLE_ENFORCE(!ctx->Attrs().Get("use_mkldnn"), - "Using global stats during training is not supported " - "in gradient op kernel of batch_norm_mkldnn_op now."); - } +void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const { + // check input + PADDLE_ENFORCE(ctx->HasInput("X")); + PADDLE_ENFORCE(ctx->HasInput("Scale"), "Input(scale) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), + "Input(Y@GRAD) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("SavedMean"), + "Input(SavedMean) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("SavedVariance"), + "Input(SavedVariance) should not be null"); + + // check output + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), ""); + if (ctx->HasOutput(framework::GradVarName("Scale"))) { + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), + "Output(Scale@GRAD) and Output(Bias@GRAD) should not be " + "null at same time"); + } + const bool use_global_stats = ctx->Attrs().Get("use_global_stats"); + if (use_global_stats) { + PADDLE_ENFORCE(!ctx->Attrs().Get("use_mkldnn"), + "Using global stats during training is not supported " + "in gradient op kernel of batch_norm_mkldnn_op now."); + } - const auto x_dims = ctx->GetInputDim("X"); - const DataLayout data_layout = framework::StringToDataLayout( - ctx->Attrs().Get("data_layout")); - const int C = - (data_layout == DataLayout::kNCHW ? x_dims[1] - : x_dims[x_dims.size() - 1]); + const auto x_dims = ctx->GetInputDim("X"); + const DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); + const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); - ctx->SetOutputDim(framework::GradVarName("X"), x_dims); - if (ctx->HasOutput(framework::GradVarName("Scale"))) { - ctx->SetOutputDim(framework::GradVarName("Scale"), {C}); - ctx->SetOutputDim(framework::GradVarName("Bias"), {C}); - } + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + if (ctx->HasOutput(framework::GradVarName("Scale"))) { + ctx->SetOutputDim(framework::GradVarName("Scale"), {C}); + ctx->SetOutputDim(framework::GradVarName("Bias"), {C}); } +} - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - const auto *var = ctx.InputVar(framework::GradVarName("Y")); - if (var == nullptr) { - PADDLE_THROW("can't find Y@GRAD"); - } - const Tensor *t = nullptr; - if (var->IsType()) { - t = &var->Get(); - } else if (var->IsType()) { - t = &var->Get(); - } - if (t == nullptr) { - PADDLE_THROW("can't find Y@GRAD"); - } +framework::OpKernelType BatchNormGradOp::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + const auto *var = ctx.InputVar(framework::GradVarName("Y")); + if (var == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } + const Tensor *t = nullptr; + if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &var->Get(); + } + if (t == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::LibraryType library = framework::LibraryType::kPlain; - framework::DataLayout layout = framework::DataLayout::kAnyLayout; + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; #ifdef PADDLE_WITH_MKLDNN - if (library == framework::LibraryType::kPlain && - platform::CanMKLDNNBeUsed(ctx)) { - library = framework::LibraryType::kMKLDNN; - layout = framework::DataLayout::kMKLDNN; - } + if (library == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; + } #endif - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.GetPlace(), layout, library); - } -}; + return framework::OpKernelType(ctx.Input("X")->type(), ctx.GetPlace(), + layout, library); +} template class BatchNormGradKernel @@ -572,37 +560,31 @@ class BatchNormGradKernel } }; -class BatchNormGradMaker : public framework::SingleGradOpDescMaker { - public: - using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; - - protected: - std::unique_ptr Apply() const override { - auto *op = new framework::OpDesc(); - op->SetType("batch_norm_grad"); - op->SetInput("X", Input("X")); - op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); - - op->SetInput("Scale", Input("Scale")); - op->SetInput("Bias", Input("Bias")); - op->SetInput("SavedMean", Output("SavedMean")); - op->SetInput("SavedVariance", Output("SavedVariance")); - - // used when setting use_global_stats True during training - if (boost::get(GetAttr("use_global_stats"))) { - op->SetInput("Mean", Output("MeanOut")); - op->SetInput("Variance", Output("VarianceOut")); - } +std::unique_ptr BatchNormGradMaker::Apply() const { + auto *op = new framework::OpDesc(); + op->SetType(GradOpType()); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + + op->SetInput("Scale", Input("Scale")); + op->SetInput("Bias", Input("Bias")); + op->SetInput("SavedMean", Output("SavedMean")); + op->SetInput("SavedVariance", Output("SavedVariance")); + + // used when setting use_global_stats True during training + if (boost::get(GetAttr("use_global_stats"))) { + op->SetInput("Mean", Output("MeanOut")); + op->SetInput("Variance", Output("VarianceOut")); + } - op->SetAttrMap(Attrs()); + op->SetAttrMap(Attrs()); - op->SetOutput(framework::GradVarName("X"), InputGrad("X")); - op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale")); - op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale")); + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); - return std::unique_ptr(op); - } -}; + return std::unique_ptr(op); +} class BatchNormInplaceInToOut : public framework::InplaceInToOut { public: @@ -642,10 +624,10 @@ class BatchNormGradInplaceInToOut : public framework::InplaceInToOut { namespace ops = paddle::operators; REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, - ops::BatchNormOpInferVarType, ops::BatchNormGradMaker, - ops::BatchNormInplaceInToOut); -REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp, - ops::BatchNormGradInplaceInToOut); + ops::BatchNormOpInferVarType, ops::BatchNormGradMaker) +// ops::BatchNormInplaceInToOut); +REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp) +// ops::BatchNormGradInplaceInToOut); REGISTER_OP_CPU_KERNEL( batch_norm, ops::BatchNormKernel, diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index 1c45746a92ad057a97d9f65aa256df616fc37f3d..36d297ec5523b9e8a136c536165bdb4d3a380c25 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -33,26 +33,6 @@ using CudnnDataType = platform::CudnnDataType; template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; -void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout, - int *N, int *C, int *H, int *W, int *D) { - *N = dims[0]; - if (dims.size() == 2) { - *C = dims[1]; - *H = 1; - *W = 1; - *D = 1; - } else { - *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; - *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; - *W = dims.size() > 3 - ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2]) - : 1; - *D = dims.size() > 4 - ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3]) - : 1; - } -} - template class BatchNormKernel : public framework::OpKernel { @@ -196,22 +176,6 @@ class BatchNormKernel } }; -template -static __global__ void KeBNBackwardData(const T *dy, - const BatchNormParamType *scale, - const BatchNormParamType *variance, - const double epsilon, const int C, - const int HxW, const int num, T *dx) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - for (int i = gid; i < num; i += stride) { - const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; - BatchNormParamType inv_var = 1.0 / sqrt(variance[c] + epsilon); - dx[i] = static_cast(static_cast>(dy[i]) * - scale[c] * inv_var); - } -} - template static __global__ void KeBNBackwardScaleBias( const T *dy, const T *x, const BatchNormParamType *mean, @@ -248,6 +212,22 @@ static __global__ void KeBNBackwardScaleBias( } } +template +static __global__ void KeBNBackwardData(const T *dy, + const BatchNormParamType *scale, + const BatchNormParamType *variance, + const double epsilon, const int C, + const int HxW, const int num, T *dx) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = gid; i < num; i += stride) { + const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; + BatchNormParamType inv_var = 1.0 / sqrt(variance[c] + epsilon); + dx[i] = static_cast(static_cast>(dy[i]) * + scale[c] * inv_var); + } +} + template class BatchNormGradKernel : public framework::OpKernel { @@ -383,7 +363,7 @@ class BatchNormGradKernel KeBNBackwardScaleBias<<< grid2, block, 0, dev_ctx.stream()>>>( d_y->data(), x->data(), running_mean_data, running_var_data, - epsilon, C, H * W, num, d_scale->data>(), + epsilon, N, C, H * W * D, d_scale->data>(), d_bias->data>()); } } else { @@ -394,10 +374,10 @@ class BatchNormGradKernel running_var_data, epsilon, C, H * W, num, d_x->data()); } if (d_scale && d_bias) { - KeBNBackwardScaleBias<<< + KeBNBackwardScaleBias<<< grid2, block, 0, dev_ctx.stream()>>>( d_y->data(), x->data(), running_mean_data, running_var_data, - epsilon, C, H * W, num, d_scale->data>(), + epsilon, N, C, H * W * D, d_scale->data>(), d_bias->data>()); } } diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h index 5e3d630d6889e445c5e84fa836d2d81bb7266779..6e89d73eb236ee7844c7de3c273e0b0f275a3e33 100644 --- a/paddle/fluid/operators/batch_norm_op.h +++ b/paddle/fluid/operators/batch_norm_op.h @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -35,17 +38,84 @@ template using ConstEigenVectorArrayMap = Eigen::Map>; +class BatchNormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override; +}; + +class BatchNormGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override; +}; + +class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +class BatchNormGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override; + + virtual std::string GradOpType() const { + return this->ForwardOpType() + "_grad"; + } +}; + +class BatchNormOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{{"X", /*->*/ "Y"}}; + } +}; + template class BatchNormKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override; + void Compute(const framework::ExecutionContext &ctx) const override; }; template class BatchNormGradKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override; + void Compute(const framework::ExecutionContext &ctx) const override; }; +inline void ExtractNCWHD(const framework::DDim &dims, + const DataLayout &data_layout, int *N, int *C, int *H, + int *W, int *D) { + *N = dims[0]; + if (dims.size() == 2) { + *C = dims[1]; + *H = 1; + *W = 1; + *D = 1; + } else { + *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; + *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; + *W = dims.size() > 3 + ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2]) + : 1; + *D = dims.size() > 4 + ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3]) + : 1; + } +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index ca6bc4df0fe2c6cddaf548d3e708e777172a0841..c6121d00dae4007f2fcaf57b0945d3f34233781d 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_op.h" +#include #include #include @@ -194,6 +195,12 @@ void Conv2DOpMaker::Make() { AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr("use_quantizer", + "(bool, default false) " + "Set to true for operators that should be quantized and use " + "int8 kernel. " + "Only used on CPU.") + .SetDefault(false); AddAttr("fuse_relu", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); AddAttr("fuse_residual_connection", diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc index 7e744e68e9737f9338d4b787aa28fd1834b145da..a617b9fb1d948340d25853252be79fdd08fe0438 100644 --- a/paddle/fluid/operators/cross_entropy_op.cc +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -248,10 +248,14 @@ class CrossEntropyOp2 : public CrossEntropyOpBase { PADDLE_ENFORCE(ctx->HasOutput("XShape"), "Output(XShape) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("MatchX"), + "Output(MatchX) should be not null."); auto x_dims = ctx->GetInputDim("X"); auto x_dims_vec = framework::vectorize(x_dims); x_dims_vec.push_back(0); ctx->SetOutputDim("XShape", framework::make_ddim(x_dims_vec)); + x_dims[x_dims.size() - 1] = 1; + ctx->SetOutputDim("MatchX", x_dims); ctx->ShareLoD("X", /*->*/ "XShape"); } @@ -264,6 +268,10 @@ class CrossEntropyOp2 : public CrossEntropyOpBase { class CrossEntropyGradientOp2 : public CrossEntropyGradientOpBase { public: using CrossEntropyGradientOpBase::CrossEntropyGradientOpBase; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("MatchX"), "Input(MatchX) must exist"); + CrossEntropyGradientOpBase::InferShape(ctx); + } protected: virtual framework::DDim GetXDim(framework::InferShapeContext* ctx) const { @@ -295,6 +303,8 @@ class CrossEntropyOpMaker2 : public framework::OpProtoAndCheckerMaker { "with 'X' except that the last dimension size is 1. It " "represents the cross entropy loss."); AddOutput("XShape", "Temporaily variable to save shape and LoD of X."); + AddOutput("MatchX", + "X value that matches label, used for gradient computation."); AddAttr("ignore_index", "(int, default -100), Specifies a target value that is" "ignored and does not contribute to the input gradient." @@ -327,7 +337,7 @@ class CrossEntropyGradOpDescMaker2 : public framework::SingleGradOpDescMaker { std::unique_ptr op(new framework::OpDesc()); op->SetType("cross_entropy_grad2"); op->SetInput("Label", Input("Label")); - op->SetInput("Y", Output("Y")); + op->SetInput("MatchX", Output("MatchX")); op->SetInput("XShape", Output("XShape")); op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); op->SetOutput(framework::GradVarName("X"), InputGrad("X")); diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h index 05609e4bc20b1c75872be38e057de221a0188b88..7eb663773ed072760c47a2914377b5306ceeb7af 100644 --- a/paddle/fluid/operators/cross_entropy_op.h +++ b/paddle/fluid/operators/cross_entropy_op.h @@ -138,15 +138,48 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel { } }; +template +struct HardLabelCrossEntropyForwardFunctor { + HardLabelCrossEntropyForwardFunctor(const T* x, T* y, T* match_x, + const int64_t* label, + int64_t ignore_index, + int64_t feature_size) + : x_(x), + y_(y), + match_x_(match_x), + label_(label), + ignore_index_(ignore_index), + feature_size_(feature_size) {} + + HOSTDEVICE void operator()(int64_t idx) const { + auto label = label_[idx]; + if (label != ignore_index_) { + auto match_x = x_[idx * feature_size_ + label]; + y_[idx] = -math::TolerableValue()(real_log(match_x)); + match_x_[idx] = match_x; + } else { + y_[idx] = 0; + match_x_[idx] = 0; // any value is ok + } + } + + const T* x_; + T* y_; + T* match_x_; + const int64_t* label_; + int64_t ignore_index_; + int64_t feature_size_; +}; + template struct HardLabelCrossEntropyBackwardFunctor { - HardLabelCrossEntropyBackwardFunctor(T* dx, const T* y, const T* dy, + HardLabelCrossEntropyBackwardFunctor(T* dx, const T* dy, const T* match_x, const int64_t* label, int64_t ignore_index, int64_t feature_size) : dx_(dx), - y_(y), dy_(dy), + match_x_(match_x), label_(label), ignore_index_(ignore_index), feature_size_(feature_size) {} @@ -156,15 +189,15 @@ struct HardLabelCrossEntropyBackwardFunctor { auto col_idx = idx % feature_size_; auto label = label_[row_idx]; if (label == col_idx && label != ignore_index_) { - dx_[idx] = -dy_[row_idx] * real_exp(y_[row_idx]); + dx_[idx] = -dy_[row_idx] / match_x_[row_idx]; } else { dx_[idx] = 0; } } T* dx_; - const T* y_; const T* dy_; + const T* match_x_; const int64_t* label_; int64_t ignore_index_; int64_t feature_size_; @@ -174,20 +207,26 @@ template class CrossEntropyOpKernel2 : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* x_original = ctx.Input("X"); - int rank = x_original->dims().size(); - - auto x = framework::ReshapeToMatrix(*x_original, rank - 1); - auto label = - framework::ReshapeToMatrix(*ctx.Input("Label"), rank - 1); + auto* x = ctx.Input("X"); + auto* label = ctx.Input("Label"); auto* y = ctx.Output("Y"); - y->mutable_data(ctx.GetPlace()); + auto* match_x = ctx.Output("MatchX"); + + auto& x_dims = x->dims(); + auto feature_size = x_dims[x_dims.size() - 1]; + auto batch_size = framework::product(x->dims()) / feature_size; + + auto* p_x = x->data(); + auto* p_label = label->data(); + auto* p_y = y->mutable_data(ctx.GetPlace()); + auto* p_match_x = match_x->mutable_data(ctx.GetPlace()); auto ignore_index = ctx.Attr("ignore_index"); - math::CrossEntropyFunctor()( - ctx.template device_context(), y, &x, &label, false, - ignore_index); + platform::ForRange for_range( + ctx.template device_context(), batch_size); + for_range(HardLabelCrossEntropyForwardFunctor( + p_x, p_y, p_match_x, p_label, ignore_index, feature_size)); } }; @@ -196,13 +235,13 @@ class CrossEntropyGradientOpKernel2 : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* dx = ctx.Output(framework::GradVarName("X")); - auto* y = ctx.Input("Y"); auto* dy = ctx.Input(framework::GradVarName("Y")); + auto* match_x = ctx.Input("MatchX"); auto* label = ctx.Input("Label"); auto* p_dx = dx->mutable_data(ctx.GetPlace()); - auto* p_y = y->data(); auto* p_dy = dy->data(); + auto* p_match_x = match_x->data(); auto* p_label = label->data(); int64_t ignore_index = ctx.Attr("ignore_index"); @@ -214,7 +253,7 @@ class CrossEntropyGradientOpKernel2 : public framework::OpKernel { ctx.template device_context(), batch_size * feature_size); for_range(HardLabelCrossEntropyBackwardFunctor( - p_dx, p_y, p_dy, p_label, ignore_index, feature_size)); + p_dx, p_dy, p_match_x, p_label, ignore_index, feature_size)); } }; diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index c87837e69424335ac926bf05664e5f79940390b5..94a2016aa53212c3ae5af6d86cccb117855cc3b4 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -33,6 +33,7 @@ detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc) detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu) detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc) +detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op.cu) detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu) if(WITH_GPU) diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc index 0a51d50e06176e713922837861f2102c9ee8a899..de3612677440596387f313e1ff59184cb3fdb7ae 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cc +++ b/paddle/fluid/operators/detection/box_coder_op.cc @@ -60,14 +60,15 @@ class BoxCoderOp : public framework::OperatorWithKernel { } else if (code_type == BoxCodeType::kDecodeCenterSize) { PADDLE_ENFORCE_EQ(target_box_dims.size(), 3, "The rank of Input TargetBox must be 3"); - if (axis == 0) { - PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]); - } else if (axis == 1) { - PADDLE_ENFORCE_EQ(target_box_dims[0], prior_box_dims[0]); - } else { - PADDLE_THROW("axis must be 0 or 1."); + PADDLE_ENFORCE(axis == 0 || axis == 1, "axis must be 0 or 1"); + if (ctx->IsRuntime()) { + if (axis == 0) { + PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]); + } else if (axis == 1) { + PADDLE_ENFORCE_EQ(target_box_dims[0], prior_box_dims[0]); + } + PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]); } - PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]); ctx->ShareDim("TargetBox", /*->*/ "OutputBox"); } diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e0d7e25d944cf2321799da4c73de9f74d9fd287d --- /dev/null +++ b/paddle/fluid/operators/detection/yolo_box_op.cc @@ -0,0 +1,167 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/operators/detection/yolo_box_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class YoloBoxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of YoloBoxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ImgSize"), + "Input(ImgSize) of YoloBoxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Boxes"), + "Output(Boxes) of YoloBoxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Scores"), + "Output(Scores) of YoloBoxOp should not be null."); + + auto dim_x = ctx->GetInputDim("X"); + auto dim_imgsize = ctx->GetInputDim("ImgSize"); + auto anchors = ctx->Attrs().Get>("anchors"); + int anchor_num = anchors.size() / 2; + auto class_num = ctx->Attrs().Get("class_num"); + + PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor."); + PADDLE_ENFORCE_EQ( + dim_x[1], anchor_num * (5 + class_num), + "Input(X) dim[1] should be equal to (anchor_mask_number * (5 " + "+ class_num))."); + PADDLE_ENFORCE_EQ(dim_imgsize.size(), 2, + "Input(ImgSize) should be a 2-D tensor."); + PADDLE_ENFORCE_EQ( + dim_imgsize[0], dim_x[0], + "Input(ImgSize) dim[0] and Input(X) dim[0] should be same."); + PADDLE_ENFORCE_EQ(dim_imgsize[1], 2, "Input(ImgSize) dim[1] should be 2."); + PADDLE_ENFORCE_GT(anchors.size(), 0, + "Attr(anchors) length should be greater than 0."); + PADDLE_ENFORCE_EQ(anchors.size() % 2, 0, + "Attr(anchors) length should be even integer."); + PADDLE_ENFORCE_GT(class_num, 0, + "Attr(class_num) should be an integer greater than 0."); + + int box_num = dim_x[2] * dim_x[3] * anchor_num; + std::vector dim_boxes({dim_x[0], box_num, 4}); + ctx->SetOutputDim("Boxes", framework::make_ddim(dim_boxes)); + + std::vector dim_scores({dim_x[0], box_num, class_num}); + ctx->SetOutputDim("Scores", framework::make_ddim(dim_scores)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); + } +}; + +class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input tensor of YoloBox operator is a 4-D tensor with " + "shape of [N, C, H, W]. The second dimension(C) stores " + "box locations, confidence score and classification one-hot " + "keys of each anchor box. Generally, X should be the output " + "of YOLOv3 network."); + AddInput("ImgSize", + "The image size tensor of YoloBox operator, " + "This is a 2-D tensor with shape of [N, 2]. This tensor holds " + "height and width of each input image used for resizing output " + "box in input image scale."); + AddOutput("Boxes", + "The output tensor of detection boxes of YoloBox operator, " + "This is a 3-D tensor with shape of [N, M, 4], N is the " + "batch num, M is output box number, and the 3rd dimension " + "stores [xmin, ymin, xmax, ymax] coordinates of boxes."); + AddOutput("Scores", + "The output tensor of detection boxes scores of YoloBox " + "operator, This is a 3-D tensor with shape of " + "[N, M, :attr:`class_num`], N is the batch num, M is " + "output box number."); + + AddAttr("class_num", "The number of classes to predict."); + AddAttr>("anchors", + "The anchor width and height, " + "it will be parsed pair by pair.") + .SetDefault(std::vector{}); + AddAttr("downsample_ratio", + "The downsample ratio from network input to YoloBox operator " + "input, so 32, 16, 8 should be set for the first, second, " + "and thrid YoloBox operators.") + .SetDefault(32); + AddAttr("conf_thresh", + "The confidence scores threshold of detection boxes. " + "Boxes with confidence scores under threshold should " + "be ignored.") + .SetDefault(0.01); + AddComment(R"DOC( + This operator generates YOLO detection boxes from output of YOLOv3 network. + + The output of previous network is in shape [N, C, H, W], while H and W + should be the same, H and W specify the grid size, each grid point predict + given number boxes, this given number, which following will be represented as S, + is specified by the number of anchors. In the second dimension(the channel + dimension), C should be equal to S * (5 + class_num), class_num is the object + category number of source dataset(such as 80 in coco dataset), so the + second(channel) dimension, apart from 4 box location coordinates x, y, w, h, + also includes confidence score of the box and class one-hot key of each anchor + box. + + Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box + predictions should be as follows: + + $$ + b_x = \\sigma(t_x) + c_x + $$ + $$ + b_y = \\sigma(t_y) + c_y + $$ + $$ + b_w = p_w e^{t_w} + $$ + $$ + b_h = p_h e^{t_h} + $$ + + in the equation above, :math:`c_x, c_y` is the left top corner of current grid + and :math:`p_w, p_h` is specified by anchors. + + The logistic regression value of the 5th channel of each anchor prediction boxes + represents the confidence score of each prediction box, and the logistic + regression value of the last :attr:`class_num` channels of each anchor prediction + boxes represents the classifcation scores. Boxes with confidence scores less than + :attr:`conf_thresh` should be ignored, and box final scores is the product of + confidence scores and classification scores. + + $$ + score_{pred} = score_{conf} * score_{class} + $$ + + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel, + ops::YoloBoxKernel); diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..5a882958e66a79507e053a96b15be8cbbcc83164 --- /dev/null +++ b/paddle/fluid/operators/detection/yolo_box_op.cu @@ -0,0 +1,120 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/yolo_box_op.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +__global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes, + T* scores, const float conf_thresh, + const int* anchors, const int n, const int h, + const int w, const int an_num, const int class_num, + const int box_num, int input_size) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + T box[4]; + for (; tid < n * box_num; tid += stride) { + int grid_num = h * w; + int i = tid / box_num; + int j = (tid % box_num) / grid_num; + int k = (tid % grid_num) / w; + int l = tid % w; + + int an_stride = (5 + class_num) * grid_num; + int img_height = imgsize[2 * i]; + int img_width = imgsize[2 * i + 1]; + + int obj_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4); + T conf = sigmoid(input[obj_idx]); + if (conf < conf_thresh) { + continue; + } + + int box_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0); + GetYoloBox(box, input, anchors, l, k, j, h, input_size, box_idx, + grid_num, img_height, img_width); + box_idx = (i * box_num + j * grid_num + k * w + l) * 4; + CalcDetectionBox(boxes, box, box_idx, img_height, img_width); + + int label_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5); + int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num; + CalcLabelScore(scores, input, label_idx, score_idx, class_num, conf, + grid_num); + } +} + +template +class YoloBoxOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* img_size = ctx.Input("ImgSize"); + auto* boxes = ctx.Output("Boxes"); + auto* scores = ctx.Output("Scores"); + + auto anchors = ctx.Attr>("anchors"); + int class_num = ctx.Attr("class_num"); + float conf_thresh = ctx.Attr("conf_thresh"); + int downsample_ratio = ctx.Attr("downsample_ratio"); + + const int n = input->dims()[0]; + const int h = input->dims()[2]; + const int w = input->dims()[3]; + const int box_num = boxes->dims()[1]; + const int an_num = anchors.size() / 2; + int input_size = downsample_ratio * h; + + auto& dev_ctx = ctx.cuda_device_context(); + auto& allocator = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); + int bytes = sizeof(int) * anchors.size(); + auto anchors_ptr = allocator.Allocate(sizeof(int) * anchors.size()); + int* anchors_data = reinterpret_cast(anchors_ptr->ptr()); + const auto gplace = boost::get(ctx.GetPlace()); + const auto cplace = platform::CPUPlace(); + memory::Copy(gplace, anchors_data, cplace, anchors.data(), bytes, + dev_ctx.stream()); + + const T* input_data = input->data(); + const int* imgsize_data = img_size->data(); + T* boxes_data = boxes->mutable_data({n, box_num, 4}, ctx.GetPlace()); + T* scores_data = + scores->mutable_data({n, box_num, class_num}, ctx.GetPlace()); + math::SetConstant set_zero; + set_zero(dev_ctx, boxes, static_cast(0)); + set_zero(dev_ctx, scores, static_cast(0)); + + int grid_dim = (n * box_num + 512 - 1) / 512; + grid_dim = grid_dim > 8 ? 8 : grid_dim; + + KeYoloBoxFw<<>>( + input_data, imgsize_data, boxes_data, scores_data, conf_thresh, + anchors_data, n, h, w, an_num, class_num, box_num, input_size); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(yolo_box, ops::YoloBoxOpCUDAKernel, + ops::YoloBoxOpCUDAKernel); diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8b7c7df0f3cf754f59c994dbe5b1cc2ac5fb773b --- /dev/null +++ b/paddle/fluid/operators/detection/yolo_box_op.h @@ -0,0 +1,149 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +HOSTDEVICE inline T sigmoid(T x) { + return 1.0 / (1.0 + std::exp(-x)); +} + +template +HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i, + int j, int an_idx, int grid_size, + int input_size, int index, int stride, + int img_height, int img_width) { + box[0] = (i + sigmoid(x[index])) * img_width / grid_size; + box[1] = (j + sigmoid(x[index + stride])) * img_height / grid_size; + box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width / + input_size; + box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] * + img_height / input_size; +} + +HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx, + int an_num, int an_stride, int stride, + int entry) { + return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; +} + +template +HOSTDEVICE inline void CalcDetectionBox(T* boxes, T* box, const int box_idx, + const int img_height, + const int img_width) { + boxes[box_idx] = box[0] - box[2] / 2; + boxes[box_idx + 1] = box[1] - box[3] / 2; + boxes[box_idx + 2] = box[0] + box[2] / 2; + boxes[box_idx + 3] = box[1] + box[3] / 2; + + boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast(0); + boxes[box_idx + 1] = + boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast(0); + boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1 + ? boxes[box_idx + 2] + : static_cast(img_width - 1); + boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1 + ? boxes[box_idx + 3] + : static_cast(img_height - 1); +} + +template +HOSTDEVICE inline void CalcLabelScore(T* scores, const T* input, + const int label_idx, const int score_idx, + const int class_num, const T conf, + const int stride) { + for (int i = 0; i < class_num; i++) { + scores[score_idx + i] = conf * sigmoid(input[label_idx + i * stride]); + } +} + +template +class YoloBoxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* imgsize = ctx.Input("ImgSize"); + auto* boxes = ctx.Output("Boxes"); + auto* scores = ctx.Output("Scores"); + auto anchors = ctx.Attr>("anchors"); + int class_num = ctx.Attr("class_num"); + float conf_thresh = ctx.Attr("conf_thresh"); + int downsample_ratio = ctx.Attr("downsample_ratio"); + + const int n = input->dims()[0]; + const int h = input->dims()[2]; + const int w = input->dims()[3]; + const int box_num = boxes->dims()[1]; + const int an_num = anchors.size() / 2; + int input_size = downsample_ratio * h; + + const int stride = h * w; + const int an_stride = (class_num + 5) * stride; + + Tensor anchors_; + auto anchors_data = + anchors_.mutable_data({an_num * 2}, ctx.GetPlace()); + std::copy(anchors.begin(), anchors.end(), anchors_data); + + const T* input_data = input->data(); + const int* imgsize_data = imgsize->data(); + T* boxes_data = boxes->mutable_data({n, box_num, 4}, ctx.GetPlace()); + memset(boxes_data, 0, boxes->numel() * sizeof(T)); + T* scores_data = + scores->mutable_data({n, box_num, class_num}, ctx.GetPlace()); + memset(scores_data, 0, scores->numel() * sizeof(T)); + + T box[4]; + for (int i = 0; i < n; i++) { + int img_height = imgsize_data[2 * i]; + int img_width = imgsize_data[2 * i + 1]; + + for (int j = 0; j < an_num; j++) { + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + int obj_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 4); + T conf = sigmoid(input_data[obj_idx]); + if (conf < conf_thresh) { + continue; + } + + int box_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 0); + GetYoloBox(box, input_data, anchors_data, l, k, j, h, input_size, + box_idx, stride, img_height, img_width); + box_idx = (i * box_num + j * stride + k * w + l) * 4; + CalcDetectionBox(boxes_data, box, box_idx, img_height, + img_width); + + int label_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 5); + int score_idx = (i * box_num + j * stride + k * w + l) * class_num; + CalcLabelScore(scores_data, input_data, label_idx, score_idx, + class_num, conf, stride); + } + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc index ab01bdf7ca8c5a369bd8838b1acc734364666992..6c37da17f4011d38efcdc5406331f1be173dd0dd 100644 --- a/paddle/fluid/operators/detection/yolov3_loss_op.cc +++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc @@ -10,6 +10,7 @@ limitations under the License. */ #include "paddle/fluid/operators/detection/yolov3_loss_op.h" +#include #include "paddle/fluid/framework/op_registry.h" namespace paddle { @@ -72,6 +73,18 @@ class Yolov3LossOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_GT(class_num, 0, "Attr(class_num) should be an integer greater then 0."); + if (ctx->HasInput("GTScore")) { + auto dim_gtscore = ctx->GetInputDim("GTScore"); + PADDLE_ENFORCE_EQ(dim_gtscore.size(), 2, + "Input(GTScore) should be a 2-D tensor"); + PADDLE_ENFORCE_EQ( + dim_gtscore[0], dim_gtbox[0], + "Input(GTBox) and Input(GTScore) dim[0] should be same"); + PADDLE_ENFORCE_EQ( + dim_gtscore[1], dim_gtbox[1], + "Input(GTBox) and Input(GTScore) dim[1] should be same"); + } + std::vector dim_out({dim_x[0]}); ctx->SetOutputDim("Loss", framework::make_ddim(dim_out)); @@ -112,6 +125,12 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { "This is a 2-D tensor with shape of [N, max_box_num], " "and each element should be an integer to indicate the " "box class id."); + AddInput("GTScore", + "The score of GTLabel, This is a 2-D tensor in same shape " + "GTLabel, and score values should in range (0, 1). This " + "input is for GTLabel score can be not 1.0 in image mixup " + "augmentation.") + .AsDispensable(); AddOutput("Loss", "The output yolov3 loss tensor, " "This is a 1-D tensor with shape of [N]"); @@ -143,6 +162,9 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("ignore_thresh", "The ignore threshold to ignore confidence loss.") .SetDefault(0.7); + AddAttr("use_label_smooth", + "Whether to use label smooth. Default True.") + .SetDefault(true); AddComment(R"DOC( This operator generates yolov3 loss based on given predict result and ground truth boxes. @@ -204,6 +226,15 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { loss = (loss_{xy} + loss_{wh}) * weight_{box} + loss_{conf} + loss_{class} $$ + + While :attr:`use_label_smooth` is set to be :attr:`True`, the classification + target will be smoothed when calculating classification loss, target of + positive samples will be smoothed to :math:`1.0 - 1.0 / class\_num` and target of + negetive samples will be smoothed to :math:`1.0 / class\_num`. + + While :attr:`GTScore` is given, which means the mixup score of ground truth + boxes, all losses incured by a ground truth box will be multiplied by its + mixup score. )DOC"); } }; @@ -240,6 +271,7 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker { op->SetInput("X", Input("X")); op->SetInput("GTBox", Input("GTBox")); op->SetInput("GTLabel", Input("GTLabel")); + op->SetInput("GTScore", Input("GTScore")); op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); op->SetInput("ObjectnessMask", Output("ObjectnessMask")); op->SetInput("GTMatchMask", Output("GTMatchMask")); @@ -249,6 +281,7 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker { op->SetOutput(framework::GradVarName("X"), InputGrad("X")); op->SetOutput(framework::GradVarName("GTBox"), {}); op->SetOutput(framework::GradVarName("GTLabel"), {}); + op->SetOutput(framework::GradVarName("GTScore"), {}); return std::unique_ptr(op); } }; diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.h b/paddle/fluid/operators/detection/yolov3_loss_op.h index 8407d4e6e8f87a2e8d073c4fbda5691abe1bba68..a004b022b75174012d10ba38e5ec161830c62640 100644 --- a/paddle/fluid/operators/detection/yolov3_loss_op.h +++ b/paddle/fluid/operators/detection/yolov3_loss_op.h @@ -37,8 +37,8 @@ static T SigmoidCrossEntropy(T x, T label) { } template -static T L2Loss(T x, T y) { - return 0.5 * (y - x) * (y - x); +static T L1Loss(T x, T y) { + return std::abs(y - x); } template @@ -47,8 +47,8 @@ static T SigmoidCrossEntropyGrad(T x, T label) { } template -static T L2LossGrad(T x, T y) { - return x - y; +static T L1LossGrad(T x, T y) { + return x > y ? 1.0 : -1.0; } static int GetMaskIndex(std::vector mask, int val) { @@ -121,47 +121,49 @@ template static void CalcBoxLocationLoss(T* loss, const T* input, Box gt, std::vector anchors, int an_idx, int box_idx, int gi, int gj, int grid_size, - int input_size, int stride) { + int input_size, int stride, T score) { T tx = gt.x * grid_size - gi; T ty = gt.y * grid_size - gj; T tw = std::log(gt.w * input_size / anchors[2 * an_idx]); T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]); - T scale = (2.0 - gt.w * gt.h); + T scale = (2.0 - gt.w * gt.h) * score; loss[0] += SigmoidCrossEntropy(input[box_idx], tx) * scale; loss[0] += SigmoidCrossEntropy(input[box_idx + stride], ty) * scale; - loss[0] += L2Loss(input[box_idx + 2 * stride], tw) * scale; - loss[0] += L2Loss(input[box_idx + 3 * stride], th) * scale; + loss[0] += L1Loss(input[box_idx + 2 * stride], tw) * scale; + loss[0] += L1Loss(input[box_idx + 3 * stride], th) * scale; } template static void CalcBoxLocationLossGrad(T* input_grad, const T loss, const T* input, Box gt, std::vector anchors, int an_idx, int box_idx, int gi, int gj, - int grid_size, int input_size, int stride) { + int grid_size, int input_size, int stride, + T score) { T tx = gt.x * grid_size - gi; T ty = gt.y * grid_size - gj; T tw = std::log(gt.w * input_size / anchors[2 * an_idx]); T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]); - T scale = (2.0 - gt.w * gt.h); + T scale = (2.0 - gt.w * gt.h) * score; input_grad[box_idx] = SigmoidCrossEntropyGrad(input[box_idx], tx) * scale * loss; input_grad[box_idx + stride] = SigmoidCrossEntropyGrad(input[box_idx + stride], ty) * scale * loss; input_grad[box_idx + 2 * stride] = - L2LossGrad(input[box_idx + 2 * stride], tw) * scale * loss; + L1LossGrad(input[box_idx + 2 * stride], tw) * scale * loss; input_grad[box_idx + 3 * stride] = - L2LossGrad(input[box_idx + 3 * stride], th) * scale * loss; + L1LossGrad(input[box_idx + 3 * stride], th) * scale * loss; } template static inline void CalcLabelLoss(T* loss, const T* input, const int index, const int label, const int class_num, - const int stride) { + const int stride, const T pos, const T neg, + T score) { for (int i = 0; i < class_num; i++) { T pred = input[index + i * stride]; - loss[0] += SigmoidCrossEntropy(pred, (i == label) ? 1.0 : 0.0); + loss[0] += SigmoidCrossEntropy(pred, (i == label) ? pos : neg) * score; } } @@ -169,11 +171,13 @@ template static inline void CalcLabelLossGrad(T* input_grad, const T loss, const T* input, const int index, const int label, const int class_num, - const int stride) { + const int stride, const T pos, const T neg, + T score) { for (int i = 0; i < class_num; i++) { T pred = input[index + i * stride]; input_grad[index + i * stride] = - SigmoidCrossEntropyGrad(pred, (i == label) ? 1.0 : 0.0) * loss; + SigmoidCrossEntropyGrad(pred, (i == label) ? pos : neg) * score * + loss; } } @@ -188,8 +192,8 @@ static inline void CalcObjnessLoss(T* loss, const T* input, const T* objness, for (int l = 0; l < w; l++) { T obj = objness[k * w + l]; if (obj > 1e-5) { - // positive sample: obj = 1 - loss[i] += SigmoidCrossEntropy(input[k * w + l], 1.0); + // positive sample: obj = mixup score + loss[i] += SigmoidCrossEntropy(input[k * w + l], 1.0) * obj; } else if (obj > -0.5) { // negetive sample: obj = 0 loss[i] += SigmoidCrossEntropy(input[k * w + l], 0.0); @@ -215,7 +219,8 @@ static inline void CalcObjnessLossGrad(T* input_grad, const T* loss, T obj = objness[k * w + l]; if (obj > 1e-5) { input_grad[k * w + l] = - SigmoidCrossEntropyGrad(input[k * w + l], 1.0) * loss[i]; + SigmoidCrossEntropyGrad(input[k * w + l], 1.0) * obj * + loss[i]; } else if (obj > -0.5) { input_grad[k * w + l] = SigmoidCrossEntropyGrad(input[k * w + l], 0.0) * loss[i]; @@ -252,6 +257,7 @@ class Yolov3LossKernel : public framework::OpKernel { auto* input = ctx.Input("X"); auto* gt_box = ctx.Input("GTBox"); auto* gt_label = ctx.Input("GTLabel"); + auto* gt_score = ctx.Input("GTScore"); auto* loss = ctx.Output("Loss"); auto* objness_mask = ctx.Output("ObjectnessMask"); auto* gt_match_mask = ctx.Output("GTMatchMask"); @@ -260,6 +266,7 @@ class Yolov3LossKernel : public framework::OpKernel { int class_num = ctx.Attr("class_num"); float ignore_thresh = ctx.Attr("ignore_thresh"); int downsample_ratio = ctx.Attr("downsample_ratio"); + bool use_label_smooth = ctx.Attr("use_label_smooth"); const int n = input->dims()[0]; const int h = input->dims()[2]; @@ -272,6 +279,13 @@ class Yolov3LossKernel : public framework::OpKernel { const int stride = h * w; const int an_stride = (class_num + 5) * stride; + T label_pos = 1.0; + T label_neg = 0.0; + if (use_label_smooth) { + label_pos = 1.0 - 1.0 / static_cast(class_num); + label_neg = 1.0 / static_cast(class_num); + } + const T* input_data = input->data(); const T* gt_box_data = gt_box->data(); const int* gt_label_data = gt_label->data(); @@ -283,6 +297,19 @@ class Yolov3LossKernel : public framework::OpKernel { int* gt_match_mask_data = gt_match_mask->mutable_data({n, b}, ctx.GetPlace()); + const T* gt_score_data; + if (!gt_score) { + Tensor gtscore; + gtscore.mutable_data({n, b}, ctx.GetPlace()); + math::SetConstant()( + ctx.template device_context(), >score, + static_cast(1.0)); + gt_score = >score; + gt_score_data = gtscore.data(); + } else { + gt_score_data = gt_score->data(); + } + // calc valid gt box mask, avoid calc duplicately in following code Tensor gt_valid_mask; bool* gt_valid_mask_data = @@ -355,19 +382,20 @@ class Yolov3LossKernel : public framework::OpKernel { int mask_idx = GetMaskIndex(anchor_mask, best_n); gt_match_mask_data[i * b + t] = mask_idx; if (mask_idx >= 0) { + T score = gt_score_data[i * b + t]; int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, an_stride, stride, 0); CalcBoxLocationLoss(loss_data + i, input_data, gt, anchors, best_n, - box_idx, gi, gj, h, input_size, stride); + box_idx, gi, gj, h, input_size, stride, score); int obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi; - obj_mask_data[obj_idx] = 1.0; + obj_mask_data[obj_idx] = score; int label = gt_label_data[i * b + t]; int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, an_stride, stride, 5); CalcLabelLoss(loss_data + i, input_data, label_idx, label, - class_num, stride); + class_num, stride, label_pos, label_neg, score); } } } @@ -384,6 +412,7 @@ class Yolov3LossGradKernel : public framework::OpKernel { auto* input = ctx.Input("X"); auto* gt_box = ctx.Input("GTBox"); auto* gt_label = ctx.Input("GTLabel"); + auto* gt_score = ctx.Input("GTScore"); auto* input_grad = ctx.Output(framework::GradVarName("X")); auto* loss_grad = ctx.Input(framework::GradVarName("Loss")); auto* objness_mask = ctx.Input("ObjectnessMask"); @@ -392,6 +421,7 @@ class Yolov3LossGradKernel : public framework::OpKernel { auto anchor_mask = ctx.Attr>("anchor_mask"); int class_num = ctx.Attr("class_num"); int downsample_ratio = ctx.Attr("downsample_ratio"); + bool use_label_smooth = ctx.Attr("use_label_smooth"); const int n = input_grad->dims()[0]; const int c = input_grad->dims()[1]; @@ -404,6 +434,13 @@ class Yolov3LossGradKernel : public framework::OpKernel { const int stride = h * w; const int an_stride = (class_num + 5) * stride; + T label_pos = 1.0; + T label_neg = 0.0; + if (use_label_smooth) { + label_pos = 1.0 - 1.0 / static_cast(class_num); + label_neg = 1.0 / static_cast(class_num); + } + const T* input_data = input->data(); const T* gt_box_data = gt_box->data(); const int* gt_label_data = gt_label->data(); @@ -414,25 +451,41 @@ class Yolov3LossGradKernel : public framework::OpKernel { input_grad->mutable_data({n, c, h, w}, ctx.GetPlace()); memset(input_grad_data, 0, input_grad->numel() * sizeof(T)); + const T* gt_score_data; + if (!gt_score) { + Tensor gtscore; + gtscore.mutable_data({n, b}, ctx.GetPlace()); + math::SetConstant()( + ctx.template device_context(), >score, + static_cast(1.0)); + gt_score = >score; + gt_score_data = gtscore.data(); + } else { + gt_score_data = gt_score->data(); + } + for (int i = 0; i < n; i++) { for (int t = 0; t < b; t++) { int mask_idx = gt_match_mask_data[i * b + t]; if (mask_idx >= 0) { + T score = gt_score_data[i * b + t]; Box gt = GetGtBox(gt_box_data, i, b, t); int gi = static_cast(gt.x * w); int gj = static_cast(gt.y * h); int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, an_stride, stride, 0); - CalcBoxLocationLossGrad( - input_grad_data, loss_grad_data[i], input_data, gt, anchors, - anchor_mask[mask_idx], box_idx, gi, gj, h, input_size, stride); + CalcBoxLocationLossGrad(input_grad_data, loss_grad_data[i], + input_data, gt, anchors, + anchor_mask[mask_idx], box_idx, gi, gj, h, + input_size, stride, score); int label = gt_label_data[i * b + t]; int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, an_stride, stride, 5); CalcLabelLossGrad(input_grad_data, loss_grad_data[i], input_data, - label_idx, label, class_num, stride); + label_idx, label, class_num, stride, label_pos, + label_neg, score); } } } diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc index 70186e5efa29b1324ff7f3954720276156fddaf1..d51d51b4953073e9a350806f041bb3112fad239c 100644 --- a/paddle/fluid/operators/fake_quantize_op.cc +++ b/paddle/fluid/operators/fake_quantize_op.cc @@ -81,6 +81,30 @@ struct FindRangeAbsMaxFunctor { template struct FindRangeAbsMaxFunctor; +template +struct FindMovingAverageAbsMaxFunctor { + void operator()(const platform::CPUDeviceContext& ctx, + const framework::Tensor& in_accum, + const framework::Tensor& in_state, const T* cur_scale, + const float rate, framework::Tensor* out_state, + framework::Tensor* out_accum, framework::Tensor* out_scale) { + T accum = in_accum.data()[0]; + T state = in_state.data()[0]; + T scale = cur_scale[0]; + + state = rate * state + 1; + accum = rate * accum + scale; + scale = accum / state; + + out_state->mutable_data(ctx.GetPlace())[0] = state; + out_accum->mutable_data(ctx.GetPlace())[0] = accum; + out_scale->mutable_data(ctx.GetPlace())[0] = scale; + } +}; + +template struct FindMovingAverageAbsMaxFunctor; + class FakeQuantizeAbsMaxOp : public framework::OperatorWithKernel { public: FakeQuantizeAbsMaxOp(const std::string& type, @@ -255,6 +279,78 @@ $$Out = round(X/scale * range)$$ } }; +class FakeQuantizeMovingAverageAbsMaxOp : public framework::OperatorWithKernel { + public: + FakeQuantizeMovingAverageAbsMaxOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput("X"), + "Input(X) of FakeQuantizeMovingAverageAbsMaxOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Output(Out) of FakeQuantizeMovingAverageAbsMaxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("OutScale"), + "Output(OutScale) of FakeQuantizeMovingAverageAbsMaxOp " + "should not be null"); + if (ctx->HasOutput("OutState")) { + ctx->SetOutputDim("OutState", {1}); + } + if (ctx->HasOutput("OutAccum")) { + ctx->SetOutputDim("OutAccum", {1}); + } + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->SetOutputDim("OutScale", {1}); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class FakeQuantizeMovingAverageAbsMaxOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor) Input is float data type."); + AddInput("InScale", "Last scale."); + AddInput("InAccum", "Last accum.").AsDispensable(); + AddInput("InState", "Last state.").AsDispensable(); + AddOutput("Out", "(Tensor) Output of quantized low level tensor."); + AddOutput("OutScale", " Current scale"); + AddOutput("OutState", "(Tensor) state buffer.").AsDispensable(); + AddOutput("OutAccum", "(Tensor) accum buffer.").AsDispensable(); + AddAttr("moving_rate", "(float, default 0.9) moving rate.") + .SetDefault(0.9); + AddAttr("bit_length", "(int, default 8), quantization bit number.") + .SetDefault(8) + .AddCustomChecker([](const int& bit_length) { + PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16, + "'bit_length' should be between 1 and 16."); + }); + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); + AddComment(R"DOC( +FakeQuantize operator is used in static quantization. + +$$scale = (0.9*max(abs(x))+accum)/(0.9*state+1)$$ +$$range = 2^{bit_length - 1} - 1$$ +$$Out = round(X/scale * range)$$ + +)DOC"); + } +}; + } // namespace operators } // namespace paddle @@ -273,6 +369,12 @@ REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp, REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxKernel); +REGISTER_OPERATOR(fake_quantize_moving_average_abs_max, + ops::FakeQuantizeMovingAverageAbsMaxOp, + ops::FakeQuantizeMovingAverageAbsMaxOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(fake_quantize_moving_average_abs_max, + ops::FakeQuantizeMovingAverageAbsMaxKernel); REGISTER_OPERATOR(fake_channel_wise_quantize_abs_max, ops::FakeChannelWiseQuantizeAbsMaxOp, ops::FakeChannelWiseQuantizeAbsMaxOpMaker, diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index 5da16a7c7314c62034bff67bcc8d099e2799c3de..3707f6772eac0d568c170d60c17d431e254d0b6b 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -147,6 +147,41 @@ struct FindRangeAbsMaxFunctor { template struct FindRangeAbsMaxFunctor; +template +struct FindMovingAverageAbsMaxFunctor { + void operator()(const platform::CUDADeviceContext& ctx, + const framework::Tensor& in_accum, + const framework::Tensor& in_state, const T* cur_scale, + const float rate, framework::Tensor* out_state, + framework::Tensor* out_accum, framework::Tensor* out_scale) { + const auto gpu_place = boost::get(ctx.GetPlace()); + + T accum; + memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data(), + sizeof(T), 0); + T state; + memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data(), + sizeof(T), 0); + T scale; + memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T), + 0); + + state = rate * state + 1; + accum = rate * accum + scale; + scale = accum / state; + + memory::Copy(gpu_place, out_accum->mutable_data(gpu_place), + platform::CPUPlace(), &accum, sizeof(T), 0); + memory::Copy(gpu_place, out_state->mutable_data(gpu_place), + platform::CPUPlace(), &state, sizeof(T), 0); + memory::Copy(gpu_place, out_scale->mutable_data(gpu_place), + platform::CPUPlace(), &scale, sizeof(T), 0); + } +}; + +template struct FindMovingAverageAbsMaxFunctor; + template struct ClipAndFakeQuantFunctor { void operator()(const platform::CUDADeviceContext& ctx, @@ -178,3 +213,6 @@ REGISTER_OP_CUDA_KERNEL(fake_channel_wise_quantize_abs_max, ops::FakeChannelWiseQuantizeAbsMaxKernel); REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxKernel); +REGISTER_OP_CUDA_KERNEL( + fake_quantize_moving_average_abs_max, + ops::FakeQuantizeMovingAverageAbsMaxKernel); diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h index 8b47600e7d99ad9e4e40ae162582d4c8461224ad..ec667e89e7699d87db9423f17014a2761ce62763 100644 --- a/paddle/fluid/operators/fake_quantize_op.h +++ b/paddle/fluid/operators/fake_quantize_op.h @@ -42,12 +42,20 @@ struct FindRangeAbsMaxFunctor { framework::Tensor* scales_arr, framework::Tensor* out_scale); }; +template +struct FindMovingAverageAbsMaxFunctor { + void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum, + const framework::Tensor& in_state, + const framework::Tensor& cur_scale, + framework::Tensor* out_state, framework::Tensor* out_accum, + framework::Tensor* out_scale); +}; + template class FakeQuantizeAbsMaxKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto* in = context.Input("X"); - auto* out = context.Output("Out"); auto* out_scale = context.Output("OutScale"); T* out_s = out_scale->mutable_data(context.GetPlace()); @@ -138,5 +146,54 @@ class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel { } }; +template +class FakeQuantizeMovingAverageAbsMaxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* in_scale = context.Input("InScale"); + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + + bool is_test = context.Attr("is_test"); + int bit_length = context.Attr("bit_length"); + int bin_cnt = std::pow(2, bit_length - 1) - 1; + auto& dev_ctx = context.template device_context(); + + // testing + if (is_test) { + ClipAndFakeQuantFunctor()(dev_ctx, *in, *in_scale, + bin_cnt, out); + return; + } + + // training + auto* in_accum = context.Input("InAccum"); + auto* in_state = context.Input("InState"); + auto& allocator = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); + auto cur_scale = allocator.Allocate(1 * sizeof(T)); + T* cur_scale_data = static_cast(cur_scale->ptr()); + + FindAbsMaxFunctor()(dev_ctx, in->data(), in->numel(), + cur_scale_data); + + auto* out_state = context.Output("OutState"); + auto* out_accum = context.Output("OutAccum"); + auto* out_scale = context.Output("OutScale"); + out_state->mutable_data(context.GetPlace()); + out_accum->mutable_data(context.GetPlace()); + out_scale->mutable_data(context.GetPlace()); + float moving_rate = context.Attr("moving_rate"); + + FindMovingAverageAbsMaxFunctor()( + dev_ctx, *in_accum, *in_state, cur_scale_data, moving_rate, out_state, + out_accum, out_scale); + + ClipAndFakeQuantFunctor()(dev_ctx, *in, *out_scale, + bin_cnt, out); + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index eb4617a9359353820fc41b9ad1c8db5327fdacde..242f5390b806756283686dae2e2c32b93c2bd71e 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -55,17 +55,8 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const { "The input tensor Input's rank of FCOp should be larger than " "in_num_col_dims."); - auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims); - PADDLE_ENFORCE_EQ( - in_mat_dims[1], w_dims[0], - "Fully Connected input and weigth size do not match. %s, %s"); - std::vector output_dims; - output_dims.reserve(static_cast(in_num_col_dims + 1)); - for (int i = 0; i < in_num_col_dims; ++i) { - output_dims.push_back(in_dims[i]); - } - output_dims.push_back(w_dims[1]); + FCOutputSize(in_dims, w_dims, output_dims, in_num_col_dims); ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); ctx->ShareLoD("Input", "Out"); @@ -128,6 +119,9 @@ void FCOpMaker::Make() { AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr(framework::kAllKernelsMustComputeRuntimeShape, + "Skip calling InferShape() function in the runtime.") + .SetDefault(true); AddComment(R"DOC( Fully Connected Operator. @@ -142,13 +136,20 @@ class FCOpKernel : public framework::OpKernel { void Compute(const paddle::framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); - auto input = ctx.Input("Input"); + auto input = ctx.Input("Input"); auto w = ctx.Input("W"); auto bias = ctx.Input("Bias"); - auto output = ctx.Output("Out"); + auto output = ctx.Output("Out"); + int in_num_col_dims = ctx.Attr("in_num_col_dims"); auto w_dims = w->dims(); + + std::vector output_dims; + FCOutputSize(input->dims(), w_dims, output_dims, in_num_col_dims); + output->Resize(framework::make_ddim(output_dims)); + output->set_lod(input->lod()); + auto out_dims = output->dims(); - int M = framework::product(out_dims) / out_dims[out_dims.size() - 1]; + int M = framework::product(out_dims) / w_dims[1]; const T* input_data = input->data(); const T* w_data = w->data(); diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h index e1b780fc0c401fbf34a9db03aa31137cbc016939..b82a63cd830b569c4541bbaffb5affb75394773a 100644 --- a/paddle/fluid/operators/fc_op.h +++ b/paddle/fluid/operators/fc_op.h @@ -48,5 +48,21 @@ class FCOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override; }; +inline void FCOutputSize(const framework::DDim& in_dims, + const framework::DDim& w_dims, + std::vector& out_dims, // NOLINT + int in_num_col_dims) { + auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims); + PADDLE_ENFORCE_EQ( + in_mat_dims[1], w_dims[0], + "Fully Connected input and weigth size do not match. %s, %s"); + + out_dims.reserve(static_cast(in_num_col_dims + 1)); + for (int i = 0; i < in_num_col_dims; ++i) { + out_dims.push_back(in_dims[i]); + } + out_dims.push_back(w_dims[1]); +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc index a0026427e2514735711f7eba26fcf861cb498d5e..ecb89184990b6f6295a104b2adc96875b5d15fd6 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc @@ -88,7 +88,8 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker { "(boolean, default false) " "Sparse update.") .SetDefault(false); - AddAttr(framework::kAllKernelsMustComputeRuntimeShape, "") + AddAttr(framework::kAllKernelsMustComputeRuntimeShape, + "Skip calling InferShape() function in the runtime.") .SetDefault(true); AddComment(R"DOC( FusedEmbeddingSeqPool Operator. diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc index f6395fb32feac175976cb96e1c0bee7347cb3ea8..82222d0a7e739e15a779541c14576ce2de24a3d2 100644 --- a/paddle/fluid/operators/hash_op.cc +++ b/paddle/fluid/operators/hash_op.cc @@ -54,7 +54,8 @@ $$Out = scale * X$$ )DOC"); AddAttr("num_hash", "").SetDefault(1); AddAttr("mod_by", "").SetDefault(100000); - AddAttr(framework::kAllKernelsMustComputeRuntimeShape, "") + AddAttr(framework::kAllKernelsMustComputeRuntimeShape, + "Skip calling InferShape() function in the runtime.") .SetDefault(true); } }; diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 14ca3e8073b9512732876e512a30968b15884495..8d96ae7e4215c2488564322e1dda46a81b46a665 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -592,6 +592,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { platform::SetDstMemoryHandler(ctx, output, handler, &dst_memory_p); } else { + need_s8_to_u8 = fuse_relu; platform::SetDstMemoryHandler(ctx, output, handler, &dst_memory_p); } diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index 3a926a716f54a094eba11d63c3b29de27dff274b..69c0486eb63475d759b6869f55d14ef1bec08b59 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -123,7 +123,7 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel { auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); - auto input = ctx.Input("Input"); + auto input = ctx.Input("Input"); auto w = ctx.Input("W"); auto bias = ctx.Input("Bias"); @@ -151,7 +151,13 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel { const T* input_data = input->data(); const T* w_data = w->data(); - auto output = ctx.Output("Out"); + auto output = ctx.Output("Out"); + int in_num_col_dims = ctx.Attr("in_num_col_dims"); + std::vector output_dims; + FCOutputSize(input->dims(), w->dims(), output_dims, in_num_col_dims); + output->Resize(framework::make_ddim(output_dims)); + output->set_lod(input->lod()); + T* output_data = output->mutable_data(ctx.GetPlace()); auto dst_memory = mem.dst(output_data); @@ -204,19 +210,21 @@ class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel { Tensor* input_grad = ctx.Output(framework::GradVarName("Input")); Tensor* w_grad = ctx.Output(framework::GradVarName("W")); + const Tensor* input = ctx.Input("Input"); + const T* input_data = input->data(); + + const Tensor* w = ctx.Input("W"); + const T* w_data = w->data(); + if (input_grad) { + input_grad->Resize(input->dims()); input_grad_data = input_grad->mutable_data(ctx.GetPlace()); } if (w_grad) { + w_grad->Resize(w->dims()); w_grad_data = w_grad->mutable_data(ctx.GetPlace()); } - const Tensor* input = ctx.Input("Input"); - const T* input_data = input->data(); - - const Tensor* w = ctx.Input("W"); - const T* w_data = w->data(); - const Tensor* out_grad = ctx.Input(framework::GradVarName("Out")); const T* out_grad_data = out_grad->data(); diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc index e41bfb80dfc0452955f7978f74ccfea184886b69..4debc7ca5ec90d6cc781d10e817e9ed8650f12aa 100644 --- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc @@ -73,6 +73,29 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { } }; +template +class TransposeINT8MKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + std::vector axis = ctx.Attr>("axis"); + std::vector axis_int8 = {0, 2, 3, 1}; + if (axis.size() != 1) { + PADDLE_ENFORCE_EQ(axis.size(), axis_int8.size()); + for (size_t i = 0; i < axis.size(); i++) { + PADDLE_ENFORCE_EQ(axis[i], axis_int8[i], + "Current INT8 MKLDNN Transpose kernel only surpport " + "axis with [0, 2, 3, 1] due to MKL-DNN kernel " + "implementation."); + } + } + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + output->ShareDataWith(*input); + output->set_layout(DataLayout::kMKLDNN); + output->set_format(input->format()); + } +}; + template class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { public: @@ -140,7 +163,10 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace, - ops::TransposeMKLDNNOpKernel); + ops::TransposeMKLDNNOpKernel, + ops::TransposeINT8MKLDNNOpKernel, + ops::TransposeINT8MKLDNNOpKernel); + REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace, ops::TransposeMKLDNNOpKernel); diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 09255f60e6953734680cc9b008504fabc5589cf0..6262ef0c2d3802bca574ba1312e7cf4a720403ef 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include // for sqrt in CPU and CUDA #include +#include #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/threadpool.h" @@ -311,17 +312,17 @@ struct SparseAdamFunctor { T beta1_pow = *beta1_pow_; T beta2_pow = *beta2_pow_; lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); - size_t row_count = numel / row_numel_; + int64_t row_count = static_cast(numel / row_numel_); - for (size_t i = 0U, j = 0U; i != row_count; ++i) { + for (int64_t i = 0, j = 0; i != row_count; ++i) { if (i == *(rows_ + j)) { - for (size_t k = 0U; k != row_numel_; ++k) { + for (int64_t k = 0; k != row_numel_; ++k) { T g = grad_[j * row_numel_ + k]; adam_update(i * row_numel_ + k, g); } ++j; } else { - for (size_t k = 0U; k != row_numel_; ++k) { + for (int64_t k = 0; k != row_numel_; ++k) { T mom1 = moment1_[i * row_numel_ + k]; T mom2 = moment2_[i * row_numel_ + k]; T p = param_[i * row_numel_ + k]; @@ -427,43 +428,23 @@ class AdamOpKernel : public framework::OpKernel { } } - framework::SelectedRows cpu_grad_merge; + framework::SelectedRows tmp_grad_merge; const framework::SelectedRows* grad_merge_ptr; if (is_strict_sorted) { grad_merge_ptr = &grad; } else { // merge duplicated rows if any. // The rows of grad_merge have been sorted inside MergeAdd functor - framework::SelectedRows* grad_merge_var; scatter::MergeAdd merge_func; - if (platform::is_cpu_place(ctx.GetPlace())) { - grad_merge_var = &cpu_grad_merge; - } else { - // FIXME(qiao): GPU also need to fix this - grad_merge_var = const_cast(ctx.scope()) - .Var() - ->GetMutable(); - } merge_func(ctx.template device_context(), grad, - grad_merge_var, true); - grad_merge_ptr = grad_merge_var; + &tmp_grad_merge, true); + grad_merge_ptr = &tmp_grad_merge; } auto& grad_merge = *grad_merge_ptr; auto& grad_tensor = grad_merge.value(); const T* grad_data = grad_tensor.template data(); - const int64_t* rows = nullptr; -// When compiled without CUDA, the CUDAData() interface should not be -// provided. -#if defined(PADDLE_WITH_CUDA) - if (platform::is_gpu_place(ctx.GetPlace())) { - rows = grad_merge.rows().CUDAData(ctx.GetPlace()); - } else { -#endif - rows = grad_merge.rows().data(); -#if defined(PADDLE_WITH_CUDA) - } -#endif + const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace()); auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); if (platform::is_cpu_place(ctx.GetPlace())) { @@ -488,7 +469,7 @@ class AdamOpKernel : public framework::OpKernel { } } #ifndef _WIN32 - else if (FLAGS_inner_op_parallelism > 1 && + else if (FLAGS_inner_op_parallelism > 1 && // NOLINT min_row_size_to_use_multithread > 0 && param.dims()[0] > min_row_size_to_use_multithread) { VLOG(3) << "use multi thread, inner_op_parallelism=" @@ -516,11 +497,11 @@ class AdamOpKernel : public framework::OpKernel { for (int i = 0; i < FLAGS_inner_op_parallelism; ++i) { int64_t start = i * line_in_each_thread; int64_t end = (i + 1) * line_in_each_thread; - if (start >= param_row_count) { + if (start >= static_cast(param_row_count)) { break; } - if (end > param_row_count) { - end = param_row_count; + if (end > static_cast(param_row_count)) { + end = static_cast(param_row_count); } fs.push_back( framework::Async([&functor, &row_id_to_grad_row_offset, @@ -545,8 +526,8 @@ class AdamOpKernel : public framework::OpKernel { } for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); } -#endif // !_WIN32 - else { +#endif // !_WIN32 + else { // NOLINT functor(param.numel()); } } else if (platform::is_gpu_place(ctx.GetPlace())) { diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h index 3ed1bff5ff4993e9c858dea8d56a8cb6124aca89..29a2ae6755aa609e4a6ee43bbf11fe02ebfa654e 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.h +++ b/paddle/fluid/operators/optimizers/momentum_op.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -69,6 +70,7 @@ class MomentumOp : public framework::OperatorWithKernel { ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("VelocityOut", param_dim); } + framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param")); @@ -351,23 +353,14 @@ class MomentumOpKernel : public framework::OpKernel { VLOG(3) << "Grad SelectedRows contains no data!"; return; } - auto* merged_grad = const_cast(ctx.scope()) - .Var() - ->GetMutable(); + + framework::SelectedRows tmp_merged_grad; + framework::SelectedRows* merged_grad = &tmp_merged_grad; math::scatter::MergeAdd merge_func; merge_func(ctx.template device_context(), *grad, merged_grad); - const int64_t* rows = nullptr; -#ifdef PADDLE_WITH_CUDA - if (platform::is_gpu_place(ctx.GetPlace())) { - rows = merged_grad->rows().CUDAData(ctx.GetPlace()); - } else { -#endif - rows = merged_grad->rows().data(); -#ifdef PADDLE_WITH_CUDA - } -#endif + const int64_t* rows = merged_grad->rows().Data(ctx.GetPlace()); int64_t row_numel = merged_grad->value().numel() / merged_grad->rows().size(); platform::ForRange for_range( diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.h b/paddle/fluid/operators/optimizers/rmsprop_op.h index 389c84d2464090ff9bd9e8b471cd0103c86a347a..4550052b2d614ccbbb09f4a2b9e747708b2a2baa 100644 --- a/paddle/fluid/operators/optimizers/rmsprop_op.h +++ b/paddle/fluid/operators/optimizers/rmsprop_op.h @@ -216,24 +216,14 @@ class RmspropOpKernel : public framework::OpKernel { } } else if (grad_var->IsType()) { auto &grad = grad_var->Get(); - auto *merged_grad = const_cast(ctx.scope()) - .Var() - ->GetMutable(); - + framework::SelectedRows tmp_merged_grad; + framework::SelectedRows *merged_grad = &tmp_merged_grad; math::scatter::MergeAdd merge_func; merge_func(dev_ctx, grad, merged_grad); platform::ForRange for_range(dev_ctx, limit); - const int64_t *rows; -#ifdef PADDLE_WITH_CUDA - if (platform::is_gpu_place(ctx.GetPlace())) { - rows = merged_grad->rows().CUDAData(ctx.GetPlace()); - } else { -#endif - rows = merged_grad->rows().data(); -#ifdef PADDLE_WITH_CUDA - } -#endif + const int64_t *rows = merged_grad->rows().Data(ctx.GetPlace()); + auto &merged_tensor = merged_grad->value(); int64_t row_count = merged_grad->rows().size(); int64_t row_numel = merged_tensor.numel() / row_count; diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 0a0ece162cc63696974383d8ed49fdd10204c331..7963c27a0153105b9ab21c7165b5e4daad8346ea 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/pool_op.h" +#include #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cudnn_helper.h" #endif @@ -212,6 +213,12 @@ void Pool2dOpMaker::Make() { AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr("use_quantizer", + "(bool, default false) " + "Set to true for operators that should be quantized and use " + "int8 kernel. " + "Only used on CPU.") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 37f69426b62fedf8cbeca68105fb86fb4ea72eab..2b429380fbfc007f5936bff96e0924d93abc81f5 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -219,14 +219,6 @@ class ReshapeKernel { std::vector(shape_data, shape_data + shape_tensor->numel()); out_dims = ReshapeOp::ValidateShape(shape, in->dims()); } - if (!in->lod().empty()) { - PADDLE_ENFORCE_EQ( - out_dims[0], in->dims()[0], - "Reshape operator cannot reshape an input sequence batch " - "into an output sequence batch that has a different " - "number of time steps. Please consider using " - "sequence_reshape op."); - } out->mutable_data(ctx.GetPlace(), in->type()); framework::TensorCopy( diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc index f357c9c08d042b69259f229955922f2f11b52c63..f5d6060bc365ad5d252127d1e6806ae7f06f93f6 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc @@ -59,7 +59,8 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker { }); AddAttr("pad_value", "(int) The enumerate sequence padding value.") .SetDefault(0); - AddAttr(framework::kAllKernelsMustComputeRuntimeShape, "") + AddAttr(framework::kAllKernelsMustComputeRuntimeShape, + "Skip calling InferShape() function in the runtime.") .SetDefault(true); AddComment(R"DOC( Sequence Enumerate Operator. diff --git a/paddle/fluid/operators/slice_op.cu b/paddle/fluid/operators/slice_op.cu index 5efecb78d1a4eaffc3a9c62e1e82a9bcb5922748..1af57b89a3506a7211cca9233ef624b0a83e77a0 100644 --- a/paddle/fluid/operators/slice_op.cu +++ b/paddle/fluid/operators/slice_op.cu @@ -12,18 +12,138 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/slice_op.h" +#include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +using platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void Padding(const paddle::platform::float16* d_out, + const int* out_dims, const int* in_dims, + const int* offsets, int64_t n, + paddle::platform::float16* d_in) { + int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x; + if (out_idx < n) { + int coords[D] = {0}; + for (int i = D - 1; i >= 0; --i) { + coords[i] = out_idx % out_dims[i]; + out_idx /= out_dims[i]; + coords[i] += offsets[i]; + } + + int64_t in_idx = 0; + for (int i = 0; i < D - 1; ++i) { + in_idx += coords[i] * in_dims[i + 1]; + } + in_idx += coords[D - 1]; + + d_in[in_idx] = d_out[out_idx]; + } +} + +template <> +class SliceGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto* d_in = ctx.Output(framework::GradVarName("Input")); + d_in->mutable_data(ctx.GetPlace()); + + auto out_dims = d_out->dims(); + auto in_dims = d_in->dims(); + int rank = out_dims.size(); + std::vector offsets(rank, 0); + auto axes = ctx.Attr>("axes"); + auto starts = ctx.Attr>("starts"); + + for (size_t i = 0; i < starts.size(); ++i) { + if (starts[i] < 0) { + starts[i] += in_dims[axes[i]]; + } + offsets[axes[i]] = std::max(starts[i], 0); + } + + math::SetConstant + set_zero; + auto& dev_ctx = + ctx.template device_context(); + set_zero(dev_ctx, d_in, static_cast(0)); + + int64_t numel = d_out->numel(); + dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1, 1, 1); + dim3 threads(PADDLE_CUDA_NUM_THREADS, 1, 1); + auto stream = ctx.cuda_device_context().stream(); + + auto out_shape = framework::vectorize2int(out_dims); + thrust::device_vector out_dims_vec(out_shape.begin(), out_shape.end()); + auto in_shape = framework::vectorize2int(in_dims); + thrust::device_vector in_dims_vec(in_shape.begin(), in_shape.end()); + thrust::device_vector offsets_vec(offsets.begin(), offsets.end()); + const int* out_dims_ptr = thrust::raw_pointer_cast(out_dims_vec.data()); + const int* in_dims_ptr = thrust::raw_pointer_cast(in_dims_vec.data()); + const int* offsets_ptr = thrust::raw_pointer_cast(offsets_vec.data()); + + switch (rank) { + case 1: + Padding<1><<>>( + d_out->data(), out_dims_ptr, in_dims_ptr, + offsets_ptr, numel, d_in->data()); + break; + case 2: + Padding<2><<>>( + d_out->data(), out_dims_ptr, in_dims_ptr, + offsets_ptr, numel, d_in->data()); + break; + case 3: + Padding<3><<>>( + d_out->data(), out_dims_ptr, in_dims_ptr, + offsets_ptr, numel, d_in->data()); + break; + case 4: + Padding<4><<>>( + d_out->data(), out_dims_ptr, in_dims_ptr, + offsets_ptr, numel, d_in->data()); + break; + case 5: + Padding<5><<>>( + d_out->data(), out_dims_ptr, in_dims_ptr, + offsets_ptr, numel, d_in->data()); + break; + case 6: + Padding<6><<>>( + d_out->data(), out_dims_ptr, in_dims_ptr, + offsets_ptr, numel, d_in->data()); + break; + } + } +}; + +} // namespace operators +} // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( slice, ops::SliceKernel, ops::SliceKernel, ops::SliceKernel, - ops::SliceKernel); + ops::SliceKernel, + ops::SliceKernel); REGISTER_OP_CUDA_KERNEL( slice_grad, ops::SliceGradKernel, ops::SliceGradKernel, ops::SliceGradKernel, - ops::SliceGradKernel); + ops::SliceGradKernel, + ops::SliceGradKernel); diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index 52b8dcc681b1f97d5ba03697257509cae1e6b484..89aaac4cbe6399af08b3d340896df7a07e1be543 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -439,7 +439,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { context.Input(framework::GradVarName("Loss"))->data(); Tensor* logit_grad = context.Output(framework::GradVarName("Logits")); - logit_grad->ShareDataWith(*context.Input("Softmax")); + framework::TensorCopy(*context.Input("Softmax"), context.GetPlace(), + context.device_context(), logit_grad); T* logit_grad_data = logit_grad->data(); const int batch_size = logit_grad->dims()[0]; diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc index e389c6a65e1e8220685294931c4d08e6fd928b7f..ecfb4e89566f3d72b3c262946c370bf34ce7515a 100644 --- a/paddle/fluid/operators/squeeze_op.cc +++ b/paddle/fluid/operators/squeeze_op.cc @@ -94,6 +94,7 @@ class SqueezeOpInferShape : public framework::InferShapeBase { } }; +// TODO(paddle-dev): Should use OpKernel. class SqueezeOp : public framework::OperatorBase { public: using OperatorBase::OperatorBase; diff --git a/paddle/fluid/operators/sync_batch_norm_op.cc b/paddle/fluid/operators/sync_batch_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d6cf27fd779eeddc94c1839e46892a99f61bd1bf --- /dev/null +++ b/paddle/fluid/operators/sync_batch_norm_op.cc @@ -0,0 +1,20 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/batch_norm_op.h" + +namespace ops = paddle::operators; +REGISTER_OPERATOR(sync_batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, + ops::BatchNormOpInferVarType, ops::BatchNormGradMaker); +REGISTER_OPERATOR(sync_batch_norm_grad, ops::BatchNormGradOp); diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu b/paddle/fluid/operators/sync_batch_norm_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..a5984bfaaaf96f7a412176bb9868dc44488acf3f --- /dev/null +++ b/paddle/fluid/operators/sync_batch_norm_op.cu @@ -0,0 +1,452 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "cub/cub.cuh" +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/nccl_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DataLayout = framework::DataLayout; +template +using CudnnDataType = platform::CudnnDataType; + +template +__global__ void KeLocalStats(const T *x, int N, int M, int C, T *mean_var) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + for (int k = blockIdx.x; k < C; k += gridDim.x) { + T x_sum = 0; + T x2_sum = 0; + for (int i = threadIdx.x; i < N * M; i += BlockDim) { + int id = layout == framework::DataLayout::kNCHW + ? (i / M) * C * M + k * M + i % M + : i * C + k; + T x_in = x[id]; + x_sum += x_in; + x2_sum += x_in * x_in; + } + __syncthreads(); + T out = BlockReduce(temp_storage).Reduce(x_sum, cub::Sum()); + __syncthreads(); + if (threadIdx.x == 0) { + mean_var[k] = out / (N * M); + } + out = BlockReduce(temp_storage).Reduce(x2_sum, cub::Sum()); + __syncthreads(); + if (threadIdx.x == 0) { + mean_var[k + C] = out / (N * M); + } + } + if (blockIdx.x == 0 && threadIdx.x == 0) { + mean_var[2 * C] = static_cast(1.0); + } +} + +template +__global__ void KeSyncAndMovingStats(T *means, T *variances, T *num_dev, + const int C, const T momentum, + const double epsilon, T *sv_mean_data, + T *sv_inv_var_data, T *moving_means, + T *moving_variances) { + // sync stats across multi-devices + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = gid; i < C; i += stride) { + T mean = means[i] / (*num_dev); + T var = variances[i] / (*num_dev); + var = var - mean * mean; + + // sync stats + sv_mean_data[i] = mean; + sv_inv_var_data[i] = 1.0 / sqrt(var + epsilon); + variances[i] = var; + + // moving stats + moving_means[i] = moving_means[i] * momentum + mean * (1. - momentum); + moving_variances[i] = + moving_variances[i] * momentum + var * (1. - momentum); + } +} + +template +static __global__ void KeNormAffine(const T *x, const T *scale, const T *bias, + const T *mean, const T *variance, + const double epsilon, const int C, + const int M, const int num, T *y) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = gid; i < num; i += stride) { + const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C; + y[i] = (x[i] - mean[c]) / sqrt(variance[c] + epsilon) * scale[c] + bias[c]; + } +} + +template +class SyncBatchNormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + double epsilon = static_cast(ctx.Attr("epsilon")); + const float momentum = ctx.Attr("momentum"); + const bool is_test = ctx.Attr("is_test"); + const std::string layout_str = ctx.Attr("data_layout"); + const DataLayout layout = framework::StringToDataLayout(layout_str); + const bool use_global_stats = ctx.Attr("use_global_stats"); + PADDLE_ENFORCE( + !use_global_stats, + "sync_batch_norm doesn't support to set use_global_stats True. ", + "Please use batch_norm in this case."); + + const auto *x = ctx.Input("X"); + const auto &x_dims = x->dims(); + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "The Input dim size should be between 2 and 5"); + int N, C, H, W, D; + ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D); + int x_numel = x->numel(); + + const T *x_d = x->data(); + const T *s_d = ctx.Input("Scale")->data(); + const T *b_d = ctx.Input("Bias")->data(); + + auto *y = ctx.Output("Y"); + T *y_d = y->mutable_data(ctx.GetPlace()); + + const T *mean_data = nullptr; + const T *var_data = nullptr; + + auto &dev_ctx = ctx.cuda_device_context(); + auto stream = dev_ctx.stream(); + auto *comm = dev_ctx.nccl_comm(); + const int block = 512; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + + paddle::memory::AllocationPtr alloc_ptr{nullptr}; + + if (is_test) { + const auto *est_mean = ctx.Input("Mean"); + const auto *est_var = ctx.Input("Variance"); + mean_data = est_mean->data(); + var_data = est_var->data(); + } else { + auto &allocator = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); + // x, x^2, 1, here 1 is used to calc device num + // device num also can be got from platform::DeviceContextPool + const int bytes = (C * 2 + 1) * sizeof(T); + alloc_ptr = allocator.Allocate(bytes); + + T *stats = reinterpret_cast(alloc_ptr->ptr()); + const int threads = 256; + int grid = std::min(C, (max_threads + threads - 1) / threads); + if (layout == framework::DataLayout::kNCHW) { + KeLocalStats< + T, threads, + framework::DataLayout::kNCHW><<>>( + x_d, N, H * W * D, C, stats); + } else { + KeLocalStats< + T, threads, + framework::DataLayout::kNHWC><<>>( + x_d, N, H * W * D, C, stats); + } + + Tensor c_g_st; + T *c_g_st_d = c_g_st.mutable_data({2 * C + 1}, platform::CPUPlace()); + auto gplace = boost::get(ctx.GetPlace()); + memory::Copy(platform::CPUPlace(), c_g_st_d, gplace, stats, bytes, 0); + + int dtype = platform::ToNCCLDataType(x->type()); + // In-place operation + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + stats, stats, 2 * C + 1, static_cast(dtype), ncclSum, + comm, stream)); + + // moving mean/variance + auto *mean_out = ctx.Output("MeanOut"); + auto *variance_out = ctx.Output("VarianceOut"); + T *est_mean_data = mean_out->mutable_data(ctx.GetPlace()); + T *est_var_data = variance_out->mutable_data(ctx.GetPlace()); + + auto *saved_mean = ctx.Output("SavedMean"); + auto *saved_inv_variance = ctx.Output("SavedVariance"); + T *sv_mean_data = saved_mean->mutable_data(ctx.GetPlace()); + T *sv_inv_var_data = saved_inv_variance->mutable_data(ctx.GetPlace()); + + // Note, Input('Mean')/Input('Variance') share variable with + // Output('MeanOut')/Output('VarianceOut') + KeSyncAndMovingStats<<<(C + block - 1) / block, block, 0, stream>>>( + stats, stats + C, stats + 2 * C, C, momentum, epsilon, sv_mean_data, + sv_inv_var_data, est_mean_data, est_var_data); + + mean_data = sv_mean_data; + var_data = stats + C; + } + + int grid2 = (std::min(x_numel, max_threads) + block - 1) / block; + if (layout == framework::DataLayout::kNCHW) { + KeNormAffine<<>>( + x_d, s_d, b_d, mean_data, var_data, epsilon, C, H * W * D, x_numel, + y_d); + } else { + KeNormAffine<<>>( + x_d, s_d, b_d, mean_data, var_data, epsilon, C, H * W * D, x_numel, + y_d); + } + } +}; + +template +__global__ void KeBackwardLocalStats(const T *dy, const T *x, const T *means, + int N, int M, int C, T *sum_dy_prod) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + for (int k = blockIdx.x; k < C; k += gridDim.x) { + T sum1 = 0; + T sum2 = 0; + T mean = means[k]; + for (int i = threadIdx.x; i < N * M; i += blockDim.x) { + int id = layout == framework::DataLayout::kNCHW + ? (i / M) * C * M + k * M + i % M + : i * C + k; + T g = dy[id]; + sum1 += g; + sum2 += g * (x[id] - mean); + } + + __syncthreads(); + T out = BlockReduce(temp_storage).Reduce(sum1, cub::Sum()); + __syncthreads(); + if (threadIdx.x == 0) { + sum_dy_prod[k] = out; + } + out = BlockReduce(temp_storage).Reduce(sum2, cub::Sum()); + __syncthreads(); + if (threadIdx.x == 0) { + sum_dy_prod[k + C] = out; + } + } + if (blockIdx.x == 0 && threadIdx.x == 0) { + sum_dy_prod[2 * C] = static_cast(1.0); + } +} + +template +static __global__ void KeBNBackwardScaleBias(const T *dy, const T *x, + const T *mean, + const T *inv_variance, + const double epsilon, const int N, + const int C, const int HxW, + T *dscale, T *dbias) { + const int outer_size = C; + const int inner_size = N * HxW; + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + T ds_sum = static_cast(0); + T db_sum = static_cast(0); + + T inv_var_i = inv_variance[i]; + T mean_i = mean[i]; + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int id = layout == framework::DataLayout::kNCHW + ? ((j / HxW) * C + i) * HxW + (j % HxW) + : j * outer_size + i; + ds_sum += dy[id] * (x[id] - mean_i); + db_sum += dy[id]; + } + __syncthreads(); + double os = BlockReduce(temp_storage) + .Reduce(static_cast(ds_sum), cub::Sum()); + __syncthreads(); + double ob = BlockReduce(temp_storage) + .Reduce(static_cast(db_sum), cub::Sum()); + __syncthreads(); + if (threadIdx.x == 0) { + dscale[i] = static_cast(os * inv_var_i); + dbias[i] = static_cast(ob); + } + __syncthreads(); + } +} + +template +static __global__ void KeBNBackwardData(const T *dy, const T *x, const T *beta, + const T *mean, const T *inv_variance, + const T *g_sum_dy, + const T *g_sum_dy_prod, + const T *num_dev, const double epsilon, + const int C, const int HxW, + const int num, T *dx) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + T scale = static_cast(C) / num; + T dev_num = num_dev[0]; + for (int i = gid; i < num; i += stride) { + const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; + T inv_var = inv_variance[c]; + T s_d = beta[c]; + T gvar = -1.0 * (g_sum_dy_prod[c] / dev_num) * s_d * inv_var * + (inv_var * inv_var); + T gmean = -1.0 * (g_sum_dy[c] / dev_num) * s_d * inv_var; + + dx[i] = + dy[i] * s_d * inv_var + gmean * scale + gvar * scale * (x[i] - mean[c]); + } +} + +// Deriving the Gradient for the Backward Pass of Batch Normalization +// https://kevinzakka.github.io/2016/09/14/batch_normalization/ +template +class SyncBatchNormGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + double epsilon = static_cast(ctx.Attr("epsilon")); + const std::string layout_str = ctx.Attr("data_layout"); + + const DataLayout layout = framework::StringToDataLayout(layout_str); + const auto *x = ctx.Input("X"); + const auto *d_y = ctx.Input(framework::GradVarName("Y")); + const auto *scale = ctx.Input("Scale"); + + const auto &x_dims = x->dims(); + + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "The Input dim size should be between 2 and 5"); + int N, C, H, W, D; + ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D); + + // init output + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + d_x->mutable_data(ctx.GetPlace()); + if (d_scale && d_bias) { + d_scale->mutable_data(ctx.GetPlace()); + d_bias->mutable_data(ctx.GetPlace()); + } + PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL); + PADDLE_ENFORCE_EQ(scale->dims()[0], C); + + std::vector dims; + std::vector strides; + if (layout == DataLayout::kNCHW) { + dims = {N, C, H, W, D}; + strides = {C * H * W * D, H * W * D, W * D, D, 1}; + } else { + dims = {N, C, H, W, D}; + strides = {H * W * C * D, 1, W * D * C, D * C, C}; + } + + const T *x_d = x->data(); + const T *dy_d = d_y->data(); + + auto &dev_ctx = ctx.cuda_device_context(); + auto stream = dev_ctx.stream(); + auto *comm = dev_ctx.nccl_comm(); + + const T *saved_mean = ctx.Input("SavedMean")->data(); + const T *saved_inv_var = ctx.Input("SavedVariance")->data(); + auto &allocator = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); + const int bytes = (C * 2 + 1) * sizeof(T); + auto alloc_ptr = allocator.Allocate(bytes); + T *stats = reinterpret_cast(alloc_ptr->ptr()); + + const int threads = 256; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + int grid = std::min(C, (max_threads + threads - 1) / threads); + int x_numel = x->numel(); + int fsize = H * W * D; + + if (layout == framework::DataLayout::kNCHW) { + KeBackwardLocalStats< + T, threads, + framework::DataLayout::kNCHW><<>>( + dy_d, x_d, saved_mean, N, fsize, C, stats); + } else { + KeBackwardLocalStats< + T, threads, + framework::DataLayout::kNHWC><<>>( + dy_d, x_d, saved_mean, N, fsize, C, stats); + } + int dtype = platform::ToNCCLDataType(x->type()); + // In-place operation + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + stats, stats, 2 * C + 1, static_cast(dtype), ncclSum, + comm, stream)); + + const int block = 512; + int grid2 = (std::min(x_numel, max_threads) + block - 1) / block; + if (layout == framework::DataLayout::kNCHW) { + if (d_scale && d_bias) { + KeBNBackwardScaleBias< + T, threads, + framework::DataLayout::kNCHW><<>>( + dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize, + d_scale->data(), d_bias->data()); + } + if (d_x) { + KeBNBackwardData< + T, framework::DataLayout::kNCHW><<>>( + dy_d, x_d, scale->data(), saved_mean, saved_inv_var, stats, + stats + C, stats + 2 * C, epsilon, C, fsize, x->numel(), + d_x->data()); + } + } else { + if (d_scale && d_bias) { + KeBNBackwardScaleBias< + T, threads, + framework::DataLayout::kNHWC><<>>( + dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize, + d_scale->data(), d_bias->data()); + } + if (d_x) { + KeBNBackwardData< + T, framework::DataLayout::kNHWC><<>>( + dy_d, x_d, scale->data(), saved_mean, saved_inv_var, stats, + stats + C, stats + 2 * C, epsilon, C, fsize, x->numel(), + d_x->data()); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_CUDA_KERNEL( + sync_batch_norm, ops::SyncBatchNormKernel, + ops::SyncBatchNormKernel); +REGISTER_OP_CUDA_KERNEL( + sync_batch_norm_grad, + ops::SyncBatchNormGradKernel, + ops::SyncBatchNormGradKernel); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 920b43b2b1990af58b73888bf7a652d57c20563c..d54a3e8670e892f4e0d9ebb60ab26714ac8c0c68 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -57,7 +57,6 @@ DeviceContextPool::DeviceContextPool( for (auto& p : places) { set.insert(p); } - for (auto& p : set) { if (platform::is_cpu_place(p)) { #ifdef PADDLE_WITH_MKLDNN @@ -317,6 +316,9 @@ CUDADeviceContext::~CUDADeviceContext() { eigen_stream_.reset(); eigen_device_.reset(); PADDLE_ENFORCE(cudaStreamDestroy(stream_)); +#if !defined(_WIN32) + PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_)); +#endif } Place CUDADeviceContext::GetPlace() const { return place_; } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index d376f90ad5754d70f3b9f30957eb2e2f584f8da9..1eb8d9691a1e591117e49c2cbe1ab691cbab4a5b 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -265,6 +265,14 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return cuda stream in the device context. */ cudaStream_t stream() const; +#if !defined(_WIN32) + /*! \brief Return nccl communicators. */ + ncclComm_t nccl_comm() const { return nccl_comm_; } + + /*! \brief Set nccl communicators. */ + void set_nccl_comm(ncclComm_t comm) { nccl_comm_ = comm; } +#endif + template void RecordEvent(cudaEvent_t ev, Callback callback) { callback(); @@ -289,6 +297,15 @@ class CUDADeviceContext : public DeviceContext { std::unique_ptr cublas_handle_; std::unique_ptr cublas_tensor_core_handle_; +#if !defined(_WIN32) + // NCCL communicator (single process version) for NCCL collective operations. + // NCCL collective operations provides fast collectives over multiple GPUs + // both within and across nodes. + // But, this collectives is used for collectives over multiple GPUs within + // nodes. + ncclComm_t nccl_comm_{nullptr}; +#endif + int compute_capability_; int runtime_version_; int driver_version_; diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 4dcf7e79043af008cb2067d90d12d629c5c2d0d9..d53a4029e1bad9eded693d2d9bd8e01e13bb73e7 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include // for strdup #include +#include +#include #include #include @@ -140,6 +142,7 @@ void InitDevices(bool init_p2p, const std::vector devices) { places.emplace_back(platform::CPUPlace()); platform::DeviceContextPool::Init(places); platform::DeviceTemporaryAllocator::Init(); + #ifndef PADDLE_WITH_MKLDNN platform::SetNumThreads(FLAGS_paddle_num_threads); #endif diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 6ae21ee8294bedc388f837aad3e20a2b9aca98a2..0428c40f985d78f0262eb0a73984bc59ab43aac2 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -16,9 +16,11 @@ #pragma once #include +#include #include #include // NOLINT #include +#include #include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/platform/dynload/nccl.h" @@ -78,6 +80,8 @@ struct NCCLContext { cudaStream_t stream() const { return ctx_->stream(); } + ncclComm_t comm() const { return comm_; } + int device_id() const { return boost::get(ctx_->GetPlace()).device; } diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 4ac5b83c56b114f4e3e4c78710716adc636ebe1d..f1385f57184eceec49b791cf6c89641b098f036a 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,6 +1,6 @@ set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer scope_pool - tracer analysis_predictor) + tracer analysis_predictor imperative_profiler) if(WITH_PYTHON) list(APPEND PYBIND_DEPS py_func_op) diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 6bbda69297a48ce27ce23282c4e08d49ee3cce6c..29c8e6a12940eb5652a0a97bea27c6fed5612001 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -42,6 +42,7 @@ void BindTracer(pybind11::module* m) { framework::AttributeMap attrs_map, const platform::CPUPlace expected_place, const bool stop_gradient = false) { + pybind11::gil_scoped_release release; return self.Trace(op, inputs, outputs, attrs_map, expected_place, stop_gradient); }) @@ -52,6 +53,7 @@ void BindTracer(pybind11::module* m) { framework::AttributeMap attrs_map, const platform::CUDAPlace expected_place, const bool stop_gradient = false) { + pybind11::gil_scoped_release release; return self.Trace(op, inputs, outputs, attrs_map, expected_place, stop_gradient); }) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 552a5e0c3289b022041c6ea4f26694ed24aa858d..7ea3e1109638beedb7490bc64c9ac08c6ac09713 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -36,6 +36,7 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/version.h" #include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/imperative/profiler.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/memory/allocation/legacy_allocator.h" #include "paddle/fluid/operators/activation_op.h" @@ -156,6 +157,11 @@ PYBIND11_MODULE(core, m) { m.def("print_mem_usage", []() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); }); + m.def("start_imperative_gperf_profiler", + []() { imperative::StartProfile(); }); + + m.def("stop_imperative_gperf_profiler", []() { imperative::StopProfile(); }); + py::class_(m, "VarBase", R"DOC()DOC") .def( py::init 2 and len(x_shape) > 2: for i, dim_x in enumerate(x_shape[:-2]): @@ -6366,6 +6400,8 @@ def squeeze(input, axes, name=None): x = layers.data(name='x', shape=[5, 1, 10]) y = layers.sequeeze(input=x, axes=[1]) """ + assert not _in_imperative_mode(), ( + "squeeze layer is not supported in imperative mode yet.") helper = LayerHelper("squeeze", **locals()) out = helper.create_variable_for_type_inference(dtype=input.dtype) x_shape = helper.create_variable_for_type_inference(dtype=input.dtype) @@ -9103,6 +9139,10 @@ def _elementwise_op(helper): op_type = helper.layer_type x = helper.kwargs.get('x', None) y = helper.kwargs.get('y', None) + if _in_imperative_mode(): + x = base.to_variable(x) + y = base.to_variable(y) + assert x is not None, 'x cannot be None in {}'.format(op_type) assert y is not None, 'y cannot be None in {}'.format(op_type) axis = helper.kwargs.get('axis', -1) diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index 6218db73459a2bb55d72545c738f88dbd8cce0f7..7d1b869cf5991dc5ef960ff4d72289979aae158a 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -476,11 +476,29 @@ class TestYoloDetection(unittest.TestCase): x = layers.data(name='x', shape=[30, 7, 7], dtype='float32') gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32') gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32') - loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13], - [0, 1], 10, 0.7, 32) + gtscore = layers.data(name='gtscore', shape=[10], dtype='float32') + loss = layers.yolov3_loss( + x, + gtbox, + gtlabel, [10, 13, 30, 13], [0, 1], + 10, + 0.7, + 32, + gtscore=gtscore, + use_label_smooth=False) self.assertIsNotNone(loss) + def test_yolo_box(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[30, 7, 7], dtype='float32') + img_size = layers.data(name='img_size', shape=[2], dtype='int32') + boxes, scores = layers.yolo_box(x, img_size, [10, 13, 30, 13], 10, + 0.01, 32) + self.assertIsNotNone(boxes) + self.assertIsNotNone(scores) + class TestBoxClip(unittest.TestCase): def test_box_clip(self): diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..a8127bcc781378fa5ef4a189a0b14d079a793946 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py @@ -0,0 +1,78 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from paddle.fluid.tests.unittests.op_test import OpTest +from mkldnn_op_test import format_reorder + + +class TestTransposeOp(OpTest): + def setUp(self): + self.init_op_type() + self.initTestCase() + self.initInputData() + self.use_mkldnn = True + self.axis = (0, 2, 3, 1) + + self.inputs = { + 'X': format_reorder(self.input_data, self.shape) + } #transform data format to 'NHWC' for INT8 transpose specially. + + self.attrs = { + 'axis': list(self.axis), + 'use_mkldnn': self.use_mkldnn, + } + + self.outputs = { + 'XShape': np.random.random(self.shape).astype('int8'), + 'Out': self.inputs['X'].transpose(self.axis) + } + + def init_op_type(self): + self.op_type = "transpose2" + + def test_check_output(self): + self.check_output(no_check_set=['XShape']) + + def initTestCase(self): + self.shape = (2, 3, 4, 5) + + def initInputData(self): + self.input_data = ( + np.random.randint(0, 100, self.shape) - 50).astype('int8') + + +class TestINT8Case(TestTransposeOp): + def initTestCase(self): + self.shape = (2, 4, 6, 8) + + def initInputData(self): + self.input_data = ( + np.random.randint(0, 100, self.shape) - 50).astype('int8') + + +class TestUINT8Case(TestTransposeOp): + def initTestCase(self): + self.shape = (1, 3, 5, 7) + + def initDataType(self): + self.input_data = (np.random.randint(0, 100, + self.shape)).astype('uint8') + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py b/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py index c29d422361b2fcddc6440c37deab18fc886d7083..55029c18d6966ea1d139a1987ff90d46c8e81270 100644 --- a/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py +++ b/python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py @@ -24,11 +24,13 @@ class CrossEntropy2OpTestBase(OpTest): def calc_output(self, logits, label, ignore_index): ret = np.zeros(shape=label.shape, dtype=logits.dtype) + match_x = np.zeros(shape=label.shape, dtype=logits.dtype) for idx in six.moves.range(label.shape[0]): if label[idx] == ignore_index: continue - ret[idx] = -np.log(logits[idx][label[idx]]) - return ret + match_x[idx] = logits[idx][label[idx]] + ret[idx] = -np.log(match_x[idx]) + return ret, match_x def setUp(self): self.shape, self.dtype, self.ignore_index = self.initParameters() @@ -39,12 +41,13 @@ class CrossEntropy2OpTestBase(OpTest): label = np.random.random_integers( low=0, high=feature_size - 1, size=self.shape[0:-1] + [1]).astype('int64') - outputs = self.calc_output( + outputs, match_x = self.calc_output( np.reshape(logits, [batch_size, feature_size]), np.reshape(label, [batch_size, 1]), self.ignore_index) self.inputs = {'X': logits, 'Label': label} self.outputs = { 'Y': np.reshape(outputs, label.shape), + 'MatchX': np.reshape(match_x, label.shape), 'XShape': np.zeros( shape=logits.shape, dtype=logits.dtype) } @@ -57,7 +60,7 @@ class CrossEntropy2OpTestBase(OpTest): self.check_grad( inputs_to_check=['X'], output_names=['Y'], - no_grad_set=['XShape', 'Label']) + no_grad_set=['XShape', 'MatchX', 'Label']) class CrossEntropy2OpTest2(CrossEntropy2OpTestBase): diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py index 90a90112bd5f0e24374111073514b20dd1231edb..cf8f01edb9a6a2b6d91080248553491c54e7707b 100644 --- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py +++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py @@ -17,6 +17,7 @@ from __future__ import print_function import unittest import numpy as np from op_test import OpTest +import paddle.fluid.core as core class TestFakeQuantizeOp(OpTest): @@ -75,6 +76,7 @@ class TestFakeQuantizeRangeAbsMaxOp(OpTest): 'InScale': np.zeros(1).astype("float32") } scale = np.max(np.abs(self.inputs['X'])).astype("float32") + out_scales = np.zeros(self.attrs['window_size']).astype("float32") out_scales[0] = scale self.outputs = { @@ -88,6 +90,46 @@ class TestFakeQuantizeRangeAbsMaxOp(OpTest): self.check_output() +class TestFakeQuantizeMovingOp(OpTest): + def setUp(self): + self.op_type = "fake_quantize_moving_average_abs_max" + self.attrs = { + 'bit_length': int(5), + 'moving_rate': float(0.9), + 'is_test': False + } + accum = np.zeros(1).astype("float32") + accum[0] = 1 + state = np.zeros(1).astype("float32") + state[0] = 1 + scale = np.zeros(1).astype("float32") + scale[0] = 0.001 + self.inputs = { + 'X': np.random.random((8, 16, 7, 7)).astype("float32"), + 'InScale': scale, + 'InAccum': accum, + 'InState': state, + } + + out_accum = np.zeros(1).astype("float32") + out_state = np.zeros(1).astype("float32") + out_scale = np.zeros(1).astype("float32") + out_accum[0] = self.attrs['moving_rate'] * accum[0] + np.max( + np.abs(self.inputs['X'])).astype("float32") + out_state[0] = self.attrs['moving_rate'] * state[0] + 1 + out_scale = out_accum / out_state + self.outputs = { + 'Out': np.round(self.inputs['X'] / out_scale * ( + (1 << (self.attrs['bit_length'] - 1)) - 1)), + 'OutAccum': out_accum, + 'OutState': out_state, + 'OutScale': out_scale, + } + + def test_check_output(self): + self.check_output() + + class TestFakeQuantizeRangeAbsMaxOp2(OpTest): def setUp(self): self.op_type = "fake_quantize_range_abs_max" diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py new file mode 100644 index 0000000000000000000000000000000000000000..2086fab5c81e241d1a49386d8285289b14364dc8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py @@ -0,0 +1,144 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import unittest +import numpy as np +import six +import sys + +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.optimizer import AdamOptimizer +from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC +from test_imperative_base import new_program_scope +from paddle.fluid.imperative.base import to_variable + + +def gen_data(): + pass + + +class GraphConv(fluid.imperative.Layer): + def __init__(self, name_scope, in_features, out_features): + super(GraphConv, self).__init__(name_scope) + + self._in_features = in_features + self._out_features = out_features + self.weight = self.create_parameter( + attr=None, + dtype='float32', + shape=[self._in_features, self._out_features]) + self.bias = self.create_parameter( + attr=None, dtype='float32', shape=[self._out_features]) + + def forward(self, features, adj): + support = fluid.layers.matmul(features, self.weight) + # TODO(panyx0718): sparse matmul? + return fluid.layers.matmul(adj, support) + self.bias + + +class GCN(fluid.imperative.Layer): + def __init__(self, name_scope, num_hidden): + super(GCN, self).__init__(name_scope) + self.gc = GraphConv(self.full_name(), num_hidden, 32) + self.gc2 = GraphConv(self.full_name(), 32, 10) + + def forward(self, x, adj): + x = fluid.layers.relu(self.gc(x, adj)) + return self.gc2(x, adj) + + +class TestImperativeGNN(unittest.TestCase): + def test_gnn_float32(self): + seed = 90 + + startup = fluid.Program() + startup.random_seed = seed + main = fluid.Program() + main.random_seed = seed + + scope = fluid.core.Scope() + with new_program_scope(main=main, startup=startup, scope=scope): + features = fluid.layers.data( + name='features', + shape=[1, 100, 50], + dtype='float32', + append_batch_size=False) + # Use selected rows when it's supported. + adj = fluid.layers.data( + name='adj', + shape=[1, 100, 100], + dtype='float32', + append_batch_size=False) + labels = fluid.layers.data( + name='labels', + shape=[100, 1], + dtype='int64', + append_batch_size=False) + + model = GCN('test_gcn', 50) + logits = model(features, adj) + logits = fluid.layers.reshape(logits, logits.shape[1:]) + # In other example, it's nll with log_softmax. However, paddle's + # log_loss only supports binary classification now. + loss = fluid.layers.softmax_with_cross_entropy(logits, labels) + loss = fluid.layers.reduce_sum(loss) + + adam = AdamOptimizer(learning_rate=1e-3) + adam.minimize(loss) + exe = fluid.Executor(fluid.CPUPlace( + ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) + exe.run(startup) + static_loss = exe.run(feed={ + 'features': np.zeros( + [1, 100, 50], dtype=np.float32), + 'adj': np.zeros( + [1, 100, 100], dtype=np.float32), + 'labels': np.zeros( + [100, 1], dtype=np.int64) + }, + fetch_list=[loss])[0] + + static_weight = np.array( + scope.find_var(model.gc.weight.name).get_tensor()) + + with fluid.imperative.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + features = np.zeros([1, 100, 50], dtype=np.float32) + # Use selected rows when it's supported. + adj = np.zeros([1, 100, 100], dtype=np.float32) + labels = np.zeros([100, 1], dtype=np.int64) + + model = GCN('test_gcn', 50) + logits = model(to_variable(features), to_variable(adj)) + logits = fluid.layers.reshape(logits, logits.shape[1:]) + # In other example, it's nll with log_softmax. However, paddle's + # log_loss only supports binary classification now. + loss = fluid.layers.softmax_with_cross_entropy(logits, + to_variable(labels)) + loss = fluid.layers.reduce_sum(loss) + adam = AdamOptimizer(learning_rate=1e-3) + adam.minimize(loss) + self.assertEqual(static_loss, loss._numpy()) + self.assertTrue( + np.allclose(static_weight, model.gc.weight._numpy())) + sys.stderr.write('%s %s\n' % (static_loss, loss._numpy())) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 5b186ae0384e3d365303c25861138a3c7e4c189f..885ee170e8032ef865ebfdd646fed1e995e9e60b 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -84,6 +84,27 @@ class TestLayer(LayerTest): self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) + def test_matmul(self): + with self.static_graph(): + t = layers.data(name='t', shape=[3, 3], dtype='float32') + t2 = layers.data(name='t2', shape=[3, 3], dtype='float32') + ret = layers.matmul(t, t2) + static_ret = self.get_static_graph_result( + feed={ + 't': np.ones( + [3, 3], dtype='float32'), + 't2': np.ones( + [3, 3], dtype='float32') + }, + fetch_list=[ret])[0] + + with self.dynamic_graph(): + t = np.ones([3, 3], dtype='float32') + t2 = np.ones([3, 3], dtype='float32') + dy_ret = layers.matmul(base.to_variable(t), base.to_variable(t2)) + + self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) + def test_conv2d(self): with self.static_graph(): images = layers.data(name='pixel', shape=[3, 5, 5], dtype='float32') @@ -153,6 +174,60 @@ class TestLayer(LayerTest): self.assertTrue(np.allclose(static_ret[i], static_ret2[i])) self.assertTrue(np.allclose(static_ret[i], dy_ret[i]._numpy())) + def test_elementwise_math(self): + n = np.ones([3, 3], dtype='float32') + n2 = np.ones([3, 3], dtype='float32') * 1.1 + n3 = np.ones([3, 3], dtype='float32') * 2 + n4 = np.ones([3, 3], dtype='float32') * 3 + n5 = np.ones([3, 3], dtype='float32') * 4 + n6 = np.ones([3, 3], dtype='float32') * 5 + + with self.static_graph(): + t = layers.data(name='t', shape=[3, 3], dtype='float32') + t2 = layers.data(name='t2', shape=[3, 3], dtype='float32') + t3 = layers.data(name='t3', shape=[3, 3], dtype='float32') + t4 = layers.data(name='t4', shape=[3, 3], dtype='float32') + t5 = layers.data(name='t5', shape=[3, 3], dtype='float32') + t6 = layers.data(name='t6', shape=[3, 3], dtype='float32') + + ret = layers.elementwise_add(t, t2) + ret = layers.elementwise_pow(ret, t3) + ret = layers.elementwise_div(ret, t4) + ret = layers.elementwise_sub(ret, t5) + ret = layers.elementwise_mul(ret, t6) + + static_ret = self.get_static_graph_result( + feed={ + 't': n, + 't2': n2, + 't3': n3, + 't4': n4, + 't5': n5, + 't6': n6 + }, + fetch_list=[ret])[0] + + with self.dynamic_graph(): + ret = layers.elementwise_add(n, n2) + ret = layers.elementwise_pow(ret, n3) + ret = layers.elementwise_div(ret, n4) + ret = layers.elementwise_sub(ret, n5) + dy_ret = layers.elementwise_mul(ret, n6) + self.assertTrue( + np.allclose(static_ret, dy_ret._numpy()), + '%s vs %s' % (static_ret, dy_ret._numpy())) + + def test_elementwise_minmax(self): + n = np.ones([3, 3], dtype='float32') + n2 = np.ones([3, 3], dtype='float32') * 2 + + with self.dynamic_graph(): + min_ret = layers.elementwise_min(n, n2) + max_ret = layers.elementwise_max(n, n2) + + self.assertTrue(np.allclose(n, min_ret._numpy())) + self.assertTrue(np.allclose(n2, max_ret._numpy())) + class TestBook(unittest.TestCase): def test_fit_a_line(self): diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py index 4e6ed3a74b344da068bbfb60707838a1b4fc40fd..5fdabbabeda6a2a3673042e4441d0bb298296404 100644 --- a/python/paddle/fluid/tests/unittests/test_slice_op.py +++ b/python/paddle/fluid/tests/unittests/test_slice_op.py @@ -16,6 +16,7 @@ from __future__ import print_function import unittest import numpy as np +import paddle.fluid.core as core from op_test import OpTest @@ -63,5 +64,28 @@ class TestCase2(TestSliceOp): self.out = self.input[-3:3, 0:100, :, 2:-1] +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFP16(TestSliceOp): + def config(self): + self.dtype = "float16" + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [-3, 0, 2] + self.ends = [3, 100, -1] + self.axes = [0, 1, 3] + self.out = self.input[-3:3, 0:100, :, 2:-1] + + def test_check_output(self): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=1e-5) + + def test_check_grad_normal(self): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_grad_with_place( + place, ['Input'], 'Out', max_relative_error=0.006) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py new file mode 100644 index 0000000000000000000000000000000000000000..f6a658cb1b753de93f11f45d0477f450ef0bdfaf --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py @@ -0,0 +1,159 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import os +import six +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.fluid import compiler + + +class TestSyncBatchNormOpTraining(unittest.TestCase): + def setUp(self): + #self.dtype = np.float32 + self.dtype = np.float64 + self.N = 32 + self.C = 16 + self.H = 64 + self.W = 32 + self.dshape = [self.N, self.C, self.H, self.W] + + def build_program(self, + place, + layout, + seed, + sync_bn=False, + only_forward=False): + main = fluid.Program() + startup = fluid.Program() + main.random_seed = seed + startup.random_seed = seed + with fluid.unique_name.guard(): + with fluid.program_guard(main, startup): + data = fluid.layers.data( + name='input', + shape=self.dshape, + dtype=self.dtype, + append_batch_size=False) + conv = fluid.layers.conv2d( + input=data, + num_filters=32, + filter_size=1, + param_attr=fluid.ParamAttr(name='conv2d_weight'), + bias_attr=False, + use_cudnn=False) + bn = fluid.layers.batch_norm( + conv, + param_attr=fluid.ParamAttr(name='bn_scale'), + bias_attr=fluid.ParamAttr(name='bn_bias'), + moving_mean_name='bn_moving_mean', + moving_variance_name='bn_moving_variance', + data_layout=layout, + is_test=only_forward) + sigmoid = fluid.layers.sigmoid(bn) + out = fluid.layers.reduce_sum(sigmoid) + if not sync_bn: + out = out / core.get_cuda_device_count() + if not only_forward: + sgd_opt = fluid.optimizer.SGD(learning_rate=0.0) + sgd_opt.backward(out) + return main, startup, [out, conv, bn] + + def compare(self, place, layout, only_forward): + seed = 10 + os.environ['FLAGS_cudnn_deterministic'] = "1" + data = np.random.random(size=self.dshape).astype(self.dtype) * 4. - 2 + # Single-GPU, N = 32 per GPU + main, startup, outs = self.build_program(place, layout, seed, False, + only_forward) + exe = fluid.Executor(place) + exe.run(startup) + fetch_names = [v.name for v in outs] + [ + 'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias' + ] + if not only_forward: + others = [ + 'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD', + 'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD', 'conv2d_0.tmp_0@GRAD' + ] + fetch_names += others + bn_fetches = exe.run(program=main, + feed={'input': data}, + fetch_list=fetch_names) + + ##################################################################### + # Multi-GPUs, self.N / core.get_cuda_device_count() per GPU + main, startup, outs = self.build_program(place, layout, seed, True, + only_forward) + exe = fluid.Executor(place) + exe.run(startup) + fetch_names = [v.name for v in outs] + [ + 'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias' + ] + if not only_forward: + others = [ + 'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD', + 'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD', 'conv2d_0.tmp_0@GRAD' + ] + fetch_names += others + for nm in fetch_names: + fv = fluid.framework._get_var(str(nm), program=main) + fv.persistable = True + build_strategy = fluid.BuildStrategy() + build_strategy.sync_batch_norm = True + build_strategy.enable_inplace = False + build_strategy.memory_optimize = False + comp_prog = compiler.CompiledProgram(main).with_data_parallel( + outs[0].name if not only_forward else None, + build_strategy=build_strategy) + sync_bn_fetches = exe.run(program=comp_prog, + feed={'input': data}, + fetch_list=fetch_names) + + for i in six.moves.xrange(1, len(sync_bn_fetches)): + bn_val = bn_fetches[i] + sync_bn_val = sync_bn_fetches[i] + if sync_bn_val.shape != bn_val.shape: + sync_bn_val = sync_bn_val[:bn_val.shape[0]] + self.assertTrue( + np.allclose( + bn_val, sync_bn_val, atol=1e-3), + "Output (" + fetch_names[i] + ") has diff. \n" + "\nBN " + + str(bn_val) + "\n" + "Sync BN " + str(sync_bn_val)) + + def test_train(self): + if not core.is_compiled_with_cuda(): + return + + places = [core.CUDAPlace(0)] + for place in places: + for layout in ["NCHW", "NHWC"]: + self.compare(place, layout, False) + + def test_infer(self): + if not core.is_compiled_with_cuda(): + return + + places = [core.CUDAPlace(0)] + for place in places: + for layout in ["NCHW", "NHWC"]: + self.compare(place, layout, True) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py new file mode 100644 index 0000000000000000000000000000000000000000..416e6ea9f412d86db877fc36175e8b910b0613fe --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py @@ -0,0 +1,117 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division + +import unittest +import numpy as np +from op_test import OpTest + +from paddle.fluid import core + + +def sigmoid(x): + return 1.0 / (1.0 + np.exp(-1.0 * x)) + + +def YoloBox(x, img_size, attrs): + n, c, h, w = x.shape + anchors = attrs['anchors'] + an_num = int(len(anchors) // 2) + class_num = attrs['class_num'] + conf_thresh = attrs['conf_thresh'] + downsample = attrs['downsample'] + input_size = downsample * h + + x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2)) + + pred_box = x[:, :, :, :, :4].copy() + grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1)) + grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w)) + pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w + pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h + + anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)] + anchors_s = np.array( + [(an_w / input_size, an_h / input_size) for an_w, an_h in anchors]) + anchor_w = anchors_s[:, 0:1].reshape((1, an_num, 1, 1)) + anchor_h = anchors_s[:, 1:2].reshape((1, an_num, 1, 1)) + pred_box[:, :, :, :, 2] = np.exp(pred_box[:, :, :, :, 2]) * anchor_w + pred_box[:, :, :, :, 3] = np.exp(pred_box[:, :, :, :, 3]) * anchor_h + + pred_conf = sigmoid(x[:, :, :, :, 4:5]) + pred_conf[pred_conf < conf_thresh] = 0. + pred_score = sigmoid(x[:, :, :, :, 5:]) * pred_conf + pred_box = pred_box * (pred_conf > 0.).astype('float32') + + pred_box = pred_box.reshape((n, -1, 4)) + pred_box[:, :, :2], pred_box[:, :, 2:4] = \ + pred_box[:, :, :2] - pred_box[:, :, 2:4] / 2., \ + pred_box[:, :, :2] + pred_box[:, :, 2:4] / 2.0 + pred_box[:, :, 0] = pred_box[:, :, 0] * img_size[:, 1][:, np.newaxis] + pred_box[:, :, 1] = pred_box[:, :, 1] * img_size[:, 0][:, np.newaxis] + pred_box[:, :, 2] = pred_box[:, :, 2] * img_size[:, 1][:, np.newaxis] + pred_box[:, :, 3] = pred_box[:, :, 3] * img_size[:, 0][:, np.newaxis] + + for i in range(len(pred_box)): + pred_box[i, :, 0] = np.clip(pred_box[i, :, 0], 0, np.inf) + pred_box[i, :, 1] = np.clip(pred_box[i, :, 1], 0, np.inf) + pred_box[i, :, 2] = np.clip(pred_box[i, :, 2], -np.inf, + img_size[i, 1] - 1) + pred_box[i, :, 3] = np.clip(pred_box[i, :, 3], -np.inf, + img_size[i, 0] - 1) + + return pred_box, pred_score.reshape((n, -1, class_num)) + + +class TestYoloBoxOp(OpTest): + def setUp(self): + self.initTestCase() + self.op_type = 'yolo_box' + x = np.random.random(self.x_shape).astype('float32') + img_size = np.random.randint(10, 20, self.imgsize_shape).astype('int32') + + self.attrs = { + "anchors": self.anchors, + "class_num": self.class_num, + "conf_thresh": self.conf_thresh, + "downsample": self.downsample, + } + + self.inputs = { + 'X': x, + 'ImgSize': img_size, + } + boxes, scores = YoloBox(x, img_size, self.attrs) + self.outputs = { + "Boxes": boxes, + "Scores": scores, + } + + def test_check_output(self): + self.check_output() + + def initTestCase(self): + self.anchors = [10, 13, 16, 30, 33, 23] + an_num = int(len(self.anchors) // 2) + self.batch_size = 32 + self.class_num = 2 + self.conf_thresh = 0.5 + self.downsample = 32 + self.x_shape = (self.batch_size, an_num * (5 + self.class_num), 13, 13) + self.imgsize_shape = (self.batch_size, 2) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py index 020c1139230a9177c4d7765367359d91839d7d46..e4d6edc72c0ca888e271101f079cdcc6fb4e8a70 100644 --- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py @@ -23,8 +23,8 @@ from op_test import OpTest from paddle.fluid import core -def l2loss(x, y): - return 0.5 * (y - x) * (y - x) +def l1loss(x, y): + return abs(x - y) def sce(x, label): @@ -66,7 +66,7 @@ def batch_xywh_box_iou(box1, box2): return inter_area / union -def YOLOv3Loss(x, gtbox, gtlabel, attrs): +def YOLOv3Loss(x, gtbox, gtlabel, gtscore, attrs): n, c, h, w = x.shape b = gtbox.shape[1] anchors = attrs['anchors'] @@ -75,21 +75,21 @@ def YOLOv3Loss(x, gtbox, gtlabel, attrs): mask_num = len(anchor_mask) class_num = attrs["class_num"] ignore_thresh = attrs['ignore_thresh'] - downsample = attrs['downsample'] - input_size = downsample * h + downsample_ratio = attrs['downsample_ratio'] + use_label_smooth = attrs['use_label_smooth'] + input_size = downsample_ratio * h x = x.reshape((n, mask_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2)) loss = np.zeros((n)).astype('float32') + label_pos = 1.0 - 1.0 / class_num if use_label_smooth else 1.0 + label_neg = 1.0 / class_num if use_label_smooth else 0.0 + pred_box = x[:, :, :, :, :4].copy() grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1)) grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w)) pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h - x[:, :, :, :, 5:] = np.where(x[:, :, :, :, 5:] < -0.5, x[:, :, :, :, 5:], - np.ones_like(x[:, :, :, :, 5:]) * 1.0 / - class_num) - mask_anchors = [] for m in anchor_mask: mask_anchors.append((anchors[2 * m], anchors[2 * m + 1])) @@ -138,21 +138,22 @@ def YOLOv3Loss(x, gtbox, gtlabel, attrs): ty = gtbox[i, j, 1] * w - gj tw = np.log(gtbox[i, j, 2] * input_size / mask_anchors[an_idx][0]) th = np.log(gtbox[i, j, 3] * input_size / mask_anchors[an_idx][1]) - scale = (2.0 - gtbox[i, j, 2] * gtbox[i, j, 3]) + scale = (2.0 - gtbox[i, j, 2] * gtbox[i, j, 3]) * gtscore[i, j] loss[i] += sce(x[i, an_idx, gj, gi, 0], tx) * scale loss[i] += sce(x[i, an_idx, gj, gi, 1], ty) * scale - loss[i] += l2loss(x[i, an_idx, gj, gi, 2], tw) * scale - loss[i] += l2loss(x[i, an_idx, gj, gi, 3], th) * scale + loss[i] += l1loss(x[i, an_idx, gj, gi, 2], tw) * scale + loss[i] += l1loss(x[i, an_idx, gj, gi, 3], th) * scale - objness[i, an_idx * h * w + gj * w + gi] = 1.0 + objness[i, an_idx * h * w + gj * w + gi] = gtscore[i, j] for label_idx in range(class_num): - loss[i] += sce(x[i, an_idx, gj, gi, 5 + label_idx], - float(label_idx == gtlabel[i, j])) + loss[i] += sce(x[i, an_idx, gj, gi, 5 + label_idx], label_pos + if label_idx == gtlabel[i, j] else + label_neg) * gtscore[i, j] for j in range(mask_num * h * w): if objness[i, j] > 0: - loss[i] += sce(pred_obj[i, j], 1.0) + loss[i] += sce(pred_obj[i, j], 1.0) * objness[i, j] elif objness[i, j] == 0: loss[i] += sce(pred_obj[i, j], 0.0) @@ -176,7 +177,8 @@ class TestYolov3LossOp(OpTest): "anchor_mask": self.anchor_mask, "class_num": self.class_num, "ignore_thresh": self.ignore_thresh, - "downsample": self.downsample, + "downsample_ratio": self.downsample_ratio, + "use_label_smooth": self.use_label_smooth, } self.inputs = { @@ -184,7 +186,14 @@ class TestYolov3LossOp(OpTest): 'GTBox': gtbox.astype('float32'), 'GTLabel': gtlabel.astype('int32'), } - loss, objness, gt_matches = YOLOv3Loss(x, gtbox, gtlabel, self.attrs) + + gtscore = np.ones(self.gtbox_shape[:2]).astype('float32') + if self.gtscore: + gtscore = np.random.random(self.gtbox_shape[:2]).astype('float32') + self.inputs['GTScore'] = gtscore + + loss, objness, gt_matches = YOLOv3Loss(x, gtbox, gtlabel, gtscore, + self.attrs) self.outputs = { 'Loss': loss, 'ObjectnessMask': objness, @@ -193,24 +202,57 @@ class TestYolov3LossOp(OpTest): def test_check_output(self): place = core.CPUPlace() - self.check_output_with_place(place, atol=1e-3) + self.check_output_with_place(place, atol=2e-3) def test_check_grad_ignore_gtbox(self): place = core.CPUPlace() - self.check_grad_with_place( - place, ['X'], - 'Loss', - no_grad_set=set(["GTBox", "GTLabel"]), - max_relative_error=0.3) + self.check_grad_with_place(place, ['X'], 'Loss', max_relative_error=0.2) + + def initTestCase(self): + self.anchors = [ + 10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, + 373, 326 + ] + self.anchor_mask = [0, 1, 2] + self.class_num = 5 + self.ignore_thresh = 0.7 + self.downsample_ratio = 32 + self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5) + self.gtbox_shape = (3, 5, 4) + self.gtscore = True + self.use_label_smooth = True + + +class TestYolov3LossWithoutLabelSmooth(TestYolov3LossOp): + def initTestCase(self): + self.anchors = [ + 10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, + 373, 326 + ] + self.anchor_mask = [0, 1, 2] + self.class_num = 5 + self.ignore_thresh = 0.7 + self.downsample_ratio = 32 + self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5) + self.gtbox_shape = (3, 5, 4) + self.gtscore = True + self.use_label_smooth = False + +class TestYolov3LossNoGTScore(TestYolov3LossOp): def initTestCase(self): - self.anchors = [10, 13, 16, 30, 33, 23] - self.anchor_mask = [1, 2] + self.anchors = [ + 10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, + 373, 326 + ] + self.anchor_mask = [0, 1, 2] self.class_num = 5 - self.ignore_thresh = 0.5 - self.downsample = 32 + self.ignore_thresh = 0.7 + self.downsample_ratio = 32 self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5) self.gtbox_shape = (3, 5, 4) + self.gtscore = False + self.use_label_smooth = True if __name__ == "__main__": diff --git a/tools/manylinux1/build_scripts/build.sh b/tools/manylinux1/build_scripts/build.sh index 1b0059a8c69fca93ecbf1db570a6092ca5c908b1..3be94a42d530bdc4cb6c0a97ee3804f8289919d1 100644 --- a/tools/manylinux1/build_scripts/build.sh +++ b/tools/manylinux1/build_scripts/build.sh @@ -153,3 +153,9 @@ done # Restore LD_LIBRARY_PATH LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}" + +# According to ar issues: https://lists.gnu.org/archive/html/bug-binutils/2016-05/msg00211.html +# we should install new version ar with 64-bit supported here +wget https://ftp.gnu.org/gnu/binutils/binutils-2.27.tar.gz +tar xzf binutils-2.27.tar.gz && cd binutils-2.27 +./configure --prefix=/opt/rh/devtoolset-2/root/usr/ --enable-64-bit-archive && make -j `nproc` && make install