提交 fd23262e 编写于 作者: S sneaxiy

merge develop, fix conflict

test=develop
...@@ -24,6 +24,8 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " ...@@ -24,6 +24,8 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
"${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
"${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
message(STATUS "AR tools: ${CMAKE_AR}")
if(WIN32) if(WIN32)
set(CMAKE_SUPPRESS_REGENERATION ON) set(CMAKE_SUPPRESS_REGENERATION ON)
set(CMAKE_STATIC_LIBRARY_PREFIX lib) set(CMAKE_STATIC_LIBRARY_PREFIX lib)
......
...@@ -110,7 +110,7 @@ function(op_library TARGET) ...@@ -110,7 +110,7 @@ function(op_library TARGET)
# Define operators that don't need pybind here. # Define operators that don't need pybind here.
foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op") "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}") if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
......
...@@ -12,7 +12,7 @@ paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], va ...@@ -12,7 +12,7 @@ paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], va
paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ef753f5cec69fef9ae6ad8b867b33a2')) paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ef753f5cec69fef9ae6ad8b867b33a2'))
paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03')) paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03'))
paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'aba8093edebf2d5c869b735b92811e45')) paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'f482e93b38b4018796969a2e1dde479d'))
paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'e148d3ab1ed8edf3e928212a375959c0')) paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'e148d3ab1ed8edf3e928212a375959c0'))
paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'b94d1f6bcc29c4fb58fc0058561250c2')) paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'b94d1f6bcc29c4fb58fc0058561250c2'))
paddle.fluid.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
...@@ -68,7 +68,7 @@ paddle.fluid.initializer.MSRAInitializer.__init__ (ArgSpec(args=['self', 'unifor ...@@ -68,7 +68,7 @@ paddle.fluid.initializer.MSRAInitializer.__init__ (ArgSpec(args=['self', 'unifor
paddle.fluid.initializer.force_init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '6d0f3e22c90d9d500d36ff57daf056ee')) paddle.fluid.initializer.force_init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '6d0f3e22c90d9d500d36ff57daf056ee'))
paddle.fluid.initializer.init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'a6d7011ca3d8c0d454dac3a56eae0c29')) paddle.fluid.initializer.init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'a6d7011ca3d8c0d454dac3a56eae0c29'))
paddle.fluid.initializer.NumpyArrayInitializer.__init__ (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.initializer.NumpyArrayInitializer.__init__ (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '1929058262994f212620599c63aea6bd')) paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '424e898365195e3ccbc2e7dc8b63605e'))
paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', '89c2c55a0b0656b106064048e068e77a')) paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', '89c2c55a0b0656b106064048e068e77a'))
paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', 'dfbb624f85015df29e994ca6999e8ff6')) paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', 'dfbb624f85015df29e994ca6999e8ff6'))
paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'b4b608b986eb9617aa0525e1be21d32d')) paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'b4b608b986eb9617aa0525e1be21d32d'))
...@@ -91,7 +91,7 @@ paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'po ...@@ -91,7 +91,7 @@ paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'po
paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '043de7333b79ee0ac55053c14ed81625')) paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '043de7333b79ee0ac55053c14ed81625'))
paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '859b887174d06f361658f69cb7c06d95')) paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '859b887174d06f361658f69cb7c06d95'))
paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '120f4323a3d7ed9c0916f15a59f0e497')) paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '120f4323a3d7ed9c0916f15a59f0e497'))
paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', 'c527b71b8a4c60dca8df8a745c2b598d')) paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '320c6973b02ea179fa89fecc80796464'))
paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', 'e45e09e65a2658e07cad987222f0d9ab')) paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', 'e45e09e65a2658e07cad987222f0d9ab'))
paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b0b8d53821716cd50c42e09b593f3feb')) paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b0b8d53821716cd50c42e09b593f3feb'))
paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '03993955ab1e6d3044c44e6f17fc85e9')) paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '03993955ab1e6d3044c44e6f17fc85e9'))
...@@ -330,7 +330,8 @@ paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes', ...@@ -330,7 +330,8 @@ paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes',
paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1')) paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1'))
paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e')) paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b')) paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691')) paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gtscore', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', '57fa96922e42db8f064c3fb77f2255e8'))
paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5566169a5ab993d177792c023c7fb340'))
paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e')) paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0')) paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d')) paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))
...@@ -367,7 +368,7 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array (ArgSpec(args=['self', 'init', ...@@ -367,7 +368,7 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array (ArgSpec(args=['self', 'init',
paddle.fluid.contrib.BeamSearchDecoder.update_array (ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None), ('document', '5754e9b3212b7c09497151516a0de5a7')) paddle.fluid.contrib.BeamSearchDecoder.update_array (ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None), ('document', '5754e9b3212b7c09497151516a0de5a7'))
paddle.fluid.contrib.memory_usage (ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '8fcb2f93bb743693baa8d4860a5ccc47')) paddle.fluid.contrib.memory_usage (ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '8fcb2f93bb743693baa8d4860a5ccc47'))
paddle.fluid.contrib.op_freq_statistic (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4d43687113c4bf5b29d15aee2f4e4afa')) paddle.fluid.contrib.op_freq_statistic (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4d43687113c4bf5b29d15aee2f4e4afa'))
paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000)), ('document', '14b39f1fcd5667ff556b1aad94357d1d')) paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size', 'moving_rate'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000, 0.9)), ('document', '14b39f1fcd5667ff556b1aad94357d1d'))
paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)), ('document', '909675a1ab055c69b436a7893fcae4fd')) paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)), ('document', '909675a1ab055c69b436a7893fcae4fd'))
paddle.fluid.contrib.QuantizeTranspiler.training_transpile (ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6dd9909f10b283ba2892a99058a72884')) paddle.fluid.contrib.QuantizeTranspiler.training_transpile (ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6dd9909f10b283ba2892a99058a72884'))
...@@ -392,9 +393,9 @@ paddle.fluid.contrib.MagnitudePruner.__init__ (ArgSpec(args=['self', 'threshold' ...@@ -392,9 +393,9 @@ paddle.fluid.contrib.MagnitudePruner.__init__ (ArgSpec(args=['self', 'threshold'
paddle.fluid.contrib.MagnitudePruner.prune (ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.MagnitudePruner.prune (ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.RatioPruner.__init__ (ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e7a81a325b296a9ca502ee5adb4fc85d')) paddle.fluid.contrib.RatioPruner.__init__ (ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e7a81a325b296a9ca502ee5adb4fc85d'))
paddle.fluid.contrib.RatioPruner.prune (ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,)), ('document', '358cbf2978c91028fb96a195a9884645')) paddle.fluid.contrib.RatioPruner.prune (ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,)), ('document', '358cbf2978c91028fb96a195a9884645'))
paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '11fbf7e8dd2289805de291b453a33ee7')) paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '2ab36d4f7a564f5f65e455807ad06c67'))
paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '5b5577bb3d24070da819674255d16196')) paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '59066bac9db0ac6ce414d05780b7333f'))
paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4efbd93876832d4d35497cdbc7a1e6d8')) paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '74c39c595dc70d6be2f16d8e462d282b'))
paddle.fluid.contrib.HDFSClient.__init__ (ArgSpec(args=['self', 'hadoop_home', 'configs'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.HDFSClient.__init__ (ArgSpec(args=['self', 'hadoop_home', 'configs'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.HDFSClient.delete (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', 'c3721aa2d4d9ef5a857dd47b2681c03e')) paddle.fluid.contrib.HDFSClient.delete (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', 'c3721aa2d4d9ef5a857dd47b2681c03e'))
paddle.fluid.contrib.HDFSClient.download (ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'unzip'], varargs=None, keywords=None, defaults=(False, False)), ('document', 'ca55bde92184d3fd0f9f5c963b25e634')) paddle.fluid.contrib.HDFSClient.download (ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'unzip'], varargs=None, keywords=None, defaults=(False, False)), ('document', 'ca55bde92184d3fd0f9f5c963b25e634'))
...@@ -493,7 +494,7 @@ paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinne ...@@ -493,7 +494,7 @@ paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinne
paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.WeightNormParamAttr.__init__ (ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.WeightNormParamAttr.__init__ (ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.DataFeeder.__init__ (ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.DataFeeder.__init__ (ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', '0eed2f198dc73c08a41b61edbc755753')) paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', 'f8f3df23c5633c614db781a91b81fb62'))
paddle.fluid.DataFeeder.feed (ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None), ('document', '459e316301279dfd82001b46f0b8ffca')) paddle.fluid.DataFeeder.feed (ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None), ('document', '459e316301279dfd82001b46f0b8ffca'))
paddle.fluid.DataFeeder.feed_parallel (ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,)), ('document', '543863d1f9d4853758adb613b8659e85')) paddle.fluid.DataFeeder.feed_parallel (ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,)), ('document', '543863d1f9d4853758adb613b8659e85'))
paddle.fluid.clip.ErrorClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.clip.ErrorClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
...@@ -517,11 +518,11 @@ paddle.reader.compose (ArgSpec(args=[], varargs='readers', keywords='kwargs', de ...@@ -517,11 +518,11 @@ paddle.reader.compose (ArgSpec(args=[], varargs='readers', keywords='kwargs', de
paddle.reader.chain (ArgSpec(args=[], varargs='readers', keywords=None, defaults=None), ('document', 'd22c34e379a53901ae67a6bca7f4def4')) paddle.reader.chain (ArgSpec(args=[], varargs='readers', keywords=None, defaults=None), ('document', 'd22c34e379a53901ae67a6bca7f4def4'))
paddle.reader.shuffle (ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None), ('document', 'e42ea6fee23ce26b23cb142cd1d6522d')) paddle.reader.shuffle (ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None), ('document', 'e42ea6fee23ce26b23cb142cd1d6522d'))
paddle.reader.firstn (ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None), ('document', 'c5bb8f7dd4f917f1569a368aab5b8aad')) paddle.reader.firstn (ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None), ('document', 'c5bb8f7dd4f917f1569a368aab5b8aad'))
paddle.reader.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '283bc0b8a0e26ae186b8b9bee4aec560')) paddle.reader.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '9c804a42f8a4dbaa76b3c98e0ab7f796'))
paddle.reader.PipeReader.__init__ (ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.reader.PipeReader.__init__ (ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.reader.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '5f80a7ed70052f01665e4c74acccfa69')) paddle.reader.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '9621ae612e595b6c34eb3bb5f3eb1a45'))
paddle.reader.multiprocess_reader (ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)), ('document', '7d8b3a96e592107c893d5d51ce968ba0')) paddle.reader.multiprocess_reader (ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)), ('document', '7d8b3a96e592107c893d5d51ce968ba0'))
paddle.reader.Fake.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.reader.Fake.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.reader.creator.np_array (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '28d457fbc9a71efa4ac91a3be179cada')) paddle.reader.creator.np_array (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '28d457fbc9a71efa4ac91a3be179cada'))
paddle.reader.creator.text_file (ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None), ('document', '44fe286ab6175a5464d3a961a68c266a')) paddle.reader.creator.text_file (ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None), ('document', 'f45fcb7add066c8e042c6774fc7c3db2'))
paddle.reader.creator.recordio (ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)), ('document', '11b3704ea42cfd537953387a7e58dae8')) paddle.reader.creator.recordio (ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)), ('document', 'b4a94ee0e2cefb495619275c2f8c61d2'))
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include <memory> #include <memory>
#include <utility>
#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
...@@ -49,6 +50,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -49,6 +50,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
AppendPass("sequential_execution_pass"); AppendPass("sequential_execution_pass");
} }
// Add op fusion.
if (strategy.sync_batch_norm_) {
AppendPass("sync_batch_norm_pass");
}
// Add op fusion. // Add op fusion.
if (strategy.fuse_relu_depthwise_conv_) { if (strategy.fuse_relu_depthwise_conv_) {
AppendPass("fuse_relu_depthwise_conv_pass"); AppendPass("fuse_relu_depthwise_conv_pass");
...@@ -227,6 +233,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -227,6 +233,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
USE_PASS(sync_batch_norm_pass);
USE_PASS(fuse_relu_depthwise_conv_pass); USE_PASS(fuse_relu_depthwise_conv_pass);
USE_PASS(fuse_elewise_add_act_pass); USE_PASS(fuse_elewise_add_act_pass);
USE_PASS(graph_viz_pass); USE_PASS(graph_viz_pass);
......
...@@ -77,6 +77,8 @@ struct BuildStrategy { ...@@ -77,6 +77,8 @@ struct BuildStrategy {
bool fuse_relu_depthwise_conv_{false}; bool fuse_relu_depthwise_conv_{false};
bool sync_batch_norm_{false};
bool memory_optimize_{true}; bool memory_optimize_{true};
// TODO(dzhwinter): // TODO(dzhwinter):
// make enable_inplace, memory_optimize_ // make enable_inplace, memory_optimize_
......
...@@ -68,11 +68,11 @@ class SplitOpMaker : public OpProtoAndCheckerMaker { ...@@ -68,11 +68,11 @@ class SplitOpMaker : public OpProtoAndCheckerMaker {
class DummyVarTypeInference : public VarTypeInference { class DummyVarTypeInference : public VarTypeInference {
public: public:
void operator()(const OpDesc& op_desc, BlockDesc* block) const override { void operator()(framework::InferVarTypeContext* ctx) const override {
auto& inputs = op_desc.Input("X"); auto& inputs = ctx->Input("X");
auto type = block->Var(inputs.front())->GetType(); auto type = ctx->GetType(inputs.front());
auto out_var_name = op_desc.Output("Out").front(); auto out_var_name = ctx->Output("Out").front();
block->Var(out_var_name)->SetType(type); ctx->SetType(out_var_name, type);
} }
}; };
......
...@@ -337,7 +337,6 @@ bool NodeCanReused(const VarDesc& node) { ...@@ -337,7 +337,6 @@ bool NodeCanReused(const VarDesc& node) {
auto type = node.GetType(); auto type = node.GetType();
// only these types holds bulk of gpu memory // only these types holds bulk of gpu memory
if (!(type == proto::VarType::LOD_TENSOR || if (!(type == proto::VarType::LOD_TENSOR ||
type == proto::VarType::SELECTED_ROWS ||
type == proto::VarType::LOD_TENSOR_ARRAY)) { type == proto::VarType::LOD_TENSOR_ARRAY)) {
return false; return false;
} }
......
...@@ -16,6 +16,8 @@ limitations under the License. */ ...@@ -16,6 +16,8 @@ limitations under the License. */
#include <string> #include <string>
#include <tuple> #include <tuple>
#include <unordered_map>
#include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/grad_op_desc_maker.h" #include "paddle/fluid/framework/grad_op_desc_maker.h"
#include "paddle/fluid/framework/inplace_op_inference.h" #include "paddle/fluid/framework/inplace_op_inference.h"
...@@ -127,9 +129,9 @@ struct OpInfoFiller<T, kGradOpDescMaker> { ...@@ -127,9 +129,9 @@ struct OpInfoFiller<T, kGradOpDescMaker> {
template <typename T> template <typename T>
struct OpInfoFiller<T, kVarTypeInference> { struct OpInfoFiller<T, kVarTypeInference> {
void operator()(const char* op_type, OpInfo* info) const { void operator()(const char* op_type, OpInfo* info) const {
info->infer_var_type_ = [](const OpDesc& fwd_op, BlockDesc* block) { info->infer_var_type_ = [](InferVarTypeContext* context) {
T inference; T inference;
inference(fwd_op, block); inference(context);
}; };
} }
}; };
......
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
...@@ -55,11 +57,11 @@ class GradOpDescMakerBase { ...@@ -55,11 +57,11 @@ class GradOpDescMakerBase {
std::back_inserter(ret_val), std::back_inserter(ret_val),
[this](const std::string& fwd_var_name) -> std::string { [this](const std::string& fwd_var_name) -> std::string {
auto g_name = GradVarName(fwd_var_name); auto g_name = GradVarName(fwd_var_name);
if (no_grad_set_.count(g_name)) { if (no_grad_set_.empty() || !no_grad_set_.count(g_name)) {
return kEmptyVarName;
} else {
(*this->grad_to_var_)[g_name] = fwd_var_name; (*this->grad_to_var_)[g_name] = fwd_var_name;
return g_name; return g_name;
} else {
return kEmptyVarName;
} }
}); });
if (!drop_empty_grad) { if (!drop_empty_grad) {
......
...@@ -46,6 +46,9 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass) ...@@ -46,6 +46,9 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
pass_library(graph_to_program_pass base) pass_library(graph_to_program_pass base)
pass_library(graph_viz_pass base) pass_library(graph_viz_pass base)
pass_library(lock_free_optimize_pass base) pass_library(lock_free_optimize_pass base)
pass_library(cpu_quantize_placement_pass base)
pass_library(cpu_quantize_pass inference)
pass_library(cpu_quantize_squash_pass inference)
pass_library(fc_fuse_pass inference) pass_library(fc_fuse_pass inference)
pass_library(attention_lstm_fuse_pass inference) pass_library(attention_lstm_fuse_pass inference)
pass_library(infer_clean_graph_pass inference) pass_library(infer_clean_graph_pass inference)
...@@ -66,6 +69,8 @@ pass_library(conv_elementwise_add_fuse_pass inference) ...@@ -66,6 +69,8 @@ pass_library(conv_elementwise_add_fuse_pass inference)
pass_library(conv_affine_channel_fuse_pass inference) pass_library(conv_affine_channel_fuse_pass inference)
pass_library(transpose_flatten_concat_fuse_pass inference) pass_library(transpose_flatten_concat_fuse_pass inference)
pass_library(identity_scale_op_clean_pass base) pass_library(identity_scale_op_clean_pass base)
pass_library(sync_batch_norm_pass base)
pass_library(runtime_context_cache_pass base)
# There may be many transpose-flatten structures in a model, and the output of # There may be many transpose-flatten structures in a model, and the output of
# these structures will be used as inputs to the concat Op. This pattern will # these structures will be used as inputs to the concat Op. This pattern will
...@@ -100,6 +105,12 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g ...@@ -100,6 +105,12 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto) cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass) cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
cc_test(test_cpu_quantize_placement_pass SRCS cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass)
cc_test(test_cpu_quantize_pass SRCS cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor)
cc_test(test_cpu_quantize_squash_pass SRCS cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
if(NOT WIN32)
cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
endif()
if (WITH_MKLDNN) if (WITH_MKLDNN)
cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass) cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor) cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor)
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/cpu_quantize_pass.h"
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
namespace framework {
namespace ir {
namespace {
void UnlinkNodes(ir::Node* a, ir::Node* b) {
a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b),
a->outputs.end());
b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a),
b->inputs.end());
}
} // namespace
enum { U8_MAX = 255, S8_MAX = 127 };
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<double, Eigen::Dynamic, 1>>;
using string::PrettyLogDetail;
void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
std::string input_name, double scale_to_one,
bool is_unsigned,
std::string scale_attr_name) const {
unsigned max = is_unsigned ? U8_MAX : S8_MAX;
float scale = scale_to_one * max;
// Create quantize output variable
VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out"));
auto* quantize_out_node = g->CreateVarNode(&quantize_out_desc);
// create a quantize op node
OpDesc q_desc;
q_desc.SetType("quantize");
q_desc.SetInput("Input", std::vector<std::string>({input->Name()}));
q_desc.SetOutput("Output",
std::vector<std::string>({quantize_out_node->Name()}));
q_desc.SetAttr("Scale", scale);
q_desc.SetAttr("is_negative_input", !is_unsigned);
auto quantize_op = g->CreateOpNode(&q_desc); // OpDesc will be copied.
// update op's input
op->Op()->SetInput(input_name,
std::vector<std::string>({quantize_out_node->Name()}));
// link quantize op
UnlinkNodes(input, op);
IR_NODE_LINK_TO(input, quantize_op);
IR_NODE_LINK_TO(quantize_op, quantize_out_node);
IR_NODE_LINK_TO(quantize_out_node, op);
if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
}
void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output,
std::string output_name,
double scale_to_one, bool is_unsigned,
std::string scale_attr_name) const {
unsigned max = is_unsigned ? U8_MAX : S8_MAX;
float scale = scale_to_one * max;
// Create dequantize input variable
VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc);
// create a dequantize op node for output.
OpDesc deq_desc;
deq_desc.SetType("dequantize");
deq_desc.SetInput("Input",
std::vector<std::string>({dequantize_in_node->Name()}));
deq_desc.SetOutput("Output", std::vector<std::string>({output->Name()}));
deq_desc.SetAttr("Scale", scale);
auto dequantize_op = g->CreateOpNode(&deq_desc); // OpDesc will be copied.
// update op's output
op->Op()->SetOutput(output_name,
std::vector<std::string>({dequantize_in_node->Name()}));
// link dequantize op
UnlinkNodes(op, output);
IR_NODE_LINK_TO(op, dequantize_in_node);
IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
IR_NODE_LINK_TO(dequantize_op, output);
if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
}
void CPUQuantizePass::QuantizeConv(Graph* graph,
bool with_residual_data) const {
GraphPatternDetector gpd;
auto pattern = gpd.mutable_pattern();
patterns::ConvResidual conv_pattern{pattern, name_scope_};
conv_pattern(with_residual_data);
int quantize_conv_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "Quantize conv2d op";
GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
auto* conv_op_desc = conv_op->Op();
// skip if should not be quantized
if (!conv_op_desc->HasAttr("use_quantizer") ||
!boost::get<bool>(conv_op_desc->GetAttr("use_quantizer")))
return;
GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
// get scales calculated after warmup, they scale variables to MAX=1.0
auto scales = Get<VarQuantScale>("quant_var_scales");
auto input_scale = scales[conv_input->Name()].second.data<double>()[0];
bool is_input_unsigned = scales[conv_input->Name()].first;
QuantizeInput(g, conv_op, conv_input, "Input", input_scale,
is_input_unsigned, "Scale_in");
auto filter_scale_tensor = scales[conv_filter->Name()].second;
EigenVectorArrayMap eigen_tensor{filter_scale_tensor.data<double>(),
filter_scale_tensor.numel(), 1};
eigen_tensor *= static_cast<double>(S8_MAX);
std::vector<float> filter_scale{
filter_scale_tensor.data<double>(),
filter_scale_tensor.data<double>() + filter_scale_tensor.numel()};
conv_op->Op()->SetAttr("Scale_weights", filter_scale);
if (with_residual_data) {
GET_IR_NODE_FROM_SUBGRAPH(conv_residual_data, conv_residual_data,
conv_pattern);
auto residual_scale =
scales[conv_residual_data->Name()].second.data<double>()[0];
bool is_residual_unsigned = scales[conv_residual_data->Name()].first;
QuantizeInput(g, conv_op, conv_residual_data, "ResidualData",
residual_scale, is_residual_unsigned, "Scale_in_eltwise");
}
auto output_scale = scales[conv_output->Name()].second.data<double>()[0];
bool is_output_unsigned = scales[conv_output->Name()].first;
DequantizeOutput(g, conv_op, conv_output, "Output", output_scale,
is_output_unsigned, "Scale_out");
++quantize_conv_count;
};
gpd(graph, handler);
AddStatis(quantize_conv_count);
std::stringstream msg_ss;
msg_ss << "--- quantized " << quantize_conv_count << " conv2d ops";
if (with_residual_data) msg_ss << " with residual connection";
PrettyLogDetail(msg_ss.str().c_str());
}
void CPUQuantizePass::QuantizePool(Graph* graph) const {
GraphPatternDetector gpd;
auto pattern = gpd.mutable_pattern();
patterns::Pool pool_pattern{pattern, name_scope_};
pool_pattern();
int quantize_pool_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "Quantize pool2d op";
GET_IR_NODE_FROM_SUBGRAPH(pool_op, pool_op, pool_pattern);
auto* pool_op_desc = pool_op->Op();
// skip if should not be quantized
if (!pool_op_desc->HasAttr("use_quantizer") ||
!boost::get<bool>(pool_op_desc->GetAttr("use_quantizer")))
return;
GET_IR_NODE_FROM_SUBGRAPH(pool_input, pool_input, pool_pattern);
GET_IR_NODE_FROM_SUBGRAPH(pool_output, pool_output, pool_pattern);
// get scales calculated after warmup, they scale variables to MAX=1.0
auto scales = Get<VarQuantScale>("quant_var_scales");
auto input_scale = scales[pool_input->Name()].second.data<double>()[0];
bool is_input_unsigned = scales[pool_input->Name()].first;
QuantizeInput(g, pool_op, pool_input, "X", input_scale, is_input_unsigned);
auto output_scale = scales[pool_output->Name()].second.data<double>()[0];
bool is_output_unsigned = scales[pool_output->Name()].first;
DequantizeOutput(g, pool_op, pool_output, "Out", output_scale,
is_output_unsigned);
++quantize_pool_count;
};
gpd(graph, handler);
AddStatis(quantize_pool_count);
PrettyLogDetail("--- quantized %d pool2d ops", quantize_pool_count);
}
std::unique_ptr<ir::Graph> CPUQuantizePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
VLOG(3) << "Quantizing the graph.";
PADDLE_ENFORCE(graph.get());
FusePassBase::Init(name_scope_, graph.get());
PADDLE_ENFORCE(param_scope());
QuantizeConv(graph.get(), true /* with_residual_data */);
QuantizeConv(graph.get());
QuantizePool(graph.get());
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(cpu_quantize_pass, paddle::framework::ir::CPUQuantizePass)
.RequirePassAttr("quant_var_scales");
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
/*
* Map variable name to tensor of scaling factors scaling it to MAX=1.0.
* bool denotes whether quantization of the variable should be done to unsigned
* type.
*/
using VarQuantScale =
std::unordered_map<std::string, std::pair<bool, LoDTensor>>;
/*
* Quantize all supported operators.
*/
class CPUQuantizePass : public FusePassBase {
public:
virtual ~CPUQuantizePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
void QuantizeConv(Graph* graph, bool with_residual_data = false) const;
void QuantizePool(Graph* graph) const;
void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name,
double scale_to_one, bool is_unsigned,
std::string scale_attr_name = "") const;
void DequantizeOutput(Graph* g, Node* op, Node* output,
std::string output_name, double scale_to_one,
bool is_unsigned,
std::string scale_attr_name = "") const;
const std::string name_scope_{"quantize"};
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/cpu_quantize_pass.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace framework {
namespace ir {
void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs, bool use_mkldnn,
bool use_quantizer = false) {
auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
op->SetAttr("use_mkldnn", use_mkldnn);
op->SetAttr("name", name);
if (type == "conv2d") {
op->SetInput("Input", {inputs[0]});
op->SetInput("Filter", {inputs[1]});
if (inputs.size() > 2)
op->SetInput("Bias", {inputs[2]});
else
op->SetInput("Bias", {});
if (inputs.size() > 3) {
op->SetInput("ResidualData", {inputs[3]});
op->SetAttr("fuse_residual_connection", true);
} else {
op->SetInput("ResidualData", {});
op->SetAttr("fuse_residual_connection", false);
}
op->SetOutput("Output", {outputs[0]});
op->SetAttr("use_quantizer", use_quantizer);
op->SetAttr("Scale_in", 1.0f);
op->SetAttr("Scale_out", 1.0f);
op->SetAttr("Scale_weights", std::vector<float>{1.0f});
} else if (type == "pool2d") {
op->SetInput("X", {inputs[0]});
op->SetOutput("Out", {outputs[0]});
op->SetAttr("use_quantizer", use_quantizer);
} else if (type == "dropout") {
op->SetInput("X", {inputs[0]});
op->SetOutput("Out", {outputs[0]});
} else if (type == "fc") {
op->SetInput("Input", {inputs[0]});
if (inputs.size() > 1) op->SetInput("W", {inputs[1]});
if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
op->SetOutput("Out", {outputs[0]});
}
}
static const std::initializer_list<std::string> variable_names{
"a", "w1", "c", "d", "w2", "e", "f", "g",
"h", "w3", "b1", "i", "j", "w4", "b2"};
// (a,w1)->Conv1->c and c->Pool1->d
//
// (d,w2)->Conv2->e and e->Pool2->f
//
// d->Dropout1->g and g->Fc1->h and (h,w3,b1,i)->Conv3->j
//
// (d,w4, b2)->Conv4->i
ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) {
ProgramDesc prog;
for (auto& v : variable_names) {
auto* var = prog.MutableBlock(0)->Var(v);
if (v.find("w") == 0 || v.find("b") == 0) {
var->SetPersistable(true);
}
}
SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"c"}, use_mkldnn,
use_quantizer);
SetOp(&prog, "pool2d", "Pool1", {"c"}, {"d"}, use_mkldnn, use_quantizer);
SetOp(&prog, "conv2d", "Conv2", {"d", "w2"}, {"e"}, use_mkldnn,
use_quantizer);
SetOp(&prog, "pool2d", "Pool2", {"e"}, {"f"}, use_mkldnn, use_quantizer);
SetOp(&prog, "dropout", "Dropout1", {"d"}, {"g"}, use_mkldnn);
SetOp(&prog, "fc", "Fc1", {"g"}, {"h"}, use_mkldnn);
SetOp(&prog, "conv2d", "Conv3", {"h", "w3", "b1", "i"}, {"j"}, use_mkldnn,
use_quantizer);
SetOp(&prog, "conv2d", "Conv4", {"c", "w4", "b2"}, {"i"}, use_mkldnn,
use_quantizer);
return prog;
}
void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
const char* var_name) {
auto x = scope->Var(var_name);
auto tensor = x->GetMutable<LoDTensor>();
tensor->mutable_data(place, proto::VarType::FP32,
::paddle::memory::Allocator::kDefault, 1);
}
void MainTest(const ProgramDesc& prog, int conv_count, int pool_count,
int quant_count, int dequant_count, int added_nodes_count,
float scale) {
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
// Init scope, as it is used in pass
auto place = paddle::platform::CPUPlace();
NaiveExecutor exe{place};
Scope scope;
exe.CreateVariables(prog, 0, true, &scope);
auto* scales = new VarQuantScale();
for (auto& v : variable_names) {
InitTensorHolder(&scope, place, v.c_str());
LoDTensor tensor;
tensor.Resize({1});
auto* ptr = tensor.mutable_data<double>(place);
ptr[0] = 2.0;
(*scales)[v] = std::make_pair(false, std::move(tensor));
}
graph->Set(kParamScopeAttr, new framework::Scope*(&scope));
auto pass = PassRegistry::Instance().Get("cpu_quantize_pass");
pass->Set("quant_var_scales", scales);
int original_nodes_num = graph->Nodes().size();
graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size();
int quantize_nodes_count = 0;
int dequantize_nodes_count = 0;
int conv2d_nodes_count = 0;
int pool2d_nodes_count = 0;
for (auto* node : graph->Nodes()) {
if (node->IsOp()) {
auto* op = node->Op();
if (op->Type() == "conv2d") {
conv2d_nodes_count++;
auto op_name = boost::get<std::string>(op->GetAttr("name"));
EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_in")), scale)
<< "Scale_in for node '" + op_name + "'.";
EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_out")), scale)
<< "Scale_out for node '" + op_name + "'.";
EXPECT_EQ(
boost::get<std::vector<float>>(op->GetAttr("Scale_weights"))[0],
scale)
<< "Scale_weights for node '" + op_name + "'.";
} else if (op->Type() == "pool2d") {
pool2d_nodes_count++;
} else if (op->Type() == "quantize") {
quantize_nodes_count++;
} else if (op->Type() == "dequantize") {
dequantize_nodes_count++;
}
}
}
EXPECT_EQ(conv2d_nodes_count, conv_count);
EXPECT_EQ(pool2d_nodes_count, pool_count);
EXPECT_EQ(quantize_nodes_count, quant_count);
EXPECT_EQ(dequantize_nodes_count, dequant_count);
EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
}
TEST(CpuQuantizePass, quantize) {
bool use_mkldnn = true;
bool use_quantizer = true;
// (a->QUANT1->IN1,w1)->Conv1->OUT1->DEQUANT1->c and
// c->QUANT2->IN2->Pool1->OUT2->DEQUANT2->d
//
// (d->QUANT3->IN3,w2)->Conv2->OUT3->DEQUANT3->e and
// e->QUANT4->IN4->Pool2->OUT4->DEQUANT4->f
//
// d->Dropout1->g and g->Fc1->h and
// (h->QUANT5->IN5,w3,b1,i->QUANT6->IN6)->Conv3->OUT5->DEQUANT5->j
//
// (d->QUANT7->IN7,w4, b2)->Conv4->DEQUANT6->OUT6->i
// Insert nodes: 7 Quant + 7 IN + 6 OUT + 6 DEQUANT
int added_nodes = 7 + 7 + 6 + 6;
MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 7, 6, added_nodes,
2.0f * 127);
}
TEST(CpuQuantizePass, do_not_quantize) {
bool use_mkldnn = true;
bool use_quantizer = false;
int added_nodes = 0;
MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 0, 0, added_nodes,
1.0f);
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(cpu_quantize_pass);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/cpu_quantize_placement_pass.h"
#include <string>
#include <unordered_set>
namespace paddle {
namespace framework {
namespace ir {
std::unique_ptr<ir::Graph> CPUQuantizePlacementPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
VLOG(3) << "Marks operators which are to be quantized.";
const auto& excluded_ids_list =
Get<std::unordered_set<int>>("quantize_excluded_op_ids");
const auto& op_types_list =
Get<std::unordered_set<std::string>>("quantize_enabled_op_types");
for (const Node* n : graph->Nodes()) {
if (n->IsOp()) {
if (std::find(excluded_ids_list.begin(), excluded_ids_list.end(),
n->id()) != excluded_ids_list.end())
continue;
auto* op = n->Op();
if (op->HasAttr("use_quantizer") || op->HasProtoAttr("use_quantizer")) {
if (op_types_list.empty()) {
op->SetAttr("use_quantizer", true);
} else if (std::find(op_types_list.begin(), op_types_list.end(),
n->Name()) != op_types_list.end()) {
op->SetAttr("use_quantizer", true);
}
}
}
}
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(cpu_quantize_placement_pass,
paddle::framework::ir::CPUQuantizePlacementPass)
// a vector of operator type names to be quantized ("conv2d" etc.)
.RequirePassAttr("quantize_enabled_op_types")
// a vector of operator ids that are to be excluded from quantization
.RequirePassAttr("quantize_excluded_op_ids");
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
/*
* Specifies which operators should be quantized.
*/
class CPUQuantizePlacementPass : public Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/cpu_quantize_placement_pass.h"
#include <gtest/gtest.h>
#include <boost/logic/tribool.hpp>
namespace paddle {
namespace framework {
namespace ir {
void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs,
boost::tribool use_quantizer) {
auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
if (!boost::indeterminate(use_quantizer))
op->SetAttr("use_quantizer", use_quantizer);
if (type == "conv2d") {
op->SetAttr("name", name);
op->SetInput("Input", {inputs[0]});
op->SetInput("Filter", {inputs[1]});
op->SetInput("Bias", {inputs[2]});
} else if (type == "relu") {
op->SetInput("X", inputs);
} else if (type == "concat") {
op->SetAttr("axis", 1);
op->SetInput("X", {inputs[0], inputs[1]});
} else if (type == "pool2d") {
op->SetInput("X", {inputs[0]});
} else {
FAIL() << "Unexpected operator type.";
}
op->SetOutput("Out", {outputs[0]});
}
// operator use_quantizer
// ---------------------------------------
// (a,b)->concat->c none
// (c,weights,bias)->conv->f false
// f->relu->g none
// g->pool->h false
// (h,weights2,bias2)->conv->k false
// k->pool->l false
ProgramDesc BuildProgramDesc() {
ProgramDesc prog;
for (auto& v :
std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g",
"h", "weights2", "bias2", "k", "l"})) {
auto* var = prog.MutableBlock(0)->Var(v);
var->SetType(proto::VarType::SELECTED_ROWS);
if (v == "weights" || v == "bias") {
var->SetPersistable(true);
}
}
SetOp(&prog, "concat", "concat1", {"a", "b"}, {"c"}, boost::indeterminate);
SetOp(&prog, "conv2d", "conv1", {"c", "weights", "bias"}, {"f"}, false);
SetOp(&prog, "relu", "relu1", {"f"}, {"g"}, boost::indeterminate);
SetOp(&prog, "pool2d", "pool1", {"g"}, {"h"}, false);
SetOp(&prog, "conv2d", "conv2", {"h", "weights2", "bias2"}, {"k"}, false);
SetOp(&prog, "pool2d", "pool2", {"k"}, {"l"}, false);
return prog;
}
void MainTest(std::initializer_list<std::string> quantize_enabled_op_types,
std::initializer_list<int> quantize_excluded_op_ids,
unsigned expected_use_quantizer_true_count) {
auto prog = BuildProgramDesc();
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
auto pass = PassRegistry::Instance().Get("cpu_quantize_placement_pass");
pass->Set("quantize_enabled_op_types",
new std::unordered_set<std::string>(quantize_enabled_op_types));
pass->Set("quantize_excluded_op_ids",
new std::unordered_set<int>(quantize_excluded_op_ids));
graph = pass->Apply(std::move(graph));
unsigned use_quantizer_true_count = 0;
for (auto* node : graph->Nodes()) {
if (node->IsOp()) {
auto* op = node->Op();
if (op->HasAttr("use_quantizer") &&
boost::get<bool>(op->GetAttr("use_quantizer"))) {
++use_quantizer_true_count;
}
}
}
EXPECT_EQ(use_quantizer_true_count, expected_use_quantizer_true_count);
}
TEST(QuantizerPlacementPass, enabled_pool) { MainTest({"pool2d"}, {}, 2); }
TEST(QuantizerPlacementPass, enabled_conv_excluded_one) {
MainTest({"conv2d"}, {4}, 1);
}
TEST(QuantizerPlacementPass, excluded_none) {
// 2 conv + 2 pool
MainTest({}, {}, 4);
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(cpu_quantize_placement_pass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file eint8_outcept in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either eint8_outpress or
// implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h"
#include <string>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
namespace framework {
namespace ir {
using string::PrettyLogDetail;
void CPUQuantizeSquashPass::FindNodesToKeep(
Graph* graph,
std::unordered_map<const Node*, int>* nodes_keep_counter) const {
GraphPatternDetector gpd;
patterns::DequantAny deq_any_pattern{gpd.mutable_pattern(), "deqant_any"};
deq_any_pattern();
int found_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, deq_any_pattern);
if (nodes_keep_counter->find(dequant_out) == nodes_keep_counter->end())
(*nodes_keep_counter)[dequant_out] = 1;
else
(*nodes_keep_counter)[dequant_out] += 1;
found_count++;
};
gpd(graph, handler);
AddStatis(found_count);
}
void CPUQuantizeSquashPass::Squash(
Graph* graph,
std::unordered_map<const Node*, int>* nodes_keep_counter) const {
GraphPatternDetector gpd;
patterns::DequantQuantAny squash_pattern{gpd.mutable_pattern(), "squash"};
squash_pattern();
int found_squash_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "squash requantize-quantize ops pair";
GET_IR_NODE_FROM_SUBGRAPH(dequant_in, dequant_in, squash_pattern);
GET_IR_NODE_FROM_SUBGRAPH(dequant_op, dequant_op, squash_pattern);
GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, squash_pattern);
GET_IR_NODE_FROM_SUBGRAPH(quant_op, quant_op, squash_pattern);
GET_IR_NODE_FROM_SUBGRAPH(quant_out, quant_out, squash_pattern);
GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, squash_pattern);
auto* next_op_desc = next_op->Op();
float dequant_scale = boost::get<float>(dequant_op->Op()->GetAttr("Scale"));
float quant_scale = boost::get<float>(quant_op->Op()->GetAttr("Scale"));
PADDLE_ENFORCE(nodes_keep_counter->find(dequant_out) !=
nodes_keep_counter->end());
// check if dequantize op should be kept or removed, decrease the counter
bool keep_dequant = (*nodes_keep_counter)[dequant_out]-- > 1;
if (dequant_scale == quant_scale) {
// squash dequantize-quantize to nothing
auto quant_out_var_name = quant_out->Name();
auto next_op_inputs = next_op_desc->InputNames();
for (const auto& name : next_op_inputs) {
auto var_name = next_op_desc->Input(name)[0];
if (var_name.compare(quant_out_var_name) == 0) {
next_op_desc->SetInput(
name, std::vector<std::string>({dequant_in->Name()}));
break;
}
}
if (keep_dequant)
GraphSafeRemoveNodes(graph, {quant_op, quant_out});
else
GraphSafeRemoveNodes(graph,
{dequant_op, quant_op, dequant_out, quant_out});
IR_NODE_LINK_TO(dequant_in, next_op);
found_squash_count++;
} else {
// squash dequantize-quantize to requantize op
OpDesc desc;
desc.SetType("requantize");
desc.SetInput("Input", std::vector<std::string>({dequant_in->Name()}));
desc.SetOutput("Output", std::vector<std::string>({quant_out->Name()}));
desc.SetAttr("Scale_in", dequant_scale);
desc.SetAttr("Scale_out", quant_scale);
auto requant_op = g->CreateOpNode(&desc);
if (keep_dequant)
GraphSafeRemoveNodes(graph, {quant_op});
else
GraphSafeRemoveNodes(graph, {dequant_op, quant_op, dequant_out});
IR_NODE_LINK_TO(dequant_in, requant_op);
IR_NODE_LINK_TO(requant_op, quant_out);
found_squash_count++;
}
};
gpd(graph, handler);
AddStatis(found_squash_count);
PrettyLogDetail("--- squashed %d dequantize-quantize pairs",
found_squash_count);
}
std::unique_ptr<ir::Graph> CPUQuantizeSquashPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
PADDLE_ENFORCE(graph.get());
FusePassBase::Init("cpu_quantize_squash_pass", graph.get());
std::unordered_map<const Node*, int> nodes_keep_counter;
FindNodesToKeep(graph.get(), &nodes_keep_counter);
Squash(graph.get(), &nodes_keep_counter);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(cpu_quantize_squash_pass,
paddle::framework::ir::CPUQuantizeSquashPass);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
/*
* Squash dequantize->quantize pair pattern into requantize op
*/
class CPUQuantizeSquashPass : public FusePassBase {
public:
virtual ~CPUQuantizeSquashPass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
/*
* For each dequantize's output find the number of operators it is an input to
*/
void FindNodesToKeep(
Graph* graph,
std::unordered_map<const Node*, int>* nodes_keep_counter) const;
/*
* Squash dequantize-quantize ops pairs into requantize or nothing
*/
void Squash(Graph* graph,
std::unordered_map<const Node*, int>* nodes_keep_counter) const;
const std::string name_scope_{"squash"};
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace framework {
namespace ir {
void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs, bool use_mkldnn,
float scale = 0) {
auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
op->SetAttr("use_mkldnn", use_mkldnn);
op->SetAttr("name", name);
if (type == "conv2d") {
op->SetInput("Input", {inputs[0]});
if (inputs.size() > 1) op->SetInput("Filter", {inputs[1]});
if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
op->SetOutput("Output", {outputs[0]});
} else if (type == "quantize") {
op->SetInput("Input", {inputs[0]});
op->SetOutput("Output", {outputs[0]});
op->SetAttr("Scale", scale);
} else if (type == "dequantize") {
op->SetInput("Input", {inputs[0]});
op->SetOutput("Output", {outputs[0]});
op->SetAttr("Scale", scale);
}
}
// (a,w1,b1)->Conv1->d
// d->Dequant->e
// e->Quant->f
// (f,w2,b2)->Conv2->i
ProgramDesc BuildProgramDesc(bool use_mkldnn, float scale1, float scale2) {
ProgramDesc prog;
for (auto& v : std::initializer_list<std::string>(
{"a", "w1", "b1", "d", "e", "f", "w2", "b2", "i"})) {
auto* var = prog.MutableBlock(0)->Var(v);
if (v.find("w") == 0 || v.find("b") == 0) {
var->SetPersistable(true);
}
}
SetOp(&prog, "conv2d", "Conv1", {"a", "w1", "b1"}, {"d"}, use_mkldnn);
SetOp(&prog, "dequantize", "Dequant", {"d"}, {"e"}, use_mkldnn, scale1);
SetOp(&prog, "quantize", "Quant", {"e"}, {"f"}, use_mkldnn, scale2);
SetOp(&prog, "conv2d", "Conv2", {"f", "w2", "b2"}, {"i"}, use_mkldnn);
return prog;
}
static const std::initializer_list<std::string> variable_names{
"a", "b", "c", "d", "e", "f", "g", "h"};
// a->Conv1->b
// b->Dequant->c
//
// c->Quant1->d and d->Conv2->e
//
// c->Conv3->f
//
// c->Quant2->g and g->Conv4->h
//
ProgramDesc BuildProgramDesc2(bool use_mkldnn, float scale1, float scale2,
float scale3) {
ProgramDesc prog;
for (auto& v : variable_names) {
prog.MutableBlock(0)->Var(v);
}
SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn);
SetOp(&prog, "dequantize", "Dequant", {"b"}, {"c"}, use_mkldnn, scale1);
SetOp(&prog, "quantize", "Quant1", {"c"}, {"d"}, use_mkldnn, scale2);
SetOp(&prog, "conv2d", "Conv2", {"d"}, {"e"}, use_mkldnn);
SetOp(&prog, "conv2d", "Conv3", {"c"}, {"f"}, use_mkldnn);
SetOp(&prog, "quantize", "Quant2", {"c"}, {"g"}, use_mkldnn, scale3);
SetOp(&prog, "conv2d", "Conv4", {"g"}, {"h"}, use_mkldnn);
return prog;
}
void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
const char* var_name) {
auto x = scope->Var(var_name);
auto tensor = x->GetMutable<LoDTensor>();
tensor->mutable_data(place, proto::VarType::FP32,
::paddle::memory::Allocator::kDefault, 1);
}
void MainTest(const ProgramDesc& prog, int removed_nodes_num) {
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
// Init scope, as it is used in pass
auto place = paddle::platform::CPUPlace();
NaiveExecutor exe{place};
Scope scope;
exe.CreateVariables(prog, 0, true, &scope);
for (auto& v : variable_names) {
InitTensorHolder(&scope, place, v.c_str());
}
graph->Set(kParamScopeAttr, new framework::Scope*(&scope));
auto pass = PassRegistry::Instance().Get("cpu_quantize_squash_pass");
int original_nodes_num = graph->Nodes().size();
graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size();
EXPECT_EQ(original_nodes_num - removed_nodes_num, current_nodes_num);
}
TEST(CpuQuantizeSquashPass, equal_scales) {
auto scale = 1.2345f;
auto use_mkldnn = true;
// Remove 4 nodes: Dequant, Quant, e, f
auto remove_nodes = 4;
MainTest(BuildProgramDesc(use_mkldnn, scale, scale), remove_nodes);
use_mkldnn = !use_mkldnn;
MainTest(BuildProgramDesc(use_mkldnn, scale, scale), remove_nodes);
}
TEST(CpuQuantizeSquashPass, inequal_scales) {
auto scale1 = 1.2345f;
auto scale2 = 21.0f;
auto use_mkldnn = true;
// Remove 3 nodes: Dequant, Quant, e
// Insert 1 node: requantize
auto remove_nodes = 2;
MainTest(BuildProgramDesc(use_mkldnn, scale1, scale2), remove_nodes);
use_mkldnn = !use_mkldnn;
MainTest(BuildProgramDesc(use_mkldnn, scale1, scale2), remove_nodes);
}
TEST(CpuQuantizeSquashPass, branch_to_equal_inequal_and_fp32) {
// Delete both quantize ops,
// bypass dequantize in both branches,
// insert requantize on one branch
auto scale = 1.2345f;
auto scale2 = 21.0f;
auto use_mkldnn = true;
// Remove 3 nodes: Quant1, Quant2, g
// Insert 1 node: requantize
auto remove_nodes = 2;
MainTest(BuildProgramDesc2(use_mkldnn, scale, scale, scale2), remove_nodes);
use_mkldnn = !use_mkldnn;
MainTest(BuildProgramDesc2(use_mkldnn, scale, scale, scale2), remove_nodes);
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(cpu_quantize_squash_pass);
...@@ -90,7 +90,8 @@ void GraphPatternDetector::operator()(Graph *graph, ...@@ -90,7 +90,8 @@ void GraphPatternDetector::operator()(Graph *graph,
ValidateByNodeRole(&subgraphs); ValidateByNodeRole(&subgraphs);
if (subgraphs.empty()) return; if (subgraphs.empty()) return;
PrettyLogEndl(Style::detail(), "--- detect %d subgraphs", subgraphs.size()); PrettyLogEndl(Style::detail(), "--- detected %d subgraphs",
subgraphs.size());
int id = 0; int id = 0;
for (auto &g : subgraphs) { for (auto &g : subgraphs) {
VLOG(3) << "optimizing #" << id++ << " subgraph"; VLOG(3) << "optimizing #" << id++ << " subgraph";
...@@ -1074,9 +1075,53 @@ PDNode *patterns::Conv::operator()() { ...@@ -1074,9 +1075,53 @@ PDNode *patterns::Conv::operator()() {
->AsOutput() ->AsOutput()
->assert_is_op_output("conv2d", "Output"); ->assert_is_op_output("conv2d", "Output");
conv_op->LinksFrom({input_var, filter_var}); conv_op->LinksFrom({input_var, filter_var}).LinksTo({output_var});
conv_op->LinksTo({output_var}); return output_var;
}
PDNode *patterns::ConvResidual::operator()(bool with_residual_data) {
auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
if (!with_residual_data)
conv_op->assert_op_attr("fuse_residual_connection", false);
auto input_var = pattern->NewNode(conv_input_repr())
->AsInput()
->assert_is_op_input("conv2d", "Input");
auto filter_var = pattern->NewNode(conv_filter_repr())
->AsInput()
->assert_is_op_input("conv2d", "Filter");
auto output_var = pattern->NewNode(conv_output_repr())
->AsOutput()
->assert_is_op_output("conv2d", "Output");
std::vector<PDNode *> links_from{input_var, filter_var};
if (with_residual_data) {
auto res_conn_var = pattern->NewNode(conv_residual_data_repr())
->AsInput()
->assert_is_op_input("conv2d", "ResidualData");
links_from.push_back(res_conn_var);
}
conv_op->LinksFrom(links_from).LinksTo({output_var});
return output_var;
}
PDNode *patterns::Pool::operator()() {
auto pool_op = pattern->NewNode(pool_op_repr())->assert_is_op("pool2d");
auto input_var = pattern->NewNode(pool_input_repr())
->AsInput()
->assert_is_op_input("pool2d", "X");
auto output_var = pattern->NewNode(pool_output_repr())
->AsOutput()
->assert_is_op_output("pool2d", "Out");
pool_op->LinksFrom({input_var}).LinksTo({output_var});
return output_var; return output_var;
} }
...@@ -1301,6 +1346,51 @@ PDNode *patterns::ConvAffineChannel::operator()( ...@@ -1301,6 +1346,51 @@ PDNode *patterns::ConvAffineChannel::operator()(
return ac_out_var; return ac_out_var;
} }
PDNode *patterns::DequantQuantAny::operator()() {
auto *dequant_in = pattern->NewNode(dequant_in_repr())
->AsInput()
->assert_is_op_input("dequantize", "Input");
auto *dequant_op =
pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize");
auto *dequant_out = pattern->NewNode(dequant_out_repr())
->AsOutput()
->assert_is_op_output("dequantize", "Output");
auto *quant_op = pattern->NewNode(quant_op_repr())
->assert_is_op("quantize")
->AsIntermediate();
auto *quant_out = pattern->NewNode(quant_out_repr())
->AsOutput()
->assert_is_op_output("quantize");
auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
dequant_op->LinksFrom({dequant_in}).LinksTo({dequant_out});
quant_op->LinksFrom({dequant_out}).LinksTo({quant_out});
next_op->LinksFrom({quant_out});
return quant_out;
}
PDNode *patterns::DequantAny::operator()() {
auto *dequant_op =
pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize");
auto *dequant_out = pattern->NewNode(dequant_out_repr())
->AsOutput()
->assert_is_op_output("dequantize", "Output");
auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
dequant_op->LinksTo({dequant_out});
next_op->LinksFrom({dequant_out});
return dequant_out;
}
// a -> transpose_op(1) -> transpose_out_a -> flatten_op(1) -> flatten_out_a // a -> transpose_op(1) -> transpose_out_a -> flatten_op(1) -> flatten_out_a
// b -> transpose_op(2) -> transpose_out_b -> flatten_op(2) -> flatten_out_b // b -> transpose_op(2) -> transpose_out_b -> flatten_op(2) -> flatten_out_b
// ... // ...
......
...@@ -18,8 +18,11 @@ ...@@ -18,8 +18,11 @@
#include <gtest/gtest_prod.h> #include <gtest/gtest_prod.h>
#endif #endif
#include <memory>
#include <numeric> #include <numeric>
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
...@@ -656,6 +659,35 @@ struct Conv : public PatternBase { ...@@ -656,6 +659,35 @@ struct Conv : public PatternBase {
PATTERN_DECL_NODE(conv_output); PATTERN_DECL_NODE(conv_output);
}; };
// Convolution op with residual data
struct ConvResidual : public PatternBase {
ConvResidual(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "conv_residual") {}
PDNode* operator()(bool with_residual_data);
PATTERN_DECL_NODE(conv_op);
PATTERN_DECL_NODE(conv_input);
PATTERN_DECL_NODE(conv_filter);
PATTERN_DECL_NODE(conv_residual_data);
PATTERN_DECL_NODE(conv_output);
};
// Pool op
// Forward pass for pooling.
// pool_input is the input.
// pool_output is a result of the operator.
struct Pool : public PatternBase {
Pool(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "pooling") {}
PDNode* operator()();
PATTERN_DECL_NODE(pool_op);
PATTERN_DECL_NODE(pool_input);
PATTERN_DECL_NODE(pool_output);
};
// ElementwiseAdd used in residual connections. // ElementwiseAdd used in residual connections.
// y_var is used and convolution output. // y_var is used and convolution output.
// The operator is removed, when residual // The operator is removed, when residual
...@@ -766,6 +798,34 @@ struct ConvAffineChannel : public PatternBase { ...@@ -766,6 +798,34 @@ struct ConvAffineChannel : public PatternBase {
PATTERN_DECL_NODE(ac_out); // Out PATTERN_DECL_NODE(ac_out); // Out
}; };
// Dequantize + Quantize + anyOP
// This pattern is used for squashing the dequantize-quantize pairs.
struct DequantQuantAny : public PatternBase {
DequantQuantAny(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "dequant_quant_any") {}
PDNode* operator()();
PATTERN_DECL_NODE(dequant_in);
PATTERN_DECL_NODE(dequant_op);
PATTERN_DECL_NODE(dequant_out);
PATTERN_DECL_NODE(quant_op);
PATTERN_DECL_NODE(quant_out);
PATTERN_DECL_NODE(next_op);
};
// Dequantize + anyOP
// This quantize is used for getting number of ops the Dequantize's
// output is an input to.
struct DequantAny : public PatternBase {
DequantAny(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "dequant_any") {}
PDNode* operator()();
PATTERN_DECL_NODE(dequant_op);
PATTERN_DECL_NODE(dequant_out);
PATTERN_DECL_NODE(next_op);
};
struct TransposeFlattenConcat : public PatternBase { struct TransposeFlattenConcat : public PatternBase {
TransposeFlattenConcat(PDPattern* pattern, const std::string& name_scope) TransposeFlattenConcat(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "transpose_flatten_concat") {} : PatternBase(pattern, name_scope, "transpose_flatten_concat") {}
......
...@@ -43,20 +43,20 @@ class SumOpMaker : public OpProtoAndCheckerMaker { ...@@ -43,20 +43,20 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
class SumOpVarTypeInference : public VarTypeInference { class SumOpVarTypeInference : public VarTypeInference {
public: public:
void operator()(const OpDesc &op_desc, BlockDesc *block) const override { void operator()(InferVarTypeContext *ctx) const override {
auto &inputs = op_desc.Input("X"); auto &inputs = ctx->Input("X");
auto default_var_type = proto::VarType::SELECTED_ROWS; auto default_var_type = proto::VarType::SELECTED_ROWS;
bool any_input_is_lod_tensor = std::any_of( bool any_input_is_lod_tensor = std::any_of(
inputs.begin(), inputs.end(), [block](const std::string &name) { inputs.begin(), inputs.end(), [&ctx](const std::string &name) {
return block->Var(name)->GetType() == proto::VarType::LOD_TENSOR; return ctx->GetType(name) == proto::VarType::LOD_TENSOR;
}); });
if (any_input_is_lod_tensor) { if (any_input_is_lod_tensor) {
default_var_type = proto::VarType::LOD_TENSOR; default_var_type = proto::VarType::LOD_TENSOR;
} }
auto out_var_name = op_desc.Output("Out").front(); auto out_var_name = ctx->Output("Out").front();
block->Var(out_var_name)->SetType(default_var_type); ctx->SetType(out_var_name, default_var_type);
} }
}; };
...@@ -71,7 +71,7 @@ class DummyOpMaker : public OpProtoAndCheckerMaker { ...@@ -71,7 +71,7 @@ class DummyOpMaker : public OpProtoAndCheckerMaker {
class DummyOpVarTypeInference : public VarTypeInference { class DummyOpVarTypeInference : public VarTypeInference {
public: public:
void operator()(const OpDesc &op_desc, BlockDesc *block) const override {} void operator()(framework::InferVarTypeContext *ctx) const override {}
}; };
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/runtime_context_cache_pass.h"
#include <memory>
#include "paddle/fluid/framework/operator.h"
namespace paddle {
namespace framework {
namespace ir {
std::unique_ptr<ir::Graph> RuntimeContextCachePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
VLOG(3) << "Applies Runtime Context Cache strategy.";
for (const Node* n : graph->Nodes()) {
if (n->IsOp()) {
n->Op()->SetAttr(kEnableCacheRuntimeContext, true);
}
}
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(runtime_context_cache_pass,
paddle::framework::ir::RuntimeContextCachePass);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
class RuntimeContextCachePass : public Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/sync_batch_norm_pass.h"
#include <memory>
#include <string>
#include <utility>
namespace paddle {
namespace framework {
namespace ir {
std::unique_ptr<ir::Graph> SyncBatchNormPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
VLOG(3) << "Use synchronous batch norm";
for (const Node* n : graph->Nodes()) {
if (n->IsOp()) {
auto* op = n->Op();
if (op->Type() == "batch_norm") {
op->SetType("sync_batch_norm");
}
if (op->Type() == "batch_norm_grad") {
op->SetType("sync_batch_norm_grad");
}
}
}
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(sync_batch_norm_pass, paddle::framework::ir::SyncBatchNormPass);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
class SyncBatchNormPass : public Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/sync_batch_norm_pass.h"
#include <gtest/gtest.h>
namespace paddle {
namespace framework {
namespace ir {
void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs) {
auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
op->SetAttr("name", name);
op->SetInput("X", inputs);
op->SetOutput("Out", outputs);
}
// (a, conv_w)->conv2d->b
// (b, bn_scale, bn_bias, mean, var)->batch_norm
// ->(c, mean, var, save_mean, save_inv_var)
ProgramDesc BuildProgramDesc() {
ProgramDesc prog;
for (auto& v : std::vector<std::string>({"a", "conv_w", "b", "bn_scale",
"bn_bias", "mean", "var", "c",
"save_mean", "save_inv_var"})) {
auto* var = prog.MutableBlock(0)->Var(v);
if (v == "conv_w" || v == "bn_scale" || v == "bn_bias" || v == "mean" ||
v == "var") {
var->SetPersistable(true);
}
}
SetOp(&prog, "conv2d", "conv", std::vector<std::string>({"a", "conv_w"}),
std::vector<std::string>({"b"}));
SetOp(&prog, "batch_norm", "bn",
std::vector<std::string>({"b", "bn_scale", "bn_bias", "mean", "var"}),
std::vector<std::string>(
{"c", "mean", "var", "save_mean", "save_inv_var"}));
return prog;
}
TEST(IsTestPass, basic) {
auto prog = BuildProgramDesc();
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
auto pass = PassRegistry::Instance().Get("sync_batch_norm_pass");
graph = pass->Apply(std::move(graph));
for (auto* node : graph->Nodes()) {
if (node->IsOp()) {
auto* op = node->Op();
auto op_name = boost::get<std::string>(op->GetAttr("name"));
if (op_name == "bn") {
ASSERT_EQ(op->Type(), "sync_batch_norm");
}
}
}
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(sync_batch_norm_pass);
...@@ -24,6 +24,7 @@ limitations under the License. */ ...@@ -24,6 +24,7 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/var_type_inference.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -677,7 +678,8 @@ void OpDesc::InferVarType(BlockDesc *block) const { ...@@ -677,7 +678,8 @@ void OpDesc::InferVarType(BlockDesc *block) const {
// var type inference. Hence, we don't do any "default" setting here. // var type inference. Hence, we don't do any "default" setting here.
auto &info = OpInfoMap::Instance().Get(this->Type()); auto &info = OpInfoMap::Instance().Get(this->Type());
if (info.infer_var_type_) { if (info.infer_var_type_) {
info.infer_var_type_(*this, block); InferVarTypeContext context(this, block);
info.infer_var_type_(&context);
} }
} }
......
...@@ -874,9 +874,23 @@ std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig( ...@@ -874,9 +874,23 @@ std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(
return kernel_configs; return kernel_configs;
} }
RuntimeContext* OperatorWithKernel::GetRuntimeContext(
const Scope& scope) const {
if (!HasAttr(kEnableCacheRuntimeContext)) {
return new RuntimeContext(Inputs(), Outputs(), scope);
} else {
const Scope* cur_scope = &scope;
if (!runtime_ctx_ || pre_scope_ != cur_scope) {
runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
pre_scope_ = cur_scope;
}
return runtime_ctx_.get();
}
}
void OperatorWithKernel::RunImpl(const Scope& scope, void OperatorWithKernel::RunImpl(const Scope& scope,
const platform::Place& place) const { const platform::Place& place) const {
RuntimeContext ctx(Inputs(), Outputs(), scope); auto runtime_ctx = GetRuntimeContext(scope);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place); auto* dev_ctx = pool.Get(place);
...@@ -891,7 +905,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -891,7 +905,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
OpKernelMap& kernels = kernels_iter->second; OpKernelMap& kernels = kernels_iter->second;
auto expected_kernel_key = this->GetExpectedKernelType( auto expected_kernel_key = this->GetExpectedKernelType(
ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr)); ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx, nullptr));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key; VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key); auto kernel_iter = kernels.find(expected_kernel_key);
...@@ -915,8 +929,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -915,8 +929,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
// do data transformScope &transfer_scope; // do data transformScope &transfer_scope;
std::vector<std::string> transfered_inplace_vars; std::vector<std::string> transfered_inplace_vars;
auto* transfer_scope = auto* transfer_scope = PrepareData(scope, expected_kernel_key,
PrepareData(scope, expected_kernel_key, &transfered_inplace_vars, &ctx); &transfered_inplace_vars, runtime_ctx);
// exec scope is the scope that kernel actually executed on. // exec scope is the scope that kernel actually executed on.
const Scope& exec_scope = const Scope& exec_scope =
...@@ -927,13 +941,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -927,13 +941,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
} }
if (!HasAttr(kAllKernelsMustComputeRuntimeShape)) { if (!HasAttr(kAllKernelsMustComputeRuntimeShape)) {
RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx); RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx);
this->InferShape(&infer_shape_ctx); this->InferShape(&infer_shape_ctx);
} }
// TODO(panyx0718): ExecutionContext should only depend on RuntimeContext // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
// not Scope. Imperative mode only pass inputs and get outputs. // not Scope. Imperative mode only pass inputs and get outputs.
kernel_iter->second( kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx,
ExecutionContext(*this, exec_scope, *dev_ctx, ctx, kernel_configs)); *runtime_ctx, kernel_configs));
if (!transfered_inplace_vars.empty()) { if (!transfered_inplace_vars.empty()) {
// there is inplace variable has been transfered. // there is inplace variable has been transfered.
......
...@@ -62,6 +62,14 @@ constexpr char kZeroVarSuffix[] = "@ZERO"; ...@@ -62,6 +62,14 @@ constexpr char kZeroVarSuffix[] = "@ZERO";
/// Variables with this suffix are the new Gradient. /// Variables with this suffix are the new Gradient.
constexpr char kNewGradSuffix[] = "@NEWGRAD@"; constexpr char kNewGradSuffix[] = "@NEWGRAD@";
/// RuntimeContext is used to relate input/output names of Operator with
/// the corresponding variables in name scope.
/// If an Op has attribute kEnableCacheRuntimeContext, it means that in a same
/// name scope, since the input/output names of this Op do not change in the
/// execution, RuntimeContext could be created only at the first iteration of
/// this Op's execution to save the elapsed time.
constexpr char kEnableCacheRuntimeContext[] = "@ENABLE_CACHE_RUNTIME_CONTEXT@";
/// If an Op has this attribute, all its kernels should calculate output /// If an Op has this attribute, all its kernels should calculate output
/// variable's shape in the corresponding Compute() function. And /// variable's shape in the corresponding Compute() function. And
/// OperatorWithKernel::RunImpl() would skip call this Op's InferShape() /// OperatorWithKernel::RunImpl() would skip call this Op's InferShape()
...@@ -453,6 +461,7 @@ class OperatorWithKernel : public OperatorBase { ...@@ -453,6 +461,7 @@ class OperatorWithKernel : public OperatorBase {
// same. // same.
proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const; proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const;
void RunImpl(const Scope& scope, const platform::Place& place) const final; void RunImpl(const Scope& scope, const platform::Place& place) const final;
RuntimeContext* GetRuntimeContext(const Scope& scope) const;
/** /**
* Transfer data from scope to a transfered scope. If there is no data need to * Transfer data from scope to a transfered scope. If there is no data need to
...@@ -471,6 +480,8 @@ class OperatorWithKernel : public OperatorBase { ...@@ -471,6 +480,8 @@ class OperatorWithKernel : public OperatorBase {
protected: protected:
mutable OpKernelConfigsMap kernel_configs_map_; mutable OpKernelConfigsMap kernel_configs_map_;
mutable std::unique_ptr<RuntimeContext> runtime_ctx_;
mutable const Scope* pre_scope_ = nullptr;
}; };
extern bool OpSupportGPU(const std::string& op_type); extern bool OpSupportGPU(const std::string& op_type);
......
...@@ -14,8 +14,10 @@ limitations under the License. */ ...@@ -14,8 +14,10 @@ limitations under the License. */
#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/parallel_executor.h"
#include <algorithm> #include <algorithm>
#include <memory>
#include <string> #include <string>
#include <tuple> #include <tuple>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
...@@ -251,6 +253,20 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places, ...@@ -251,6 +253,20 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
member_->nccl_ctxs_.reset(new platform::NCCLContextMap( member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
member_->places_, nccl_id, build_strategy.num_trainers_, member_->places_, nccl_id, build_strategy.num_trainers_,
build_strategy.trainer_id_)); build_strategy.trainer_id_));
std::unique_ptr<platform::NCCLContextMap> dev_nccl_ctxs;
dev_nccl_ctxs.reset(new platform::NCCLContextMap(member_->places_));
// Initialize device context's nccl comm
// Note, more than one ParallelExecutor with same place, the nccl comm will
// be rewrite and there will be some problem.
for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
auto &nccl_ctx = dev_nccl_ctxs->at(dev_id);
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
pool.Get(member_->places_[dev_id]));
dev_ctx->set_nccl_comm(nccl_ctx.comm());
}
#else #else
PADDLE_THROW("Not compiled with CUDA"); PADDLE_THROW("Not compiled with CUDA");
#endif #endif
......
...@@ -44,6 +44,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -44,6 +44,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
<< dst_place; << dst_place;
return; return;
} }
#ifdef PADDLE_WITH_MKLDNN
if (src.layout() == DataLayout::kMKLDNN) {
dst->set_mkldnn_prim_desc(src.get_mkldnn_prim_desc());
}
#endif
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
boost::get<platform::CPUPlace>(src_place), src_ptr, size); boost::get<platform::CPUPlace>(src_place), src_ptr, size);
} }
......
...@@ -27,6 +27,7 @@ namespace framework { ...@@ -27,6 +27,7 @@ namespace framework {
class OperatorBase; class OperatorBase;
class OpDesc; class OpDesc;
class InferShapeContext; class InferShapeContext;
class InferVarTypeContext;
class BlockDesc; class BlockDesc;
class Variable; class Variable;
...@@ -53,7 +54,7 @@ using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDesc>>( ...@@ -53,7 +54,7 @@ using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDesc>>(
const std::vector<BlockDesc*>& grad_block)>; const std::vector<BlockDesc*>& grad_block)>;
using InferVarTypeFN = using InferVarTypeFN =
std::function<void(const OpDesc& /*op_desc*/, BlockDesc* /*block*/)>; std::function<void(framework::InferVarTypeContext* /*context*/)>;
using InferShapeFN = std::function<void(InferShapeContext*)>; using InferShapeFN = std::function<void(InferShapeContext*)>;
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include <string> #include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/framework/type_defs.h"
...@@ -21,26 +23,123 @@ limitations under the License. */ ...@@ -21,26 +23,123 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class OpDesc;
class BlockDesc;
// default infer var type context
class InferVarTypeContext {
public:
InferVarTypeContext(const OpDesc* op, BlockDesc* block)
: op_(op), block_(block) {}
virtual ~InferVarTypeContext() {}
virtual Attribute GetAttr(const std::string& name) const {
PADDLE_ENFORCE_NOT_NULL(op_);
return op_->GetAttr(name);
}
virtual bool HasVar(const std::string& name) const {
PADDLE_ENFORCE_NOT_NULL(block_);
return block_->FindVarRecursive(name) != nullptr;
}
virtual bool HasInput(const std::string& name) const {
PADDLE_ENFORCE_NOT_NULL(op_);
return op_->Inputs().count(name) > 0;
}
virtual bool HasOutput(const std::string& name) const {
PADDLE_ENFORCE_NOT_NULL(op_);
return op_->Outputs().count(name) > 0;
}
virtual const std::vector<std::string>& Input(const std::string& name) const {
PADDLE_ENFORCE_NOT_NULL(op_);
return op_->Input(name);
}
virtual const std::vector<std::string>& Output(
const std::string& name) const {
PADDLE_ENFORCE_NOT_NULL(op_);
return op_->Output(name);
}
virtual proto::VarType::Type GetType(const std::string& name) const {
PADDLE_ENFORCE_NOT_NULL(block_);
return block_->FindRecursiveOrCreateVar(name).GetType();
}
virtual void SetType(const std::string& name, proto::VarType::Type type) {
PADDLE_ENFORCE_NOT_NULL(block_);
block_->FindRecursiveOrCreateVar(name).SetType(type);
}
virtual proto::VarType::Type GetDataType(const std::string& name) const {
PADDLE_ENFORCE_NOT_NULL(block_);
return block_->FindRecursiveOrCreateVar(name).GetDataType();
}
virtual void SetDataType(const std::string& name, proto::VarType::Type type) {
PADDLE_ENFORCE_NOT_NULL(block_);
block_->FindRecursiveOrCreateVar(name).SetDataType(type);
}
virtual std::vector<proto::VarType::Type> GetDataTypes(
const std::string& name) const {
PADDLE_ENFORCE_NOT_NULL(block_);
return block_->FindRecursiveOrCreateVar(name).GetDataTypes();
}
virtual void SetDataTypes(
const std::string& name,
const std::vector<proto::VarType::Type>& multiple_data_type) {
PADDLE_ENFORCE_NOT_NULL(block_);
block_->FindRecursiveOrCreateVar(name).SetDataTypes(multiple_data_type);
}
virtual std::vector<int64_t> GetShape(const std::string& name) const {
PADDLE_ENFORCE_NOT_NULL(block_);
return block_->FindRecursiveOrCreateVar(name).GetShape();
}
virtual void SetShape(const std::string& name,
const std::vector<int64_t>& dims) {
PADDLE_ENFORCE_NOT_NULL(block_);
block_->FindRecursiveOrCreateVar(name).SetShape(dims);
}
virtual int32_t GetLoDLevel(const std::string& name) const {
PADDLE_ENFORCE_NOT_NULL(block_);
return block_->FindRecursiveOrCreateVar(name).GetLoDLevel();
}
virtual void SetLoDLevel(const std::string& name, int32_t lod_level) {
PADDLE_ENFORCE_NOT_NULL(block_);
block_->FindRecursiveOrCreateVar(name).SetLoDLevel(lod_level);
}
protected:
const OpDesc* op_;
BlockDesc* block_;
};
class VarTypeInference { class VarTypeInference {
public: public:
virtual ~VarTypeInference() {} virtual ~VarTypeInference() {}
virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0; virtual void operator()(InferVarTypeContext* context) const = 0; // NOLINT
}; };
class PassInDtypeAndVarTypeToOutput : public framework::VarTypeInference { class PassInDtypeAndVarTypeToOutput : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc& op_desc, void operator()(framework::InferVarTypeContext* ctx) const final { // NOLINT
framework::BlockDesc* block) const final {
auto in_out_var_names = this->GetInputOutputWithSameType(); auto in_out_var_names = this->GetInputOutputWithSameType();
for (auto& i_o_n : in_out_var_names) { for (auto& i_o_n : in_out_var_names) {
auto& x_name = op_desc.Input(i_o_n.first).at(0); auto& x_name = ctx->Input(i_o_n.first).at(0);
auto& out_name = op_desc.Output(i_o_n.second).at(0); auto& out_name = ctx->Output(i_o_n.second).at(0);
auto& x = block->FindRecursiveOrCreateVar(x_name); ctx->SetType(out_name, ctx->GetType(x_name));
auto& out = block->FindRecursiveOrCreateVar(out_name); ctx->SetDataType(out_name, ctx->GetDataType(x_name));
out.SetType(x.GetType());
out.SetDataType(x.GetDataType());
} }
} }
......
...@@ -44,20 +44,20 @@ class SumOpMaker : public OpProtoAndCheckerMaker { ...@@ -44,20 +44,20 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
class SumOpVarTypeInference : public VarTypeInference { class SumOpVarTypeInference : public VarTypeInference {
public: public:
void operator()(const OpDesc &op_desc, BlockDesc *block) const override { void operator()(framework::InferVarTypeContext *ctx) const override {
auto &inputs = op_desc.Input("X"); auto &inputs = ctx->Input("X");
auto default_var_type = proto::VarType::SELECTED_ROWS; auto default_var_type = proto::VarType::SELECTED_ROWS;
bool any_input_is_lod_tensor = std::any_of( bool any_input_is_lod_tensor = std::any_of(
inputs.begin(), inputs.end(), [block](const std::string &name) { inputs.begin(), inputs.end(), [&ctx](const std::string &name) {
return block->Var(name)->GetType() == proto::VarType::LOD_TENSOR; return ctx->GetType(name) == proto::VarType::LOD_TENSOR;
}); });
if (any_input_is_lod_tensor) { if (any_input_is_lod_tensor) {
default_var_type = proto::VarType::LOD_TENSOR; default_var_type = proto::VarType::LOD_TENSOR;
} }
auto out_var_name = op_desc.Output("Out").front(); auto out_var_name = ctx->Output("Out").front();
block->Var(out_var_name)->SetType(default_var_type); ctx->SetType(out_var_name, default_var_type);
} }
}; };
} // namespace framework } // namespace framework
......
...@@ -2,4 +2,5 @@ if(WITH_PYTHON) ...@@ -2,4 +2,5 @@ if(WITH_PYTHON)
cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind) cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind)
cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind) cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
cc_library(engine SRCS engine.cc) cc_library(engine SRCS engine.cc)
cc_library(imperative_profiler SRCS profiler.cc)
endif() endif()
...@@ -214,13 +214,11 @@ framework::LoDTensor& VarBase::GradValue() { ...@@ -214,13 +214,11 @@ framework::LoDTensor& VarBase::GradValue() {
} }
std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() { std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
if (grad_op_descs_.empty() && backward_id_ <= 0) { PADDLE_ENFORCE(!grad_op_descs_.empty() || backward_id_ > 0,
VLOG(3) << "op with no grad: " << Type(); "%s has no backward implementation", Type());
return {};
}
VLOG(3) << "apply op grad: " << Type(); VLOG(3) << "apply op grad: " << Type();
std::vector<framework::VariableValueMap> tmp_grad_outputs; std::vector<VarBasePtrMap> tmp_grad_outputs;
if (backward_id_ > 0) { if (backward_id_ > 0) {
VLOG(3) << "py_layer_grad"; VLOG(3) << "py_layer_grad";
tmp_grad_outputs.resize(1); tmp_grad_outputs.resize(1);
...@@ -239,30 +237,66 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() { ...@@ -239,30 +237,66 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
VLOG(3) << "apply grad op " << grad_op_desc->Type(); VLOG(3) << "apply grad op " << grad_op_desc->Type();
// Allocate tmp grad output variable // Allocate tmp grad output variable
for (auto it : grad_output_variable_map) { for (const auto& it : grad_output_variable_map) {
auto& outputs = tmp_grad_outputs[k][it.first]; auto& outputs = tmp_grad_outputs[k][it.first];
outputs.reserve(it.second.size()); outputs.reserve(it.second.size());
for (size_t i = 0; i < it.second.size(); ++i) { for (size_t i = 0; i < it.second.size(); ++i) {
VarBase* origin_grad_var_base = it.second[i];
// Allocate a new variable // Allocate a new variable
Variable* tmp_var = new framework::Variable(); VarBase* tmp_grad_var_base = new VarBase(
tmp_var->GetMutable<framework::LoDTensor>(); string::Sprintf("%s@IGrad", origin_grad_var_base->Name()),
outputs.emplace_back(tmp_var); origin_grad_var_base->DataType(), origin_grad_var_base->Dims(),
place_, true, false);
outputs.emplace_back(tmp_grad_var_base);
} }
} }
// Run grad op
framework::RuntimeContext ctx(grad_input_vars_[k], tmp_grad_outputs[k]);
// No need to do compile time infer shape here. // No need to do compile time infer shape here.
// grad_op_desc_->InferShape(*block_); // grad_op_desc_->InferShape(*block_);
// grad_op_desc->InferVarType(block_); // grad_op_desc->InferVarType(block_);
std::unique_ptr<framework::OperatorBase> opbase = std::unique_ptr<framework::OperatorBase> opbase =
framework::OpRegistry::CreateOp(*grad_op_desc); framework::OpRegistry::CreateOp(*grad_op_desc);
auto& info = framework::OpInfoMap::Instance().Get(grad_op_desc->Type());
if (info.infer_var_type_) {
RuntimeInferVarTypeContext infer_var_type_ctx(
&grad_input_vars_[k], &tmp_grad_outputs[k], &attrs_);
info.infer_var_type_(&infer_var_type_ctx);
}
framework::OperatorWithKernel* op_kernel = framework::OperatorWithKernel* op_kernel =
dynamic_cast<framework::OperatorWithKernel*>(opbase.get()); dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
// Run grad op
framework::VariableValueMap grad_invars_map;
framework::VariableValueMap grad_outvars_map;
for (const auto& it : grad_input_vars_[k]) {
auto& grad_invars = grad_invars_map[it.first];
grad_invars.reserve(it.second.size());
for (const VarBase* grad_inp : it.second) {
PADDLE_ENFORCE_NOT_NULL(grad_inp->var_, "op %s input %s nullptr",
grad_op_desc->Type(), grad_inp->Name());
grad_invars.emplace_back(grad_inp->var_);
}
}
for (const auto& it : tmp_grad_outputs[k]) {
auto& grad_outvars = grad_outvars_map[it.first];
grad_outvars.reserve(it.second.size());
for (VarBase* grad_out : it.second) {
PADDLE_ENFORCE_NOT_NULL(grad_out->var_, "op %s output %s nullptr",
grad_op_desc->Type(), grad_out->Name());
grad_outvars.emplace_back(grad_out->var_);
}
}
framework::RuntimeContext ctx(grad_invars_map, grad_outvars_map);
framework::Scope scope; framework::Scope scope;
PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_); PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
p.op.RuntimeInferShape(scope, place_, ctx); p.op.RuntimeInferShape(scope, place_, ctx);
...@@ -273,14 +307,14 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() { ...@@ -273,14 +307,14 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
// Add tmp grad outputs to original grad vars // Add tmp grad outputs to original grad vars
for (size_t k = 0; k < grad_output_vars_.size(); ++k) { for (size_t k = 0; k < grad_output_vars_.size(); ++k) {
for (auto it : grad_output_vars_[k]) { for (const auto& it : grad_output_vars_[k]) {
auto& outputs = tmp_grad_outputs[k][it.first]; auto& outputs = tmp_grad_outputs[k][it.first];
auto& origin_outputs = it.second; const auto& origin_outputs = it.second;
PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
for (size_t i = 0; i < outputs.size(); ++i) { for (size_t i = 0; i < outputs.size(); ++i) {
framework::Variable* grad = outputs[i]; framework::Variable* grad = outputs[i]->var_;
framework::Variable* orig_grad = origin_outputs[i]; framework::Variable* orig_grad = origin_outputs[i]->var_;
AddTo(grad, orig_grad, place_); AddTo(grad, orig_grad, place_);
delete grad; delete grad;
} }
...@@ -328,28 +362,35 @@ void PyLayer::RegisterFunc(int func_id, const py::object& py_func) { ...@@ -328,28 +362,35 @@ void PyLayer::RegisterFunc(int func_id, const py::object& py_func) {
int PyLayer::NumFuncs() { return py_funcs_.size(); } int PyLayer::NumFuncs() { return py_funcs_.size(); }
std::vector<Variable*> PyLayer::Apply(int func_id, std::vector<framework::Variable*> PyLayer::Apply(
const std::vector<VarBase*>& inputs) { int func_id, const std::vector<VarBase*>& inputs) {
std::vector<framework::Variable*> invars;
for (const VarBase* in : inputs) {
invars.push_back(in->var_);
}
PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end()); PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
return CallPythonFunc(py_funcs_[func_id], invars); return CallPythonFunc(py_funcs_[func_id], inputs);
} }
std::vector<Variable*> PyLayer::ApplyGrad( std::vector<VarBase*> PyLayer::ApplyGrad(int func_id,
int func_id, const std::vector<framework::Variable*>& inputs) { const std::vector<VarBase*>& inputs) {
PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end()); PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
return CallPythonFunc(py_funcs_[func_id], inputs); auto rets = CallPythonFunc(py_funcs_[func_id], inputs);
std::vector<VarBase*> outs;
outs.reserve(rets.size());
for (size_t i = 0U; i != rets.size(); ++i) {
outs.emplace_back(new VarBase(
string::Sprintf("%s_out_%d", framework::GradVarName(PyLayer::kFwdOut),
i),
rets[i], nullptr, true));
}
return outs;
} }
std::vector<framework::Variable*> PyLayer::CallPythonFunc( std::vector<framework::Variable*> PyLayer::CallPythonFunc(
const py::object& callable, const std::vector<framework::Variable*>& ins) { const py::object& callable, const std::vector<VarBase*>& ins) {
py::gil_scoped_acquire guard; py::gil_scoped_acquire guard;
py::tuple in_args(ins.size()); py::tuple in_args(ins.size());
for (size_t i = 0; i < ins.size(); ++i) { for (size_t i = 0; i < ins.size(); ++i) {
const framework::LoDTensor& t = ins[i]->Get<framework::LoDTensor>(); const framework::LoDTensor& t = ins[i]->var_->Get<framework::LoDTensor>();
in_args[i] = t.IsInitialized() ? py::cast(t) : py::cast(nullptr); in_args[i] = t.IsInitialized() ? py::cast(t) : py::cast(nullptr);
} }
VLOG(3) << "pyfunc in " << py::len(in_args); VLOG(3) << "pyfunc in " << py::len(in_args);
...@@ -359,6 +400,7 @@ std::vector<framework::Variable*> PyLayer::CallPythonFunc( ...@@ -359,6 +400,7 @@ std::vector<framework::Variable*> PyLayer::CallPythonFunc(
auto ret_tuple = py::cast<py::tuple>(ret); auto ret_tuple = py::cast<py::tuple>(ret);
size_t ret_num = py::len(ret_tuple); size_t ret_num = py::len(ret_tuple);
std::vector<framework::Variable*> outs; std::vector<framework::Variable*> outs;
outs.reserve(ret_num);
VLOG(3) << "pyfunc out " << ret_num; VLOG(3) << "pyfunc out " << ret_num;
for (size_t i = 0; i < ret_num; ++i) { for (size_t i = 0; i < ret_num; ++i) {
try { try {
...@@ -369,7 +411,7 @@ std::vector<framework::Variable*> PyLayer::CallPythonFunc( ...@@ -369,7 +411,7 @@ std::vector<framework::Variable*> PyLayer::CallPythonFunc(
auto* tensor = var->GetMutable<framework::LoDTensor>(); auto* tensor = var->GetMutable<framework::LoDTensor>();
tensor->ShareDataWith(*py_out_tensor); tensor->ShareDataWith(*py_out_tensor);
tensor->set_lod(py_out_tensor->lod()); tensor->set_lod(py_out_tensor->lod());
outs.push_back(var); outs.emplace_back(var);
} catch (py::cast_error&) { } catch (py::cast_error&) {
PADDLE_THROW("The %d-th output must be LoDTensor", i); PADDLE_THROW("The %d-th output must be LoDTensor", i);
} }
......
...@@ -18,14 +18,16 @@ ...@@ -18,14 +18,16 @@
#include "paddle/fluid/framework/python_headers.h" #include "paddle/fluid/framework/python_headers.h"
// clang-format on // clang-format on
#include <map> // NOLINT #include <map> // NOLINT
#include <string> // NOLINT #include <string> // NOLINT
#include <vector> // NOLINT #include <vector> // NOLINT
#include <memory> // NOLINT #include <memory> // NOLINT
#include <unordered_map> // NOLINT
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/framework/var_type_inference.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
...@@ -135,13 +137,13 @@ class VarBase { ...@@ -135,13 +137,13 @@ class VarBase {
persistable) {} persistable) {}
private: private:
// TODO(minqiyang): need support SelectedRows
VarBase(const std::string& name, framework::proto::VarType::Type dtype, VarBase(const std::string& name, framework::proto::VarType::Type dtype,
const framework::DDim& shape, const platform::Place& place, const framework::DDim& shape, const platform::Place& place,
framework::Variable* var, VarBase* grad, bool stop_gradient, framework::Variable* var, VarBase* grad, bool stop_gradient,
bool persistable) bool persistable)
: name_(name), : name_(name),
dtype_(dtype), type_(framework::proto::VarType::LOD_TENSOR),
place_(place),
var_(var), var_(var),
grads_(grad), grads_(grad),
stop_gradient_(stop_gradient), stop_gradient_(stop_gradient),
...@@ -151,10 +153,12 @@ class VarBase { ...@@ -151,10 +153,12 @@ class VarBase {
pre_op_out_idx_(-1) { pre_op_out_idx_(-1) {
if (!var_) { if (!var_) {
var_ = new framework::Variable(); var_ = new framework::Variable();
auto tensor = var_->GetMutable<framework::LoDTensor>();
tensor->Resize(shape);
tensor->mutable_data(place_, dtype_);
} }
auto tensor = var_->GetMutable<framework::LoDTensor>();
tensor->Resize(shape);
tensor->mutable_data(place, dtype);
VLOG(10) << "create varbase: " << name_ << " type: " << dtype
<< " place: " << place;
} }
public: public:
...@@ -184,7 +188,23 @@ class VarBase { ...@@ -184,7 +188,23 @@ class VarBase {
} }
} }
inline framework::proto::VarType::Type DType() const { return dtype_; } inline framework::DDim Dims() const {
return var_->Get<framework::LoDTensor>().dims();
}
// data type. e.g.. FP32
inline void SetDataType(framework::proto::VarType::Type type) {
auto tensor = var_->GetMutable<framework::LoDTensor>();
tensor->mutable_data(tensor->place(), type);
}
inline framework::proto::VarType::Type DataType() const {
auto tensor = var_->Get<framework::LoDTensor>();
return tensor.type();
}
// tensor type. e.g.. LoDTensor
inline void SetType(framework::proto::VarType::Type type) { type_ = type; }
inline framework::proto::VarType::Type Type() const { return type_; }
inline void SetStopGradient(bool stop_gradient) { inline void SetStopGradient(bool stop_gradient) {
stop_gradient_ = stop_gradient; stop_gradient_ = stop_gradient;
...@@ -238,7 +258,7 @@ class VarBase { ...@@ -238,7 +258,7 @@ class VarBase {
} }
std::string name_; std::string name_;
framework::proto::VarType::Type dtype_; framework::proto::VarType::Type type_;
platform::Place place_; platform::Place place_;
framework::Variable* var_; framework::Variable* var_;
...@@ -294,17 +314,23 @@ class PYBIND11_HIDDEN OpBase { ...@@ -294,17 +314,23 @@ class PYBIND11_HIDDEN OpBase {
void InvokeBackwardHooks(); void InvokeBackwardHooks();
void TrackPreOp(const VarBase* inp_var, const std::string& inp_name) { void TrackPreOp(const std::string& inp_name,
if (inp_var->PreOp() && !inp_var->IsStopGradient()) { const std::vector<VarBase*>& inputs) {
VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot " auto& pre_ops_list = pre_ops_[inp_name];
<< inp_name; pre_ops_list.reserve(inputs.size());
pre_ops_[inp_name].push_back(inp_var->PreOp()); auto& pre_ops_out_idx_list = pre_ops_out_idx_[inp_name];
pre_ops_out_idx_[inp_name].push_back(inp_var->PreOpOutIdx()); for (VarBase* inp_var : inputs) {
} else { if (inp_var->PreOp() && !inp_var->IsStopGradient()) {
VLOG(3) << "no pre op in slot " << inp_name VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot "
<< " input var stop_gradient: " << inp_var->IsStopGradient(); << inp_name;
pre_ops_[inp_name].push_back(nullptr); pre_ops_list.emplace_back(inp_var->PreOp());
// pre_ops_out_idx_[inp_name].push_back(-1); pre_ops_out_idx_list.push_back(inp_var->PreOpOutIdx());
} else {
VLOG(3) << "no pre op in slot " << inp_name
<< " input var stop_gradient: " << inp_var->IsStopGradient();
pre_ops_list.emplace_back(nullptr);
// pre_ops_out_idx_list.push_back(-1);
}
} }
} }
...@@ -328,11 +354,13 @@ class PYBIND11_HIDDEN OpBase { ...@@ -328,11 +354,13 @@ class PYBIND11_HIDDEN OpBase {
std::map<std::string, std::vector<int>> pre_ops_out_idx_; std::map<std::string, std::vector<int>> pre_ops_out_idx_;
// Inputs to a vector of bwd ops. // Inputs to a vector of bwd ops.
std::vector<framework::VariableValueMap> grad_input_vars_; std::vector<VarBasePtrMap> grad_input_vars_;
// Outputs to a vector of bwd ops. // Outputs to a vector of bwd ops.
std::vector<framework::VariableValueMap> grad_output_vars_; std::vector<VarBasePtrMap> grad_output_vars_;
std::vector<py::object> backward_hooks_; std::vector<py::object> backward_hooks_;
framework::AttributeMap attrs_;
}; };
class Layer { class Layer {
...@@ -359,12 +387,131 @@ class PyLayer { ...@@ -359,12 +387,131 @@ class PyLayer {
static std::vector<framework::Variable*> Apply( static std::vector<framework::Variable*> Apply(
int func_id, const std::vector<VarBase*>& inputs); int func_id, const std::vector<VarBase*>& inputs);
static std::vector<framework::Variable*> ApplyGrad( static std::vector<VarBase*> ApplyGrad(int func_id,
int func_id, const std::vector<framework::Variable*>& inputs); const std::vector<VarBase*>& inputs);
private: private:
static std::vector<framework::Variable*> CallPythonFunc( static std::vector<framework::Variable*> CallPythonFunc(
const py::object& callable, const std::vector<framework::Variable*>& ins); const py::object& callable, const std::vector<VarBase*>& ins);
};
// infer var type context for imperative mode
class PYBIND11_HIDDEN RuntimeInferVarTypeContext
: public framework::InferVarTypeContext {
public:
RuntimeInferVarTypeContext(const imperative::VarBasePtrMap* inputs,
imperative::VarBasePtrMap* outputs,
const framework::AttributeMap* attrs_map)
: InferVarTypeContext(nullptr, nullptr),
inputs_(inputs),
outputs_(outputs),
attrs_(attrs_map),
input_names_(),
output_names_(),
var_set_() {
input_names_.reserve(inputs_->size());
for (auto& it : *inputs_) {
for (imperative::VarBase* var : it.second) {
input_names_[it.first].emplace_back(var->Name());
var_set_[var->Name()] = var;
}
}
output_names_.reserve(outputs_->size());
for (auto& it : *outputs_) {
for (imperative::VarBase* var : it.second) {
output_names_[it.first].emplace_back(var->Name());
var_set_[var->Name()] = var;
}
}
}
virtual ~RuntimeInferVarTypeContext() {}
framework::Attribute GetAttr(const std::string& name) const override {
PADDLE_ENFORCE_NOT_NULL(attrs_);
return attrs_->at(name);
}
bool HasVar(const std::string& name) const override {
return var_set_.count(name) > 0;
}
bool HasInput(const std::string& name) const override {
PADDLE_ENFORCE_NOT_NULL(inputs_);
return inputs_->count(name) > 0;
}
bool HasOutput(const std::string& name) const override {
PADDLE_ENFORCE_NOT_NULL(outputs_);
return outputs_->count(name) > 0;
}
const std::vector<std::string>& Input(
const std::string& name) const override {
return input_names_.at(name);
}
const std::vector<std::string>& Output(
const std::string& name) const override {
return output_names_.at(name);
}
framework::proto::VarType::Type GetType(
const std::string& name) const override {
return var_set_.at(name)->Type();
}
void SetType(const std::string& name,
framework::proto::VarType::Type type) override {
var_set_[name]->SetType(type);
}
framework::proto::VarType::Type GetDataType(
const std::string& name) const override {
return var_set_.at(name)->DataType();
}
void SetDataType(const std::string& name,
framework::proto::VarType::Type type) override {
var_set_[name]->SetDataType(type);
}
std::vector<framework::proto::VarType::Type> GetDataTypes(
const std::string& name) const override {
PADDLE_THROW("GetDataTypes is not supported in runtime InferVarType");
}
void SetDataTypes(const std::string& name,
const std::vector<framework::proto::VarType::Type>&
multiple_data_type) override {
PADDLE_THROW("SetDataTypes is not supported in runtime InferVarType");
}
std::vector<int64_t> GetShape(const std::string& name) const override {
PADDLE_THROW("Do not handle Shape in runtime InferVarType");
}
void SetShape(const std::string& name,
const std::vector<int64_t>& dims) override {
PADDLE_THROW("Do not handle Shape in runtime InferVarType");
}
int32_t GetLoDLevel(const std::string& name) const override {
PADDLE_THROW("Do not handle LoDLevel in runtime InferVarType");
}
void SetLoDLevel(const std::string& name, int32_t lod_level) override {
PADDLE_THROW("Do not handle LoDLevel in runtime InferVarType");
}
private:
const imperative::VarBasePtrMap* inputs_;
imperative::VarBasePtrMap* outputs_;
const framework::AttributeMap* attrs_;
std::unordered_map<std::string, std::vector<std::string>> input_names_;
std::unordered_map<std::string, std::vector<std::string>> output_names_;
std::unordered_map<std::string, imperative::VarBase*> var_set_;
}; };
} // namespace imperative } // namespace imperative
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/profiler.h"
#ifdef WITH_GPERFTOOLS
#include "gperftools/profiler.h"
#endif
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <mutex> // NOLINT
#include <thread> // NOLINT
DEFINE_string(
tracer_profile_fname, "xxgperf",
"Profiler filename for imperative tracer, which generated by gperftools."
"Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
namespace paddle {
namespace imperative {
static std::once_flag gTracerProfileOnce;
#ifdef WITH_GPERFTOOLS
static bool gTracerProfilerStarted = false;
#endif
void StartProfile() {
if (!FLAGS_tracer_profile_fname.empty()) {
std::call_once(gTracerProfileOnce, [] {
#ifdef WITH_GPERFTOOLS
ProfilerStart(FLAGS_tracer_profile_fname.c_str());
gTracerProfilerStarted = true;
#else
LOG(WARNING) << "Paddle is not compiled with gperftools. "
"FLAGS_tracer_profile_fname will be ignored";
#endif
});
}
}
void StopProfile() {
#ifdef WITH_GPERFTOOLS
ProfilerFlush();
#else
LOG(WARNING) << "Paddle is not compiled with gperftools. "
"FLAGS_tracer_profile_fname will be ignored";
#endif
}
} // namespace imperative
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace paddle {
namespace imperative {
extern void StartProfile();
extern void StopProfile();
} // namespace imperative
} // namespace paddle
...@@ -19,38 +19,26 @@ ...@@ -19,38 +19,26 @@
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/var_type_inference.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#ifdef WITH_GPERFTOOLS
#include "gperftools/profiler.h"
#endif
DEFINE_string(
tracer_profile_fname, "",
"Profiler filename for imperative tracer, which generated by gperftools."
"Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
static std::once_flag gTracerProfileOnce;
#ifdef WITH_GPERFTOOLS
static bool gTracerProfilerStarted = false;
#endif
void CreateGradOp(const framework::OpDesc& op_desc, void CreateGradOp(const framework::OpDesc& op_desc,
const std::unordered_set<std::string>& no_grad_set, const std::unordered_set<std::string>& no_grad_set,
const std::vector<framework::BlockDesc*>& grad_sub_block, const std::vector<framework::BlockDesc*>& grad_sub_block,
std::vector<framework::OpDesc*>* grad_op_descs, std::vector<framework::OpDesc*>* grad_op_descs,
std::unordered_map<std::string, std::string>* grad_to_var) { std::unordered_map<std::string, std::string>* grad_to_var) {
PADDLE_ENFORCE(grad_op_descs->empty()); PADDLE_ENFORCE(grad_op_descs->empty());
std::vector<std::unique_ptr<framework::OpDesc>> descs = const framework::OpInfo& op_info =
framework::OpInfoMap::Instance() framework::OpInfoMap::Instance().Get(op_desc.Type());
.Get(op_desc.Type()) if (!op_info.grad_op_maker_) return;
.GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
std::vector<std::unique_ptr<framework::OpDesc>> descs =
op_info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
for (auto& desc : descs) { for (auto& desc : descs) {
grad_op_descs->emplace_back(desc.release()); grad_op_descs->emplace_back(desc.release());
} }
...@@ -145,31 +133,13 @@ framework::VariableNameMap CreateOutputVarNameMap( ...@@ -145,31 +133,13 @@ framework::VariableNameMap CreateOutputVarNameMap(
return result; return result;
} }
Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {}
if (!FLAGS_tracer_profile_fname.empty()) {
std::call_once(gTracerProfileOnce, [] {
#ifdef WITH_GPERFTOOLS
ProfilerStart(FLAGS_tracer_profile_fname.c_str());
gTracerProfilerStarted = true;
#else
LOG(WARNING) << "Paddle is not compiled with gperftools. "
"FLAGS_tracer_profile_fname will be ignored";
#endif
});
}
}
std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
const VarBasePtrMap& outputs, VarBasePtrMap* outputs,
framework::AttributeMap attrs_map, framework::AttributeMap attrs_map,
const platform::Place expected_place, const platform::Place expected_place,
const bool stop_gradient) { const bool stop_gradient) {
#ifdef WITH_GPERFTOOLS
if (gTracerProfilerStarted) {
ProfilerFlush();
}
#endif
framework::VariableValueMap invars_map; framework::VariableValueMap invars_map;
framework::VariableValueMap outvars_map; framework::VariableValueMap outvars_map;
...@@ -184,7 +154,6 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -184,7 +154,6 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
inp->Name()); inp->Name());
invars.emplace_back(inp->var_); invars.emplace_back(inp->var_);
op->TrackPreOp(inp, it.first);
if (!stop_gradient) { if (!stop_gradient) {
current_vars_map[inp->Name()] = inp; current_vars_map[inp->Name()] = inp;
} }
...@@ -192,9 +161,10 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -192,9 +161,10 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
<< " inited: " << inp->var_->IsInitialized() << " inited: " << inp->var_->IsInitialized()
<< " stop_grad: " << inp->IsStopGradient(); << " stop_grad: " << inp->IsStopGradient();
} }
op->TrackPreOp(it.first, it.second);
} }
op->output_vars_ = outputs; op->output_vars_ = *outputs;
for (auto it : op->output_vars_) { for (auto it : op->output_vars_) {
auto& outvars = outvars_map[it.first]; auto& outvars = outvars_map[it.first];
const std::vector<VarBase*>& outputs = it.second; const std::vector<VarBase*>& outputs = it.second;
...@@ -217,7 +187,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -217,7 +187,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
framework::VariableNameMap invars_name_map = framework::VariableNameMap invars_name_map =
CreateInputVarNameMap(op, inputs); CreateInputVarNameMap(op, inputs);
framework::VariableNameMap outvars_name_map = framework::VariableNameMap outvars_name_map =
CreateOutputVarNameMap(op, outputs); CreateOutputVarNameMap(op, *outputs);
auto& info = framework::OpInfoMap::Instance().Get(op->Type()); auto& info = framework::OpInfoMap::Instance().Get(op->Type());
if (info.Checker() != nullptr) { if (info.Checker() != nullptr) {
...@@ -228,6 +198,11 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -228,6 +198,11 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
framework::OpRegistry::CreateOp(op->Type(), invars_name_map, framework::OpRegistry::CreateOp(op->Type(), invars_name_map,
outvars_name_map, attrs_map); outvars_name_map, attrs_map);
if (info.infer_var_type_) {
RuntimeInferVarTypeContext infer_var_type_ctx(&inputs, outputs, &attrs_map);
info.infer_var_type_(&infer_var_type_ctx);
}
// TODO(minqiyang): Support infer var type in imperative mode // TODO(minqiyang): Support infer var type in imperative mode
// Run forward op // Run forward op
VLOG(3) << "tracer running " << op->Type(); VLOG(3) << "tracer running " << op->Type();
...@@ -252,6 +227,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -252,6 +227,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
VLOG(5) << "start construct backward op"; VLOG(5) << "start construct backward op";
// construct grad op descs // construct grad op descs
op->attrs_ = attrs_map;
std::unique_ptr<framework::OpDesc> fwd_op_desc(new framework::OpDesc( std::unique_ptr<framework::OpDesc> fwd_op_desc(new framework::OpDesc(
op->Type(), invars_name_map, outvars_name_map, attrs_map)); op->Type(), invars_name_map, outvars_name_map, attrs_map));
std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var( std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
...@@ -278,12 +254,12 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -278,12 +254,12 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
auto fwd_var_it = current_vars_map.find(grad_invar); auto fwd_var_it = current_vars_map.find(grad_invar);
PADDLE_ENFORCE(fwd_var_it != current_vars_map.end()); PADDLE_ENFORCE(fwd_var_it != current_vars_map.end());
// Forward inputs or outputs. // Forward inputs or outputs.
grad_in_vars.emplace_back(fwd_var_it->second->var_); grad_in_vars.emplace_back(fwd_var_it->second);
} else { } else {
VarBase* var = current_vars_map[var_it->second]; VarBase* var = current_vars_map[var_it->second];
InitGrad(var, prepared_op.GetDeviceContext()); InitGrad(var, prepared_op.GetDeviceContext());
// Douts. // Douts.
grad_in_vars.emplace_back(var->grads_->var_); grad_in_vars.emplace_back(var->grads_);
} }
vars_saved_for_backward.insert(it.first); vars_saved_for_backward.insert(it.first);
...@@ -300,7 +276,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -300,7 +276,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
op->Type()); op->Type());
VarBase* var = current_vars_map[var_it->second]; VarBase* var = current_vars_map[var_it->second];
InitGrad(var, prepared_op.GetDeviceContext()); InitGrad(var, prepared_op.GetDeviceContext());
grad_out_vars.push_back(var->grads_->var_); grad_out_vars.push_back(var->grads_);
} }
} }
} }
...@@ -319,9 +295,7 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op, ...@@ -319,9 +295,7 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
std::vector<framework::Variable*> ret_vars = std::vector<framework::Variable*> ret_vars =
PyLayer::Apply(op->forward_id_, inputs); PyLayer::Apply(op->forward_id_, inputs);
for (VarBase* inp : inputs) { op->TrackPreOp(PyLayer::kFwdInp, inputs);
op->TrackPreOp(inp, PyLayer::kFwdInp);
}
std::vector<VarBase*>& outputs = op->output_vars_[PyLayer::kFwdOut]; std::vector<VarBase*>& outputs = op->output_vars_[PyLayer::kFwdOut];
outputs.reserve(ret_vars.size()); outputs.reserve(ret_vars.size());
...@@ -342,23 +316,23 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op, ...@@ -342,23 +316,23 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
auto& grad_output_vars = auto& grad_output_vars =
op->grad_output_vars_[0][framework::GradVarName(PyLayer::kFwdOut)]; op->grad_output_vars_[0][framework::GradVarName(PyLayer::kFwdOut)];
for (const VarBase* inp : inputs) { for (VarBase* inp : inputs) {
grad_input_vars.push_back(inp->var_); grad_input_vars.push_back(inp);
} }
for (VarBase* out : outputs) { for (VarBase* out : outputs) {
grad_input_vars.push_back(out->var_); grad_input_vars.push_back(out);
} }
// TODO(minqiyang): Add GPU support for PyLayer, only support CPU now // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
platform::CPUPlace place; platform::CPUPlace place;
for (VarBase* out : outputs) { for (VarBase* out : outputs) {
InitGrad(out, platform::DeviceContextPool::Instance().Get(place)); InitGrad(out, platform::DeviceContextPool::Instance().Get(place));
grad_input_vars.push_back(out->grads_->var_); grad_input_vars.push_back(out->grads_);
} }
for (VarBase* inp : inputs) { for (VarBase* inp : inputs) {
InitGrad(inp, platform::DeviceContextPool::Instance().Get(place)); InitGrad(inp, platform::DeviceContextPool::Instance().Get(place));
grad_output_vars.push_back(inp->grads_->var_); grad_output_vars.push_back(inp->grads_);
} }
} }
return outputs; return outputs;
......
...@@ -48,7 +48,7 @@ class Tracer { ...@@ -48,7 +48,7 @@ class Tracer {
virtual ~Tracer() {} virtual ~Tracer() {}
std::set<std::string> Trace(OpBase* op, const VarBasePtrMap& inputs, std::set<std::string> Trace(OpBase* op, const VarBasePtrMap& inputs,
const VarBasePtrMap& outputs, VarBasePtrMap* outputs, // NOLINT
framework::AttributeMap attrs_map, framework::AttributeMap attrs_map,
const platform::Place expected_place, const platform::Place expected_place,
const bool stop_gradient = false); const bool stop_gradient = false);
......
...@@ -25,6 +25,7 @@ class VarBase; ...@@ -25,6 +25,7 @@ class VarBase;
class OpBase; class OpBase;
typedef std::map<std::string, std::vector<VarBase*>> VarBasePtrMap; typedef std::map<std::string, std::vector<VarBase*>> VarBasePtrMap;
typedef std::map<std::string, std::vector<const VarBase*>> ConstVarBasePtrMap;
typedef std::map<std::string, std::vector<OpBase*>> OpBasePtrMap; typedef std::map<std::string, std::vector<OpBase*>> OpBasePtrMap;
} // namespace imperative } // namespace imperative
......
...@@ -91,5 +91,5 @@ if(WITH_TESTING) ...@@ -91,5 +91,5 @@ if(WITH_TESTING)
add_subdirectory(tests/book) add_subdirectory(tests/book)
if(WITH_INFERENCE_API_TEST) if(WITH_INFERENCE_API_TEST)
add_subdirectory(tests/api) add_subdirectory(tests/api)
endif() endif()
endif() endif()
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
...@@ -38,7 +39,10 @@ ...@@ -38,7 +39,10 @@
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
using framework::ir::Graph; using framework::ir::Graph;
using VarQuantScale =
std::unordered_map<std::string, std::pair<bool, framework::LoDTensor>>;
/* /*
* The argument definition of both Pass and PassManagers. * The argument definition of both Pass and PassManagers.
...@@ -127,6 +131,8 @@ struct Argument { ...@@ -127,6 +131,8 @@ struct Argument {
// Pass a set of op types to enable its mkldnn kernel // Pass a set of op types to enable its mkldnn kernel
DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes, DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
std::unordered_set<std::string>); std::unordered_set<std::string>);
// Scales for variables to be quantized
DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale);
// Passed from config. // Passed from config.
DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/inference/analysis/ir_pass_manager.h" #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
...@@ -55,14 +56,14 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -55,14 +56,14 @@ void IRPassManager::CreatePasses(Argument *argument,
".dot"; ".dot";
pass->Set("graph_viz_path", new std::string(std::move(dot_file_path))); pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
pass_num++; pass_num++;
} } else if (pass_name == "mkldnn_placement_pass") {
if (pass_name == "mkldnn_placement_pass") {
pass->Set("mkldnn_enabled_op_types", pass->Set("mkldnn_enabled_op_types",
new std::unordered_set<std::string>( new std::unordered_set<std::string>(
argument->mkldnn_enabled_op_types())); argument->mkldnn_enabled_op_types()));
} } else if (pass_name == "cpu_quantize_pass") {
pass->Set("quant_var_scales",
if (pass_name == "tensorrt_subgraph_pass") { new VarQuantScale(argument->quant_var_scales()));
} else if (pass_name == "tensorrt_subgraph_pass") {
pass->Set("workspace_size", new int(argument->tensorrt_workspace_size())); pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
pass->Set("min_subgraph_size", pass->Set("min_subgraph_size",
......
...@@ -118,6 +118,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -118,6 +118,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(serialized_info_cache_); CP_MEMBER(serialized_info_cache_);
// framework related.
CP_MEMBER(enable_runtime_context_cache_);
if (use_gpu_) { if (use_gpu_) {
pass_builder_.reset(new GpuPassStrategy( pass_builder_.reset(new GpuPassStrategy(
*static_cast<GpuPassStrategy *>(other.pass_builder()))); *static_cast<GpuPassStrategy *>(other.pass_builder())));
...@@ -219,12 +222,23 @@ void AnalysisConfig::Update() { ...@@ -219,12 +222,23 @@ void AnalysisConfig::Update() {
} }
if (enable_memory_optim_) { if (enable_memory_optim_) {
pass_builder()->AppendAnalysisPass("memory_optimize_pass"); auto analysis_passes = pass_builder()->AnalysisPasses();
auto memory_opti_pass_name = "memory_optimize_pass";
bool already_exists =
std::find(analysis_passes.begin(), analysis_passes.end(),
memory_opti_pass_name) != analysis_passes.end();
if (!already_exists) {
pass_builder()->AppendAnalysisPass(memory_opti_pass_name);
}
} }
if (ir_debug_) { if (ir_debug_) {
pass_builder()->TurnOnDebug(); pass_builder()->TurnOnDebug();
} }
if (enable_runtime_context_cache_) {
pass_builder()->AppendPass("runtime_context_cache_pass");
}
} }
std::string AnalysisConfig::SerializeInfoCache() { std::string AnalysisConfig::SerializeInfoCache() {
...@@ -258,6 +272,7 @@ std::string AnalysisConfig::SerializeInfoCache() { ...@@ -258,6 +272,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << specify_input_name_; ss << specify_input_name_;
ss << cpu_math_library_num_threads_; ss << cpu_math_library_num_threads_;
ss << enable_runtime_context_cache_;
return ss.str(); return ss.str();
} }
......
...@@ -194,6 +194,23 @@ struct AnalysisConfig { ...@@ -194,6 +194,23 @@ struct AnalysisConfig {
/** Tell whether the memory optimization is activated. */ /** Tell whether the memory optimization is activated. */
bool enable_memory_optim() const; bool enable_memory_optim() const;
// framework related
/** \brief Control whether to perform runtime context cache optimization.
*
* If turned off, in Op's every execution, RuntimeContext would be called to
* relate input/output names of this Op with the corresponding variables in
* Scope.
*/
void SwitchRuntimeContextCache(int x = true) {
enable_runtime_context_cache_ = x;
}
/** A boolean state tell whether the runtime context cache optimization is
* actived.
*/
bool runtime_context_cache_enabled() const {
return enable_runtime_context_cache_;
}
friend class ::paddle::AnalysisPredictor; friend class ::paddle::AnalysisPredictor;
/** NOTE just for developer, not an official API, easily to be broken. /** NOTE just for developer, not an official API, easily to be broken.
...@@ -254,6 +271,15 @@ struct AnalysisConfig { ...@@ -254,6 +271,15 @@ struct AnalysisConfig {
int cpu_math_library_num_threads_{1}; int cpu_math_library_num_threads_{1};
// framework related
// RuntimeContext is used to relate input/output names of Operator with
// the corresponding variables in Scope.
// If enable_runtime_context_cache_ is true, it means that in a same Scope,
// since the input/output names of this Op do not change in the execution,
// RuntimeContext could be created only at the first iteration of this Op's
// execution to save the elapsed time.
bool enable_runtime_context_cache_{false};
// A runtime cache, shouldn't be transferred to others. // A runtime cache, shouldn't be transferred to others.
std::string serialized_info_cache_; std::string serialized_info_cache_;
......
...@@ -110,7 +110,7 @@ set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer") ...@@ -110,7 +110,7 @@ set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer")
download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp%2Ftransformer_model.tar.gz" "temp%2Ftransformer_data.txt.tar.gz") download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp%2Ftransformer_model.tar.gz" "temp%2Ftransformer_data.txt.tar.gz")
inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_tester.cc inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8) ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 SERIAL)
# ocr # ocr
set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
......
...@@ -107,6 +107,7 @@ void SetConfig(AnalysisConfig *cfg) { ...@@ -107,6 +107,7 @@ void SetConfig(AnalysisConfig *cfg) {
cfg->DisableGpu(); cfg->DisableGpu();
cfg->SwitchSpecifyInputNames(); cfg->SwitchSpecifyInputNames();
cfg->SwitchIrOptim(); cfg->SwitchIrOptim();
cfg->SwitchRuntimeContextCache();
if (FLAGS_zero_copy) { if (FLAGS_zero_copy) {
cfg->SwitchUseFeedFetchOps(false); cfg->SwitchUseFeedFetchOps(false);
} }
......
...@@ -183,10 +183,13 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -183,10 +183,13 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
} }
// Easy for profiling independently. // Easy for profiling independently.
TEST(Analyzer_Transformer, profile) { void profile(bool use_mkldnn = false) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
if (use_mkldnn) {
cfg.EnableMKLDNN();
}
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
...@@ -194,6 +197,11 @@ TEST(Analyzer_Transformer, profile) { ...@@ -194,6 +197,11 @@ TEST(Analyzer_Transformer, profile) {
input_slots_all, &outputs, FLAGS_num_threads); input_slots_all, &outputs, FLAGS_num_threads);
} }
TEST(Analyzer_Transformer, profile) { profile(); }
#ifdef PADDLE_WITH_MKLDNN
TEST(Analyzer_Transformer, profile_mkldnn) { profile(true); }
#endif
// Check the fuse status // Check the fuse status
TEST(Analyzer_Transformer, fuse_statis) { TEST(Analyzer_Transformer, fuse_statis) {
AnalysisConfig cfg; AnalysisConfig cfg;
...@@ -206,9 +214,12 @@ TEST(Analyzer_Transformer, fuse_statis) { ...@@ -206,9 +214,12 @@ TEST(Analyzer_Transformer, fuse_statis) {
} }
// Compare result of NativeConfig and AnalysisConfig // Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_Transformer, compare) { void compare(bool use_mkldnn = false) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
if (use_mkldnn) {
cfg.EnableMKLDNN();
}
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
...@@ -216,5 +227,10 @@ TEST(Analyzer_Transformer, compare) { ...@@ -216,5 +227,10 @@ TEST(Analyzer_Transformer, compare) {
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all); reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
TEST(Analyzer_Transformer, compare) { compare(); }
#ifdef PADDLE_WITH_MKLDNN
TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); }
#endif
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -72,7 +72,8 @@ std::ostream &operator<<(std::ostream &os, const AnalysisConfig &config) { ...@@ -72,7 +72,8 @@ std::ostream &operator<<(std::ostream &os, const AnalysisConfig &config) {
} }
os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim() os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
<< "\n"; << "\n";
os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim() os << GenSpaces(num_spaces)
<< "use_runtime_context_cache: " << config.runtime_context_cache_enabled()
<< "\n"; << "\n";
os << GenSpaces(num_spaces) os << GenSpaces(num_spaces)
<< "use_feed_fetch_ops: " << config.use_feed_fetch_ops_enabled() << "\n"; << "use_feed_fetch_ops: " << config.use_feed_fetch_ops_enabled() << "\n";
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
#include <string> #include <string>
#include "paddle/fluid/platform/lock_guard_ptr.h" #include "paddle/fluid/platform/lock_guard_ptr.h"
DEFINE_double(tolerant_times, 2, DEFINE_double(buffered_allocator_excess_times, 2,
"Tolerant memory size times of buffered_allocator"); "Tolerant memory size times of buffered_allocator");
DEFINE_string(division_plan_path, "", "Division plan file path"); DEFINE_string(division_plan_path, "", "Division plan file path");
...@@ -149,7 +149,7 @@ inline static size_t FindDivisionPlanBinIndex(const std::vector<size_t> &bins, ...@@ -149,7 +149,7 @@ inline static size_t FindDivisionPlanBinIndex(const std::vector<size_t> &bins,
} }
inline static size_t TolerantUpperSize(size_t size) { inline static size_t TolerantUpperSize(size_t size) {
return static_cast<size_t>(size * FLAGS_tolerant_times); return static_cast<size_t>(size * FLAGS_buffered_allocator_excess_times);
} }
MultiBinBufferedAllocator::MultiBinBufferedAllocator( MultiBinBufferedAllocator::MultiBinBufferedAllocator(
...@@ -172,7 +172,8 @@ MultiBinBufferedAllocator::MultiBinBufferedAllocator( ...@@ -172,7 +172,8 @@ MultiBinBufferedAllocator::MultiBinBufferedAllocator(
} }
VLOG(1) << "Division plan is: " << GetDebugStringOfPlan(division_plan_); VLOG(1) << "Division plan is: " << GetDebugStringOfPlan(division_plan_);
VLOG(1) << "FLAGS_tolerant_times = " << FLAGS_tolerant_times; VLOG(1) << "FLAGS_buffered_allocator_excess_times = "
<< FLAGS_buffered_allocator_excess_times;
} }
void MultiBinBufferedAllocator::FreeImpl(Allocation *allocation) { void MultiBinBufferedAllocator::FreeImpl(Allocation *allocation) {
......
...@@ -44,10 +44,10 @@ if (WITH_DISTRIBUTE) ...@@ -44,10 +44,10 @@ if (WITH_DISTRIBUTE)
SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch) SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch)
endif() endif()
register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op sync_batch_norm_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
# warpctc_op needs cudnn 7 above
if (WITH_GPU) if (WITH_GPU)
# warpctc_op needs cudnn 7 above
if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
else() else()
...@@ -58,6 +58,10 @@ if (WITH_GPU) ...@@ -58,6 +58,10 @@ if (WITH_GPU)
op_library(conv_fusion_op) op_library(conv_fusion_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n") file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
endif() endif()
if (NOT WIN32)
op_library(sync_batch_norm_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
endif()
else() else()
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
endif() endif()
......
...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/batch_norm_op.h"
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/data_layout.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
...@@ -22,147 +24,150 @@ limitations under the License. */ ...@@ -22,147 +24,150 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class BatchNormOp : public framework::OperatorWithKernel { void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
public: PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of ConvOp should not be null.");
using framework::OperatorWithKernel::OperatorWithKernel; PADDLE_ENFORCE(ctx->HasInput("Scale"),
"Input(Scale) of ConvOp should not be null.");
void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasInput("Bias"),
PADDLE_ENFORCE(ctx->HasInput("X"), ""); "Input(Bias) of ConvOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Scale"), ""); PADDLE_ENFORCE(ctx->HasInput("Mean"),
PADDLE_ENFORCE(ctx->HasInput("Bias"), ""); "Input(Mean) of ConvOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Mean"), ""); PADDLE_ENFORCE(ctx->HasInput("Variance"),
PADDLE_ENFORCE(ctx->HasInput("Variance"), ""); "Input(Variance) of ConvOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Y"), ""); PADDLE_ENFORCE(ctx->HasOutput("Y"),
PADDLE_ENFORCE(ctx->HasOutput("MeanOut"), ""); "Output(Y) of ConvOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"), ""); bool is_test = ctx->Attrs().Get<bool>("is_test");
PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), ""); if (!is_test) {
PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), ""); PADDLE_ENFORCE(ctx->HasOutput("MeanOut"),
"Output(MeanOut) of ConvOp should not be null.");
// make sure Mean/MeanOut and Variance/VarianceOut share memory in Python PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"),
PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0], "Output(VarianceOut) of ConvOp should not be null.");
"Mean and MeanOut should share the same memory"); PADDLE_ENFORCE(ctx->HasOutput("SavedMean"),
PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0], "Output(SavedMean) of ConvOp should not be null.");
ctx->Outputs("VarianceOut")[0], PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"),
"Variance and VarianceOut should share the same memory"); "Output(SavedVariance) of ConvOp should not be null.");
const auto x_dims = ctx->GetInputDim("X");
const DataLayout data_layout = framework::StringToDataLayout(
ctx->Attrs().Get<std::string>("data_layout"));
PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"Input X must have 2 to 5 dimensions.");
const int64_t C =
(data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], C);
ctx->SetOutputDim("Y", x_dims);
ctx->SetOutputDim("MeanOut", {C});
ctx->SetOutputDim("VarianceOut", {C});
ctx->SetOutputDim("SavedMean", {C});
ctx->SetOutputDim("SavedVariance", {C});
ctx->ShareLoD("X", "Y");
} }
protected: // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
framework::OpKernelType GetExpectedKernelType( PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
const framework::ExecutionContext &ctx) const override { "Mean and MeanOut should share the same memory");
auto input_data_type = ctx.Input<Tensor>("X")->type(); PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0], ctx->Outputs("VarianceOut")[0],
// By default, the type of the scale, bias, mean, "Variance and VarianceOut should share the same memory");
// and var tensors should both be float. (For float or float16 input tensor)
// or double (For double input tensor). const auto x_dims = ctx->GetInputDim("X");
auto bn_param_type = framework::proto::VarType::FP32; const DataLayout data_layout = framework::StringToDataLayout(
if (input_data_type == framework::proto::VarType::FP64) { ctx->Attrs().Get<std::string>("data_layout"));
bn_param_type = framework::proto::VarType::FP64;
} PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Scale")->type(), "Input X must have 2 to 5 dimensions.");
"Scale input should be of float type");
PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Bias")->type(), const int64_t C =
"Bias input should be of float type"); (data_layout == DataLayout::kNCHW ? x_dims[1]
PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Mean")->type(), : x_dims[x_dims.size() - 1]);
"Mean input should be of float type");
PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Variance")->type(), PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
"Variance input should be of float type"); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], C);
framework::LibraryType library = framework::LibraryType::kPlain;
framework::DataLayout layout = framework::DataLayout::kAnyLayout; ctx->SetOutputDim("Y", x_dims);
ctx->SetOutputDim("MeanOut", {C});
ctx->SetOutputDim("VarianceOut", {C});
ctx->SetOutputDim("SavedMean", {C});
ctx->SetOutputDim("SavedVariance", {C});
ctx->ShareLoD("X", "Y");
}
framework::OpKernelType BatchNormOp::GetExpectedKernelType(
const framework::ExecutionContext &ctx) const {
auto input_data_type = ctx.Input<Tensor>("X")->type();
// By default, the type of the scale, bias, mean,
// and var tensors should both be float. (For float or float16 input tensor)
// or double (For double input tensor).
auto bn_param_type = framework::proto::VarType::FP32;
if (input_data_type == framework::proto::VarType::FP64) {
bn_param_type = framework::proto::VarType::FP64;
}
PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Scale")->type(),
"Scale input should be of float type");
PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Bias")->type(),
"Bias input should be of float type");
PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Mean")->type(),
"Mean input should be of float type");
PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Variance")->type(),
"Variance input should be of float type");
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::LibraryType library = framework::LibraryType::kPlain;
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (library == framework::LibraryType::kPlain && if (library == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) { platform::CanMKLDNNBeUsed(ctx)) {
library = framework::LibraryType::kMKLDNN; library = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN; layout = framework::DataLayout::kMKLDNN;
}
#endif
return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
library);
} }
}; #endif
class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
public: library);
void Make() override { }
AddAttr<bool>("is_test",
"(bool, default false) Set to true for inference only, false " void BatchNormOpMaker::Make() {
"for training. Some layers may run faster when this is true.") AddAttr<bool>("is_test",
.SetDefault(false); "(bool, default false) Set to true for inference only, false "
AddAttr<float>("momentum", "").SetDefault(0.9); "for training. Some layers may run faster when this is true.")
AddAttr<float>("epsilon", "") .SetDefault(false);
.SetDefault(1e-5) AddAttr<float>("momentum", "").SetDefault(0.9);
.AddCustomChecker([](const float &epsilon) { AddAttr<float>("epsilon", "")
PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f, .SetDefault(1e-5)
"'epsilon' should be between 0.0 and 0.001."); .AddCustomChecker([](const float &epsilon) {
}); PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
AddAttr<std::string>("data_layout", "").SetDefault("NCHW"); "'epsilon' should be between 0.0 and 0.001.");
AddInput("X", "The input tensor"); });
AddInput("Scale", AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
"Scale is a 1-dimensional tensor of size C " AddInput("X", "The input tensor");
"that is applied to the output"); AddInput("Scale",
AddInput("Bias", "Scale is a 1-dimensional tensor of size C "
"Bias is a 1-dimensional tensor of size C " "that is applied to the output");
"that is applied to the output"); AddInput("Bias",
AddInput("Mean", "Bias is a 1-dimensional tensor of size C "
"The global mean (for training) or " "that is applied to the output");
"estimated mean (for testing)"); AddInput("Mean",
AddInput("Variance", "The global mean (for training) or "
"The global variance (for training) " "estimated mean (for testing)");
"or estimated Variance (for testing)"); AddInput("Variance",
AddOutput("Y", "result after normalization"); "The global variance (for training) "
AddOutput("MeanOut", "or estimated Variance (for testing)");
"Share memory with Mean. " AddOutput("Y", "result after normalization");
"Store the global mean when training"); AddOutput("MeanOut",
AddOutput("VarianceOut", "Share memory with Mean. "
"Share memory with Variance. " "Store the global mean when training");
"Store the global Variance when training"); AddOutput("VarianceOut",
AddOutput("SavedMean", "Share memory with Variance. "
"Mean of the current mini batch, " "Store the global Variance when training");
"will apply to output when training") AddOutput("SavedMean",
.AsIntermediate(); "Mean of the current mini batch, "
AddOutput("SavedVariance", "will apply to output when training")
"Variance of the current mini batch, " .AsIntermediate();
"will apply to output when training") AddOutput("SavedVariance",
.AsIntermediate(); "Variance of the current mini batch, "
AddAttr<bool>("use_mkldnn", "will apply to output when training")
"(bool, default false) Only used in mkldnn kernel") .AsIntermediate();
.SetDefault(false); AddAttr<bool>("use_mkldnn",
AddAttr<bool>("fuse_with_relu", "(bool, default false) Only used in mkldnn kernel")
"(bool, default false) Only used in mkldnn kernel") .SetDefault(false);
.SetDefault(false); AddAttr<bool>("fuse_with_relu",
AddAttr<bool>("use_global_stats", "(bool, default false) Only used in mkldnn kernel")
"(bool, default false) Whether to use global mean and " .SetDefault(false);
"variance. In inference or test mode, set use_global_stats " AddAttr<bool>("use_global_stats",
"to true or is_test true. the behavior is equivalent. " "(bool, default false) Whether to use global mean and "
"In train mode, when setting use_global_stats True, the " "variance. In inference or test mode, set use_global_stats "
"global mean and variance are also used during train time, " "to true or is_test true. the behavior is equivalent. "
"the BN acts as scaling and shiffting.") "In train mode, when setting use_global_stats True, the "
.SetDefault(false); "global mean and variance are also used during train time, "
AddComment(R"DOC( "the BN acts as scaling and shiffting.")
.SetDefault(false);
AddComment(R"DOC(
Batch Normalization. Batch Normalization.
Batch Norm has been implemented as discussed in the paper: Batch Norm has been implemented as discussed in the paper:
...@@ -173,17 +178,7 @@ The required data format for this layer is one of the following: ...@@ -173,17 +178,7 @@ The required data format for this layer is one of the following:
2. NCHW `[batch, in_channels, in_height, in_width]` 2. NCHW `[batch, in_channels, in_height, in_width]`
)DOC"); )DOC");
} }
};
class BatchNormOpInferVarType
: public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
}
};
template <typename T> template <typename T>
class BatchNormKernel<platform::CPUDeviceContext, T> class BatchNormKernel<platform::CPUDeviceContext, T>
...@@ -336,82 +331,75 @@ class BatchNormKernel<platform::CPUDeviceContext, T> ...@@ -336,82 +331,75 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
} }
}; };
class BatchNormGradOp : public framework::OperatorWithKernel { void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
public: // check input
using framework::OperatorWithKernel::OperatorWithKernel; PADDLE_ENFORCE(ctx->HasInput("X"));
PADDLE_ENFORCE(ctx->HasInput("Scale"), "Input(scale) should not be null.");
void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
// check input "Input(Y@GRAD) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("X")); PADDLE_ENFORCE(ctx->HasInput("SavedMean"),
PADDLE_ENFORCE(ctx->HasInput("Scale"), "Input(scale) should not be null."); "Input(SavedMean) should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), PADDLE_ENFORCE(ctx->HasInput("SavedVariance"),
"Input(Y@GRAD) should not be null."); "Input(SavedVariance) should not be null");
PADDLE_ENFORCE(ctx->HasInput("SavedMean"),
"Input(SavedMean) should not be null."); // check output
PADDLE_ENFORCE(ctx->HasInput("SavedVariance"), PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "");
"Input(SavedVariance) should not be null"); if (ctx->HasOutput(framework::GradVarName("Scale"))) {
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")),
// check output "Output(Scale@GRAD) and Output(Bias@GRAD) should not be "
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), ""); "null at same time");
if (ctx->HasOutput(framework::GradVarName("Scale"))) { }
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), const bool use_global_stats = ctx->Attrs().Get<bool>("use_global_stats");
"Output(Scale@GRAD) and Output(Bias@GRAD) should not be " if (use_global_stats) {
"null at same time"); PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_mkldnn"),
} "Using global stats during training is not supported "
const bool use_global_stats = ctx->Attrs().Get<bool>("use_global_stats"); "in gradient op kernel of batch_norm_mkldnn_op now.");
if (use_global_stats) { }
PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_mkldnn"),
"Using global stats during training is not supported "
"in gradient op kernel of batch_norm_mkldnn_op now.");
}
const auto x_dims = ctx->GetInputDim("X"); const auto x_dims = ctx->GetInputDim("X");
const DataLayout data_layout = framework::StringToDataLayout( const DataLayout data_layout = framework::StringToDataLayout(
ctx->Attrs().Get<std::string>("data_layout")); ctx->Attrs().Get<std::string>("data_layout"));
const int C = const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
(data_layout == DataLayout::kNCHW ? x_dims[1] : x_dims[x_dims.size() - 1]);
: x_dims[x_dims.size() - 1]);
ctx->SetOutputDim(framework::GradVarName("X"), x_dims); ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
if (ctx->HasOutput(framework::GradVarName("Scale"))) { if (ctx->HasOutput(framework::GradVarName("Scale"))) {
ctx->SetOutputDim(framework::GradVarName("Scale"), {C}); ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
ctx->SetOutputDim(framework::GradVarName("Bias"), {C}); ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
}
} }
}
protected: framework::OpKernelType BatchNormGradOp::GetExpectedKernelType(
framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const {
const framework::ExecutionContext &ctx) const override { const auto *var = ctx.InputVar(framework::GradVarName("Y"));
const auto *var = ctx.InputVar(framework::GradVarName("Y")); if (var == nullptr) {
if (var == nullptr) { PADDLE_THROW("can't find Y@GRAD");
PADDLE_THROW("can't find Y@GRAD"); }
} const Tensor *t = nullptr;
const Tensor *t = nullptr; if (var->IsType<Tensor>()) {
if (var->IsType<Tensor>()) { t = &var->Get<Tensor>();
t = &var->Get<Tensor>(); } else if (var->IsType<LoDTensor>()) {
} else if (var->IsType<LoDTensor>()) { t = &var->Get<LoDTensor>();
t = &var->Get<LoDTensor>(); }
} if (t == nullptr) {
if (t == nullptr) { PADDLE_THROW("can't find Y@GRAD");
PADDLE_THROW("can't find Y@GRAD"); }
}
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::LibraryType library = framework::LibraryType::kPlain; framework::LibraryType library = framework::LibraryType::kPlain;
framework::DataLayout layout = framework::DataLayout::kAnyLayout; framework::DataLayout layout = framework::DataLayout::kAnyLayout;
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (library == framework::LibraryType::kPlain && if (library == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) { platform::CanMKLDNNBeUsed(ctx)) {
library = framework::LibraryType::kMKLDNN; library = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN; layout = framework::DataLayout::kMKLDNN;
} }
#endif #endif
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), ctx.GetPlace(),
ctx.GetPlace(), layout, library); layout, library);
} }
};
template <typename T> template <typename T>
class BatchNormGradKernel<platform::CPUDeviceContext, T> class BatchNormGradKernel<platform::CPUDeviceContext, T>
...@@ -572,37 +560,31 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T> ...@@ -572,37 +560,31 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
} }
}; };
class BatchNormGradMaker : public framework::SingleGradOpDescMaker { std::unique_ptr<framework::OpDesc> BatchNormGradMaker::Apply() const {
public: auto *op = new framework::OpDesc();
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; op->SetType(GradOpType());
op->SetInput("X", Input("X"));
protected: op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *op = new framework::OpDesc(); op->SetInput("Scale", Input("Scale"));
op->SetType("batch_norm_grad"); op->SetInput("Bias", Input("Bias"));
op->SetInput("X", Input("X")); op->SetInput("SavedMean", Output("SavedMean"));
op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); op->SetInput("SavedVariance", Output("SavedVariance"));
op->SetInput("Scale", Input("Scale")); // used when setting use_global_stats True during training
op->SetInput("Bias", Input("Bias")); if (boost::get<bool>(GetAttr("use_global_stats"))) {
op->SetInput("SavedMean", Output("SavedMean")); op->SetInput("Mean", Output("MeanOut"));
op->SetInput("SavedVariance", Output("SavedVariance")); op->SetInput("Variance", Output("VarianceOut"));
}
// used when setting use_global_stats True during training
if (boost::get<bool>(GetAttr("use_global_stats"))) {
op->SetInput("Mean", Output("MeanOut"));
op->SetInput("Variance", Output("VarianceOut"));
}
op->SetAttrMap(Attrs()); op->SetAttrMap(Attrs());
op->SetOutput(framework::GradVarName("X"), InputGrad("X")); op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale")); op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
return std::unique_ptr<framework::OpDesc>(op); return std::unique_ptr<framework::OpDesc>(op);
} }
};
class BatchNormInplaceInToOut : public framework::InplaceInToOut { class BatchNormInplaceInToOut : public framework::InplaceInToOut {
public: public:
...@@ -642,10 +624,10 @@ class BatchNormGradInplaceInToOut : public framework::InplaceInToOut { ...@@ -642,10 +624,10 @@ class BatchNormGradInplaceInToOut : public framework::InplaceInToOut {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
ops::BatchNormOpInferVarType, ops::BatchNormGradMaker, ops::BatchNormOpInferVarType, ops::BatchNormGradMaker)
ops::BatchNormInplaceInToOut); // ops::BatchNormInplaceInToOut);
REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp, REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp)
ops::BatchNormGradInplaceInToOut); // ops::BatchNormGradInplaceInToOut);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>, batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -33,26 +33,6 @@ using CudnnDataType = platform::CudnnDataType<T>; ...@@ -33,26 +33,6 @@ using CudnnDataType = platform::CudnnDataType<T>;
template <typename T> template <typename T>
using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType; using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout,
int *N, int *C, int *H, int *W, int *D) {
*N = dims[0];
if (dims.size() == 2) {
*C = dims[1];
*H = 1;
*W = 1;
*D = 1;
} else {
*C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
*H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
*W = dims.size() > 3
? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
: 1;
*D = dims.size() > 4
? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
: 1;
}
}
template <typename T> template <typename T>
class BatchNormKernel<platform::CUDADeviceContext, T> class BatchNormKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> { : public framework::OpKernel<T> {
...@@ -196,22 +176,6 @@ class BatchNormKernel<platform::CUDADeviceContext, T> ...@@ -196,22 +176,6 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
} }
}; };
template <typename T, framework::DataLayout layout>
static __global__ void KeBNBackwardData(const T *dy,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *variance,
const double epsilon, const int C,
const int HxW, const int num, T *dx) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < num; i += stride) {
const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
scale[c] * inv_var);
}
}
template <typename T, int BlockDim, framework::DataLayout layout> template <typename T, int BlockDim, framework::DataLayout layout>
static __global__ void KeBNBackwardScaleBias( static __global__ void KeBNBackwardScaleBias(
const T *dy, const T *x, const BatchNormParamType<T> *mean, const T *dy, const T *x, const BatchNormParamType<T> *mean,
...@@ -248,6 +212,22 @@ static __global__ void KeBNBackwardScaleBias( ...@@ -248,6 +212,22 @@ static __global__ void KeBNBackwardScaleBias(
} }
} }
template <typename T, framework::DataLayout layout>
static __global__ void KeBNBackwardData(const T *dy,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *variance,
const double epsilon, const int C,
const int HxW, const int num, T *dx) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < num; i += stride) {
const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
scale[c] * inv_var);
}
}
template <typename T> template <typename T>
class BatchNormGradKernel<platform::CUDADeviceContext, T> class BatchNormGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> { : public framework::OpKernel<T> {
...@@ -383,7 +363,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T> ...@@ -383,7 +363,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<< KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<<
grid2, block, 0, dev_ctx.stream()>>>( grid2, block, 0, dev_ctx.stream()>>>(
d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data, d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
epsilon, C, H * W, num, d_scale->data<BatchNormParamType<T>>(), epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>()); d_bias->data<BatchNormParamType<T>>());
} }
} else { } else {
...@@ -394,10 +374,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T> ...@@ -394,10 +374,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
running_var_data, epsilon, C, H * W, num, d_x->data<T>()); running_var_data, epsilon, C, H * W, num, d_x->data<T>());
} }
if (d_scale && d_bias) { if (d_scale && d_bias) {
KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<< KeBNBackwardScaleBias<T, block, framework::DataLayout::kNHWC><<<
grid2, block, 0, dev_ctx.stream()>>>( grid2, block, 0, dev_ctx.stream()>>>(
d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data, d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
epsilon, C, H * W, num, d_scale->data<BatchNormParamType<T>>(), epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>()); d_bias->data<BatchNormParamType<T>>());
} }
} }
......
...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -35,17 +38,84 @@ template <typename T> ...@@ -35,17 +38,84 @@ template <typename T>
using ConstEigenVectorArrayMap = using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>; Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
class BatchNormOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override;
};
class BatchNormGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override;
};
class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override;
};
class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override;
virtual std::string GradOpType() const {
return this->ForwardOpType() + "_grad";
}
};
class BatchNormOpInferVarType
: public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
}
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class BatchNormKernel : public framework::OpKernel<T> { class BatchNormKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override; void Compute(const framework::ExecutionContext &ctx) const override;
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class BatchNormGradKernel : public framework::OpKernel<T> { class BatchNormGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override; void Compute(const framework::ExecutionContext &ctx) const override;
}; };
inline void ExtractNCWHD(const framework::DDim &dims,
const DataLayout &data_layout, int *N, int *C, int *H,
int *W, int *D) {
*N = dims[0];
if (dims.size() == 2) {
*C = dims[1];
*H = 1;
*W = 1;
*D = 1;
} else {
*C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
*H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
*W = dims.size() > 3
? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
: 1;
*D = dims.size() > 4
? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
: 1;
}
}
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -178,10 +178,10 @@ Beam Search Decode Operator. This Operator constructs the full hypotheses for ...@@ -178,10 +178,10 @@ Beam Search Decode Operator. This Operator constructs the full hypotheses for
each source sentence by walking back along the LoDTensorArray Input(ids) each source sentence by walking back along the LoDTensorArray Input(ids)
whose lods can be used to restore the path in the beam search tree. whose lods can be used to restore the path in the beam search tree.
The Output(SentenceIds) and Output(SentenceScores) separately contain the The Output(SentenceIds) and Output(SentenceScores) separately contain the
generated id sequences and the corresponding scores. The shapes and lods of the generated id sequences and the corresponding scores. The shapes and lods of the
two LodTensor are same. The lod level is 2 and the two levels separately two LodTensor are same. The lod level is 2 and the two levels separately
indicate how many hypotheses each source sentence has and how many ids each indicate how many hypotheses each source sentence has and how many ids each
hypothesis has. hypothesis has.
)DOC"); )DOC");
} }
...@@ -203,15 +203,12 @@ class BeamSearchDecodeInferShape : public framework::InferShapeBase { ...@@ -203,15 +203,12 @@ class BeamSearchDecodeInferShape : public framework::InferShapeBase {
class BeamSearchDecodeInferVarType : public framework::VarTypeInference { class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc& op_desc, void operator()(framework::InferVarTypeContext* ctx) const override {
framework::BlockDesc* block) const override { for (auto& o : ctx->Output("SentenceIds")) {
for (auto& o : op_desc.Output("SentenceIds")) { ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
auto& sentence_ids = block->FindRecursiveOrCreateVar(o);
sentence_ids.SetType(framework::proto::VarType::LOD_TENSOR);
} }
for (auto& o : op_desc.Output("SentenceScores")) { for (auto& o : ctx->Output("SentenceScores")) {
auto& sentence_scores = block->FindRecursiveOrCreateVar(o); ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
sentence_scores.SetType(framework::proto::VarType::LOD_TENSOR);
} }
} }
}; };
......
...@@ -65,7 +65,7 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -65,7 +65,7 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(true); .SetDefault(true);
AddComment(R"DOC( AddComment(R"DOC(
This operator does the search in beams for one time step. This operator does the search in beams for one time step.
Specifically, it selects the top-K candidate word ids of current step from Specifically, it selects the top-K candidate word ids of current step from
Input(ids) according to their Input(scores) for all source sentences, Input(ids) according to their Input(scores) for all source sentences,
where K is Attr(beam_size) and Input(ids), Input(scores) are predicted results where K is Attr(beam_size) and Input(ids), Input(scores) are predicted results
...@@ -120,15 +120,12 @@ class BeamSearchOp : public framework::OperatorWithKernel { ...@@ -120,15 +120,12 @@ class BeamSearchOp : public framework::OperatorWithKernel {
class BeamSearchInferVarType : public framework::VarTypeInference { class BeamSearchInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(framework::InferVarTypeContext *ctx) const override {
framework::BlockDesc *block) const override { for (auto &o : ctx->Output("selected_ids")) {
for (auto &o : op_desc.Output("selected_ids")) { ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
auto &selected_ids = block->FindRecursiveOrCreateVar(o);
selected_ids.SetType(framework::proto::VarType::LOD_TENSOR);
} }
for (auto &o : op_desc.Output("selected_scores")) { for (auto &o : ctx->Output("selected_scores")) {
auto &selected_scores = block->FindRecursiveOrCreateVar(o); ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
selected_scores.SetType(framework::proto::VarType::LOD_TENSOR);
} }
} }
}; };
......
...@@ -93,11 +93,9 @@ execution. ...@@ -93,11 +93,9 @@ execution.
class GetPlacesInferVarType : public framework::VarTypeInference { class GetPlacesInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(framework::InferVarTypeContext *ctx) const override {
framework::BlockDesc *block) const override { for (auto &o_name : ctx->Output("Out")) {
for (auto &o_name : op_desc.Output("Out")) { ctx->SetType(o_name, framework::proto::VarType::PLACE_LIST);
block->FindRecursiveOrCreateVar(o_name).SetType(
framework::proto::VarType::PLACE_LIST);
} }
} }
}; };
......
...@@ -100,16 +100,13 @@ class WriteToArrayInferShape : public framework::InferShapeBase { ...@@ -100,16 +100,13 @@ class WriteToArrayInferShape : public framework::InferShapeBase {
class WriteToArrayInferVarType : public framework::VarTypeInference { class WriteToArrayInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(framework::InferVarTypeContext *ctx) const override {
framework::BlockDesc *block) const override { auto x_name = ctx->Input("X")[0];
auto x_name = op_desc.Input("X")[0]; auto out_name = ctx->Output("Out")[0];
auto out_name = op_desc.Output("Out")[0];
VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY"; VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
auto &out = block->FindRecursiveOrCreateVar(out_name); ctx->SetType(out_name, framework::proto::VarType::LOD_TENSOR_ARRAY);
out.SetType(framework::proto::VarType::LOD_TENSOR_ARRAY); if (ctx->HasVar(x_name)) {
auto *x = block->FindVarRecursive(x_name); ctx->SetDataType(out_name, ctx->GetDataType(x_name));
if (x != nullptr) {
out.SetDataType(x->GetDataType());
} }
} }
}; };
......
...@@ -365,19 +365,16 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { ...@@ -365,19 +365,16 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
class WhileGradOpVarTypeInference : public framework::VarTypeInference { class WhileGradOpVarTypeInference : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(framework::InferVarTypeContext *ctx) const override {
framework::BlockDesc *block) const override { auto p_names = ctx->Input(kX);
auto p_names = op_desc.Input(kX); auto pg_ig_names = ctx->Output(framework::GradVarName(kX));
auto pg_ig_names = op_desc.Output(framework::GradVarName(kX));
for (size_t i = 0; i < p_names.size(); ++i) { for (size_t i = 0; i < p_names.size(); ++i) {
auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i])); if (ctx->HasVar(pg_ig_names[i])) {
auto *g_var = block->FindVarRecursive(pg_ig_names[i]);
if (g_var != nullptr) { // Gradient could be @EMPTY@
VLOG(5) << "Setting " << pg_ig_names[i] << " following " << p_names[i] VLOG(5) << "Setting " << pg_ig_names[i] << " following " << p_names[i]
<< " type: " << p_var.GetType(); << " type: " << ctx->GetType(p_names[i]);
g_var->SetType(p_var.GetType()); ctx->SetType(pg_ig_names[i], ctx->GetType(p_names[i]));
g_var->SetDataType(p_var.GetDataType()); ctx->SetDataType(pg_ig_names[i], ctx->GetDataType(p_names[i]));
} }
} }
} }
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/conv_op.h"
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -194,6 +195,12 @@ void Conv2DOpMaker::Make() { ...@@ -194,6 +195,12 @@ void Conv2DOpMaker::Make() {
AddAttr<bool>("use_mkldnn", AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel") "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>("use_quantizer",
"(bool, default false) "
"Set to true for operators that should be quantized and use "
"int8 kernel. "
"Only used on CPU.")
.SetDefault(false);
AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel") AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>("fuse_residual_connection", AddAttr<bool>("fuse_residual_connection",
......
...@@ -13,18 +13,21 @@ See the License for the specific language governing permissions and ...@@ -13,18 +13,21 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/cross_entropy_op.h" #include "paddle/fluid/operators/cross_entropy_op.h"
#include <memory>
#include <string> #include <string>
#include <unordered_map>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class CrossEntropyOp : public framework::OperatorWithKernel { class CrossEntropyOpBase : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null."); PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
...@@ -43,7 +46,8 @@ class CrossEntropyOp : public framework::OperatorWithKernel { ...@@ -43,7 +46,8 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
"Input(X) and Input(Label) shall have the same shape " "Input(X) and Input(Label) shall have the same shape "
"except the last dimension."); "except the last dimension.");
} }
if (ctx->Attrs().Get<bool>("soft_label")) {
if (IsSoftLabel(ctx)) {
if (check) { if (check) {
PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1], PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1],
"If Attr(soft_label) == true, the last dimension of " "If Attr(soft_label) == true, the last dimension of "
...@@ -69,21 +73,24 @@ class CrossEntropyOp : public framework::OperatorWithKernel { ...@@ -69,21 +73,24 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.device_context()); ctx.device_context());
} }
virtual bool IsSoftLabel(framework::InferShapeContext* ctx) const {
return ctx->Attrs().Get<bool>("soft_label");
}
}; };
class CrossEntropyGradientOp : public framework::OperatorWithKernel { class CrossEntropyGradientOpBase : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
"Input(Y@GRAD) shoudl be not null."); "Input(Y@GRAD) shoudl be not null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Output(X@GRAD) should be not null."); "Output(X@GRAD) should be not null.");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = GetXDim(ctx);
auto label_dims = ctx->GetInputDim("Label"); auto label_dims = ctx->GetInputDim("Label");
auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y")); auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
int rank = x_dims.size(); int rank = x_dims.size();
...@@ -108,9 +115,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { ...@@ -108,9 +115,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
"The Input(X) and Input(Y@Grad) should have the same " "The Input(X) and Input(Y@Grad) should have the same "
"shape except the last dimension."); "shape except the last dimension.");
} }
PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1, if (IsSoftLabel(ctx)) {
"The last dimension of Input(Y@Grad) should be 1.");
if (ctx->Attrs().Get<bool>("soft_label")) {
if (check) { if (check) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
x_dims[rank - 1], label_dims[rank - 1], x_dims[rank - 1], label_dims[rank - 1],
...@@ -123,7 +128,10 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { ...@@ -123,7 +128,10 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
"Input(Label) should be 1."); "Input(Label) should be 1.");
} }
ctx->SetOutputDim(framework::GradVarName("X"), x_dims); ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
ctx->ShareLoD("X", framework::GradVarName("X")); PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1,
"The last dimension of Input(Y@Grad) should be 1.");
ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
ctx->ShareLoD(VarNameWithXLoD(), framework::GradVarName("X"));
} }
protected: protected:
...@@ -131,8 +139,28 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { ...@@ -131,8 +139,28 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
// is determined by its input "X". // is determined by its input "X".
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), return framework::OpKernelType(
ctx.device_context()); ctx.Input<Tensor>(framework::GradVarName("Y"))->type(),
ctx.device_context());
}
virtual framework::DDim GetXDim(framework::InferShapeContext* ctx) const {
return ctx->GetInputDim("X");
}
virtual const char* VarNameWithXLoD() const { return "X"; }
virtual bool IsSoftLabel(framework::InferShapeContext* ctx) const {
return ctx->Attrs().Get<bool>("soft_label");
}
};
class CrossEntropyOpInferVarType
: public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
} }
}; };
...@@ -200,22 +228,132 @@ or not. But the output only shares the LoD information with input X. ...@@ -200,22 +228,132 @@ or not. But the output only shares the LoD information with input X.
} }
}; };
class CrossEntropyOpInferVarType class CrossEntropyGradientOp : public CrossEntropyGradientOpBase {
: public framework::PassInDtypeAndVarTypeToOutput { public:
using CrossEntropyGradientOpBase::CrossEntropyGradientOpBase;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
CrossEntropyGradientOpBase::InferShape(ctx);
}
};
class CrossEntropyOp2 : public CrossEntropyOpBase {
public:
using CrossEntropyOpBase::CrossEntropyOpBase;
void InferShape(framework::InferShapeContext* ctx) const override {
CrossEntropyOpBase::InferShape(ctx);
PADDLE_ENFORCE(ctx->HasOutput("XShape"),
"Output(XShape) should be not null.");
PADDLE_ENFORCE(ctx->HasOutput("MatchX"),
"Output(MatchX) should be not null.");
auto x_dims = ctx->GetInputDim("X");
auto x_dims_vec = framework::vectorize(x_dims);
x_dims_vec.push_back(0);
ctx->SetOutputDim("XShape", framework::make_ddim(x_dims_vec));
x_dims[x_dims.size() - 1] = 1;
ctx->SetOutputDim("MatchX", x_dims);
ctx->ShareLoD("X", /*->*/ "XShape");
}
protected: protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType() bool IsSoftLabel(framework::InferShapeContext* ctx) const override {
const override { return false;
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}}; }
};
class CrossEntropyGradientOp2 : public CrossEntropyGradientOpBase {
public:
using CrossEntropyGradientOpBase::CrossEntropyGradientOpBase;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("MatchX"), "Input(MatchX) must exist");
CrossEntropyGradientOpBase::InferShape(ctx);
}
protected:
virtual framework::DDim GetXDim(framework::InferShapeContext* ctx) const {
auto x_shape = ctx->GetInputDim("XShape");
return framework::DDim(x_shape.Get(), x_shape.size() - 1);
}
virtual const char* VarNameWithXLoD() const { return "XShape"; }
virtual bool IsSoftLabel(framework::InferShapeContext* ctx) const {
return false;
} }
}; };
class CrossEntropyOpMaker2 : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor, default Tensor<float>), a tensor whose last dimension "
"size is equal to the number of classes. This input is a "
"probability computed by the previous operator, which is almost "
"always the result of a softmax operator.");
AddInput(
"Label",
"(Tensor), the tensor which represents the ground truth. It has the "
"same shape with 'X' except the last dimension. One hot Tensor.");
AddOutput("Y",
"(Tensor, default Tensor<float>), a tensor whose shape is same "
"with 'X' except that the last dimension size is 1. It "
"represents the cross entropy loss.");
AddOutput("XShape", "Temporaily variable to save shape and LoD of X.");
AddOutput("MatchX",
"X value that matches label, used for gradient computation.");
AddAttr<int>("ignore_index",
"(int, default -100), Specifies a target value that is"
"ignored and does not contribute to the input gradient."
"Only valid if soft_label is set to False")
.SetDefault(-100);
AddComment(R"DOC(
Hard-label CrossEntropy Operator.
The input 'X' and 'Label' will first be logically flattened to 2-D matrixs.
The matrix's second dimension(row length) is as same as the original last
dimension, and the first dimension(column length) is the product of all other
original dimensions. Then the softmax computation will take palce on each raw
of flattened matrixs.
Only support hard label.
Both the input X and Label can carry the LoD (Level of Details) information,
or not. But the output only shares the LoD information with input X.
)DOC");
}
};
class CrossEntropyGradOpDescMaker2 : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("cross_entropy_grad2");
op->SetInput("Label", Input("Label"));
op->SetInput("MatchX", Output("MatchX"));
op->SetInput("XShape", Output("XShape"));
op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
using CPUCtx = paddle::platform::CPUDeviceContext; using CPUCtx = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker, REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOpBase,
ops::CrossEntropyOpInferVarType, ops::CrossEntropyOpMaker, ops::CrossEntropyOpInferVarType,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp); REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp);
REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>, REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,
...@@ -223,3 +361,14 @@ REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>, ...@@ -223,3 +361,14 @@ REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,
REGISTER_OP_CPU_KERNEL(cross_entropy_grad, REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
ops::CrossEntropyGradientOpKernel<CPUCtx, float>, ops::CrossEntropyGradientOpKernel<CPUCtx, float>,
ops::CrossEntropyGradientOpKernel<CPUCtx, double>); ops::CrossEntropyGradientOpKernel<CPUCtx, double>);
REGISTER_OPERATOR(cross_entropy2, ops::CrossEntropyOp2,
ops::CrossEntropyOpMaker2, ops::CrossEntropyOpInferVarType,
ops::CrossEntropyGradOpDescMaker2);
REGISTER_OPERATOR(cross_entropy_grad2, ops::CrossEntropyGradientOp2);
REGISTER_OP_CPU_KERNEL(cross_entropy2,
ops::CrossEntropyOpKernel2<CPUCtx, float>,
ops::CrossEntropyOpKernel2<CPUCtx, double>);
REGISTER_OP_CPU_KERNEL(cross_entropy_grad2,
ops::CrossEntropyGradientOpKernel2<CPUCtx, float>,
ops::CrossEntropyGradientOpKernel2<CPUCtx, double>);
...@@ -27,3 +27,13 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -27,3 +27,13 @@ REGISTER_OP_CUDA_KERNEL(
cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>, cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
ops::CrossEntropyGradientOpKernel<CUDACtx, double>, ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>); ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
REGISTER_OP_CUDA_KERNEL(cross_entropy2,
ops::CrossEntropyOpKernel2<CUDACtx, float>,
ops::CrossEntropyOpKernel2<CUDACtx, double>,
ops::CrossEntropyOpKernel2<CUDACtx, plat::float16>);
REGISTER_OP_CUDA_KERNEL(
cross_entropy_grad2, ops::CrossEntropyGradientOpKernel2<CUDACtx, float>,
ops::CrossEntropyGradientOpKernel2<CUDACtx, double>,
ops::CrossEntropyGradientOpKernel2<CUDACtx, plat::float16>);
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math.h"
#include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
...@@ -137,5 +138,124 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> { ...@@ -137,5 +138,124 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
} }
}; };
template <typename T>
struct HardLabelCrossEntropyForwardFunctor {
HardLabelCrossEntropyForwardFunctor(const T* x, T* y, T* match_x,
const int64_t* label,
int64_t ignore_index,
int64_t feature_size)
: x_(x),
y_(y),
match_x_(match_x),
label_(label),
ignore_index_(ignore_index),
feature_size_(feature_size) {}
HOSTDEVICE void operator()(int64_t idx) const {
auto label = label_[idx];
if (label != ignore_index_) {
auto match_x = x_[idx * feature_size_ + label];
y_[idx] = -math::TolerableValue<T>()(real_log(match_x));
match_x_[idx] = match_x;
} else {
y_[idx] = 0;
match_x_[idx] = 0; // any value is ok
}
}
const T* x_;
T* y_;
T* match_x_;
const int64_t* label_;
int64_t ignore_index_;
int64_t feature_size_;
};
template <typename T>
struct HardLabelCrossEntropyBackwardFunctor {
HardLabelCrossEntropyBackwardFunctor(T* dx, const T* dy, const T* match_x,
const int64_t* label,
int64_t ignore_index,
int64_t feature_size)
: dx_(dx),
dy_(dy),
match_x_(match_x),
label_(label),
ignore_index_(ignore_index),
feature_size_(feature_size) {}
HOSTDEVICE void operator()(int64_t idx) const {
auto row_idx = idx / feature_size_;
auto col_idx = idx % feature_size_;
auto label = label_[row_idx];
if (label == col_idx && label != ignore_index_) {
dx_[idx] = -dy_[row_idx] / match_x_[row_idx];
} else {
dx_[idx] = 0;
}
}
T* dx_;
const T* dy_;
const T* match_x_;
const int64_t* label_;
int64_t ignore_index_;
int64_t feature_size_;
};
template <typename DeviceContext, typename T>
class CrossEntropyOpKernel2 : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<Tensor>("X");
auto* label = ctx.Input<Tensor>("Label");
auto* y = ctx.Output<Tensor>("Y");
auto* match_x = ctx.Output<Tensor>("MatchX");
auto& x_dims = x->dims();
auto feature_size = x_dims[x_dims.size() - 1];
auto batch_size = framework::product(x->dims()) / feature_size;
auto* p_x = x->data<T>();
auto* p_label = label->data<int64_t>();
auto* p_y = y->mutable_data<T>(ctx.GetPlace());
auto* p_match_x = match_x->mutable_data<T>(ctx.GetPlace());
auto ignore_index = ctx.Attr<int>("ignore_index");
platform::ForRange<DeviceContext> for_range(
ctx.template device_context<DeviceContext>(), batch_size);
for_range(HardLabelCrossEntropyForwardFunctor<T>(
p_x, p_y, p_match_x, p_label, ignore_index, feature_size));
}
};
template <typename DeviceContext, typename T>
class CrossEntropyGradientOpKernel2 : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
auto* match_x = ctx.Input<Tensor>("MatchX");
auto* label = ctx.Input<Tensor>("Label");
auto* p_dx = dx->mutable_data<T>(ctx.GetPlace());
auto* p_dy = dy->data<T>();
auto* p_match_x = match_x->data<T>();
auto* p_label = label->data<int64_t>();
int64_t ignore_index = ctx.Attr<int>("ignore_index");
int rank = dx->dims().size();
int64_t feature_size = dx->dims()[rank - 1];
int64_t batch_size = framework::product(dx->dims()) / feature_size;
platform::ForRange<DeviceContext> for_range(
ctx.template device_context<DeviceContext>(),
batch_size * feature_size);
for_range(HardLabelCrossEntropyBackwardFunctor<T>(
p_dx, p_dy, p_match_x, p_label, ignore_index, feature_size));
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -33,6 +33,7 @@ detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) ...@@ -33,6 +33,7 @@ detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc) detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu) detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc) detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op.cu)
detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu) detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu)
if(WITH_GPU) if(WITH_GPU)
......
...@@ -60,14 +60,15 @@ class BoxCoderOp : public framework::OperatorWithKernel { ...@@ -60,14 +60,15 @@ class BoxCoderOp : public framework::OperatorWithKernel {
} else if (code_type == BoxCodeType::kDecodeCenterSize) { } else if (code_type == BoxCodeType::kDecodeCenterSize) {
PADDLE_ENFORCE_EQ(target_box_dims.size(), 3, PADDLE_ENFORCE_EQ(target_box_dims.size(), 3,
"The rank of Input TargetBox must be 3"); "The rank of Input TargetBox must be 3");
if (axis == 0) { PADDLE_ENFORCE(axis == 0 || axis == 1, "axis must be 0 or 1");
PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]); if (ctx->IsRuntime()) {
} else if (axis == 1) { if (axis == 0) {
PADDLE_ENFORCE_EQ(target_box_dims[0], prior_box_dims[0]); PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]);
} else { } else if (axis == 1) {
PADDLE_THROW("axis must be 0 or 1."); PADDLE_ENFORCE_EQ(target_box_dims[0], prior_box_dims[0]);
}
PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
} }
PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
ctx->ShareDim("TargetBox", /*->*/ "OutputBox"); ctx->ShareDim("TargetBox", /*->*/ "OutputBox");
} }
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/yolo_box_op.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using framework::Tensor;
class YoloBoxOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of YoloBoxOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("ImgSize"),
"Input(ImgSize) of YoloBoxOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Boxes"),
"Output(Boxes) of YoloBoxOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Scores"),
"Output(Scores) of YoloBoxOp should not be null.");
auto dim_x = ctx->GetInputDim("X");
auto dim_imgsize = ctx->GetInputDim("ImgSize");
auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
int anchor_num = anchors.size() / 2;
auto class_num = ctx->Attrs().Get<int>("class_num");
PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor.");
PADDLE_ENFORCE_EQ(
dim_x[1], anchor_num * (5 + class_num),
"Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
"+ class_num)).");
PADDLE_ENFORCE_EQ(dim_imgsize.size(), 2,
"Input(ImgSize) should be a 2-D tensor.");
PADDLE_ENFORCE_EQ(
dim_imgsize[0], dim_x[0],
"Input(ImgSize) dim[0] and Input(X) dim[0] should be same.");
PADDLE_ENFORCE_EQ(dim_imgsize[1], 2, "Input(ImgSize) dim[1] should be 2.");
PADDLE_ENFORCE_GT(anchors.size(), 0,
"Attr(anchors) length should be greater than 0.");
PADDLE_ENFORCE_EQ(anchors.size() % 2, 0,
"Attr(anchors) length should be even integer.");
PADDLE_ENFORCE_GT(class_num, 0,
"Attr(class_num) should be an integer greater than 0.");
int box_num = dim_x[2] * dim_x[3] * anchor_num;
std::vector<int64_t> dim_boxes({dim_x[0], box_num, 4});
ctx->SetOutputDim("Boxes", framework::make_ddim(dim_boxes));
std::vector<int64_t> dim_scores({dim_x[0], box_num, class_num});
ctx->SetOutputDim("Scores", framework::make_ddim(dim_scores));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.GetPlace());
}
};
class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"The input tensor of YoloBox operator is a 4-D tensor with "
"shape of [N, C, H, W]. The second dimension(C) stores "
"box locations, confidence score and classification one-hot "
"keys of each anchor box. Generally, X should be the output "
"of YOLOv3 network.");
AddInput("ImgSize",
"The image size tensor of YoloBox operator, "
"This is a 2-D tensor with shape of [N, 2]. This tensor holds "
"height and width of each input image used for resizing output "
"box in input image scale.");
AddOutput("Boxes",
"The output tensor of detection boxes of YoloBox operator, "
"This is a 3-D tensor with shape of [N, M, 4], N is the "
"batch num, M is output box number, and the 3rd dimension "
"stores [xmin, ymin, xmax, ymax] coordinates of boxes.");
AddOutput("Scores",
"The output tensor of detection boxes scores of YoloBox "
"operator, This is a 3-D tensor with shape of "
"[N, M, :attr:`class_num`], N is the batch num, M is "
"output box number.");
AddAttr<int>("class_num", "The number of classes to predict.");
AddAttr<std::vector<int>>("anchors",
"The anchor width and height, "
"it will be parsed pair by pair.")
.SetDefault(std::vector<int>{});
AddAttr<int>("downsample_ratio",
"The downsample ratio from network input to YoloBox operator "
"input, so 32, 16, 8 should be set for the first, second, "
"and thrid YoloBox operators.")
.SetDefault(32);
AddAttr<float>("conf_thresh",
"The confidence scores threshold of detection boxes. "
"Boxes with confidence scores under threshold should "
"be ignored.")
.SetDefault(0.01);
AddComment(R"DOC(
This operator generates YOLO detection boxes from output of YOLOv3 network.
The output of previous network is in shape [N, C, H, W], while H and W
should be the same, H and W specify the grid size, each grid point predict
given number boxes, this given number, which following will be represented as S,
is specified by the number of anchors. In the second dimension(the channel
dimension), C should be equal to S * (5 + class_num), class_num is the object
category number of source dataset(such as 80 in coco dataset), so the
second(channel) dimension, apart from 4 box location coordinates x, y, w, h,
also includes confidence score of the box and class one-hot key of each anchor
box.
Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box
predictions should be as follows:
$$
b_x = \\sigma(t_x) + c_x
$$
$$
b_y = \\sigma(t_y) + c_y
$$
$$
b_w = p_w e^{t_w}
$$
$$
b_h = p_h e^{t_h}
$$
in the equation above, :math:`c_x, c_y` is the left top corner of current grid
and :math:`p_w, p_h` is specified by anchors.
The logistic regression value of the 5th channel of each anchor prediction boxes
represents the confidence score of each prediction box, and the logistic
regression value of the last :attr:`class_num` channels of each anchor prediction
boxes represents the classifcation scores. Boxes with confidence scores less than
:attr:`conf_thresh` should be ignored, and box final scores is the product of
confidence scores and classification scores.
$$
score_{pred} = score_{conf} * score_{class}
$$
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel<float>,
ops::YoloBoxKernel<double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/yolo_box_op.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
__global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
T* scores, const float conf_thresh,
const int* anchors, const int n, const int h,
const int w, const int an_num, const int class_num,
const int box_num, int input_size) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
T box[4];
for (; tid < n * box_num; tid += stride) {
int grid_num = h * w;
int i = tid / box_num;
int j = (tid % box_num) / grid_num;
int k = (tid % grid_num) / w;
int l = tid % w;
int an_stride = (5 + class_num) * grid_num;
int img_height = imgsize[2 * i];
int img_width = imgsize[2 * i + 1];
int obj_idx =
GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4);
T conf = sigmoid<T>(input[obj_idx]);
if (conf < conf_thresh) {
continue;
}
int box_idx =
GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0);
GetYoloBox<T>(box, input, anchors, l, k, j, h, input_size, box_idx,
grid_num, img_height, img_width);
box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width);
int label_idx =
GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5);
int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
CalcLabelScore<T>(scores, input, label_idx, score_idx, class_num, conf,
grid_num);
}
}
template <typename T>
class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("X");
auto* img_size = ctx.Input<Tensor>("ImgSize");
auto* boxes = ctx.Output<Tensor>("Boxes");
auto* scores = ctx.Output<Tensor>("Scores");
auto anchors = ctx.Attr<std::vector<int>>("anchors");
int class_num = ctx.Attr<int>("class_num");
float conf_thresh = ctx.Attr<float>("conf_thresh");
int downsample_ratio = ctx.Attr<int>("downsample_ratio");
const int n = input->dims()[0];
const int h = input->dims()[2];
const int w = input->dims()[3];
const int box_num = boxes->dims()[1];
const int an_num = anchors.size() / 2;
int input_size = downsample_ratio * h;
auto& dev_ctx = ctx.cuda_device_context();
auto& allocator =
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
int bytes = sizeof(int) * anchors.size();
auto anchors_ptr = allocator.Allocate(sizeof(int) * anchors.size());
int* anchors_data = reinterpret_cast<int*>(anchors_ptr->ptr());
const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
const auto cplace = platform::CPUPlace();
memory::Copy(gplace, anchors_data, cplace, anchors.data(), bytes,
dev_ctx.stream());
const T* input_data = input->data<T>();
const int* imgsize_data = img_size->data<int>();
T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
T* scores_data =
scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
math::SetConstant<platform::CUDADeviceContext, T> set_zero;
set_zero(dev_ctx, boxes, static_cast<T>(0));
set_zero(dev_ctx, scores, static_cast<T>(0));
int grid_dim = (n * box_num + 512 - 1) / 512;
grid_dim = grid_dim > 8 ? 8 : grid_dim;
KeYoloBoxFw<T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
input_data, imgsize_data, boxes_data, scores_data, conf_thresh,
anchors_data, n, h, w, an_num, class_num, box_num, input_size);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(yolo_box, ops::YoloBoxOpCUDAKernel<float>,
ops::YoloBoxOpCUDAKernel<double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
HOSTDEVICE inline T sigmoid(T x) {
return 1.0 / (1.0 + std::exp(-x));
}
template <typename T>
HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i,
int j, int an_idx, int grid_size,
int input_size, int index, int stride,
int img_height, int img_width) {
box[0] = (i + sigmoid<T>(x[index])) * img_width / grid_size;
box[1] = (j + sigmoid<T>(x[index + stride])) * img_height / grid_size;
box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
input_size;
box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] *
img_height / input_size;
}
HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
int an_num, int an_stride, int stride,
int entry) {
return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
}
template <typename T>
HOSTDEVICE inline void CalcDetectionBox(T* boxes, T* box, const int box_idx,
const int img_height,
const int img_width) {
boxes[box_idx] = box[0] - box[2] / 2;
boxes[box_idx + 1] = box[1] - box[3] / 2;
boxes[box_idx + 2] = box[0] + box[2] / 2;
boxes[box_idx + 3] = box[1] + box[3] / 2;
boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast<T>(0);
boxes[box_idx + 1] =
boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<T>(0);
boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
? boxes[box_idx + 2]
: static_cast<T>(img_width - 1);
boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
? boxes[box_idx + 3]
: static_cast<T>(img_height - 1);
}
template <typename T>
HOSTDEVICE inline void CalcLabelScore(T* scores, const T* input,
const int label_idx, const int score_idx,
const int class_num, const T conf,
const int stride) {
for (int i = 0; i < class_num; i++) {
scores[score_idx + i] = conf * sigmoid<T>(input[label_idx + i * stride]);
}
}
template <typename T>
class YoloBoxKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("X");
auto* imgsize = ctx.Input<Tensor>("ImgSize");
auto* boxes = ctx.Output<Tensor>("Boxes");
auto* scores = ctx.Output<Tensor>("Scores");
auto anchors = ctx.Attr<std::vector<int>>("anchors");
int class_num = ctx.Attr<int>("class_num");
float conf_thresh = ctx.Attr<float>("conf_thresh");
int downsample_ratio = ctx.Attr<int>("downsample_ratio");
const int n = input->dims()[0];
const int h = input->dims()[2];
const int w = input->dims()[3];
const int box_num = boxes->dims()[1];
const int an_num = anchors.size() / 2;
int input_size = downsample_ratio * h;
const int stride = h * w;
const int an_stride = (class_num + 5) * stride;
Tensor anchors_;
auto anchors_data =
anchors_.mutable_data<int>({an_num * 2}, ctx.GetPlace());
std::copy(anchors.begin(), anchors.end(), anchors_data);
const T* input_data = input->data<T>();
const int* imgsize_data = imgsize->data<int>();
T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
memset(boxes_data, 0, boxes->numel() * sizeof(T));
T* scores_data =
scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
memset(scores_data, 0, scores->numel() * sizeof(T));
T box[4];
for (int i = 0; i < n; i++) {
int img_height = imgsize_data[2 * i];
int img_width = imgsize_data[2 * i + 1];
for (int j = 0; j < an_num; j++) {
for (int k = 0; k < h; k++) {
for (int l = 0; l < w; l++) {
int obj_idx =
GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 4);
T conf = sigmoid<T>(input_data[obj_idx]);
if (conf < conf_thresh) {
continue;
}
int box_idx =
GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 0);
GetYoloBox<T>(box, input_data, anchors_data, l, k, j, h, input_size,
box_idx, stride, img_height, img_width);
box_idx = (i * box_num + j * stride + k * w + l) * 4;
CalcDetectionBox<T>(boxes_data, box, box_idx, img_height,
img_width);
int label_idx =
GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 5);
int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
CalcLabelScore<T>(scores_data, input_data, label_idx, score_idx,
class_num, conf, stride);
}
}
}
}
}
};
} // namespace operators
} // namespace paddle
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/detection/yolov3_loss_op.h" #include "paddle/fluid/operators/detection/yolov3_loss_op.h"
#include <memory>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
namespace paddle { namespace paddle {
...@@ -72,6 +73,18 @@ class Yolov3LossOp : public framework::OperatorWithKernel { ...@@ -72,6 +73,18 @@ class Yolov3LossOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_GT(class_num, 0, PADDLE_ENFORCE_GT(class_num, 0,
"Attr(class_num) should be an integer greater then 0."); "Attr(class_num) should be an integer greater then 0.");
if (ctx->HasInput("GTScore")) {
auto dim_gtscore = ctx->GetInputDim("GTScore");
PADDLE_ENFORCE_EQ(dim_gtscore.size(), 2,
"Input(GTScore) should be a 2-D tensor");
PADDLE_ENFORCE_EQ(
dim_gtscore[0], dim_gtbox[0],
"Input(GTBox) and Input(GTScore) dim[0] should be same");
PADDLE_ENFORCE_EQ(
dim_gtscore[1], dim_gtbox[1],
"Input(GTBox) and Input(GTScore) dim[1] should be same");
}
std::vector<int64_t> dim_out({dim_x[0]}); std::vector<int64_t> dim_out({dim_x[0]});
ctx->SetOutputDim("Loss", framework::make_ddim(dim_out)); ctx->SetOutputDim("Loss", framework::make_ddim(dim_out));
...@@ -112,6 +125,12 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -112,6 +125,12 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
"This is a 2-D tensor with shape of [N, max_box_num], " "This is a 2-D tensor with shape of [N, max_box_num], "
"and each element should be an integer to indicate the " "and each element should be an integer to indicate the "
"box class id."); "box class id.");
AddInput("GTScore",
"The score of GTLabel, This is a 2-D tensor in same shape "
"GTLabel, and score values should in range (0, 1). This "
"input is for GTLabel score can be not 1.0 in image mixup "
"augmentation.")
.AsDispensable();
AddOutput("Loss", AddOutput("Loss",
"The output yolov3 loss tensor, " "The output yolov3 loss tensor, "
"This is a 1-D tensor with shape of [N]"); "This is a 1-D tensor with shape of [N]");
...@@ -143,6 +162,9 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -143,6 +162,9 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<float>("ignore_thresh", AddAttr<float>("ignore_thresh",
"The ignore threshold to ignore confidence loss.") "The ignore threshold to ignore confidence loss.")
.SetDefault(0.7); .SetDefault(0.7);
AddAttr<bool>("use_label_smooth",
"Whether to use label smooth. Default True.")
.SetDefault(true);
AddComment(R"DOC( AddComment(R"DOC(
This operator generates yolov3 loss based on given predict result and ground This operator generates yolov3 loss based on given predict result and ground
truth boxes. truth boxes.
...@@ -204,6 +226,15 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -204,6 +226,15 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
loss = (loss_{xy} + loss_{wh}) * weight_{box} loss = (loss_{xy} + loss_{wh}) * weight_{box}
+ loss_{conf} + loss_{class} + loss_{conf} + loss_{class}
$$ $$
While :attr:`use_label_smooth` is set to be :attr:`True`, the classification
target will be smoothed when calculating classification loss, target of
positive samples will be smoothed to :math:`1.0 - 1.0 / class\_num` and target of
negetive samples will be smoothed to :math:`1.0 / class\_num`.
While :attr:`GTScore` is given, which means the mixup score of ground truth
boxes, all losses incured by a ground truth box will be multiplied by its
mixup score.
)DOC"); )DOC");
} }
}; };
...@@ -240,6 +271,7 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker { ...@@ -240,6 +271,7 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
op->SetInput("X", Input("X")); op->SetInput("X", Input("X"));
op->SetInput("GTBox", Input("GTBox")); op->SetInput("GTBox", Input("GTBox"));
op->SetInput("GTLabel", Input("GTLabel")); op->SetInput("GTLabel", Input("GTLabel"));
op->SetInput("GTScore", Input("GTScore"));
op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
op->SetInput("ObjectnessMask", Output("ObjectnessMask")); op->SetInput("ObjectnessMask", Output("ObjectnessMask"));
op->SetInput("GTMatchMask", Output("GTMatchMask")); op->SetInput("GTMatchMask", Output("GTMatchMask"));
...@@ -249,6 +281,7 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker { ...@@ -249,6 +281,7 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
op->SetOutput(framework::GradVarName("X"), InputGrad("X")); op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetOutput(framework::GradVarName("GTBox"), {}); op->SetOutput(framework::GradVarName("GTBox"), {});
op->SetOutput(framework::GradVarName("GTLabel"), {}); op->SetOutput(framework::GradVarName("GTLabel"), {});
op->SetOutput(framework::GradVarName("GTScore"), {});
return std::unique_ptr<framework::OpDesc>(op); return std::unique_ptr<framework::OpDesc>(op);
} }
}; };
......
...@@ -37,8 +37,8 @@ static T SigmoidCrossEntropy(T x, T label) { ...@@ -37,8 +37,8 @@ static T SigmoidCrossEntropy(T x, T label) {
} }
template <typename T> template <typename T>
static T L2Loss(T x, T y) { static T L1Loss(T x, T y) {
return 0.5 * (y - x) * (y - x); return std::abs(y - x);
} }
template <typename T> template <typename T>
...@@ -47,8 +47,8 @@ static T SigmoidCrossEntropyGrad(T x, T label) { ...@@ -47,8 +47,8 @@ static T SigmoidCrossEntropyGrad(T x, T label) {
} }
template <typename T> template <typename T>
static T L2LossGrad(T x, T y) { static T L1LossGrad(T x, T y) {
return x - y; return x > y ? 1.0 : -1.0;
} }
static int GetMaskIndex(std::vector<int> mask, int val) { static int GetMaskIndex(std::vector<int> mask, int val) {
...@@ -121,47 +121,49 @@ template <typename T> ...@@ -121,47 +121,49 @@ template <typename T>
static void CalcBoxLocationLoss(T* loss, const T* input, Box<T> gt, static void CalcBoxLocationLoss(T* loss, const T* input, Box<T> gt,
std::vector<int> anchors, int an_idx, std::vector<int> anchors, int an_idx,
int box_idx, int gi, int gj, int grid_size, int box_idx, int gi, int gj, int grid_size,
int input_size, int stride) { int input_size, int stride, T score) {
T tx = gt.x * grid_size - gi; T tx = gt.x * grid_size - gi;
T ty = gt.y * grid_size - gj; T ty = gt.y * grid_size - gj;
T tw = std::log(gt.w * input_size / anchors[2 * an_idx]); T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]); T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);
T scale = (2.0 - gt.w * gt.h); T scale = (2.0 - gt.w * gt.h) * score;
loss[0] += SigmoidCrossEntropy<T>(input[box_idx], tx) * scale; loss[0] += SigmoidCrossEntropy<T>(input[box_idx], tx) * scale;
loss[0] += SigmoidCrossEntropy<T>(input[box_idx + stride], ty) * scale; loss[0] += SigmoidCrossEntropy<T>(input[box_idx + stride], ty) * scale;
loss[0] += L2Loss<T>(input[box_idx + 2 * stride], tw) * scale; loss[0] += L1Loss<T>(input[box_idx + 2 * stride], tw) * scale;
loss[0] += L2Loss<T>(input[box_idx + 3 * stride], th) * scale; loss[0] += L1Loss<T>(input[box_idx + 3 * stride], th) * scale;
} }
template <typename T> template <typename T>
static void CalcBoxLocationLossGrad(T* input_grad, const T loss, const T* input, static void CalcBoxLocationLossGrad(T* input_grad, const T loss, const T* input,
Box<T> gt, std::vector<int> anchors, Box<T> gt, std::vector<int> anchors,
int an_idx, int box_idx, int gi, int gj, int an_idx, int box_idx, int gi, int gj,
int grid_size, int input_size, int stride) { int grid_size, int input_size, int stride,
T score) {
T tx = gt.x * grid_size - gi; T tx = gt.x * grid_size - gi;
T ty = gt.y * grid_size - gj; T ty = gt.y * grid_size - gj;
T tw = std::log(gt.w * input_size / anchors[2 * an_idx]); T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]); T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);
T scale = (2.0 - gt.w * gt.h); T scale = (2.0 - gt.w * gt.h) * score;
input_grad[box_idx] = input_grad[box_idx] =
SigmoidCrossEntropyGrad<T>(input[box_idx], tx) * scale * loss; SigmoidCrossEntropyGrad<T>(input[box_idx], tx) * scale * loss;
input_grad[box_idx + stride] = input_grad[box_idx + stride] =
SigmoidCrossEntropyGrad<T>(input[box_idx + stride], ty) * scale * loss; SigmoidCrossEntropyGrad<T>(input[box_idx + stride], ty) * scale * loss;
input_grad[box_idx + 2 * stride] = input_grad[box_idx + 2 * stride] =
L2LossGrad<T>(input[box_idx + 2 * stride], tw) * scale * loss; L1LossGrad<T>(input[box_idx + 2 * stride], tw) * scale * loss;
input_grad[box_idx + 3 * stride] = input_grad[box_idx + 3 * stride] =
L2LossGrad<T>(input[box_idx + 3 * stride], th) * scale * loss; L1LossGrad<T>(input[box_idx + 3 * stride], th) * scale * loss;
} }
template <typename T> template <typename T>
static inline void CalcLabelLoss(T* loss, const T* input, const int index, static inline void CalcLabelLoss(T* loss, const T* input, const int index,
const int label, const int class_num, const int label, const int class_num,
const int stride) { const int stride, const T pos, const T neg,
T score) {
for (int i = 0; i < class_num; i++) { for (int i = 0; i < class_num; i++) {
T pred = input[index + i * stride]; T pred = input[index + i * stride];
loss[0] += SigmoidCrossEntropy<T>(pred, (i == label) ? 1.0 : 0.0); loss[0] += SigmoidCrossEntropy<T>(pred, (i == label) ? pos : neg) * score;
} }
} }
...@@ -169,11 +171,13 @@ template <typename T> ...@@ -169,11 +171,13 @@ template <typename T>
static inline void CalcLabelLossGrad(T* input_grad, const T loss, static inline void CalcLabelLossGrad(T* input_grad, const T loss,
const T* input, const int index, const T* input, const int index,
const int label, const int class_num, const int label, const int class_num,
const int stride) { const int stride, const T pos, const T neg,
T score) {
for (int i = 0; i < class_num; i++) { for (int i = 0; i < class_num; i++) {
T pred = input[index + i * stride]; T pred = input[index + i * stride];
input_grad[index + i * stride] = input_grad[index + i * stride] =
SigmoidCrossEntropyGrad<T>(pred, (i == label) ? 1.0 : 0.0) * loss; SigmoidCrossEntropyGrad<T>(pred, (i == label) ? pos : neg) * score *
loss;
} }
} }
...@@ -188,8 +192,8 @@ static inline void CalcObjnessLoss(T* loss, const T* input, const T* objness, ...@@ -188,8 +192,8 @@ static inline void CalcObjnessLoss(T* loss, const T* input, const T* objness,
for (int l = 0; l < w; l++) { for (int l = 0; l < w; l++) {
T obj = objness[k * w + l]; T obj = objness[k * w + l];
if (obj > 1e-5) { if (obj > 1e-5) {
// positive sample: obj = 1 // positive sample: obj = mixup score
loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 1.0); loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 1.0) * obj;
} else if (obj > -0.5) { } else if (obj > -0.5) {
// negetive sample: obj = 0 // negetive sample: obj = 0
loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 0.0); loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 0.0);
...@@ -215,7 +219,8 @@ static inline void CalcObjnessLossGrad(T* input_grad, const T* loss, ...@@ -215,7 +219,8 @@ static inline void CalcObjnessLossGrad(T* input_grad, const T* loss,
T obj = objness[k * w + l]; T obj = objness[k * w + l];
if (obj > 1e-5) { if (obj > 1e-5) {
input_grad[k * w + l] = input_grad[k * w + l] =
SigmoidCrossEntropyGrad<T>(input[k * w + l], 1.0) * loss[i]; SigmoidCrossEntropyGrad<T>(input[k * w + l], 1.0) * obj *
loss[i];
} else if (obj > -0.5) { } else if (obj > -0.5) {
input_grad[k * w + l] = input_grad[k * w + l] =
SigmoidCrossEntropyGrad<T>(input[k * w + l], 0.0) * loss[i]; SigmoidCrossEntropyGrad<T>(input[k * w + l], 0.0) * loss[i];
...@@ -252,6 +257,7 @@ class Yolov3LossKernel : public framework::OpKernel<T> { ...@@ -252,6 +257,7 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
auto* input = ctx.Input<Tensor>("X"); auto* input = ctx.Input<Tensor>("X");
auto* gt_box = ctx.Input<Tensor>("GTBox"); auto* gt_box = ctx.Input<Tensor>("GTBox");
auto* gt_label = ctx.Input<Tensor>("GTLabel"); auto* gt_label = ctx.Input<Tensor>("GTLabel");
auto* gt_score = ctx.Input<Tensor>("GTScore");
auto* loss = ctx.Output<Tensor>("Loss"); auto* loss = ctx.Output<Tensor>("Loss");
auto* objness_mask = ctx.Output<Tensor>("ObjectnessMask"); auto* objness_mask = ctx.Output<Tensor>("ObjectnessMask");
auto* gt_match_mask = ctx.Output<Tensor>("GTMatchMask"); auto* gt_match_mask = ctx.Output<Tensor>("GTMatchMask");
...@@ -260,6 +266,7 @@ class Yolov3LossKernel : public framework::OpKernel<T> { ...@@ -260,6 +266,7 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
int class_num = ctx.Attr<int>("class_num"); int class_num = ctx.Attr<int>("class_num");
float ignore_thresh = ctx.Attr<float>("ignore_thresh"); float ignore_thresh = ctx.Attr<float>("ignore_thresh");
int downsample_ratio = ctx.Attr<int>("downsample_ratio"); int downsample_ratio = ctx.Attr<int>("downsample_ratio");
bool use_label_smooth = ctx.Attr<bool>("use_label_smooth");
const int n = input->dims()[0]; const int n = input->dims()[0];
const int h = input->dims()[2]; const int h = input->dims()[2];
...@@ -272,6 +279,13 @@ class Yolov3LossKernel : public framework::OpKernel<T> { ...@@ -272,6 +279,13 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
const int stride = h * w; const int stride = h * w;
const int an_stride = (class_num + 5) * stride; const int an_stride = (class_num + 5) * stride;
T label_pos = 1.0;
T label_neg = 0.0;
if (use_label_smooth) {
label_pos = 1.0 - 1.0 / static_cast<T>(class_num);
label_neg = 1.0 / static_cast<T>(class_num);
}
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
const T* gt_box_data = gt_box->data<T>(); const T* gt_box_data = gt_box->data<T>();
const int* gt_label_data = gt_label->data<int>(); const int* gt_label_data = gt_label->data<int>();
...@@ -283,6 +297,19 @@ class Yolov3LossKernel : public framework::OpKernel<T> { ...@@ -283,6 +297,19 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
int* gt_match_mask_data = int* gt_match_mask_data =
gt_match_mask->mutable_data<int>({n, b}, ctx.GetPlace()); gt_match_mask->mutable_data<int>({n, b}, ctx.GetPlace());
const T* gt_score_data;
if (!gt_score) {
Tensor gtscore;
gtscore.mutable_data<T>({n, b}, ctx.GetPlace());
math::SetConstant<platform::CPUDeviceContext, T>()(
ctx.template device_context<platform::CPUDeviceContext>(), &gtscore,
static_cast<T>(1.0));
gt_score = &gtscore;
gt_score_data = gtscore.data<T>();
} else {
gt_score_data = gt_score->data<T>();
}
// calc valid gt box mask, avoid calc duplicately in following code // calc valid gt box mask, avoid calc duplicately in following code
Tensor gt_valid_mask; Tensor gt_valid_mask;
bool* gt_valid_mask_data = bool* gt_valid_mask_data =
...@@ -355,19 +382,20 @@ class Yolov3LossKernel : public framework::OpKernel<T> { ...@@ -355,19 +382,20 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
int mask_idx = GetMaskIndex(anchor_mask, best_n); int mask_idx = GetMaskIndex(anchor_mask, best_n);
gt_match_mask_data[i * b + t] = mask_idx; gt_match_mask_data[i * b + t] = mask_idx;
if (mask_idx >= 0) { if (mask_idx >= 0) {
T score = gt_score_data[i * b + t];
int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
an_stride, stride, 0); an_stride, stride, 0);
CalcBoxLocationLoss<T>(loss_data + i, input_data, gt, anchors, best_n, CalcBoxLocationLoss<T>(loss_data + i, input_data, gt, anchors, best_n,
box_idx, gi, gj, h, input_size, stride); box_idx, gi, gj, h, input_size, stride, score);
int obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi; int obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi;
obj_mask_data[obj_idx] = 1.0; obj_mask_data[obj_idx] = score;
int label = gt_label_data[i * b + t]; int label = gt_label_data[i * b + t];
int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
an_stride, stride, 5); an_stride, stride, 5);
CalcLabelLoss<T>(loss_data + i, input_data, label_idx, label, CalcLabelLoss<T>(loss_data + i, input_data, label_idx, label,
class_num, stride); class_num, stride, label_pos, label_neg, score);
} }
} }
} }
...@@ -384,6 +412,7 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> { ...@@ -384,6 +412,7 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
auto* input = ctx.Input<Tensor>("X"); auto* input = ctx.Input<Tensor>("X");
auto* gt_box = ctx.Input<Tensor>("GTBox"); auto* gt_box = ctx.Input<Tensor>("GTBox");
auto* gt_label = ctx.Input<Tensor>("GTLabel"); auto* gt_label = ctx.Input<Tensor>("GTLabel");
auto* gt_score = ctx.Input<Tensor>("GTScore");
auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X")); auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss")); auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
auto* objness_mask = ctx.Input<Tensor>("ObjectnessMask"); auto* objness_mask = ctx.Input<Tensor>("ObjectnessMask");
...@@ -392,6 +421,7 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> { ...@@ -392,6 +421,7 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
auto anchor_mask = ctx.Attr<std::vector<int>>("anchor_mask"); auto anchor_mask = ctx.Attr<std::vector<int>>("anchor_mask");
int class_num = ctx.Attr<int>("class_num"); int class_num = ctx.Attr<int>("class_num");
int downsample_ratio = ctx.Attr<int>("downsample_ratio"); int downsample_ratio = ctx.Attr<int>("downsample_ratio");
bool use_label_smooth = ctx.Attr<bool>("use_label_smooth");
const int n = input_grad->dims()[0]; const int n = input_grad->dims()[0];
const int c = input_grad->dims()[1]; const int c = input_grad->dims()[1];
...@@ -404,6 +434,13 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> { ...@@ -404,6 +434,13 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
const int stride = h * w; const int stride = h * w;
const int an_stride = (class_num + 5) * stride; const int an_stride = (class_num + 5) * stride;
T label_pos = 1.0;
T label_neg = 0.0;
if (use_label_smooth) {
label_pos = 1.0 - 1.0 / static_cast<T>(class_num);
label_neg = 1.0 / static_cast<T>(class_num);
}
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
const T* gt_box_data = gt_box->data<T>(); const T* gt_box_data = gt_box->data<T>();
const int* gt_label_data = gt_label->data<int>(); const int* gt_label_data = gt_label->data<int>();
...@@ -414,25 +451,41 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> { ...@@ -414,25 +451,41 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace()); input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
memset(input_grad_data, 0, input_grad->numel() * sizeof(T)); memset(input_grad_data, 0, input_grad->numel() * sizeof(T));
const T* gt_score_data;
if (!gt_score) {
Tensor gtscore;
gtscore.mutable_data<T>({n, b}, ctx.GetPlace());
math::SetConstant<platform::CPUDeviceContext, T>()(
ctx.template device_context<platform::CPUDeviceContext>(), &gtscore,
static_cast<T>(1.0));
gt_score = &gtscore;
gt_score_data = gtscore.data<T>();
} else {
gt_score_data = gt_score->data<T>();
}
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
for (int t = 0; t < b; t++) { for (int t = 0; t < b; t++) {
int mask_idx = gt_match_mask_data[i * b + t]; int mask_idx = gt_match_mask_data[i * b + t];
if (mask_idx >= 0) { if (mask_idx >= 0) {
T score = gt_score_data[i * b + t];
Box<T> gt = GetGtBox(gt_box_data, i, b, t); Box<T> gt = GetGtBox(gt_box_data, i, b, t);
int gi = static_cast<int>(gt.x * w); int gi = static_cast<int>(gt.x * w);
int gj = static_cast<int>(gt.y * h); int gj = static_cast<int>(gt.y * h);
int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
an_stride, stride, 0); an_stride, stride, 0);
CalcBoxLocationLossGrad<T>( CalcBoxLocationLossGrad<T>(input_grad_data, loss_grad_data[i],
input_grad_data, loss_grad_data[i], input_data, gt, anchors, input_data, gt, anchors,
anchor_mask[mask_idx], box_idx, gi, gj, h, input_size, stride); anchor_mask[mask_idx], box_idx, gi, gj, h,
input_size, stride, score);
int label = gt_label_data[i * b + t]; int label = gt_label_data[i * b + t];
int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
an_stride, stride, 5); an_stride, stride, 5);
CalcLabelLossGrad<T>(input_grad_data, loss_grad_data[i], input_data, CalcLabelLossGrad<T>(input_grad_data, loss_grad_data[i], input_data,
label_idx, label, class_num, stride); label_idx, label, class_num, stride, label_pos,
label_neg, score);
} }
} }
} }
......
...@@ -56,8 +56,7 @@ class FakeInitOp : public framework::OperatorBase { ...@@ -56,8 +56,7 @@ class FakeInitOp : public framework::OperatorBase {
class FakeInitOpVarTypeInference : public framework::VarTypeInference { class FakeInitOpVarTypeInference : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(framework::InferVarTypeContext *ctx) const override {}
framework::BlockDesc *block) const override {}
}; };
class FakeInitOpMaker : public framework::OpProtoAndCheckerMaker { class FakeInitOpMaker : public framework::OpProtoAndCheckerMaker {
......
...@@ -114,11 +114,10 @@ class MergeIdsOp : public framework::OperatorWithKernel { ...@@ -114,11 +114,10 @@ class MergeIdsOp : public framework::OperatorWithKernel {
class MergeIdsOpInferVarType : public framework::VarTypeInference { class MergeIdsOpInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(framework::InferVarTypeContext *ctx) const override {
framework::BlockDesc *block) const override { auto input_type = ctx->GetType(ctx->Input("Ids")[0]);
auto *input_var = block->Var(op_desc.Input("Ids")[0]); for (auto &out_var : ctx->Output("Out")) {
for (auto &out_var : op_desc.Output("Out")) { ctx->SetType(out_var, input_type);
block->Var(out_var)->SetType(input_var->GetType());
} }
} }
}; };
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#include "paddle/fluid/operators/distributed_ops/split_ids_op.h" #include "paddle/fluid/operators/distributed_ops/split_ids_op.h"
#include <memory>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -71,11 +73,10 @@ class SplitIdsOp : public framework::OperatorWithKernel { ...@@ -71,11 +73,10 @@ class SplitIdsOp : public framework::OperatorWithKernel {
class SplitIdsOpInferVarType : public framework::VarTypeInference { class SplitIdsOpInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(framework::InferVarTypeContext *ctx) const override {
framework::BlockDesc *block) const override { auto input_type = ctx->GetType(ctx->Input("Ids")[0]);
auto *input_var = block->Var(op_desc.Input("Ids")[0]); for (auto &out_var : ctx->Output("Out")) {
for (auto &out_var : op_desc.Output("Out")) { ctx->SetType(out_var, input_type);
block->Var(out_var)->SetType(input_var->GetType());
} }
} }
}; };
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/expand_op.h" #include "paddle/fluid/operators/expand_op.h"
#include <memory>
#include <vector> #include <vector>
namespace paddle { namespace paddle {
...@@ -138,12 +139,28 @@ class ExpandGradOp : public framework::OperatorWithKernel { ...@@ -138,12 +139,28 @@ class ExpandGradOp : public framework::OperatorWithKernel {
} }
}; };
class ExpandGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("expand_grad");
op->SetInput("X", Input("X"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker, REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::ExpandGradOpDescMaker);
REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp); REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>, expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -81,6 +81,30 @@ struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> { ...@@ -81,6 +81,30 @@ struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
template struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, float>; template struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, float>;
template <typename T>
struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& ctx,
const framework::Tensor& in_accum,
const framework::Tensor& in_state, const T* cur_scale,
const float rate, framework::Tensor* out_state,
framework::Tensor* out_accum, framework::Tensor* out_scale) {
T accum = in_accum.data<T>()[0];
T state = in_state.data<T>()[0];
T scale = cur_scale[0];
state = rate * state + 1;
accum = rate * accum + scale;
scale = accum / state;
out_state->mutable_data<T>(ctx.GetPlace())[0] = state;
out_accum->mutable_data<T>(ctx.GetPlace())[0] = accum;
out_scale->mutable_data<T>(ctx.GetPlace())[0] = scale;
}
};
template struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext,
float>;
class FakeQuantizeAbsMaxOp : public framework::OperatorWithKernel { class FakeQuantizeAbsMaxOp : public framework::OperatorWithKernel {
public: public:
FakeQuantizeAbsMaxOp(const std::string& type, FakeQuantizeAbsMaxOp(const std::string& type,
...@@ -255,6 +279,78 @@ $$Out = round(X/scale * range)$$ ...@@ -255,6 +279,78 @@ $$Out = round(X/scale * range)$$
} }
}; };
class FakeQuantizeMovingAverageAbsMaxOp : public framework::OperatorWithKernel {
public:
FakeQuantizeMovingAverageAbsMaxOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(
ctx->HasInput("X"),
"Input(X) of FakeQuantizeMovingAverageAbsMaxOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Out"),
"Output(Out) of FakeQuantizeMovingAverageAbsMaxOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("OutScale"),
"Output(OutScale) of FakeQuantizeMovingAverageAbsMaxOp "
"should not be null");
if (ctx->HasOutput("OutState")) {
ctx->SetOutputDim("OutState", {1});
}
if (ctx->HasOutput("OutAccum")) {
ctx->SetOutputDim("OutAccum", {1});
}
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->SetOutputDim("OutScale", {1});
ctx->ShareLoD("X", /*->*/ "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
ctx.device_context());
}
};
class FakeQuantizeMovingAverageAbsMaxOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor) Input is float data type.");
AddInput("InScale", "Last scale.");
AddInput("InAccum", "Last accum.").AsDispensable();
AddInput("InState", "Last state.").AsDispensable();
AddOutput("Out", "(Tensor) Output of quantized low level tensor.");
AddOutput("OutScale", " Current scale");
AddOutput("OutState", "(Tensor) state buffer.").AsDispensable();
AddOutput("OutAccum", "(Tensor) accum buffer.").AsDispensable();
AddAttr<float>("moving_rate", "(float, default 0.9) moving rate.")
.SetDefault(0.9);
AddAttr<int>("bit_length", "(int, default 8), quantization bit number.")
.SetDefault(8)
.AddCustomChecker([](const int& bit_length) {
PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
"'bit_length' should be between 1 and 16.");
});
AddAttr<bool>("is_test",
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true.")
.SetDefault(false);
AddComment(R"DOC(
FakeQuantize operator is used in static quantization.
$$scale = (0.9*max(abs(x))+accum)/(0.9*state+1)$$
$$range = 2^{bit_length - 1} - 1$$
$$Out = round(X/scale * range)$$
)DOC");
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -273,6 +369,12 @@ REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp, ...@@ -273,6 +369,12 @@ REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp,
REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max, REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max,
ops::FakeQuantizeRangeAbsMaxKernel<CPU, float>); ops::FakeQuantizeRangeAbsMaxKernel<CPU, float>);
REGISTER_OPERATOR(fake_quantize_moving_average_abs_max,
ops::FakeQuantizeMovingAverageAbsMaxOp,
ops::FakeQuantizeMovingAverageAbsMaxOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(fake_quantize_moving_average_abs_max,
ops::FakeQuantizeMovingAverageAbsMaxKernel<CPU, float>);
REGISTER_OPERATOR(fake_channel_wise_quantize_abs_max, REGISTER_OPERATOR(fake_channel_wise_quantize_abs_max,
ops::FakeChannelWiseQuantizeAbsMaxOp, ops::FakeChannelWiseQuantizeAbsMaxOp,
ops::FakeChannelWiseQuantizeAbsMaxOpMaker, ops::FakeChannelWiseQuantizeAbsMaxOpMaker,
......
...@@ -147,6 +147,41 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> { ...@@ -147,6 +147,41 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>; template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
template <typename T>
struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& in_accum,
const framework::Tensor& in_state, const T* cur_scale,
const float rate, framework::Tensor* out_state,
framework::Tensor* out_accum, framework::Tensor* out_scale) {
const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
T accum;
memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data<T>(),
sizeof(T), 0);
T state;
memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data<T>(),
sizeof(T), 0);
T scale;
memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T),
0);
state = rate * state + 1;
accum = rate * accum + scale;
scale = accum / state;
memory::Copy(gpu_place, out_accum->mutable_data<T>(gpu_place),
platform::CPUPlace(), &accum, sizeof(T), 0);
memory::Copy(gpu_place, out_state->mutable_data<T>(gpu_place),
platform::CPUPlace(), &state, sizeof(T), 0);
memory::Copy(gpu_place, out_scale->mutable_data<T>(gpu_place),
platform::CPUPlace(), &scale, sizeof(T), 0);
}
};
template struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext,
float>;
template <typename T> template <typename T>
struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> { struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx, void operator()(const platform::CUDADeviceContext& ctx,
...@@ -178,3 +213,6 @@ REGISTER_OP_CUDA_KERNEL(fake_channel_wise_quantize_abs_max, ...@@ -178,3 +213,6 @@ REGISTER_OP_CUDA_KERNEL(fake_channel_wise_quantize_abs_max,
ops::FakeChannelWiseQuantizeAbsMaxKernel<CUDA, float>); ops::FakeChannelWiseQuantizeAbsMaxKernel<CUDA, float>);
REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max, REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max,
ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>); ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>);
REGISTER_OP_CUDA_KERNEL(
fake_quantize_moving_average_abs_max,
ops::FakeQuantizeMovingAverageAbsMaxKernel<CUDA, float>);
...@@ -42,12 +42,20 @@ struct FindRangeAbsMaxFunctor { ...@@ -42,12 +42,20 @@ struct FindRangeAbsMaxFunctor {
framework::Tensor* scales_arr, framework::Tensor* out_scale); framework::Tensor* scales_arr, framework::Tensor* out_scale);
}; };
template <typename DeviceContext, typename T>
struct FindMovingAverageAbsMaxFunctor {
void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum,
const framework::Tensor& in_state,
const framework::Tensor& cur_scale,
framework::Tensor* out_state, framework::Tensor* out_accum,
framework::Tensor* out_scale);
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> { class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<framework::Tensor>("X"); auto* in = context.Input<framework::Tensor>("X");
auto* out = context.Output<framework::Tensor>("Out"); auto* out = context.Output<framework::Tensor>("Out");
auto* out_scale = context.Output<framework::Tensor>("OutScale"); auto* out_scale = context.Output<framework::Tensor>("OutScale");
T* out_s = out_scale->mutable_data<T>(context.GetPlace()); T* out_s = out_scale->mutable_data<T>(context.GetPlace());
...@@ -138,5 +146,54 @@ class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> { ...@@ -138,5 +146,54 @@ class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
} }
}; };
template <typename DeviceContext, typename T>
class FakeQuantizeMovingAverageAbsMaxKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<framework::Tensor>("X");
auto* in_scale = context.Input<framework::Tensor>("InScale");
auto* out = context.Output<framework::Tensor>("Out");
out->mutable_data<T>(context.GetPlace());
bool is_test = context.Attr<bool>("is_test");
int bit_length = context.Attr<int>("bit_length");
int bin_cnt = std::pow(2, bit_length - 1) - 1;
auto& dev_ctx = context.template device_context<DeviceContext>();
// testing
if (is_test) {
ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *in_scale,
bin_cnt, out);
return;
}
// training
auto* in_accum = context.Input<framework::Tensor>("InAccum");
auto* in_state = context.Input<framework::Tensor>("InState");
auto& allocator =
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
auto cur_scale = allocator.Allocate(1 * sizeof(T));
T* cur_scale_data = static_cast<T*>(cur_scale->ptr());
FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(), in->numel(),
cur_scale_data);
auto* out_state = context.Output<framework::Tensor>("OutState");
auto* out_accum = context.Output<framework::Tensor>("OutAccum");
auto* out_scale = context.Output<framework::Tensor>("OutScale");
out_state->mutable_data<T>(context.GetPlace());
out_accum->mutable_data<T>(context.GetPlace());
out_scale->mutable_data<T>(context.GetPlace());
float moving_rate = context.Attr<float>("moving_rate");
FindMovingAverageAbsMaxFunctor<DeviceContext, T>()(
dev_ctx, *in_accum, *in_state, cur_scale_data, moving_rate, out_state,
out_accum, out_scale);
ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
bin_cnt, out);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -55,17 +55,8 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -55,17 +55,8 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
"The input tensor Input's rank of FCOp should be larger than " "The input tensor Input's rank of FCOp should be larger than "
"in_num_col_dims."); "in_num_col_dims.");
auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims);
PADDLE_ENFORCE_EQ(
in_mat_dims[1], w_dims[0],
"Fully Connected input and weigth size do not match. %s, %s");
std::vector<int64_t> output_dims; std::vector<int64_t> output_dims;
output_dims.reserve(static_cast<size_t>(in_num_col_dims + 1)); FCOutputSize(in_dims, w_dims, output_dims, in_num_col_dims);
for (int i = 0; i < in_num_col_dims; ++i) {
output_dims.push_back(in_dims[i]);
}
output_dims.push_back(w_dims[1]);
ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
ctx->ShareLoD("Input", "Out"); ctx->ShareLoD("Input", "Out");
...@@ -128,6 +119,9 @@ void FCOpMaker::Make() { ...@@ -128,6 +119,9 @@ void FCOpMaker::Make() {
AddAttr<bool>("use_mkldnn", AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel") "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
"Skip calling InferShape() function in the runtime.")
.SetDefault(true);
AddComment(R"DOC( AddComment(R"DOC(
Fully Connected Operator. Fully Connected Operator.
...@@ -142,13 +136,20 @@ class FCOpKernel : public framework::OpKernel<T> { ...@@ -142,13 +136,20 @@ class FCOpKernel : public framework::OpKernel<T> {
void Compute(const paddle::framework::ExecutionContext& ctx) const override { void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace."); "It must use CPUPlace.");
auto input = ctx.Input<Tensor>("Input"); auto input = ctx.Input<framework::LoDTensor>("Input");
auto w = ctx.Input<Tensor>("W"); auto w = ctx.Input<Tensor>("W");
auto bias = ctx.Input<Tensor>("Bias"); auto bias = ctx.Input<Tensor>("Bias");
auto output = ctx.Output<Tensor>("Out"); auto output = ctx.Output<framework::LoDTensor>("Out");
int in_num_col_dims = ctx.Attr<int>("in_num_col_dims");
auto w_dims = w->dims(); auto w_dims = w->dims();
std::vector<int64_t> output_dims;
FCOutputSize(input->dims(), w_dims, output_dims, in_num_col_dims);
output->Resize(framework::make_ddim(output_dims));
output->set_lod(input->lod());
auto out_dims = output->dims(); auto out_dims = output->dims();
int M = framework::product(out_dims) / out_dims[out_dims.size() - 1]; int M = framework::product(out_dims) / w_dims[1];
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
const T* w_data = w->data<T>(); const T* w_data = w->data<T>();
......
...@@ -48,5 +48,21 @@ class FCOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -48,5 +48,21 @@ class FCOpMaker : public framework::OpProtoAndCheckerMaker {
void Make() override; void Make() override;
}; };
inline void FCOutputSize(const framework::DDim& in_dims,
const framework::DDim& w_dims,
std::vector<int64_t>& out_dims, // NOLINT
int in_num_col_dims) {
auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims);
PADDLE_ENFORCE_EQ(
in_mat_dims[1], w_dims[0],
"Fully Connected input and weigth size do not match. %s, %s");
out_dims.reserve(static_cast<size_t>(in_num_col_dims + 1));
for (int i = 0; i < in_num_col_dims; ++i) {
out_dims.push_back(in_dims[i]);
}
out_dims.push_back(w_dims[1]);
}
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -39,12 +39,11 @@ class FillConstantOp : public framework::OperatorWithKernel { ...@@ -39,12 +39,11 @@ class FillConstantOp : public framework::OperatorWithKernel {
class FillConstantOpVarTypeInference : public framework::VarTypeInference { class FillConstantOpVarTypeInference : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc& op_desc, void operator()(framework::InferVarTypeContext* ctx) const override {
framework::BlockDesc* block) const override {
auto data_type = static_cast<framework::proto::VarType::Type>( auto data_type = static_cast<framework::proto::VarType::Type>(
boost::get<int>(op_desc.GetAttr("dtype"))); boost::get<int>(ctx->GetAttr("dtype")));
auto& out_var_name = op_desc.Output("Out").front(); auto& out_var_name = ctx->Output("Out").front();
block->Var(out_var_name)->SetDataType(data_type); ctx->SetDataType(out_var_name, data_type);
} }
}; };
......
...@@ -88,7 +88,8 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -88,7 +88,8 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker {
"(boolean, default false) " "(boolean, default false) "
"Sparse update.") "Sparse update.")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "") AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
"Skip calling InferShape() function in the runtime.")
.SetDefault(true); .SetDefault(true);
AddComment(R"DOC( AddComment(R"DOC(
FusedEmbeddingSeqPool Operator. FusedEmbeddingSeqPool Operator.
...@@ -137,22 +138,20 @@ class FusedEmbeddingSeqPoolOpGrad : public framework::OperatorWithKernel { ...@@ -137,22 +138,20 @@ class FusedEmbeddingSeqPoolOpGrad : public framework::OperatorWithKernel {
class FusedEmbeddingSeqPoolOpGradVarTypeInference class FusedEmbeddingSeqPoolOpGradVarTypeInference
: public framework::VarTypeInference { : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc& op_desc, void operator()(framework::InferVarTypeContext* ctx) const override {
framework::BlockDesc* block) const override { auto out_var_name = ctx->Output(framework::GradVarName("W")).front();
auto out_var_name = op_desc.Output(framework::GradVarName("W")).front(); auto attr = ctx->GetAttr("is_sparse");
auto attr = op_desc.GetAttr("is_sparse");
bool is_sparse = boost::get<bool>(attr); bool is_sparse = boost::get<bool>(attr);
if (is_sparse) { if (is_sparse) {
VLOG(3) << "fused_embedding_seq_pool_grad op " VLOG(3) << "fused_embedding_seq_pool_grad op "
<< framework::GradVarName("W") << " is set to SelectedRows"; << framework::GradVarName("W") << " is set to SelectedRows";
block->Var(out_var_name) ctx->SetType(out_var_name, framework::proto::VarType::SELECTED_ROWS);
->SetType(framework::proto::VarType::SELECTED_ROWS);
} else { } else {
VLOG(3) << "fused_embedding_seq_pool_grad op " VLOG(3) << "fused_embedding_seq_pool_grad op "
<< framework::GradVarName("W") << " is set to LoDTensor"; << framework::GradVarName("W") << " is set to LoDTensor";
block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR); ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
} }
block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType()); ctx->SetDataType(out_var_name, ctx->GetDataType(ctx->Input("W")[0]));
} }
}; };
......
...@@ -81,15 +81,12 @@ GetTensorFromSelectedRows is used to get the tensor from SelectedRows. ...@@ -81,15 +81,12 @@ GetTensorFromSelectedRows is used to get the tensor from SelectedRows.
class GetTensorFromSelectedRowsOpVarTypeInference class GetTensorFromSelectedRowsOpVarTypeInference
: public framework::VarTypeInference { : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(framework::InferVarTypeContext *ctx) const { // NOLINT
framework::BlockDesc *block) const final { auto out_var_name = ctx->Output("Out").front();
auto out_var_name = op_desc.Output("Out").front(); auto in_var_name = ctx->Input("X").front();
auto in_var_name = op_desc.Input("X").front();
ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
auto out_var = block->FindRecursiveOrCreateVar(out_var_name); ctx->SetDataType(out_var_name, ctx->GetDataType(in_var_name));
auto in_var = block->FindRecursiveOrCreateVar(in_var_name);
out_var.SetType(framework::proto::VarType::LOD_TENSOR);
out_var.SetDataType(in_var.GetDataType());
} }
}; };
......
...@@ -54,7 +54,8 @@ $$Out = scale * X$$ ...@@ -54,7 +54,8 @@ $$Out = scale * X$$
)DOC"); )DOC");
AddAttr<int>("num_hash", "").SetDefault(1); AddAttr<int>("num_hash", "").SetDefault(1);
AddAttr<int>("mod_by", "").SetDefault(100000); AddAttr<int>("mod_by", "").SetDefault(100000);
AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "") AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
"Skip calling InferShape() function in the runtime.")
.SetDefault(true); .SetDefault(true);
} }
}; };
......
...@@ -197,38 +197,32 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel { ...@@ -197,38 +197,32 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
class HierarchicalSigmoidGradOpGradVarTypeInference class HierarchicalSigmoidGradOpGradVarTypeInference
: public framework::VarTypeInference { : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc& op_desc, void operator()(framework::InferVarTypeContext* ctx) const override {
framework::BlockDesc* block) const override { auto w_grad_var_name = ctx->Output(framework::GradVarName("W")).front();
auto w_grad_var_name = op_desc.Output(framework::GradVarName("W")).front(); auto bias_grad_var_name_vec = ctx->Output(framework::GradVarName("Bias"));
auto bias_grad_var_name_vec =
op_desc.Output(framework::GradVarName("Bias"));
std::string bias_grad_var_name; std::string bias_grad_var_name;
bool hasBias = false; bool hasBias = false;
if (bias_grad_var_name_vec.size()) { if (bias_grad_var_name_vec.size()) {
hasBias = true; hasBias = true;
bias_grad_var_name = bias_grad_var_name = ctx->Output(framework::GradVarName("Bias")).front();
op_desc.Output(framework::GradVarName("Bias")).front();
} }
auto attr = op_desc.GetAttr("is_sparse"); auto attr = ctx->GetAttr("is_sparse");
bool is_sparse = boost::get<bool>(attr); bool is_sparse = boost::get<bool>(attr);
if (is_sparse) { if (is_sparse) {
VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W") VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
<< " is set to SelectedRows"; << " is set to SelectedRows";
block->Var(w_grad_var_name) ctx->SetType(w_grad_var_name, framework::proto::VarType::SELECTED_ROWS);
->SetType(framework::proto::VarType::SELECTED_ROWS);
} else { } else {
VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W") VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
<< " is set to LoDTensor"; << " is set to LoDTensor";
block->Var(w_grad_var_name) ctx->SetType(w_grad_var_name, framework::proto::VarType::LOD_TENSOR);
->SetType(framework::proto::VarType::LOD_TENSOR);
} }
if (hasBias) { if (hasBias) {
VLOG(30) << "hierarchical_sigmoid_grad op " VLOG(30) << "hierarchical_sigmoid_grad op "
<< framework::GradVarName("Bias") << " is set to LoDTensor"; << framework::GradVarName("Bias") << " is set to LoDTensor";
block->Var(bias_grad_var_name) ctx->SetType(bias_grad_var_name, framework::proto::VarType::LOD_TENSOR);
->SetType(framework::proto::VarType::LOD_TENSOR);
} }
block->Var(w_grad_var_name)->SetDataType(block->Var("W")->GetDataType()); ctx->SetDataType(w_grad_var_name, ctx->GetDataType(ctx->Input("W")[0]));
} }
}; };
......
...@@ -64,11 +64,9 @@ class LoDRankTableInferShape : public framework::InferShapeBase { ...@@ -64,11 +64,9 @@ class LoDRankTableInferShape : public framework::InferShapeBase {
class LoDRankTableInferVarType : public framework::VarTypeInference { class LoDRankTableInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(framework::InferVarTypeContext *ctx) const override {
framework::BlockDesc *block) const override { for (auto &o : ctx->Output("Out")) {
for (auto &o : op_desc.Output("Out")) { ctx->SetType(o, framework::proto::VarType::LOD_RANK_TABLE);
block->FindRecursiveOrCreateVar(o).SetType(
framework::proto::VarType::LOD_RANK_TABLE);
} }
} }
}; };
......
...@@ -201,10 +201,9 @@ class LoDTensorToArrayInferShape : public framework::InferShapeBase { ...@@ -201,10 +201,9 @@ class LoDTensorToArrayInferShape : public framework::InferShapeBase {
class LoDTensorToArrayInferVarType : public framework::VarTypeInference { class LoDTensorToArrayInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(framework::InferVarTypeContext *ctx) const override {
framework::BlockDesc *block) const override { for (auto &out_var : ctx->Output("Out")) {
for (auto &out_var : op_desc.Output("Out")) { ctx->SetType(out_var, framework::proto::VarType::LOD_TENSOR_ARRAY);
block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
} }
} }
}; };
......
...@@ -147,22 +147,20 @@ class LookupTableOpGrad : public framework::OperatorWithKernel { ...@@ -147,22 +147,20 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
class LookupTableOpGradVarTypeInference : public framework::VarTypeInference { class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc& op_desc, void operator()(framework::InferVarTypeContext* ctx) const override {
framework::BlockDesc* block) const override { auto out_var_name = ctx->Output(framework::GradVarName("W")).front();
auto out_var_name = op_desc.Output(framework::GradVarName("W")).front(); auto attr = ctx->GetAttr("is_sparse");
auto attr = op_desc.GetAttr("is_sparse");
bool is_sparse = boost::get<bool>(attr); bool is_sparse = boost::get<bool>(attr);
if (is_sparse) { if (is_sparse) {
VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
<< " is set to SelectedRows"; << " is set to SelectedRows";
block->Var(out_var_name) ctx->SetType(out_var_name, framework::proto::VarType::SELECTED_ROWS);
->SetType(framework::proto::VarType::SELECTED_ROWS);
} else { } else {
VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W") VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
<< " is set to LoDTensor"; << " is set to LoDTensor";
block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR); ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
} }
block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType()); ctx->SetDataType(out_var_name, ctx->GetDataType(ctx->Input("W")[0]));
} }
}; };
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/hostdevice.h"
#include "math.h" // NOLINT
namespace paddle {
namespace operators {
inline HOSTDEVICE platform::float16 real_exp(platform::float16 x) {
return static_cast<platform::float16>(::expf(static_cast<float>(x)));
}
inline HOSTDEVICE float real_exp(float x) { return ::expf(x); }
inline HOSTDEVICE double real_exp(double x) { return ::exp(x); }
inline HOSTDEVICE platform::float16 real_log(platform::float16 x) {
return static_cast<platform::float16>(::logf(static_cast<float>(x)));
}
inline HOSTDEVICE float real_log(float x) { return ::logf(x); }
inline HOSTDEVICE double real_log(double x) { return ::log(x); }
} // namespace operators
} // namespace paddle
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math.h"
#include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
...@@ -20,17 +21,6 @@ namespace paddle { ...@@ -20,17 +21,6 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
namespace {
__device__ __forceinline__ float real_log(float x) { return logf(x); }
__device__ __forceinline__ double real_log(double x) { return log(x); }
__device__ __forceinline__ platform::float16 real_log(
const platform::float16& val) {
return static_cast<platform::float16>(logf(static_cast<float>(val)));
}
template <typename T> template <typename T>
__global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label, __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
const int N, const int D, const int N, const int D,
...@@ -61,7 +51,6 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, ...@@ -61,7 +51,6 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
Y[blockIdx.x] = -val; Y[blockIdx.x] = -val;
} }
} }
} // namespace
template <typename T> template <typename T>
class CrossEntropyFunctor<platform::CUDADeviceContext, T> { class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
......
...@@ -592,6 +592,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -592,6 +592,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
platform::SetDstMemoryHandler<uint8_t>(ctx, output, handler, platform::SetDstMemoryHandler<uint8_t>(ctx, output, handler,
&dst_memory_p); &dst_memory_p);
} else { } else {
need_s8_to_u8 = fuse_relu;
platform::SetDstMemoryHandler<int8_t>(ctx, output, handler, platform::SetDstMemoryHandler<int8_t>(ctx, output, handler,
&dst_memory_p); &dst_memory_p);
} }
......
...@@ -123,7 +123,7 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -123,7 +123,7 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>(); auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine(); const auto& mkldnn_engine = dev_ctx.GetEngine();
auto input = ctx.Input<Tensor>("Input"); auto input = ctx.Input<framework::LoDTensor>("Input");
auto w = ctx.Input<Tensor>("W"); auto w = ctx.Input<Tensor>("W");
auto bias = ctx.Input<Tensor>("Bias"); auto bias = ctx.Input<Tensor>("Bias");
...@@ -151,7 +151,13 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -151,7 +151,13 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
const T* w_data = w->data<T>(); const T* w_data = w->data<T>();
auto output = ctx.Output<Tensor>("Out"); auto output = ctx.Output<framework::LoDTensor>("Out");
int in_num_col_dims = ctx.Attr<int>("in_num_col_dims");
std::vector<int64_t> output_dims;
FCOutputSize(input->dims(), w->dims(), output_dims, in_num_col_dims);
output->Resize(framework::make_ddim(output_dims));
output->set_lod(input->lod());
T* output_data = output->mutable_data<T>(ctx.GetPlace()); T* output_data = output->mutable_data<T>(ctx.GetPlace());
auto dst_memory = mem.dst(output_data); auto dst_memory = mem.dst(output_data);
...@@ -204,19 +210,21 @@ class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -204,19 +210,21 @@ class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input")); Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
Tensor* w_grad = ctx.Output<Tensor>(framework::GradVarName("W")); Tensor* w_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
const Tensor* input = ctx.Input<Tensor>("Input");
const T* input_data = input->data<T>();
const Tensor* w = ctx.Input<Tensor>("W");
const T* w_data = w->data<T>();
if (input_grad) { if (input_grad) {
input_grad->Resize(input->dims());
input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace()); input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
} }
if (w_grad) { if (w_grad) {
w_grad->Resize(w->dims());
w_grad_data = w_grad->mutable_data<T>(ctx.GetPlace()); w_grad_data = w_grad->mutable_data<T>(ctx.GetPlace());
} }
const Tensor* input = ctx.Input<Tensor>("Input");
const T* input_data = input->data<T>();
const Tensor* w = ctx.Input<Tensor>("W");
const T* w_data = w->data<T>();
const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out")); const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
const T* out_grad_data = out_grad->data<T>(); const T* out_grad_data = out_grad->data<T>();
......
...@@ -73,6 +73,29 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -73,6 +73,29 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
} }
}; };
template <typename T>
class TransposeINT8MKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
std::vector<int> axis_int8 = {0, 2, 3, 1};
if (axis.size() != 1) {
PADDLE_ENFORCE_EQ(axis.size(), axis_int8.size());
for (size_t i = 0; i < axis.size(); i++) {
PADDLE_ENFORCE_EQ(axis[i], axis_int8[i],
"Current INT8 MKLDNN Transpose kernel only surpport "
"axis with [0, 2, 3, 1] due to MKL-DNN kernel "
"implementation.");
}
}
auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out");
output->ShareDataWith(*input);
output->set_layout(DataLayout::kMKLDNN);
output->set_format(input->format());
}
};
template <typename T> template <typename T>
class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
public: public:
...@@ -140,7 +163,10 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -140,7 +163,10 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace, REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace,
ops::TransposeMKLDNNOpKernel<float>); ops::TransposeMKLDNNOpKernel<float>,
ops::TransposeINT8MKLDNNOpKernel<uint8_t>,
ops::TransposeINT8MKLDNNOpKernel<int8_t>);
REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace, REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace,
ops::TransposeMKLDNNOpKernel<float>); ops::TransposeMKLDNNOpKernel<float>);
......
...@@ -60,12 +60,9 @@ class NCCLInitOp : public framework::OperatorBase { ...@@ -60,12 +60,9 @@ class NCCLInitOp : public framework::OperatorBase {
class NCCLInitOpVarTypeInference : public framework::VarTypeInference { class NCCLInitOpVarTypeInference : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(framework::InferVarTypeContext *ctx) const override {
framework::BlockDesc *block) const override { auto out_var_name = ctx->Output("Communicator").front();
auto out_var_name = op_desc.Output("Communicator").front(); ctx->SetType(out_var_name, framework::proto::VarType::RAW);
auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
auto var_type = framework::proto::VarType::RAW;
out_var.SetType(var_type);
} }
}; };
......
...@@ -237,23 +237,21 @@ class NCEOpGrad : public framework::OperatorWithKernel { ...@@ -237,23 +237,21 @@ class NCEOpGrad : public framework::OperatorWithKernel {
class NCEOpGradVarTypeInference : public framework::VarTypeInference { class NCEOpGradVarTypeInference : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(framework::InferVarTypeContext *ctx) const override {
framework::BlockDesc *block) const override { auto weight_grad = ctx->Output(framework::GradVarName("Weight")).front();
auto weight_grad = op_desc.Output(framework::GradVarName("Weight")).front();
auto attr = op_desc.GetAttr("is_sparse"); auto attr = ctx->GetAttr("is_sparse");
bool is_sparse = boost::get<bool>(attr); bool is_sparse = boost::get<bool>(attr);
if (is_sparse) { if (is_sparse) {
VLOG(3) << "nce_op_grad op " << weight_grad << " and " VLOG(3) << "nce_op_grad op " << weight_grad << " and "
<< " is set to SelectedRows"; << " is set to SelectedRows";
block->Var(weight_grad) ctx->SetType(weight_grad, framework::proto::VarType::SELECTED_ROWS);
->SetType(framework::proto::VarType::SELECTED_ROWS);
} else { } else {
VLOG(3) << "nce_op_grad op " << weight_grad << " and " VLOG(3) << "nce_op_grad op " << weight_grad << " and "
<< " is set to LoDTensor"; << " is set to LoDTensor";
block->Var(weight_grad)->SetType(framework::proto::VarType::LOD_TENSOR); ctx->SetType(weight_grad, framework::proto::VarType::LOD_TENSOR);
} }
block->Var(weight_grad)->SetDataType(block->Var("Input")->GetDataType()); ctx->SetDataType(weight_grad, ctx->GetDataType(ctx->Input("Input")[0]));
} }
}; };
......
...@@ -37,8 +37,7 @@ class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -37,8 +37,7 @@ class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker {
class NgraphEngineInferVarType : public framework::VarTypeInference { class NgraphEngineInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(framework::InferVarTypeContext *ctx) const override {}
framework::BlockDesc *block) const override {}
}; };
} // namespace operators } // namespace operators
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <math.h> // for sqrt in CPU and CUDA #include <math.h> // for sqrt in CPU and CUDA
#include <Eigen/Dense> #include <Eigen/Dense>
#include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
...@@ -311,17 +312,17 @@ struct SparseAdamFunctor<T, CPUAdam> { ...@@ -311,17 +312,17 @@ struct SparseAdamFunctor<T, CPUAdam> {
T beta1_pow = *beta1_pow_; T beta1_pow = *beta1_pow_;
T beta2_pow = *beta2_pow_; T beta2_pow = *beta2_pow_;
lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
size_t row_count = numel / row_numel_; int64_t row_count = static_cast<int64_t>(numel / row_numel_);
for (size_t i = 0U, j = 0U; i != row_count; ++i) { for (int64_t i = 0, j = 0; i != row_count; ++i) {
if (i == *(rows_ + j)) { if (i == *(rows_ + j)) {
for (size_t k = 0U; k != row_numel_; ++k) { for (int64_t k = 0; k != row_numel_; ++k) {
T g = grad_[j * row_numel_ + k]; T g = grad_[j * row_numel_ + k];
adam_update(i * row_numel_ + k, g); adam_update(i * row_numel_ + k, g);
} }
++j; ++j;
} else { } else {
for (size_t k = 0U; k != row_numel_; ++k) { for (int64_t k = 0; k != row_numel_; ++k) {
T mom1 = moment1_[i * row_numel_ + k]; T mom1 = moment1_[i * row_numel_ + k];
T mom2 = moment2_[i * row_numel_ + k]; T mom2 = moment2_[i * row_numel_ + k];
T p = param_[i * row_numel_ + k]; T p = param_[i * row_numel_ + k];
...@@ -427,43 +428,23 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -427,43 +428,23 @@ class AdamOpKernel : public framework::OpKernel<T> {
} }
} }
framework::SelectedRows cpu_grad_merge; framework::SelectedRows tmp_grad_merge;
const framework::SelectedRows* grad_merge_ptr; const framework::SelectedRows* grad_merge_ptr;
if (is_strict_sorted) { if (is_strict_sorted) {
grad_merge_ptr = &grad; grad_merge_ptr = &grad;
} else { } else {
// merge duplicated rows if any. // merge duplicated rows if any.
// The rows of grad_merge have been sorted inside MergeAdd functor // The rows of grad_merge have been sorted inside MergeAdd functor
framework::SelectedRows* grad_merge_var;
scatter::MergeAdd<DeviceContext, T> merge_func; scatter::MergeAdd<DeviceContext, T> merge_func;
if (platform::is_cpu_place(ctx.GetPlace())) {
grad_merge_var = &cpu_grad_merge;
} else {
// FIXME(qiao): GPU also need to fix this
grad_merge_var = const_cast<framework::Scope&>(ctx.scope())
.Var()
->GetMutable<framework::SelectedRows>();
}
merge_func(ctx.template device_context<DeviceContext>(), grad, merge_func(ctx.template device_context<DeviceContext>(), grad,
grad_merge_var, true); &tmp_grad_merge, true);
grad_merge_ptr = grad_merge_var; grad_merge_ptr = &tmp_grad_merge;
} }
auto& grad_merge = *grad_merge_ptr; auto& grad_merge = *grad_merge_ptr;
auto& grad_tensor = grad_merge.value(); auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data<T>(); const T* grad_data = grad_tensor.template data<T>();
const int64_t* rows = nullptr; const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace());
// When compiled without CUDA, the CUDAData() interface should not be
// provided.
#if defined(PADDLE_WITH_CUDA)
if (platform::is_gpu_place(ctx.GetPlace())) {
rows = grad_merge.rows().CUDAData(ctx.GetPlace());
} else {
#endif
rows = grad_merge.rows().data();
#if defined(PADDLE_WITH_CUDA)
}
#endif
auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
if (platform::is_cpu_place(ctx.GetPlace())) { if (platform::is_cpu_place(ctx.GetPlace())) {
...@@ -488,7 +469,7 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -488,7 +469,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
} }
} }
#ifndef _WIN32 #ifndef _WIN32
else if (FLAGS_inner_op_parallelism > 1 && else if (FLAGS_inner_op_parallelism > 1 && // NOLINT
min_row_size_to_use_multithread > 0 && min_row_size_to_use_multithread > 0 &&
param.dims()[0] > min_row_size_to_use_multithread) { param.dims()[0] > min_row_size_to_use_multithread) {
VLOG(3) << "use multi thread, inner_op_parallelism=" VLOG(3) << "use multi thread, inner_op_parallelism="
...@@ -516,11 +497,11 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -516,11 +497,11 @@ class AdamOpKernel : public framework::OpKernel<T> {
for (int i = 0; i < FLAGS_inner_op_parallelism; ++i) { for (int i = 0; i < FLAGS_inner_op_parallelism; ++i) {
int64_t start = i * line_in_each_thread; int64_t start = i * line_in_each_thread;
int64_t end = (i + 1) * line_in_each_thread; int64_t end = (i + 1) * line_in_each_thread;
if (start >= param_row_count) { if (start >= static_cast<int64_t>(param_row_count)) {
break; break;
} }
if (end > param_row_count) { if (end > static_cast<int64_t>(param_row_count)) {
end = param_row_count; end = static_cast<int64_t>(param_row_count);
} }
fs.push_back( fs.push_back(
framework::Async([&functor, &row_id_to_grad_row_offset, framework::Async([&functor, &row_id_to_grad_row_offset,
...@@ -545,8 +526,8 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -545,8 +526,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
} }
for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
} }
#endif // !_WIN32 #endif // !_WIN32
else { else { // NOLINT
functor(param.numel()); functor(param.numel());
} }
} else if (platform::is_gpu_place(ctx.GetPlace())) { } else if (platform::is_gpu_place(ctx.GetPlace())) {
......
...@@ -56,9 +56,9 @@ This optimizer use LARS (https://arxiv.org/abs/1708.03888) to optimize each ...@@ -56,9 +56,9 @@ This optimizer use LARS (https://arxiv.org/abs/1708.03888) to optimize each
weight using a local learning rate: weight using a local learning rate:
$$ $$
local\_lr = \eta * local\_lr = \eta *
\frac{\left \| param \right \|}{\left \| grad \right \| + \beta *\left \| param \right \|} \\ \frac{\left \| param \right \|}{\left \| grad \right \| + \beta *\left \| param \right \|} \\
velocity = mu * velocity + velocity = mu * velocity +
local\_lr * (grad + \beta * param) \\ local\_lr * (grad + \beta * param) \\
param = param - velocity. \\ param = param - velocity. \\
$$ $$
...@@ -72,8 +72,7 @@ use L2 regularizers in case of using LARS. ...@@ -72,8 +72,7 @@ use L2 regularizers in case of using LARS.
class LarsMomentumOpVarTypeInference : public framework::VarTypeInference { class LarsMomentumOpVarTypeInference : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(framework::InferVarTypeContext* ctx) const override {}
framework::BlockDesc *block) const override {}
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
......
...@@ -21,18 +21,14 @@ using Tensor = framework::Tensor; ...@@ -21,18 +21,14 @@ using Tensor = framework::Tensor;
class MomentumOpInferVarType : public framework::VarTypeInference { class MomentumOpInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc& op_desc, void operator()(framework::InferVarTypeContext* ctx) const override {
framework::BlockDesc* block) const override { auto& input_var = ctx->Input("Param")[0];
auto input_var = op_desc.Input("Param")[0]; for (auto& out_var : ctx->Output("ParamOut")) {
for (auto& out_var : op_desc.Output("ParamOut")) { if (ctx->GetType(input_var) == framework::proto::VarType::SELECTED_ROWS) {
if (block->FindRecursiveOrCreateVar(input_var).GetType() == ctx->SetType(out_var, framework::proto::VarType::SELECTED_ROWS);
framework::proto::VarType::SELECTED_ROWS) { } else if (ctx->GetType(input_var) ==
block->FindRecursiveOrCreateVar(out_var).SetType(
framework::proto::VarType::SELECTED_ROWS);
} else if (block->FindRecursiveOrCreateVar(input_var).GetType() ==
framework::proto::VarType::LOD_TENSOR) { framework::proto::VarType::LOD_TENSOR) {
block->FindRecursiveOrCreateVar(out_var).SetType( ctx->SetType(out_var, framework::proto::VarType::LOD_TENSOR);
framework::proto::VarType::LOD_TENSOR);
} else { } else {
PADDLE_THROW( PADDLE_THROW(
"Only support LodTensor and SelectedRows, Unexpected Input Type."); "Only support LodTensor and SelectedRows, Unexpected Input Type.");
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -69,6 +70,7 @@ class MomentumOp : public framework::OperatorWithKernel { ...@@ -69,6 +70,7 @@ class MomentumOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("ParamOut", param_dim);
ctx->SetOutputDim("VelocityOut", param_dim); ctx->SetOutputDim("VelocityOut", param_dim);
} }
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param")); auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
...@@ -351,23 +353,14 @@ class MomentumOpKernel : public framework::OpKernel<T> { ...@@ -351,23 +353,14 @@ class MomentumOpKernel : public framework::OpKernel<T> {
VLOG(3) << "Grad SelectedRows contains no data!"; VLOG(3) << "Grad SelectedRows contains no data!";
return; return;
} }
auto* merged_grad = const_cast<framework::Scope&>(ctx.scope())
.Var() framework::SelectedRows tmp_merged_grad;
->GetMutable<framework::SelectedRows>(); framework::SelectedRows* merged_grad = &tmp_merged_grad;
math::scatter::MergeAdd<DeviceContext, T> merge_func; math::scatter::MergeAdd<DeviceContext, T> merge_func;
merge_func(ctx.template device_context<DeviceContext>(), *grad, merge_func(ctx.template device_context<DeviceContext>(), *grad,
merged_grad); merged_grad);
const int64_t* rows = nullptr; const int64_t* rows = merged_grad->rows().Data(ctx.GetPlace());
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
rows = merged_grad->rows().CUDAData(ctx.GetPlace());
} else {
#endif
rows = merged_grad->rows().data();
#ifdef PADDLE_WITH_CUDA
}
#endif
int64_t row_numel = int64_t row_numel =
merged_grad->value().numel() / merged_grad->rows().size(); merged_grad->value().numel() / merged_grad->rows().size();
platform::ForRange<DeviceContext> for_range( platform::ForRange<DeviceContext> for_range(
......
...@@ -216,24 +216,14 @@ class RmspropOpKernel : public framework::OpKernel<T> { ...@@ -216,24 +216,14 @@ class RmspropOpKernel : public framework::OpKernel<T> {
} }
} else if (grad_var->IsType<framework::SelectedRows>()) { } else if (grad_var->IsType<framework::SelectedRows>()) {
auto &grad = grad_var->Get<framework::SelectedRows>(); auto &grad = grad_var->Get<framework::SelectedRows>();
auto *merged_grad = const_cast<framework::Scope &>(ctx.scope()) framework::SelectedRows tmp_merged_grad;
.Var() framework::SelectedRows *merged_grad = &tmp_merged_grad;
->GetMutable<framework::SelectedRows>();
math::scatter::MergeAdd<DeviceContext, T> merge_func; math::scatter::MergeAdd<DeviceContext, T> merge_func;
merge_func(dev_ctx, grad, merged_grad); merge_func(dev_ctx, grad, merged_grad);
platform::ForRange<DeviceContext> for_range(dev_ctx, limit); platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
const int64_t *rows; const int64_t *rows = merged_grad->rows().Data(ctx.GetPlace());
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
rows = merged_grad->rows().CUDAData(ctx.GetPlace());
} else {
#endif
rows = merged_grad->rows().data();
#ifdef PADDLE_WITH_CUDA
}
#endif
auto &merged_tensor = merged_grad->value(); auto &merged_tensor = merged_grad->value();
int64_t row_count = merged_grad->rows().size(); int64_t row_count = merged_grad->rows().size();
int64_t row_numel = merged_tensor.numel() / row_count; int64_t row_numel = merged_tensor.numel() / row_count;
......
...@@ -50,20 +50,18 @@ class SGDOp : public framework::OperatorWithKernel { ...@@ -50,20 +50,18 @@ class SGDOp : public framework::OperatorWithKernel {
class SGDOpInferVarType : public framework::VarTypeInference { class SGDOpInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(framework::InferVarTypeContext *ctx) const override {
framework::BlockDesc *block) const override { auto &input_var_n = ctx->Input("Param")[0];
auto input_var_n = op_desc.Input("Param")[0]; auto in_var_type = ctx->GetType(input_var_n);
auto in_var_type = block->FindRecursiveOrCreateVar(input_var_n).GetType();
PADDLE_ENFORCE(in_var_type == framework::proto::VarType::SELECTED_ROWS || PADDLE_ENFORCE(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
in_var_type == framework::proto::VarType::LOD_TENSOR, in_var_type == framework::proto::VarType::LOD_TENSOR,
"The input Var's type should be LoDtensor or SelectedRows," "The input Var's type should be LoDtensor or SelectedRows,"
" but the received var(%s)'s type is %s", " but the received var(%s)'s type is %s",
input_var_n, in_var_type); input_var_n, in_var_type);
for (auto &out_var_n : op_desc.Output("ParamOut")) { for (auto &out_var_n : ctx->Output("ParamOut")) {
auto &out_var = block->FindRecursiveOrCreateVar(out_var_n); if (ctx->GetType(out_var_n) != in_var_type) {
if (out_var.GetType() != in_var_type) { ctx->SetType(out_var_n, in_var_type);
out_var.SetType(in_var_type);
} }
} }
} }
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/pool_op.h" #include "paddle/fluid/operators/pool_op.h"
#include <unordered_map>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_helper.h"
#endif #endif
...@@ -212,6 +213,12 @@ void Pool2dOpMaker::Make() { ...@@ -212,6 +213,12 @@ void Pool2dOpMaker::Make() {
AddAttr<bool>("use_mkldnn", AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel") "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>("use_quantizer",
"(bool, default false) "
"Set to true for operators that should be quantized and use "
"int8 kernel. "
"Only used on CPU.")
.SetDefault(false);
AddAttr<std::string>( AddAttr<std::string>(
"data_format", "data_format",
"(string, default NCHW) Only used in " "(string, default NCHW) Only used in "
......
...@@ -14,8 +14,11 @@ ...@@ -14,8 +14,11 @@
#include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/operators/py_func_op.h"
#include <memory>
#include <set> #include <set>
#include <string> #include <string>
#include <unordered_set>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -91,15 +94,12 @@ static void CallPythonFunc(py::object *callable, ...@@ -91,15 +94,12 @@ static void CallPythonFunc(py::object *callable,
} }
} }
class PyFuncOpVarTypInference : public framework::VarTypeInference { class PyFuncOpVarTypeInference : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op, void operator()(framework::InferVarTypeContext *ctx) const override {
framework::BlockDesc *block) const override { bool has_out = (ctx->HasOutput("Out") && !ctx->Output("Out").empty());
auto &outs = op.Outputs();
bool has_out = (outs.count("Out") > 0 && !outs.at("Out").empty());
auto &ins = op.Inputs(); bool has_in = (ctx->HasInput("X") && !ctx->Input("X").empty());
bool has_in = (ins.count("X") > 0 && !ins.at("X").empty());
/** /**
* X or Out can be empty, so that py_func can be more flexible * X or Out can be empty, so that py_func can be more flexible
...@@ -107,8 +107,8 @@ class PyFuncOpVarTypInference : public framework::VarTypeInference { ...@@ -107,8 +107,8 @@ class PyFuncOpVarTypInference : public framework::VarTypeInference {
*/ */
PADDLE_ENFORCE(has_in || has_out, "Input(X) or Output(Out) must exist"); PADDLE_ENFORCE(has_in || has_out, "Input(X) or Output(Out) must exist");
PADDLE_ENFORCE_GE(boost::get<int>(op.GetAttr(kForwardPythonCallableId)), 0, PADDLE_ENFORCE_GE(boost::get<int>(ctx->GetAttr(kForwardPythonCallableId)),
"Function id cannot be less than 0"); 0, "Function id cannot be less than 0");
if (!has_out) return; if (!has_out) return;
...@@ -118,7 +118,7 @@ class PyFuncOpVarTypInference : public framework::VarTypeInference { ...@@ -118,7 +118,7 @@ class PyFuncOpVarTypInference : public framework::VarTypeInference {
* the corresponding forward variable * the corresponding forward variable
*/ */
const std::string kGradVarSuffix = framework::kGradVarSuffix; const std::string kGradVarSuffix = framework::kGradVarSuffix;
auto &out_var_names = outs.at("Out"); auto &out_var_names = ctx->Output("Out");
for (auto &out_var_name : out_var_names) { for (auto &out_var_name : out_var_names) {
if (out_var_name == framework::kEmptyVarName || if (out_var_name == framework::kEmptyVarName ||
out_var_name.size() < kGradVarSuffix.size()) { out_var_name.size() < kGradVarSuffix.size()) {
...@@ -128,18 +128,17 @@ class PyFuncOpVarTypInference : public framework::VarTypeInference { ...@@ -128,18 +128,17 @@ class PyFuncOpVarTypInference : public framework::VarTypeInference {
size_t len = out_var_name.size() - kGradVarSuffix.size(); size_t len = out_var_name.size() - kGradVarSuffix.size();
if (out_var_name.substr(len) == kGradVarSuffix) { if (out_var_name.substr(len) == kGradVarSuffix) {
auto fwd_var_name = out_var_name.substr(0, len); auto fwd_var_name = out_var_name.substr(0, len);
auto *out_var_desc = block->FindVarRecursive(out_var_name); PADDLE_ENFORCE(ctx->HasVar(out_var_name),
auto *fwd_var_desc = block->FindVarRecursive(fwd_var_name); "Backward variable %s not found", out_var_name);
PADDLE_ENFORCE_NOT_NULL(out_var_desc, "Backward variable %s not found", PADDLE_ENFORCE(ctx->HasVar(fwd_var_name),
out_var_name); "Backward variable %s not found", fwd_var_name);
PADDLE_ENFORCE_NOT_NULL(fwd_var_desc, "Forward variable %s not found",
fwd_var_name);
VLOG(10) << "Infer var_desc of Output(" << out_var_name << ") as Input(" VLOG(10) << "Infer var_desc of Output(" << out_var_name << ") as Input("
<< fwd_var_name << ")"; << fwd_var_name << ")";
out_var_desc->SetShape(fwd_var_desc->GetShape());
out_var_desc->SetDataType(fwd_var_desc->GetDataType()); ctx->SetShape(out_var_name, ctx->GetShape(fwd_var_name));
out_var_desc->SetLoDLevel(fwd_var_desc->GetLoDLevel()); ctx->SetDataType(out_var_name, ctx->GetDataType(fwd_var_name));
out_var_desc->SetType(fwd_var_desc->GetType()); ctx->SetLoDLevel(out_var_name, ctx->GetLoDLevel(fwd_var_name));
ctx->SetType(out_var_name, ctx->GetType(fwd_var_name));
} }
} }
} }
...@@ -309,5 +308,5 @@ class PyFuncOp : public framework::OperatorBase { ...@@ -309,5 +308,5 @@ class PyFuncOp : public framework::OperatorBase {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(py_func, ops::PyFuncOp, ops::PyFuncOpMaker, REGISTER_OPERATOR(py_func, ops::PyFuncOp, ops::PyFuncOpMaker,
ops::PyFuncOpVarTypInference, ops::PyFuncOpShapeInference, ops::PyFuncOpVarTypeInference, ops::PyFuncOpShapeInference,
ops::PyFuncOpGradDescMaker); ops::PyFuncOpGradDescMaker);
...@@ -85,10 +85,10 @@ class CreateCustomReaderOpMaker : public DecoratedReaderMakerBase { ...@@ -85,10 +85,10 @@ class CreateCustomReaderOpMaker : public DecoratedReaderMakerBase {
AddComment(R"DOC( AddComment(R"DOC(
CreateCustomReader Operator CreateCustomReader Operator
A custom reader can be used for input data preprocessing. A custom reader can be used for input data preprocessing.
A custom reader holds its own sub-block, which will be executed in CPU A custom reader holds its own sub-block, which will be executed in CPU
in its 'ReadNext()' function. Users can configurate their own in its 'ReadNext()' function. Users can configurate their own
preprocessing pipelines by inserting operators into custom reader's preprocessing pipelines by inserting operators into custom reader's
sub-block. sub-block.
)DOC"); )DOC");
} }
...@@ -123,23 +123,22 @@ class CustomReaderInferShape : public framework::InferShapeBase { ...@@ -123,23 +123,22 @@ class CustomReaderInferShape : public framework::InferShapeBase {
class CustomReaderInferVarType : public framework::VarTypeInference { class CustomReaderInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc& op_desc, void operator()(framework::InferVarTypeContext* ctx) const override {
framework::BlockDesc* block) const override { auto& out_var_name = ctx->Output("Out")[0];
framework::VarDesc* out_reader = block->FindVar(op_desc.Output("Out")[0]); PADDLE_ENFORCE(ctx->HasVar(out_var_name));
PADDLE_ENFORCE_NOT_NULL(out_reader); ctx->SetType(out_var_name, framework::proto::VarType::READER);
out_reader->SetType(framework::proto::VarType::READER);
auto sink_var_names = auto sink_var_names =
boost::get<std::vector<std::string>>(op_desc.GetAttr("sink_var_names")); boost::get<std::vector<std::string>>(ctx->GetAttr("sink_var_names"));
const auto* sub_block = const auto* sub_block =
boost::get<framework::BlockDesc*>(op_desc.GetAttr("sub_block")); boost::get<framework::BlockDesc*>(ctx->GetAttr("sub_block"));
std::vector<framework::proto::VarType::Type> res_data_types; std::vector<framework::proto::VarType::Type> res_data_types;
for (const std::string& var_name : sink_var_names) { for (const std::string& var_name : sink_var_names) {
framework::VarDesc* var = sub_block->FindVar(var_name); framework::VarDesc* var = sub_block->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(var);
res_data_types.emplace_back(var->GetDataType()); res_data_types.emplace_back(var->GetDataType());
} }
out_reader->SetDataTypes(res_data_types); ctx->SetDataTypes(out_var_name, res_data_types);
} }
}; };
......
...@@ -51,19 +51,16 @@ class ReadInferShape : public framework::InferShapeBase { ...@@ -51,19 +51,16 @@ class ReadInferShape : public framework::InferShapeBase {
class ReadInferVarType : public framework::VarTypeInference { class ReadInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc& op_desc, void operator()(framework::InferVarTypeContext* ctx) const override {
framework::BlockDesc* block) const override { bool infer_out = boost::get<bool>(ctx->GetAttr("infer_out"));
bool infer_out = boost::get<bool>(op_desc.GetAttr("infer_out"));
if (infer_out) { if (infer_out) {
std::string reader_name = op_desc.Input("Reader")[0]; std::string reader_name = ctx->Input("Reader")[0];
std::vector<std::string> out_names = op_desc.Output("Out"); std::vector<std::string> out_names = ctx->Output("Out");
framework::VarDesc* reader = block->FindVarRecursive(reader_name); auto dtypes = ctx->GetDataTypes(reader_name);
auto dtypes = reader->GetDataTypes();
PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size()); PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
for (size_t i = 0; i < dtypes.size(); ++i) { for (size_t i = 0; i < dtypes.size(); ++i) {
framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]); ctx->SetType(out_names[i], framework::proto::VarType::LOD_TENSOR);
out.SetType(framework::proto::VarType::LOD_TENSOR); ctx->SetDataType(out_names[i], dtypes[i]);
out.SetDataType(dtypes[i]);
} }
} }
} }
......
...@@ -98,11 +98,10 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const { ...@@ -98,11 +98,10 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
} }
} }
void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc, void FileReaderInferVarType::operator()(
framework::BlockDesc* block) const { framework::InferVarTypeContext* ctx) const {
std::string reader_name = op_desc.Output("Out")[0]; std::string reader_name = ctx->Output("Out")[0];
framework::VarDesc* reader = block->FindVarRecursive(reader_name); ctx->SetType(reader_name, framework::proto::VarType::READER);
reader->SetType(framework::proto::VarType::READER);
} }
void DecoratedReaderInferShape::operator()( void DecoratedReaderInferShape::operator()(
...@@ -125,13 +124,11 @@ void DecoratedReaderInferShape::operator()( ...@@ -125,13 +124,11 @@ void DecoratedReaderInferShape::operator()(
} }
void DecoratedReaderInferVarType::operator()( void DecoratedReaderInferVarType::operator()(
const framework::OpDesc& op_desc, framework::BlockDesc* block) const { framework::InferVarTypeContext* ctx) const {
std::string in_reader_name = op_desc.Input("UnderlyingReader")[0]; const std::string& in_reader_name = ctx->Input("UnderlyingReader")[0];
framework::VarDesc* in_reader = block->FindVarRecursive(in_reader_name); const std::string& out_reader_name = ctx->Output("Out")[0];
std::string out_reader_name = op_desc.Output("Out")[0]; ctx->SetType(out_reader_name, framework::proto::VarType::READER);
framework::VarDesc* out_reader = block->FindVarRecursive(out_reader_name); ctx->SetDataTypes(out_reader_name, ctx->GetDataTypes(in_reader_name));
out_reader->SetType(framework::proto::VarType::READER);
out_reader->SetDataTypes(in_reader->GetDataTypes());
} }
void DecoratedReaderMakerBase::Make() { void DecoratedReaderMakerBase::Make() {
......
...@@ -14,7 +14,9 @@ ...@@ -14,7 +14,9 @@
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
...@@ -59,8 +61,7 @@ class FileReaderInferShape : public framework::InferShapeBase { ...@@ -59,8 +61,7 @@ class FileReaderInferShape : public framework::InferShapeBase {
class FileReaderInferVarType : public framework::VarTypeInference { class FileReaderInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc& op_desc, void operator()(framework::InferVarTypeContext* ctx) const override;
framework::BlockDesc* block) const override;
}; };
// general infershape for decorated reader // general infershape for decorated reader
...@@ -72,8 +73,7 @@ class DecoratedReaderInferShape : public framework::InferShapeBase { ...@@ -72,8 +73,7 @@ class DecoratedReaderInferShape : public framework::InferShapeBase {
// general var type inference for decorated reader // general var type inference for decorated reader
class DecoratedReaderInferVarType : public framework::VarTypeInference { class DecoratedReaderInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc& op_desc, void operator()(framework::InferVarTypeContext* ctx) const override;
framework::BlockDesc* block) const override;
}; };
class DecoratedReaderMakerBase : public framework::OpProtoAndCheckerMaker { class DecoratedReaderMakerBase : public framework::OpProtoAndCheckerMaker {
......
...@@ -219,14 +219,6 @@ class ReshapeKernel { ...@@ -219,14 +219,6 @@ class ReshapeKernel {
std::vector<int>(shape_data, shape_data + shape_tensor->numel()); std::vector<int>(shape_data, shape_data + shape_tensor->numel());
out_dims = ReshapeOp::ValidateShape(shape, in->dims()); out_dims = ReshapeOp::ValidateShape(shape, in->dims());
} }
if (!in->lod().empty()) {
PADDLE_ENFORCE_EQ(
out_dims[0], in->dims()[0],
"Reshape operator cannot reshape an input sequence batch "
"into an output sequence batch that has a different "
"number of time steps. Please consider using "
"sequence_reshape op.");
}
out->mutable_data(ctx.GetPlace(), in->type()); out->mutable_data(ctx.GetPlace(), in->type());
framework::TensorCopy( framework::TensorCopy(
......
...@@ -159,12 +159,9 @@ This operator will serialize and write LoDTensor / SelectedRows variable to file ...@@ -159,12 +159,9 @@ This operator will serialize and write LoDTensor / SelectedRows variable to file
class SaveOpVarTypeInference : public framework::VarTypeInference { class SaveOpVarTypeInference : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(framework::InferVarTypeContext *ctx) const override {
framework::BlockDesc *block) const override { auto out_var_name = ctx->Output(LOOKUP_TABLE_PATH).front();
auto out_var_name = op_desc.Output(LOOKUP_TABLE_PATH).front(); ctx->SetType(out_var_name, framework::proto::VarType::RAW);
auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
auto var_type = framework::proto::VarType::RAW;
out_var.SetType(var_type);
} }
}; };
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/scale_op.h" #include "paddle/fluid/operators/scale_op.h"
#include <memory>
#include <string> #include <string>
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
...@@ -69,17 +70,13 @@ $$Out = scale*(X + bias)$$ ...@@ -69,17 +70,13 @@ $$Out = scale*(X + bias)$$
class ScaleOpVarTypeInference : public framework::VarTypeInference { class ScaleOpVarTypeInference : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(framework::InferVarTypeContext *ctx) const override {
framework::BlockDesc *block) const override { auto &in_var_name = ctx->Input("X").front();
auto &in_var_name = op_desc.Input("X").front(); auto out_var_name = ctx->Output("Out").front();
auto &in_var = detail::Ref(block->FindVarRecursive(in_var_name));
auto out_var_name = op_desc.Output("Out").front();
auto *out_var = block->FindVarRecursive(out_var_name);
if (in_var_name != out_var_name) { if (in_var_name != out_var_name) {
out_var->SetType(in_var.GetType()); ctx->SetType(out_var_name, ctx->GetType(in_var_name));
out_var->SetDataType(in_var.GetDataType()); ctx->SetDataType(out_var_name, ctx->GetDataType(in_var_name));
} }
} }
}; };
......
...@@ -15,13 +15,12 @@ limitations under the License. */ ...@@ -15,13 +15,12 @@ limitations under the License. */
#pragma once #pragma once
#include <string> #include <string>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
static HOSTDEVICE float real_exp(float x) { return expf(x); }
static HOSTDEVICE float real_exp(double x) { return exp(x); }
template <typename T> template <typename T>
struct SeluFunctor { struct SeluFunctor {
SeluFunctor(const T* x_data_ptr, float alpha, float scale, T* y_data_ptr) SeluFunctor(const T* x_data_ptr, float alpha, float scale, T* y_data_ptr)
......
...@@ -30,13 +30,6 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel { ...@@ -30,13 +30,6 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel {
"Output(X) of SequenceEnumerate operator should not be null."); "Output(X) of SequenceEnumerate operator should not be null.");
const auto x_dims = ctx->GetInputDim("X"); const auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(
x_dims.size(), 2,
"Input(X) of SequenceEnumerate operator's rank should be 2.");
PADDLE_ENFORCE_EQ(x_dims[1], 1,
"Input(X) of SequenceEnumerate operator's 2nd "
"dimension should be 1.");
const auto win_size = ctx->Attrs().Get<int>("win_size"); const auto win_size = ctx->Attrs().Get<int>("win_size");
ctx->SetOutputDim("Out", {x_dims[0], win_size}); ctx->SetOutputDim("Out", {x_dims[0], win_size});
ctx->ShareLoD("X", "Out"); ctx->ShareLoD("X", "Out");
...@@ -59,7 +52,8 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -59,7 +52,8 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
}); });
AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.") AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.")
.SetDefault(0); .SetDefault(0);
AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "") AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
"Skip calling InferShape() function in the runtime.")
.SetDefault(true); .SetDefault(true);
AddComment(R"DOC( AddComment(R"DOC(
Sequence Enumerate Operator. Sequence Enumerate Operator.
......
...@@ -27,30 +27,47 @@ class SequenceEnumerateKernel : public framework::OpKernel<T> { ...@@ -27,30 +27,47 @@ class SequenceEnumerateKernel : public framework::OpKernel<T> {
auto* in = context.Input<LoDTensor>("X"); auto* in = context.Input<LoDTensor>("X");
auto* out = context.Output<LoDTensor>("Out"); auto* out = context.Output<LoDTensor>("Out");
int win_size = context.Attr<int>("win_size"); int win_size = context.Attr<int>("win_size");
int pad_value = context.Attr<int>("pad_value"); auto pad_value = static_cast<T>(context.Attr<int>("pad_value"));
auto in_dims = in->dims(); auto in_dims = in->dims();
auto in_lod = in->lod(); auto lod0 = in->lod()[0];
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
static_cast<uint64_t>(in_dims[0]), in_lod[0].back(), static_cast<uint64_t>(in_dims[0]), lod0.back(),
"The actual input data's size mismatched with LoD information."); "The actual input data's size mismatched with LoD information.");
PADDLE_ENFORCE_EQ(
in_dims.size(), 2UL,
"Input(X) of SequenceEnumerate operator's rank should be 2.");
PADDLE_ENFORCE_EQ(in_dims[1], 1,
"Input(X) of SequenceEnumerate operator's 2nd "
"dimension should be 1.");
// Generate enumerate sequence set // Generate enumerate sequence set
auto lod0 = in_lod[0];
auto in_data = in->data<T>(); auto in_data = in->data<T>();
out->Resize({in_dims[0], win_size}); out->Resize({in_dims[0], win_size});
out->set_lod(in->lod());
auto out_data = out->mutable_data<T>(context.GetPlace()); auto out_data = out->mutable_data<T>(context.GetPlace());
for (size_t i = 0; i < lod0.size() - 1; ++i) { for (size_t i = 0; i < lod0.size() - 1; ++i) {
for (size_t idx = lod0[i]; idx < lod0[i + 1]; ++idx) { int start = lod0[i];
for (int word_idx = 0; word_idx < win_size; ++word_idx) { int end = lod0[i + 1];
size_t word_pos = idx + word_idx; int copy_size = win_size < end - start + 1 ? win_size : end - start + 1;
out_data[win_size * idx + word_idx] = int mid = end + 1 - copy_size;
word_pos < lod0[i + 1] ? in_data[word_pos] : pad_value; int pad_num = win_size - copy_size;
copy_size *= sizeof(T);
for (int idx = start; idx < mid; ++idx) {
std::memcpy(out_data, in_data + idx, copy_size);
out_data += win_size;
}
for (int idx = mid; idx < end; ++idx) {
copy_size -= sizeof(T);
pad_num++;
std::memcpy(out_data, in_data + idx, copy_size);
T* pdata = out_data + copy_size / sizeof(T);
for (int i = 0; i < pad_num; ++i) {
pdata[i] = pad_value;
} }
out_data += win_size;
} }
} }
out->set_lod(in->lod());
} }
}; };
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <cub/cub.cuh> // NOLINT #include <cub/cub.cuh> // NOLINT
#include "paddle/fluid/operators/math.h"
#include "paddle/fluid/operators/sequence_ops/sequence_softmax_op.h" #include "paddle/fluid/operators/sequence_ops/sequence_softmax_op.h"
namespace paddle { namespace paddle {
...@@ -21,9 +22,6 @@ namespace operators { ...@@ -21,9 +22,6 @@ namespace operators {
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
__device__ __forceinline__ float real_exp(float x) { return expf(x); }
__device__ __forceinline__ double real_exp(double x) { return exp(x); }
template <typename T, int BlockDim> template <typename T, int BlockDim>
using BlockReduce = cub::BlockReduce<T, BlockDim>; using BlockReduce = cub::BlockReduce<T, BlockDim>;
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "cub/cub.cuh" #include "cub/cub.cuh"
#include "paddle/fluid/operators/math.h"
#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/hostdevice.h"
...@@ -21,11 +22,6 @@ namespace operators { ...@@ -21,11 +22,6 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
static HOSTDEVICE float real_exp(float x) { return expf(x); }
static HOSTDEVICE float real_exp(double x) { return exp(x); }
static HOSTDEVICE float real_log(float x) { return logf(x); }
static HOSTDEVICE float real_log(double x) { return log(x); }
static constexpr int kNumCUDAThreads = 512; static constexpr int kNumCUDAThreads = 512;
static constexpr int kNumMaxinumNumBlocks = 4096; static constexpr int kNumMaxinumNumBlocks = 4096;
......
...@@ -12,18 +12,138 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,18 +12,138 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <thrust/device_vector.h>
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/slice_op.h" #include "paddle/fluid/operators/slice_op.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
template <size_t D>
__global__ void Padding(const paddle::platform::float16* d_out,
const int* out_dims, const int* in_dims,
const int* offsets, int64_t n,
paddle::platform::float16* d_in) {
int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x;
if (out_idx < n) {
int64_t out_idx_tmp = out_idx;
int coords[D] = {0};
for (int i = D - 1; i >= 0; --i) {
coords[i] = out_idx_tmp % out_dims[i];
out_idx_tmp /= out_dims[i];
coords[i] += offsets[i];
}
int64_t in_idx = 0;
for (int i = 0; i < D; ++i) {
in_idx = in_idx * in_dims[i] + coords[i];
}
d_in[in_idx] = d_out[out_idx];
}
}
template <>
class SliceGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>
: public framework::OpKernel<paddle::platform::float16> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* d_in = ctx.Output<framework::Tensor>(framework::GradVarName("Input"));
d_in->mutable_data<paddle::platform::float16>(ctx.GetPlace());
auto out_dims = d_out->dims();
auto in_dims = d_in->dims();
int rank = out_dims.size();
std::vector<int> offsets(rank, 0);
auto axes = ctx.Attr<std::vector<int>>("axes");
auto starts = ctx.Attr<std::vector<int>>("starts");
for (size_t i = 0; i < starts.size(); ++i) {
if (starts[i] < 0) {
starts[i] += in_dims[axes[i]];
}
offsets[axes[i]] = std::max(starts[i], 0);
}
math::SetConstant<paddle::platform::CUDADeviceContext,
paddle::platform::float16>
set_zero;
auto& dev_ctx =
ctx.template device_context<paddle::platform::CUDADeviceContext>();
set_zero(dev_ctx, d_in, static_cast<paddle::platform::float16>(0));
int64_t numel = d_out->numel();
dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1);
dim3 threads(PADDLE_CUDA_NUM_THREADS);
auto stream = ctx.cuda_device_context().stream();
auto out_shape = framework::vectorize2int(out_dims);
thrust::device_vector<int> out_dims_vec(out_shape.begin(), out_shape.end());
auto in_shape = framework::vectorize2int(in_dims);
thrust::device_vector<int> in_dims_vec(in_shape.begin(), in_shape.end());
thrust::device_vector<int> offsets_vec(offsets.begin(), offsets.end());
const int* out_dims_ptr = thrust::raw_pointer_cast(out_dims_vec.data());
const int* in_dims_ptr = thrust::raw_pointer_cast(in_dims_vec.data());
const int* offsets_ptr = thrust::raw_pointer_cast(offsets_vec.data());
switch (rank) {
case 1:
Padding<1><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 2:
Padding<2><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 3:
Padding<3><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 4:
Padding<4><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 5:
Padding<5><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
case 6:
Padding<6><<<blocks, threads, 0, stream>>>(
d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
offsets_ptr, numel, d_in->data<paddle::platform::float16>());
break;
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>, slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, double>, ops::SliceKernel<paddle::platform::CUDADeviceContext, double>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, int>, ops::SliceKernel<paddle::platform::CUDADeviceContext, int>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SliceKernel<paddle::platform::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
slice_grad, slice_grad,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, float>, ops::SliceGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, double>, ops::SliceGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int>, ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SliceGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
...@@ -439,7 +439,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> { ...@@ -439,7 +439,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>(); context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();
Tensor* logit_grad = Tensor* logit_grad =
context.Output<Tensor>(framework::GradVarName("Logits")); context.Output<Tensor>(framework::GradVarName("Logits"));
logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax")); framework::TensorCopy(*context.Input<Tensor>("Softmax"), context.GetPlace(),
context.device_context(), logit_grad);
T* logit_grad_data = logit_grad->data<T>(); T* logit_grad_data = logit_grad->data<T>();
const int batch_size = logit_grad->dims()[0]; const int batch_size = logit_grad->dims()[0];
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#include "paddle/fluid/operators/split_selected_rows_op.h" #include "paddle/fluid/operators/split_selected_rows_op.h"
#include <memory>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -60,10 +62,9 @@ class SplitSelectedRowsOp : public framework::OperatorWithKernel { ...@@ -60,10 +62,9 @@ class SplitSelectedRowsOp : public framework::OperatorWithKernel {
class SplitSelectedRowsOpInferVarType : public framework::VarTypeInference { class SplitSelectedRowsOpInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(framework::InferVarTypeContext *ctx) const override {
framework::BlockDesc *block) const override { for (auto &out_var : ctx->Output("Out")) {
for (auto &out_var : op_desc.Output("Out")) { ctx->SetType(out_var, framework::proto::VarType::SELECTED_ROWS);
block->Var(out_var)->SetType(framework::proto::VarType::SELECTED_ROWS);
} }
} }
}; };
......
...@@ -94,6 +94,7 @@ class SqueezeOpInferShape : public framework::InferShapeBase { ...@@ -94,6 +94,7 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
} }
}; };
// TODO(paddle-dev): Should use OpKernel.
class SqueezeOp : public framework::OperatorBase { class SqueezeOp : public framework::OperatorBase {
public: public:
using OperatorBase::OperatorBase; using OperatorBase::OperatorBase;
......
...@@ -12,6 +12,7 @@ limitations under the License. */ ...@@ -12,6 +12,7 @@ limitations under the License. */
#include "paddle/fluid/operators/sum_op.h" #include "paddle/fluid/operators/sum_op.h"
#include <algorithm> #include <algorithm>
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -159,24 +160,20 @@ the LoD information with the first input. ...@@ -159,24 +160,20 @@ the LoD information with the first input.
class SumOpVarTypeInference : public framework::VarTypeInference { class SumOpVarTypeInference : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc& op_desc, void operator()(framework::InferVarTypeContext* ctx) const override {
framework::BlockDesc* block) const override { auto& inputs = ctx->Input("X");
auto& inputs = op_desc.Input("X");
auto var_type = framework::proto::VarType::SELECTED_ROWS; auto var_type = framework::proto::VarType::SELECTED_ROWS;
for (auto& name : op_desc.Input("X")) { for (auto& name : ctx->Input("X")) {
VLOG(10) << name << " " VLOG(10) << name << " " << ctx->GetType(name);
<< block->FindRecursiveOrCreateVar(name).GetType();
} }
bool any_input_is_lod_tensor = std::any_of( bool any_input_is_lod_tensor = std::any_of(
inputs.begin(), inputs.end(), [block](const std::string& name) { inputs.begin(), inputs.end(), [ctx](const std::string& name) {
return block->FindRecursiveOrCreateVar(name).GetType() == return ctx->GetType(name) == framework::proto::VarType::LOD_TENSOR;
framework::proto::VarType::LOD_TENSOR;
}); });
auto is_tensor_array = [block](const std::string& name) { auto is_tensor_array = [ctx](const std::string& name) {
return block->FindRecursiveOrCreateVar(name).GetType() == return ctx->GetType(name) == framework::proto::VarType::LOD_TENSOR_ARRAY;
framework::proto::VarType::LOD_TENSOR_ARRAY;
}; };
bool any_input_is_tensor_array = bool any_input_is_tensor_array =
...@@ -188,8 +185,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference { ...@@ -188,8 +185,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
if (!all_inputs_are_tensor_array) { if (!all_inputs_are_tensor_array) {
std::ostringstream os; std::ostringstream os;
for (auto& each : inputs) { for (auto& each : inputs) {
os << " " << each << " type is " os << " " << each << " type is " << ctx->GetType(each) << "\n";
<< block->FindRecursiveOrCreateVar(each).GetType() << "\n";
} }
PADDLE_ENFORCE(all_inputs_are_tensor_array, PADDLE_ENFORCE(all_inputs_are_tensor_array,
"Not all inputs are tensor array:\n%s", os.str()); "Not all inputs are tensor array:\n%s", os.str());
...@@ -199,11 +195,9 @@ class SumOpVarTypeInference : public framework::VarTypeInference { ...@@ -199,11 +195,9 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
var_type = framework::proto::VarType::LOD_TENSOR; var_type = framework::proto::VarType::LOD_TENSOR;
} }
auto out_var_name = op_desc.Output("Out").front(); auto out_var_name = ctx->Output("Out").front();
auto& out_var = block->FindRecursiveOrCreateVar(out_var_name); ctx->SetType(out_var_name, var_type);
out_var.SetType(var_type); ctx->SetDataType(out_var_name, ctx->GetDataType(inputs.front()));
auto& in_var = detail::Ref(block->FindVarRecursive(inputs.front()));
out_var.SetDataType(in_var.GetDataType());
} }
}; };
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/batch_norm_op.h"
namespace ops = paddle::operators;
REGISTER_OPERATOR(sync_batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
ops::BatchNormOpInferVarType, ops::BatchNormGradMaker);
REGISTER_OPERATOR(sync_batch_norm_grad, ops::BatchNormGradOp);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <cfloat>
#include <string>
#include <vector>
#include "cub/cub.cuh"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/nccl_helper.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using DataLayout = framework::DataLayout;
template <typename T>
using CudnnDataType = platform::CudnnDataType<T>;
template <typename T, int BlockDim, framework::DataLayout layout>
__global__ void KeLocalStats(const T *x, int N, int M, int C, T *mean_var) {
typedef cub::BlockReduce<T, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
for (int k = blockIdx.x; k < C; k += gridDim.x) {
T x_sum = 0;
T x2_sum = 0;
for (int i = threadIdx.x; i < N * M; i += BlockDim) {
int id = layout == framework::DataLayout::kNCHW
? (i / M) * C * M + k * M + i % M
: i * C + k;
T x_in = x[id];
x_sum += x_in;
x2_sum += x_in * x_in;
}
__syncthreads();
T out = BlockReduce(temp_storage).Reduce(x_sum, cub::Sum());
__syncthreads();
if (threadIdx.x == 0) {
mean_var[k] = out / (N * M);
}
out = BlockReduce(temp_storage).Reduce(x2_sum, cub::Sum());
__syncthreads();
if (threadIdx.x == 0) {
mean_var[k + C] = out / (N * M);
}
}
if (blockIdx.x == 0 && threadIdx.x == 0) {
mean_var[2 * C] = static_cast<T>(1.0);
}
}
template <typename T>
__global__ void KeSyncAndMovingStats(T *means, T *variances, T *num_dev,
const int C, const T momentum,
const double epsilon, T *sv_mean_data,
T *sv_inv_var_data, T *moving_means,
T *moving_variances) {
// sync stats across multi-devices
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < C; i += stride) {
T mean = means[i] / (*num_dev);
T var = variances[i] / (*num_dev);
var = var - mean * mean;
// sync stats
sv_mean_data[i] = mean;
sv_inv_var_data[i] = 1.0 / sqrt(var + epsilon);
variances[i] = var;
// moving stats
moving_means[i] = moving_means[i] * momentum + mean * (1. - momentum);
moving_variances[i] =
moving_variances[i] * momentum + var * (1. - momentum);
}
}
template <typename T, framework::DataLayout layout>
static __global__ void KeNormAffine(const T *x, const T *scale, const T *bias,
const T *mean, const T *variance,
const double epsilon, const int C,
const int M, const int num, T *y) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < num; i += stride) {
const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C;
y[i] = (x[i] - mean[c]) / sqrt(variance[c] + epsilon) * scale[c] + bias[c];
}
}
template <typename DeviceContext, typename T>
class SyncBatchNormKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test");
const std::string layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout layout = framework::StringToDataLayout(layout_str);
const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
PADDLE_ENFORCE(
!use_global_stats,
"sync_batch_norm doesn't support to set use_global_stats True. ",
"Please use batch_norm in this case.");
const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims();
PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"The Input dim size should be between 2 and 5");
int N, C, H, W, D;
ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
int x_numel = x->numel();
const T *x_d = x->data<T>();
const T *s_d = ctx.Input<Tensor>("Scale")->data<T>();
const T *b_d = ctx.Input<Tensor>("Bias")->data<T>();
auto *y = ctx.Output<Tensor>("Y");
T *y_d = y->mutable_data<T>(ctx.GetPlace());
const T *mean_data = nullptr;
const T *var_data = nullptr;
auto &dev_ctx = ctx.cuda_device_context();
auto stream = dev_ctx.stream();
auto *comm = dev_ctx.nccl_comm();
const int block = 512;
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
paddle::memory::AllocationPtr alloc_ptr{nullptr};
if (is_test) {
const auto *est_mean = ctx.Input<Tensor>("Mean");
const auto *est_var = ctx.Input<Tensor>("Variance");
mean_data = est_mean->data<T>();
var_data = est_var->data<T>();
} else {
auto &allocator =
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
// x, x^2, 1, here 1 is used to calc device num
// device num also can be got from platform::DeviceContextPool
const int bytes = (C * 2 + 1) * sizeof(T);
alloc_ptr = allocator.Allocate(bytes);
T *stats = reinterpret_cast<T *>(alloc_ptr->ptr());
const int threads = 256;
int grid = std::min(C, (max_threads + threads - 1) / threads);
if (layout == framework::DataLayout::kNCHW) {
KeLocalStats<
T, threads,
framework::DataLayout::kNCHW><<<grid, threads, 0, stream>>>(
x_d, N, H * W * D, C, stats);
} else {
KeLocalStats<
T, threads,
framework::DataLayout::kNHWC><<<grid, threads, 0, stream>>>(
x_d, N, H * W * D, C, stats);
}
Tensor c_g_st;
T *c_g_st_d = c_g_st.mutable_data<T>({2 * C + 1}, platform::CPUPlace());
auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
memory::Copy(platform::CPUPlace(), c_g_st_d, gplace, stats, bytes, 0);
int dtype = platform::ToNCCLDataType(x->type());
// In-place operation
PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
comm, stream));
// moving mean/variance
auto *mean_out = ctx.Output<Tensor>("MeanOut");
auto *variance_out = ctx.Output<Tensor>("VarianceOut");
T *est_mean_data = mean_out->mutable_data<T>(ctx.GetPlace());
T *est_var_data = variance_out->mutable_data<T>(ctx.GetPlace());
auto *saved_mean = ctx.Output<Tensor>("SavedMean");
auto *saved_inv_variance = ctx.Output<Tensor>("SavedVariance");
T *sv_mean_data = saved_mean->mutable_data<T>(ctx.GetPlace());
T *sv_inv_var_data = saved_inv_variance->mutable_data<T>(ctx.GetPlace());
// Note, Input('Mean')/Input('Variance') share variable with
// Output('MeanOut')/Output('VarianceOut')
KeSyncAndMovingStats<T><<<(C + block - 1) / block, block, 0, stream>>>(
stats, stats + C, stats + 2 * C, C, momentum, epsilon, sv_mean_data,
sv_inv_var_data, est_mean_data, est_var_data);
mean_data = sv_mean_data;
var_data = stats + C;
}
int grid2 = (std::min(x_numel, max_threads) + block - 1) / block;
if (layout == framework::DataLayout::kNCHW) {
KeNormAffine<T,
framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
x_d, s_d, b_d, mean_data, var_data, epsilon, C, H * W * D, x_numel,
y_d);
} else {
KeNormAffine<T,
framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
x_d, s_d, b_d, mean_data, var_data, epsilon, C, H * W * D, x_numel,
y_d);
}
}
};
template <typename T, const int BlockDim, framework::DataLayout layout>
__global__ void KeBackwardLocalStats(const T *dy, const T *x, const T *means,
int N, int M, int C, T *sum_dy_prod) {
typedef cub::BlockReduce<double, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
for (int k = blockIdx.x; k < C; k += gridDim.x) {
T sum1 = 0;
T sum2 = 0;
T mean = means[k];
for (int i = threadIdx.x; i < N * M; i += blockDim.x) {
int id = layout == framework::DataLayout::kNCHW
? (i / M) * C * M + k * M + i % M
: i * C + k;
T g = dy[id];
sum1 += g;
sum2 += g * (x[id] - mean);
}
__syncthreads();
T out = BlockReduce(temp_storage).Reduce(sum1, cub::Sum());
__syncthreads();
if (threadIdx.x == 0) {
sum_dy_prod[k] = out;
}
out = BlockReduce(temp_storage).Reduce(sum2, cub::Sum());
__syncthreads();
if (threadIdx.x == 0) {
sum_dy_prod[k + C] = out;
}
}
if (blockIdx.x == 0 && threadIdx.x == 0) {
sum_dy_prod[2 * C] = static_cast<T>(1.0);
}
}
template <typename T, int BlockDim, framework::DataLayout layout>
static __global__ void KeBNBackwardScaleBias(const T *dy, const T *x,
const T *mean,
const T *inv_variance,
const double epsilon, const int N,
const int C, const int HxW,
T *dscale, T *dbias) {
const int outer_size = C;
const int inner_size = N * HxW;
typedef cub::BlockReduce<double, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
T ds_sum = static_cast<T>(0);
T db_sum = static_cast<T>(0);
T inv_var_i = inv_variance[i];
T mean_i = mean[i];
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int id = layout == framework::DataLayout::kNCHW
? ((j / HxW) * C + i) * HxW + (j % HxW)
: j * outer_size + i;
ds_sum += dy[id] * (x[id] - mean_i);
db_sum += dy[id];
}
__syncthreads();
double os = BlockReduce(temp_storage)
.Reduce(static_cast<double>(ds_sum), cub::Sum());
__syncthreads();
double ob = BlockReduce(temp_storage)
.Reduce(static_cast<double>(db_sum), cub::Sum());
__syncthreads();
if (threadIdx.x == 0) {
dscale[i] = static_cast<T>(os * inv_var_i);
dbias[i] = static_cast<T>(ob);
}
__syncthreads();
}
}
template <typename T, framework::DataLayout layout>
static __global__ void KeBNBackwardData(const T *dy, const T *x, const T *beta,
const T *mean, const T *inv_variance,
const T *g_sum_dy,
const T *g_sum_dy_prod,
const T *num_dev, const double epsilon,
const int C, const int HxW,
const int num, T *dx) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
T scale = static_cast<T>(C) / num;
T dev_num = num_dev[0];
for (int i = gid; i < num; i += stride) {
const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
T inv_var = inv_variance[c];
T s_d = beta[c];
T gvar = -1.0 * (g_sum_dy_prod[c] / dev_num) * s_d * inv_var *
(inv_var * inv_var);
T gmean = -1.0 * (g_sum_dy[c] / dev_num) * s_d * inv_var;
dx[i] =
dy[i] * s_d * inv_var + gmean * scale + gvar * scale * (x[i] - mean[c]);
}
}
// Deriving the Gradient for the Backward Pass of Batch Normalization
// https://kevinzakka.github.io/2016/09/14/batch_normalization/
template <typename DeviceContext, typename T>
class SyncBatchNormGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use CUDAPlace.");
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const std::string layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout layout = framework::StringToDataLayout(layout_str);
const auto *x = ctx.Input<Tensor>("X");
const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto *scale = ctx.Input<Tensor>("Scale");
const auto &x_dims = x->dims();
PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"The Input dim size should be between 2 and 5");
int N, C, H, W, D;
ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
// init output
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
d_x->mutable_data<T>(ctx.GetPlace());
if (d_scale && d_bias) {
d_scale->mutable_data<T>(ctx.GetPlace());
d_bias->mutable_data<T>(ctx.GetPlace());
}
PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
PADDLE_ENFORCE_EQ(scale->dims()[0], C);
std::vector<int> dims;
std::vector<int> strides;
if (layout == DataLayout::kNCHW) {
dims = {N, C, H, W, D};
strides = {C * H * W * D, H * W * D, W * D, D, 1};
} else {
dims = {N, C, H, W, D};
strides = {H * W * C * D, 1, W * D * C, D * C, C};
}
const T *x_d = x->data<T>();
const T *dy_d = d_y->data<T>();
auto &dev_ctx = ctx.cuda_device_context();
auto stream = dev_ctx.stream();
auto *comm = dev_ctx.nccl_comm();
const T *saved_mean = ctx.Input<Tensor>("SavedMean")->data<T>();
const T *saved_inv_var = ctx.Input<Tensor>("SavedVariance")->data<T>();
auto &allocator =
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
const int bytes = (C * 2 + 1) * sizeof(T);
auto alloc_ptr = allocator.Allocate(bytes);
T *stats = reinterpret_cast<T *>(alloc_ptr->ptr());
const int threads = 256;
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
int grid = std::min(C, (max_threads + threads - 1) / threads);
int x_numel = x->numel();
int fsize = H * W * D;
if (layout == framework::DataLayout::kNCHW) {
KeBackwardLocalStats<
T, threads,
framework::DataLayout::kNCHW><<<grid, threads, 0, stream>>>(
dy_d, x_d, saved_mean, N, fsize, C, stats);
} else {
KeBackwardLocalStats<
T, threads,
framework::DataLayout::kNHWC><<<grid, threads, 0, stream>>>(
dy_d, x_d, saved_mean, N, fsize, C, stats);
}
int dtype = platform::ToNCCLDataType(x->type());
// In-place operation
PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
comm, stream));
const int block = 512;
int grid2 = (std::min(x_numel, max_threads) + block - 1) / block;
if (layout == framework::DataLayout::kNCHW) {
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T, threads,
framework::DataLayout::kNCHW><<<grid, threads, 0, stream>>>(
dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize,
d_scale->data<T>(), d_bias->data<T>());
}
if (d_x) {
KeBNBackwardData<
T, framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
dy_d, x_d, scale->data<T>(), saved_mean, saved_inv_var, stats,
stats + C, stats + 2 * C, epsilon, C, fsize, x->numel(),
d_x->data<T>());
}
} else {
if (d_scale && d_bias) {
KeBNBackwardScaleBias<
T, threads,
framework::DataLayout::kNHWC><<<grid, threads, 0, stream>>>(
dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize,
d_scale->data<T>(), d_bias->data<T>());
}
if (d_x) {
KeBNBackwardData<
T, framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
dy_d, x_d, scale->data<T>(), saved_mean, saved_inv_var, stats,
stats + C, stats + 2 * C, epsilon, C, fsize, x->numel(),
d_x->data<T>());
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
sync_batch_norm, ops::SyncBatchNormKernel<plat::CUDADeviceContext, float>,
ops::SyncBatchNormKernel<plat::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
sync_batch_norm_grad,
ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, float>,
ops::SyncBatchNormGradKernel<plat::CUDADeviceContext, double>);
...@@ -177,10 +177,9 @@ class LoDTensorArray2TensorGradInferShape : public framework::InferShapeBase { ...@@ -177,10 +177,9 @@ class LoDTensorArray2TensorGradInferShape : public framework::InferShapeBase {
class LoDTensorArray2TensorGradInferVarType class LoDTensorArray2TensorGradInferVarType
: public framework::VarTypeInference { : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(framework::InferVarTypeContext *ctx) const override {
framework::BlockDesc *block) const override { for (auto &out_var : ctx->Output(framework::GradVarName("X"))) {
for (auto &out_var : op_desc.Output(framework::GradVarName("X"))) { ctx->SetType(out_var, framework::proto::VarType::LOD_TENSOR_ARRAY);
block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
} }
} }
}; };
......
...@@ -46,8 +46,7 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -46,8 +46,7 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
class TensorRTEngineInferVarType : public framework::VarTypeInference { class TensorRTEngineInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(framework::InferVarTypeContext *ctx) const override {}
framework::BlockDesc *block) const override {}
}; };
} // namespace operators } // namespace operators
......
...@@ -112,17 +112,16 @@ uniform distribution. The random result is in set [min, max]. ...@@ -112,17 +112,16 @@ uniform distribution. The random result is in set [min, max].
class UniformRandomOpVarTypeInference : public framework::VarTypeInference { class UniformRandomOpVarTypeInference : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(framework::InferVarTypeContext *ctx) const override {
framework::BlockDesc *block) const override { auto out_var_name = ctx->Output("Out").front();
auto out_var_name = op_desc.Output("Out").front();
auto var_data_type = static_cast<framework::proto::VarType::Type>( auto var_data_type = static_cast<framework::proto::VarType::Type>(
boost::get<int>(op_desc.GetAttr("dtype"))); boost::get<int>(ctx->GetAttr("dtype")));
auto out_var = block->FindRecursiveOrCreateVar(out_var_name); if (ctx->GetType(out_var_name) !=
if (out_var.GetType() != framework::proto::VarType::SELECTED_ROWS) { framework::proto::VarType::SELECTED_ROWS) {
out_var.SetType(framework::proto::VarType::LOD_TENSOR); ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
} }
out_var.SetDataType(var_data_type); ctx->SetDataType(out_var_name, var_data_type);
} }
}; };
......
...@@ -57,7 +57,6 @@ DeviceContextPool::DeviceContextPool( ...@@ -57,7 +57,6 @@ DeviceContextPool::DeviceContextPool(
for (auto& p : places) { for (auto& p : places) {
set.insert(p); set.insert(p);
} }
for (auto& p : set) { for (auto& p : set) {
if (platform::is_cpu_place(p)) { if (platform::is_cpu_place(p)) {
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
...@@ -317,6 +316,9 @@ CUDADeviceContext::~CUDADeviceContext() { ...@@ -317,6 +316,9 @@ CUDADeviceContext::~CUDADeviceContext() {
eigen_stream_.reset(); eigen_stream_.reset();
eigen_device_.reset(); eigen_device_.reset();
PADDLE_ENFORCE(cudaStreamDestroy(stream_)); PADDLE_ENFORCE(cudaStreamDestroy(stream_));
#if !defined(_WIN32)
PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_));
#endif
} }
Place CUDADeviceContext::GetPlace() const { return place_; } Place CUDADeviceContext::GetPlace() const { return place_; }
......
...@@ -265,6 +265,14 @@ class CUDADeviceContext : public DeviceContext { ...@@ -265,6 +265,14 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return cuda stream in the device context. */ /*! \brief Return cuda stream in the device context. */
cudaStream_t stream() const; cudaStream_t stream() const;
#if !defined(_WIN32)
/*! \brief Return nccl communicators. */
ncclComm_t nccl_comm() const { return nccl_comm_; }
/*! \brief Set nccl communicators. */
void set_nccl_comm(ncclComm_t comm) { nccl_comm_ = comm; }
#endif
template <typename Callback> template <typename Callback>
void RecordEvent(cudaEvent_t ev, Callback callback) { void RecordEvent(cudaEvent_t ev, Callback callback) {
callback(); callback();
...@@ -289,6 +297,15 @@ class CUDADeviceContext : public DeviceContext { ...@@ -289,6 +297,15 @@ class CUDADeviceContext : public DeviceContext {
std::unique_ptr<CublasHandleHolder> cublas_handle_; std::unique_ptr<CublasHandleHolder> cublas_handle_;
std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_; std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
#if !defined(_WIN32)
// NCCL communicator (single process version) for NCCL collective operations.
// NCCL collective operations provides fast collectives over multiple GPUs
// both within and across nodes.
// But, this collectives is used for collectives over multiple GPUs within
// nodes.
ncclComm_t nccl_comm_{nullptr};
#endif
int compute_capability_; int compute_capability_;
int runtime_version_; int runtime_version_;
int driver_version_; int driver_version_;
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <string.h> // for strdup #include <string.h> // for strdup
#include <algorithm> #include <algorithm>
#include <memory>
#include <set>
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
...@@ -140,6 +142,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) { ...@@ -140,6 +142,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
places.emplace_back(platform::CPUPlace()); places.emplace_back(platform::CPUPlace());
platform::DeviceContextPool::Init(places); platform::DeviceContextPool::Init(places);
platform::DeviceTemporaryAllocator::Init(); platform::DeviceTemporaryAllocator::Init();
#ifndef PADDLE_WITH_MKLDNN #ifndef PADDLE_WITH_MKLDNN
platform::SetNumThreads(FLAGS_paddle_num_threads); platform::SetNumThreads(FLAGS_paddle_num_threads);
#endif #endif
......
...@@ -16,9 +16,11 @@ ...@@ -16,9 +16,11 @@
#pragma once #pragma once
#include <stdio.h> #include <stdio.h>
#include <memory>
#include <string> #include <string>
#include <thread> // NOLINT #include <thread> // NOLINT
#include <typeindex> #include <typeindex>
#include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/dynload/nccl.h"
...@@ -78,6 +80,8 @@ struct NCCLContext { ...@@ -78,6 +80,8 @@ struct NCCLContext {
cudaStream_t stream() const { return ctx_->stream(); } cudaStream_t stream() const { return ctx_->stream(); }
ncclComm_t comm() const { return comm_; }
int device_id() const { int device_id() const {
return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device; return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
} }
......
set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune
feed_fetch_method pass_builder parallel_executor profiler layer scope_pool feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
tracer analysis_predictor) tracer analysis_predictor imperative_profiler)
if(WITH_PYTHON) if(WITH_PYTHON)
list(APPEND PYBIND_DEPS py_func_op) list(APPEND PYBIND_DEPS py_func_op)
......
...@@ -38,20 +38,22 @@ void BindTracer(pybind11::module* m) { ...@@ -38,20 +38,22 @@ void BindTracer(pybind11::module* m) {
.def("trace", .def("trace",
[](imperative::Tracer& self, imperative::OpBase* op, [](imperative::Tracer& self, imperative::OpBase* op,
const imperative::VarBasePtrMap& inputs, const imperative::VarBasePtrMap& inputs,
const imperative::VarBasePtrMap& outputs, imperative::VarBasePtrMap* outputs,
framework::AttributeMap attrs_map, framework::AttributeMap attrs_map,
const platform::CPUPlace expected_place, const platform::CPUPlace expected_place,
const bool stop_gradient = false) { const bool stop_gradient = false) {
pybind11::gil_scoped_release release;
return self.Trace(op, inputs, outputs, attrs_map, expected_place, return self.Trace(op, inputs, outputs, attrs_map, expected_place,
stop_gradient); stop_gradient);
}) })
.def("trace", .def("trace",
[](imperative::Tracer& self, imperative::OpBase* op, [](imperative::Tracer& self, imperative::OpBase* op,
const imperative::VarBasePtrMap& inputs, const imperative::VarBasePtrMap& inputs,
const imperative::VarBasePtrMap& outputs, imperative::VarBasePtrMap* outputs,
framework::AttributeMap attrs_map, framework::AttributeMap attrs_map,
const platform::CUDAPlace expected_place, const platform::CUDAPlace expected_place,
const bool stop_gradient = false) { const bool stop_gradient = false) {
pybind11::gil_scoped_release release;
return self.Trace(op, inputs, outputs, attrs_map, expected_place, return self.Trace(op, inputs, outputs, attrs_map, expected_place,
stop_gradient); stop_gradient);
}) })
......
...@@ -242,6 +242,10 @@ void BindAnalysisConfig(py::module *m) { ...@@ -242,6 +242,10 @@ void BindAnalysisConfig(py::module *m) {
.def("set_mkldnn_op", &AnalysisConfig::SetMKLDNNOp) .def("set_mkldnn_op", &AnalysisConfig::SetMKLDNNOp)
.def("set_model_buffer", &AnalysisConfig::SetModelBuffer) .def("set_model_buffer", &AnalysisConfig::SetModelBuffer)
.def("model_from_memory", &AnalysisConfig::model_from_memory) .def("model_from_memory", &AnalysisConfig::model_from_memory)
.def("runtime_context_cache_enabled",
&AnalysisConfig::runtime_context_cache_enabled)
.def("switch_runtime_context_cache",
&AnalysisConfig::SwitchRuntimeContextCache, py::arg("x") = true)
.def("pass_builder", &AnalysisConfig::pass_builder, .def("pass_builder", &AnalysisConfig::pass_builder,
py::return_value_policy::reference); py::return_value_policy::reference);
} }
......
...@@ -36,6 +36,7 @@ limitations under the License. */ ...@@ -36,6 +36,7 @@ limitations under the License. */
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/version.h" #include "paddle/fluid/framework/version.h"
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/profiler.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/memory/allocation/legacy_allocator.h" #include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
...@@ -156,6 +157,11 @@ PYBIND11_MODULE(core, m) { ...@@ -156,6 +157,11 @@ PYBIND11_MODULE(core, m) {
m.def("print_mem_usage", m.def("print_mem_usage",
[]() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); }); []() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); });
m.def("start_imperative_gperf_profiler",
[]() { imperative::StartProfile(); });
m.def("stop_imperative_gperf_profiler", []() { imperative::StopProfile(); });
py::class_<imperative::VarBase>(m, "VarBase", R"DOC()DOC") py::class_<imperative::VarBase>(m, "VarBase", R"DOC()DOC")
.def( .def(
py::init<const std::string &, paddle::framework::proto::VarType::Type, py::init<const std::string &, paddle::framework::proto::VarType::Type,
...@@ -194,7 +200,7 @@ PYBIND11_MODULE(core, m) { ...@@ -194,7 +200,7 @@ PYBIND11_MODULE(core, m) {
.def_property("name", &imperative::VarBase::Name, .def_property("name", &imperative::VarBase::Name,
&imperative::VarBase::SetName) &imperative::VarBase::SetName)
.def_property_readonly("shape", &imperative::VarBase::Shape) .def_property_readonly("shape", &imperative::VarBase::Shape)
.def_property_readonly("dtype", &imperative::VarBase::DType) .def_property_readonly("dtype", &imperative::VarBase::DataType)
.def_property("persistable", &imperative::VarBase::IsPersistable, .def_property("persistable", &imperative::VarBase::IsPersistable,
&imperative::VarBase::SetPersistable) &imperative::VarBase::SetPersistable)
.def_property("stop_gradient", &imperative::VarBase::IsStopGradient, .def_property("stop_gradient", &imperative::VarBase::IsStopGradient,
...@@ -1231,6 +1237,21 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1231,6 +1237,21 @@ All parameter, weight, gradient are variables in Paddle.
it will save GPU memory and may make the execution faster. it will save GPU memory and may make the execution faster.
This options is only available in GPU devices. This options is only available in GPU devices.
Default False)DOC") Default False)DOC")
.def_property(
"sync_batch_norm",
[](const BuildStrategy &self) { return self.sync_batch_norm_; },
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
self.sync_batch_norm_ = b;
},
R"DOC(The type is BOOL, sync_batch_norm indicates whether to use
synchronous batch normalization which synchronizes the mean
and variance through multi-devices in training phase.
Current implementation doesn't support FP16 training and CPU.
And only synchronous on one machine, not all machines.
Default False)DOC")
.def_property( .def_property(
"memory_optimize", "memory_optimize",
[](const BuildStrategy &self) { return self.memory_optimize_; }, [](const BuildStrategy &self) { return self.memory_optimize_; },
......
...@@ -129,11 +129,12 @@ def __bootstrap__(): ...@@ -129,11 +129,12 @@ def __bootstrap__():
'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory',
'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb',
'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion', 'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion',
'allocator_strategy', 'enable_buffered_allocator', 'tolerant_times', 'allocator_strategy', 'enable_buffered_allocator',
'reader_queue_speed_test_mode', 'print_sub_graph_dir', 'buffered_allocator_excess_times', 'reader_queue_speed_test_mode',
'pe_profile_fname', 'warpctc_dir', 'inner_op_parallelism', 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
'enable_parallel_graph', 'multiple_of_cupti_buffer_size', 'inner_op_parallelism', 'enable_parallel_graph',
'enable_subgraph_optimize' 'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize',
'tracer_profile_fname'
] ]
if 'Darwin' not in sysstr: if 'Darwin' not in sysstr:
read_env_flags.append('use_pinned_memory') read_env_flags.append('use_pinned_memory')
......
...@@ -223,6 +223,9 @@ class CompiledProgram(object): ...@@ -223,6 +223,9 @@ class CompiledProgram(object):
tps), "num_trainers == len(end_points)" tps), "num_trainers == len(end_points)"
self._build_strategy.trainers_endpoints = tps self._build_strategy.trainers_endpoints = tps
if self._build_strategy.sync_batch_norm:
self._build_strategy.enable_sequential_execution = True
self._persistable_vars = [] self._persistable_vars = []
for node in self._graph.nodes(): for node in self._graph.nodes():
if node.is_var() and node.var() is not None and node.var().persistable() and \ if node.is_var() and node.var() is not None and node.var().persistable() and \
......
...@@ -84,7 +84,8 @@ class QuantizeTranspiler(object): ...@@ -84,7 +84,8 @@ class QuantizeTranspiler(object):
activation_bits=8, activation_bits=8,
activation_quantize_type='abs_max', activation_quantize_type='abs_max',
weight_quantize_type='abs_max', weight_quantize_type='abs_max',
window_size=10000): window_size=10000,
moving_rate=0.9):
""" """
Convert and rewrite the fluid Program according to weight and Convert and rewrite the fluid Program according to weight and
activation quantization type. activation quantization type.
...@@ -117,23 +118,27 @@ class QuantizeTranspiler(object): ...@@ -117,23 +118,27 @@ class QuantizeTranspiler(object):
""" """
self.weight_bits = weight_bits self.weight_bits = weight_bits
self.activation_bits = activation_bits self.activation_bits = activation_bits
quant_type = ['abs_max', 'range_abs_max'] quant_type = ['abs_max', 'range_abs_max', 'moving_average_abs_max']
if weight_quantize_type not in quant_type: if weight_quantize_type not in quant_type:
raise ValueError( raise ValueError(
"Unknown weight_quantize_type: '%s'. It can only be ", "Unknown weight_quantize_type: '%s'. It can only be ",
"'abs_max' or 'range_abs_max'.", str(weight_quantize_type)) "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
str(weight_quantize_type))
if activation_quantize_type not in quant_type: if activation_quantize_type not in quant_type:
raise ValueError( raise ValueError(
"Unknown activation_quantize_type : '%s'. It can only be ", "Unknown activation_quantize_type : '%s'. It can only be ",
"'abs_max' or 'range_abs_max'.", str(activation_quantize_type)) "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
str(activation_quantize_type))
self.weight_quantize_type = weight_quantize_type self.weight_quantize_type = weight_quantize_type
self.activation_quantize_type = activation_quantize_type self.activation_quantize_type = activation_quantize_type
self.window_size = window_size self.window_size = window_size
self.moving_rate = moving_rate
self.helper = LayerHelper(self.__class__.__name__) self.helper = LayerHelper(self.__class__.__name__)
self.fake_quant_op_types = [ self.fake_quant_op_types = [
'fake_quantize_abs_max', 'fake_quantize_range_abs_max' 'fake_quantize_abs_max', 'fake_quantize_range_abs_max',
'fake_quantize_moving_average_abs_max'
] ]
self.fake_dequant_op_types = ['fake_dequantize_max_abs'] self.fake_dequant_op_types = ['fake_dequantize_max_abs']
self.is_test = None self.is_test = None
...@@ -168,6 +173,7 @@ class QuantizeTranspiler(object): ...@@ -168,6 +173,7 @@ class QuantizeTranspiler(object):
block_id = block.idx block_id = block.idx
# insert quant op and dequant op # insert quant op and dequant op
for name in op.input_arg_names: for name in op.input_arg_names:
#if share input between ops
if name in dequanted_vars[block_id]: if name in dequanted_vars[block_id]:
dequant_var = dequanted_vars[block_id][name] dequant_var = dequanted_vars[block_id][name]
else: else:
...@@ -261,6 +267,7 @@ class QuantizeTranspiler(object): ...@@ -261,6 +267,7 @@ class QuantizeTranspiler(object):
max_range = None max_range = None
scale_var = None scale_var = None
for name in op.input_arg_names: for name in op.input_arg_names:
#rename input name of the op to the input name of last op which has be removed
if name in op_in_rename_map[block_id]: if name in op_in_rename_map[block_id]:
op._rename_input(name, op_in_rename_map[block_id][name]) op._rename_input(name, op_in_rename_map[block_id][name])
...@@ -272,8 +279,7 @@ class QuantizeTranspiler(object): ...@@ -272,8 +279,7 @@ class QuantizeTranspiler(object):
max_range = param_range * act_range / scale_v max_range = param_range * act_range / scale_v
else: else:
assert isinstance(scale_v, Variable) assert isinstance(scale_v, Variable)
scale_var = var_scale_map[block_id][_original_var_name( scale_var = scale_v
name)]
if len(op.output_arg_names) != 1: if len(op.output_arg_names) != 1:
raise ValueError("Only support one output, but op %s has" raise ValueError("Only support one output, but op %s has"
...@@ -309,7 +315,7 @@ class QuantizeTranspiler(object): ...@@ -309,7 +315,7 @@ class QuantizeTranspiler(object):
op_type = op.type op_type = op.type
# insert dequant_op after fc/conv, need to rename # insert dequant_op after fc/conv, need to rename
# input of the followed ops # input of the followed ops(of fc/conv) to the dquant_op
for name in op.input_arg_names: for name in op.input_arg_names:
if name in op_out_rename_map[block_id]: if name in op_out_rename_map[block_id]:
op._rename_input(name, op._rename_input(name,
...@@ -389,8 +395,8 @@ class QuantizeTranspiler(object): ...@@ -389,8 +395,8 @@ class QuantizeTranspiler(object):
for op in block.ops: for op in block.ops:
args += op.input_arg_names args += op.input_arg_names
args += op.output_arg_names args += op.output_arg_names
args = list(set(args)) args = list(set(args)) #vals of all left ops
var_names = block.vars.keys() var_names = block.vars.keys() # all vals
sub_block_remove_vars = [] sub_block_remove_vars = []
for var in var_names: for var in var_names:
if var not in args: if var not in args:
...@@ -471,6 +477,61 @@ class QuantizeTranspiler(object): ...@@ -471,6 +477,61 @@ class QuantizeTranspiler(object):
return quant_var, scale return quant_var, scale
def _insert_quant_moving_average_abs_max_op(self, block, idx, var,
quant_bits):
"""Insert fake_quantize_moving_average_abs_max
"""
quant_var = block.create_var(
name=_quantized_var_name(var.name),
type=var.type,
shape=var.shape,
dtype=var.dtype)
state = self.helper.create_global_variable(
name=unique_name.generate('state'),
persistable=True,
dtype=var.dtype,
shape=[1])
self.helper.set_variable_initializer(
state, initializer=Constant(value=1))
accum = self.helper.create_global_variable(
name=unique_name.generate('accum'),
persistable=True,
dtype=var.dtype,
shape=[1])
self.helper.set_variable_initializer(
accum, initializer=Constant(value=1))
scale = self.helper.create_parameter(
attr=ParamAttr(
name=_quantized_scale_name(var.name),
initializer=Constant(0.001),
trainable=False),
shape=[1],
dtype=var.dtype)
scale.stop_gradient = True
ins = {'X': var, 'InScale': scale}
outs = {'Out': quant_var, 'OutScale': scale}
if not self.is_test:
ins['InState'] = state
ins['InAccum'] = accum
outs['OutState'] = state
outs['OutAccum'] = accum
attrs = {
'bit_length': quant_bits,
'moving_rate': self.moving_rate,
'is_test': self.is_test
}
quant_op = block._insert_op(
idx,
type='fake_quantize_moving_average_abs_max',
attrs=attrs,
inputs=ins,
outputs=outs)
return quant_var, scale
def _insert_quant_op(self, block, idx, var, quant_bits, quant_type): def _insert_quant_op(self, block, idx, var, quant_bits, quant_type):
""" """
Insert fake_quantize_op Insert fake_quantize_op
...@@ -480,6 +541,9 @@ class QuantizeTranspiler(object): ...@@ -480,6 +541,9 @@ class QuantizeTranspiler(object):
elif quant_type == 'range_abs_max': elif quant_type == 'range_abs_max':
return self._insert_quant_range_abs_max_op(block, idx, var, return self._insert_quant_range_abs_max_op(block, idx, var,
quant_bits) quant_bits)
elif quant_type == 'moving_average_abs_max':
return self._insert_quant_moving_average_abs_max_op(block, idx, var,
quant_bits)
def _insert_dequant_op(self, block, idx, var, scale, quant_bits): def _insert_dequant_op(self, block, idx, var, scale, quant_bits):
""" """
......
...@@ -38,7 +38,8 @@ class QuantizationTransformPass(object): ...@@ -38,7 +38,8 @@ class QuantizationTransformPass(object):
activation_bits=8, activation_bits=8,
activation_quantize_type='abs_max', activation_quantize_type='abs_max',
weight_quantize_type='abs_max', weight_quantize_type='abs_max',
window_size=10000): window_size=10000,
moving_rate=0.9):
""" """
Convert and rewrite the IrGraph according to weight and Convert and rewrite the IrGraph according to weight and
activation quantization type. activation quantization type.
...@@ -83,19 +84,22 @@ class QuantizationTransformPass(object): ...@@ -83,19 +84,22 @@ class QuantizationTransformPass(object):
self._weight_bits = weight_bits self._weight_bits = weight_bits
self._activation_bits = activation_bits self._activation_bits = activation_bits
quant_type = ['abs_max', 'range_abs_max'] quant_type = ['abs_max', 'range_abs_max', 'moving_average_abs_max']
if activation_quantize_type not in quant_type: if activation_quantize_type not in quant_type:
raise ValueError( raise ValueError(
"Unknown activation_quantize_type : '%s'. It can only be ", "Unknown activation_quantize_type : '%s'. It can only be ",
"'abs_max' or 'range_abs_max'.", str(activation_quantize_type)) "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
str(activation_quantize_type))
if weight_quantize_type not in quant_type: if weight_quantize_type not in quant_type:
raise ValueError( raise ValueError(
"Unknown weight_quantize_type: '%s'. It can only be ", "Unknown weight_quantize_type: '%s'. It can only be ",
"'abs_max' or 'range_abs_max'.", str(weight_quantize_type)) "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
str(weight_quantize_type))
self._activation_quantize_type = activation_quantize_type self._activation_quantize_type = activation_quantize_type
self._weight_quantize_type = weight_quantize_type self._weight_quantize_type = weight_quantize_type
self._window_size = window_size self._window_size = window_size
self._moving_rate = moving_rate
self._need_initialized = collections.OrderedDict() self._need_initialized = collections.OrderedDict()
self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul'] self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
...@@ -222,6 +226,9 @@ class QuantizationTransformPass(object): ...@@ -222,6 +226,9 @@ class QuantizationTransformPass(object):
elif quant_type == 'range_abs_max': elif quant_type == 'range_abs_max':
return self._insert_quant_range_abs_max_op(graph, var_node, return self._insert_quant_range_abs_max_op(graph, var_node,
quant_bits) quant_bits)
elif quant_type == 'moving_average_abs_max':
return self._insert_quant_moving_average_abs_max_op(graph, var_node,
quant_bits)
def _insert_quant_abs_max_op(self, graph, var_node, quant_bits): def _insert_quant_abs_max_op(self, graph, var_node, quant_bits):
""" """
...@@ -309,6 +316,74 @@ class QuantizationTransformPass(object): ...@@ -309,6 +316,74 @@ class QuantizationTransformPass(object):
return quant_var_node, scale_out_node return quant_var_node, scale_out_node
def _insert_quant_moving_average_abs_max_op(self, graph, var_node,
quant_bits):
"""Insert fake_quantize_moving_average_abs_max
"""
quant_var_node = graph.create_var_node(
name=self._quantized_var_name(var_node.name()),
var_type=var_node.type(),
shape=var_node.shape(),
var_dtype=var_node.dtype())
scale_in_node = graph.create_persistable_node(
name=self._quantized_scale_name(var_node.name()),
var_type=core.VarDesc.VarType.LOD_TENSOR,
shape=[1],
var_dtype=var_node.dtype())
self._need_initialized[scale_in_node.var()] = Constant(value=0.001)
scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
ins = {'X': var_node, 'InScale': scale_in_node}
outs = {'Out': quant_var_node, 'OutScale': scale_out_node}
if not self._is_test:
state_in_node = graph.create_persistable_node(
name=unique_name.generate('state'),
var_type=core.VarDesc.VarType.LOD_TENSOR,
var_dtype=var_node.dtype(),
shape=[1])
self._need_initialized[state_in_node.var()] = Constant(value=1)
accum_in_node = graph.create_persistable_node(
name=unique_name.generate('accum'),
var_type=core.VarDesc.VarType.LOD_TENSOR,
var_dtype=var_node.dtype(),
shape=[1])
self._need_initialized[accum_in_node.var()] = Constant(value=1)
state_out_node = graph.create_var_node_from_desc(state_in_node.var(
))
accum_out_node = graph.create_var_node_from_desc(accum_in_node.var(
))
ins['InState'] = state_in_node
ins['InAccum'] = accum_in_node
outs['OutState'] = state_out_node
outs['OutAccum'] = accum_out_node
attrs = {
'bit_length': quant_bits,
'moving_rate': self._moving_rate,
'is_test': self._is_test,
'op_role': core.op_proto_and_checker_maker.OpRole.Forward
}
quant_op_node = graph.create_op_node(
op_type='fake_quantize_moving_average_abs_max',
attrs=attrs,
inputs=ins,
outputs=outs)
graph.link_to(var_node, quant_op_node)
graph.link_to(scale_in_node, quant_op_node)
graph.link_to(quant_op_node, quant_var_node)
graph.link_to(quant_op_node, scale_out_node)
if not self._is_test:
graph.link_to(state_in_node, quant_op_node)
graph.link_to(accum_in_node, quant_op_node)
graph.link_to(quant_op_node, state_out_node)
graph.link_to(quant_op_node, accum_out_node)
return quant_var_node, scale_out_node
def _insert_dequant_op(self, graph, var_node, scale_var_node, quant_bits): def _insert_dequant_op(self, graph, var_node, scale_var_node, quant_bits):
""" """
Insert fake_dequantize_op in the graph. Insert fake_dequantize_op in the graph.
...@@ -389,7 +464,8 @@ class QuantizationFreezePass(object): ...@@ -389,7 +464,8 @@ class QuantizationFreezePass(object):
self._weight_quantize_type = weight_quantize_type self._weight_quantize_type = weight_quantize_type
self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul'] self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
self._fake_quant_op_names = [ self._fake_quant_op_names = [
'fake_quantize_abs_max', 'fake_quantize_range_abs_max' 'fake_quantize_abs_max', 'fake_quantize_range_abs_max',
'fake_quantize_moving_average_abs_max'
] ]
self._fake_dequant_op_names = ['fake_dequantize_max_abs'] self._fake_dequant_op_names = ['fake_dequantize_max_abs']
self._op_input_rename_map = collections.OrderedDict() self._op_input_rename_map = collections.OrderedDict()
......
...@@ -164,6 +164,9 @@ class TestQuantizationTransformPass(unittest.TestCase): ...@@ -164,6 +164,9 @@ class TestQuantizationTransformPass(unittest.TestCase):
def test_linear_fc_quant_range_abs_max(self): def test_linear_fc_quant_range_abs_max(self):
self.linear_fc_quant('range_abs_max', for_ci=True) self.linear_fc_quant('range_abs_max', for_ci=True)
def test_linear_fc_quant_moving_average_abs_max(self):
self.linear_fc_quant('moving_average_abs_max', for_ci=True)
def residual_block_quant(self, quant_type, for_ci=False): def residual_block_quant(self, quant_type, for_ci=False):
main = fluid.Program() main = fluid.Program()
startup = fluid.Program() startup = fluid.Program()
...@@ -201,6 +204,9 @@ class TestQuantizationTransformPass(unittest.TestCase): ...@@ -201,6 +204,9 @@ class TestQuantizationTransformPass(unittest.TestCase):
def test_residual_block_range_abs_max(self): def test_residual_block_range_abs_max(self):
self.residual_block_quant('range_abs_max', for_ci=True) self.residual_block_quant('range_abs_max', for_ci=True)
def test_residual_block_moving_average_abs_max(self):
self.residual_block_quant('moving_average_abs_max', for_ci=True)
class TestQuantizationFreezePass(unittest.TestCase): class TestQuantizationFreezePass(unittest.TestCase):
def freeze_graph(self, use_cuda, seed, quant_type, for_ci=False): def freeze_graph(self, use_cuda, seed, quant_type, for_ci=False):
...@@ -380,11 +386,18 @@ class TestQuantizationFreezePass(unittest.TestCase): ...@@ -380,11 +386,18 @@ class TestQuantizationFreezePass(unittest.TestCase):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
self.freeze_graph( self.freeze_graph(
True, seed=1, quant_type='range_abs_max', for_ci=True) True, seed=1, quant_type='range_abs_max', for_ci=True)
self.freeze_graph(
True,
seed=1,
quant_type='moving_average_abs_max',
for_ci=True)
def test_freeze_graph_cpu_static(self): def test_freeze_graph_cpu_static(self):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
self.freeze_graph( self.freeze_graph(
False, seed=2, quant_type='range_abs_max', for_ci=True) False, seed=2, quant_type='range_abs_max', for_ci=True)
self.freeze_graph(
False, seed=2, quant_type='moving_average_abs_max', for_ci=True)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -18,6 +18,7 @@ import os ...@@ -18,6 +18,7 @@ import os
import time import time
import logging import logging
import paddle
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid import io from paddle.fluid import io
from paddle.fluid import Program from paddle.fluid import Program
...@@ -84,8 +85,9 @@ def convert_dist_to_sparse_program(program): ...@@ -84,8 +85,9 @@ def convert_dist_to_sparse_program(program):
when we train model with distributed lookup table but want to do the local inference, we can use when we train model with distributed lookup table but want to do the local inference, we can use
this function to convert the train program with distributed lookup table to sparse lookup table. this function to convert the train program with distributed lookup table to sparse lookup table.
:param program(Program): the program must be the trainer program, which will be get by the distribute transpiler. Args:
:return: program(Program): the program must be the trainer program, which will be get by the distribute transpiler.
Returns:
program: The `program` is a Program, it's the program replace distributed lookup table to sparse lookup table. program: The `program` is a Program, it's the program replace distributed lookup table to sparse lookup table.
""" """
if not program._distributed_lookup_table: if not program._distributed_lookup_table:
...@@ -128,68 +130,92 @@ def convert_dist_to_sparse_program(program): ...@@ -128,68 +130,92 @@ def convert_dist_to_sparse_program(program):
return program return program
def _load_persistable_vars(executor, dirname, program, lookup_table_vars):
def _is_checkpoint_var(exclude_fluid_vars=None):
"""
the checkpoint will not save or load all the variables.
var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
: param var(Variable)
"""
if exclude_fluid_vars is None:
exclude_fluid_vars = []
def is_valid(var):
if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
var.desc.type() == core.VarDesc.VarType.RAW:
return False
# @GRAD are named for gradient variables, checkpoint will not save it.
if "@GRAD" in var.name:
return False
# .trainer_ are named for distribute train variables, checkpoint will not save it.
if ".trainer_" in var.name:
return False
# .block is named for distribute train variables, checkpoint will not save it.
if ".block" in var.name:
return False
if "tmp_" in var.name:
return False
if var.name in exclude_fluid_vars:
return False
return var.persistable
return is_valid
io.load_vars(
executor,
dirname=dirname,
main_program=program,
predicate=_is_checkpoint_var(lookup_table_vars),
filename=None)
def load_persistables_for_increment(dirname, executor, program, def load_persistables_for_increment(dirname, executor, program,
lookup_table_var, lookup_table_var_path): lookup_table_var, lookup_table_var_path):
""" """
WARNING: this function will only be used for distributed training with distributed lookup table. WARNING: this function will only be used for distributed training with distributed lookup table.
for increment trainning, the pserver will not only load dense variables, for increment trainning, the pserver will not only load dense variables,
but also load the suitable lookup table var. Because of slice lookup table but also load the suitable lookup table var. Because of sliced lookup table
var with HASH, we must load the correct slice var. var with HASH, we must load the correct sliced var.
Args:
dirname(str): The directory path
executor(Executor): The executor to run for loading inference model.
program(Program): The parameter server program, which will run on Pserver.
lookup_table_var: the distributed lookup tables var name.
lookup_table_var_path: the the distributed lookup tables var location.
Returns:
None
"""
def _load_persistable_vars(executor, dirname, need_load_vars):
load_prog = Program()
load_block = load_prog.global_block()
need_delete_vars = []
for param in need_load_vars:
origin_var = param.origin
slice_var = param.slice
is_slice = param.is_slice
offset = param.offset
if is_slice:
origin = load_block.create_var(
name="{}.load".format(origin_var.name),
type=origin_var.type,
shape=origin_var.shape,
dtype=origin_var.dtype,
persistable=True)
load_block.append_op(
type='load',
inputs={},
outputs={'Out': [origin]},
attrs={
'file_path': os.path.join(dirname, origin_var.name)
})
slice = load_block.create_var(
name=slice_var.name,
type=slice_var.type,
shape=slice_var.shape,
dtype=slice_var.dtype,
persistable=True)
dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
start = int(offset / dim1_flatten)
end = int(offset / dim1_flatten + slice.shape[0])
load_block.append_op(
type="slice",
inputs={'Input': origin},
outputs={'Out': slice},
attrs={'axes': [0],
'starts': [start],
'ends': [end]})
need_delete_vars.append(origin)
else:
origin = load_block.create_var(
name="{}".format(origin_var.name),
type=origin_var.type,
shape=origin_var.shape,
dtype=origin_var.dtype,
persistable=True)
load_block.append_op(
type='load',
inputs={},
outputs={'Out': [origin]},
attrs={
'file_path': os.path.join(dirname, origin_var.name)
})
:param dirname(str): The directory path load_block.append_op(
:param executor(Executor): The executor to run for loading inference model. type='delete_var',
:param program(Program): The parameter server program, which will run on Pserver. inputs={'X': need_delete_vars}, )
:param lookup_table_var: the distributed lookup tables var name.
:param lookup_table_var_path: the the distributed lookup tables var location. executor.run(load_prog)
:return: None
"""
def __load_lookup_table_vars(executor, main_program, lookup_table_var, def __load_lookup_table_vars(executor, main_program, lookup_table_var,
lookup_table_var_path): lookup_table_var_path):
...@@ -217,7 +243,9 @@ def load_persistables_for_increment(dirname, executor, program, ...@@ -217,7 +243,9 @@ def load_persistables_for_increment(dirname, executor, program,
"Distributed Lookup Table Vars from {}, time = {}".format( "Distributed Lookup Table Vars from {}, time = {}".format(
dirname, time.ctime())) dirname, time.ctime()))
_load_persistable_vars(executor, dirname, program, [lookup_table_var]) need_load_vars = program._parameters_on_pservers.get_distributed_vars_by_ep(
program._ps_endpoint)
_load_persistable_vars(executor, dirname, need_load_vars)
__load_lookup_table_vars(executor, program, lookup_table_var, __load_lookup_table_vars(executor, program, lookup_table_var,
lookup_table_var_path) lookup_table_var_path)
...@@ -233,15 +261,62 @@ def load_persistables_for_inference(dirname, executor, program, ...@@ -233,15 +261,62 @@ def load_persistables_for_inference(dirname, executor, program,
Inference with distributed lookup table is a little funky, this function will load distributed Inference with distributed lookup table is a little funky, this function will load distributed
lookup table vars into sparse var, can be used in local inference mode. lookup table vars into sparse var, can be used in local inference mode.
:param dirname(str): The directory path Args:
:param executor(Executor): The executor to run for loading inference model. dirname(str): The directory path
:param program(Program): The parameter server program, which will run on Pserver. executor(Executor): The executor to run for loading inference model.
:param lookup_table_var_name: the distributed lookup tables var name. program(Program): The parameter server program, which will run on Pserver.
:return: None lookup_table_var_name: the distributed lookup tables var name.
Returns:
None
""" """
def __load_lookup_table_vars(executor, dirname, main_program, def _load_persistable_vars(executor, dirname, program, lookup_table_vars):
lookup_table_vars): def _is_checkpoint_var(exclude_fluid_vars=None):
"""
the checkpoint will not save or load all the variables.
var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
: param var(Variable)
"""
if exclude_fluid_vars is None:
exclude_fluid_vars = []
def is_valid(var):
if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
var.desc.type() == core.VarDesc.VarType.RAW:
return False
# @GRAD are named for gradient variables, checkpoint will not save it.
if "@GRAD" in var.name:
return False
# .trainer_ are named for distribute train variables, checkpoint will not save it.
if ".trainer_" in var.name:
return False
# .block is named for distribute train variables, checkpoint will not save it.
if ".block" in var.name:
return False
if "tmp_" in var.name:
return False
if var.name in exclude_fluid_vars:
return False
return var.persistable
return is_valid
io.load_vars(
executor,
dirname=dirname,
main_program=program,
predicate=_is_checkpoint_var(lookup_table_vars),
filename=None)
def _load_lookup_table_vars(executor, dirname, main_program,
lookup_table_vars):
if not os.path.isdir(dirname): if not os.path.isdir(dirname):
raise ValueError("There is no directory named '%s'", dirname) raise ValueError("There is no directory named '%s'", dirname)
...@@ -313,11 +388,96 @@ def load_persistables_for_inference(dirname, executor, program, ...@@ -313,11 +388,96 @@ def load_persistables_for_inference(dirname, executor, program,
dirname, time.ctime())) dirname, time.ctime()))
_load_persistable_vars(executor, dirname, program, [lookup_table_var_name]) _load_persistable_vars(executor, dirname, program, [lookup_table_var_name])
__load_lookup_table_vars(executor, dirname, program, _load_lookup_table_vars(executor, dirname, program, [lookup_table_var_name])
[lookup_table_var_name])
_logger.info("Finish Load Sparse Program With " _logger.info("Finish Load Sparse Program With "
"Distributed Lookup Table Vars from {}, time = {}".format( "Distributed Lookup Table Vars from {}, time = {}".format(
dirname, time.ctime())) dirname, time.ctime()))
return program return program
def get_inference_model(main_program, feeded_var_names, target_vars):
"""
Prune the given `main_program` to build a new program especially for inference with distributed lookup table ,
and then add `feeded_vars` and `target_vars` in this program.
Args:
main_program(Program|None): The original program, which will be pruned to
build the inference model. If is setted None,
the default main program will be used.
Default: None.
feeded_var_names(list[str]): Names of variables that need to be feeded data
during inference.
target_vars(list[Variable]): Variables from which we can get inference
results.
Returns:
program(Program)
Raises:
ValueError: If `feed_var_names` is not a list of basestring.
ValueError: If `target_vars` is not a list of Variable.
"""
def prepend_feed_ops(inference_program,
feed_target_names,
feed_holder_name='feed'):
if len(feed_target_names) == 0:
return
global_block = inference_program.global_block()
feed_var = global_block.create_var(
name=feed_holder_name,
type=core.VarDesc.VarType.FEED_MINIBATCH,
persistable=True)
for i, name in enumerate(feed_target_names):
out = global_block.var(name)
global_block._prepend_op(
type='feed',
inputs={'X': [feed_var]},
outputs={'Out': [out]},
attrs={'col': i})
def append_fetch_ops(inference_program,
fetch_target_names,
fetch_holder_name='fetch'):
global_block = inference_program.global_block()
fetch_var = global_block.create_var(
name=fetch_holder_name,
type=core.VarDesc.VarType.FETCH_LIST,
persistable=True)
for i, name in enumerate(fetch_target_names):
global_block.append_op(
type='fetch',
inputs={'X': [name]},
outputs={'Out': [fetch_var]},
attrs={'col': i})
origin_program = main_program.clone()
main_program = main_program.clone()
global_block = main_program.global_block()
need_to_remove_op_index = []
for i, op in enumerate(global_block.ops):
op.desc.set_is_target(False)
if op.type == "feed" or op.type == "fetch":
need_to_remove_op_index.append(i)
for index in need_to_remove_op_index[::-1]:
global_block._remove_op(index)
main_program.desc.flush()
main_program = main_program._prune(targets=target_vars)
main_program = main_program._inference_optimize(prune_read_op=True)
fetch_var_names = [v.name for v in target_vars]
prepend_feed_ops(main_program, feeded_var_names)
append_fetch_ops(main_program, fetch_var_names)
return main_program
...@@ -268,8 +268,8 @@ class DataFeeder(object): ...@@ -268,8 +268,8 @@ class DataFeeder(object):
Args: Args:
reader(function): the reader is the function which can generate data. reader(function): the reader is the function which can generate data.
multi_devices(bool): whether to use multiple devices or not. multi_devices(bool): whether to use multiple devices or not.
num_places(int): if the multi_devices is True, you can specify the number num_places(int): if multi_devices is True, you can specify the number
of GPU to use, if 'num_places' is None, the function will use all the of GPU to use, if multi_devices is None, the function will use all the
GPU of the current machine. Default None. GPU of the current machine. Default None.
drop_last(bool): whether to drop the last batch if the drop_last(bool): whether to drop the last batch if the
size of the last batch is less than batch_size. Default True. size of the last batch is less than batch_size. Default True.
...@@ -278,7 +278,7 @@ class DataFeeder(object): ...@@ -278,7 +278,7 @@ class DataFeeder(object):
dict: the result of conversion. dict: the result of conversion.
Raises: Raises:
ValueError: If drop_last is False and the data batch which cannot fit for devices. ValueError: If drop_last is False and the data batch cannot fit for devices.
""" """
def __reader_creator__(): def __reader_creator__():
......
...@@ -470,13 +470,21 @@ class Executor(object): ...@@ -470,13 +470,21 @@ class Executor(object):
program(Program|CompiledProgram): the program that need to run, program(Program|CompiledProgram): the program that need to run,
if not provided, then default_main_program (not compiled) will be used. if not provided, then default_main_program (not compiled) will be used.
feed(dict): feed variable map, e.g. {"image": ImageData, "label": LabelData} feed(dict): feed variable map, e.g. {"image": ImageData, "label": LabelData}
fetch_list(list): a list of variable or variable names that user want to get, run will return them according to this list. fetch_list(list): a list of variable or variable names that user
feed_var_name(str): the name for the input variable of feed Operator. wants to get, this method will return them according to this list.
fetch_var_name(str): the name for the output variable of fetch Operator. feed_var_name(str): the name for the input variable of
scope(Scope): the scope used to run this program, you can switch it to different scope. default is global_scope feed Operator.
fetch_var_name(str): the name for the output variable of
fetch Operator.
scope(Scope): the scope used to run this program, you can switch
it to different scope. default is global_scope
return_numpy(bool): if convert the fetched tensor to numpy return_numpy(bool): if convert the fetched tensor to numpy
use_program_cache(bool): set use_program_cache to true if program not changed compare to the last step. use_program_cache(bool): whether to use the cached program
settings across batches. Setting it be true would be faster
only when (1) the program is not compiled with data parallel,
and (2) program, feed variable names and fetch_list variable
names do not changed compared to the last step.
Returns: Returns:
list(numpy.array): fetch result according to fetch_list. list(numpy.array): fetch result according to fetch_list.
......
...@@ -430,6 +430,11 @@ class Variable(object): ...@@ -430,6 +430,11 @@ class Variable(object):
Returns: Returns:
str: The debug string. str: The debug string.
""" """
if _in_imperative_mode():
# TODO(panyx0718): add more imperative debug info.
return 'name %s, dtype: %s shape: %s' % (self.name, self.dtype,
self.shape)
assert isinstance(throw_on_error, bool) and isinstance(with_details, assert isinstance(throw_on_error, bool) and isinstance(with_details,
bool) bool)
protostr = self.desc.serialize_to_string() protostr = self.desc.serialize_to_string()
......
...@@ -26,8 +26,12 @@ from .nn import * ...@@ -26,8 +26,12 @@ from .nn import *
from . import tracer from . import tracer
from .tracer import * from .tracer import *
from . import profiler
from .profiler import *
__all__ = [] __all__ = []
__all__ += layers.__all__ __all__ += layers.__all__
__all__ += base.__all__ __all__ += base.__all__
__all__ += nn.__all__ __all__ += nn.__all__
__all__ += tracer.__all__ __all__ += tracer.__all__
__all__ += profiler.__all__
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from .. import core
__all__ = [
'start_gperf_profiler',
'stop_gperf_profiler',
]
def start_gperf_profiler():
core.start_imperative_gperf_profiler()
def stop_gperf_profiler():
core.stop_imperative_gperf_profiler()
...@@ -49,6 +49,7 @@ __all__ = [ ...@@ -49,6 +49,7 @@ __all__ = [
'box_coder', 'box_coder',
'polygon_box_transform', 'polygon_box_transform',
'yolov3_loss', 'yolov3_loss',
'yolo_box',
'box_clip', 'box_clip',
'multiclass_nms', 'multiclass_nms',
'distribute_fpn_proposals', 'distribute_fpn_proposals',
...@@ -515,6 +516,8 @@ def yolov3_loss(x, ...@@ -515,6 +516,8 @@ def yolov3_loss(x,
class_num, class_num,
ignore_thresh, ignore_thresh,
downsample_ratio, downsample_ratio,
gtscore=None,
use_label_smooth=True,
name=None): name=None):
""" """
${comment} ${comment}
...@@ -533,28 +536,35 @@ def yolov3_loss(x, ...@@ -533,28 +536,35 @@ def yolov3_loss(x,
class_num (int): ${class_num_comment} class_num (int): ${class_num_comment}
ignore_thresh (float): ${ignore_thresh_comment} ignore_thresh (float): ${ignore_thresh_comment}
downsample_ratio (int): ${downsample_ratio_comment} downsample_ratio (int): ${downsample_ratio_comment}
name (string): the name of yolov3 loss name (string): the name of yolov3 loss. Default None.
gtscore (Variable): mixup score of ground truth boxes, shoud be in shape
of [N, B]. Default None.
use_label_smooth (bool): ${use_label_smooth_comment}
Returns: Returns:
Variable: A 1-D tensor with shape [1], the value of yolov3 loss Variable: A 1-D tensor with shape [N], the value of yolov3 loss
Raises: Raises:
TypeError: Input x of yolov3_loss must be Variable TypeError: Input x of yolov3_loss must be Variable
TypeError: Input gtbox of yolov3_loss must be Variable" TypeError: Input gtbox of yolov3_loss must be Variable
TypeError: Input gtlabel of yolov3_loss must be Variable" TypeError: Input gtlabel of yolov3_loss must be Variable
TypeError: Input gtscore of yolov3_loss must be None or Variable
TypeError: Attr anchors of yolov3_loss must be list or tuple TypeError: Attr anchors of yolov3_loss must be list or tuple
TypeError: Attr class_num of yolov3_loss must be an integer TypeError: Attr class_num of yolov3_loss must be an integer
TypeError: Attr ignore_thresh of yolov3_loss must be a float number TypeError: Attr ignore_thresh of yolov3_loss must be a float number
TypeError: Attr use_label_smooth of yolov3_loss must be a bool value
Examples: Examples:
.. code-block:: python .. code-block:: python
x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32') x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32') gtbox = fluid.layers.data(name='gtbox', shape=[6, 4], dtype='float32')
gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32') gtlabel = fluid.layers.data(name='gtlabel', shape=[6], dtype='int32')
gtscore = fluid.layers.data(name='gtscore', shape=[6], dtype='float32')
anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326] anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
anchor_mask = [0, 1, 2] anchor_mask = [0, 1, 2]
loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel, anchors=anchors, loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel,
gtscore=gtscore, anchors=anchors,
anchor_mask=anchor_mask, class_num=80, anchor_mask=anchor_mask, class_num=80,
ignore_thresh=0.7, downsample_ratio=32) ignore_thresh=0.7, downsample_ratio=32)
""" """
...@@ -566,6 +576,8 @@ def yolov3_loss(x, ...@@ -566,6 +576,8 @@ def yolov3_loss(x,
raise TypeError("Input gtbox of yolov3_loss must be Variable") raise TypeError("Input gtbox of yolov3_loss must be Variable")
if not isinstance(gtlabel, Variable): if not isinstance(gtlabel, Variable):
raise TypeError("Input gtlabel of yolov3_loss must be Variable") raise TypeError("Input gtlabel of yolov3_loss must be Variable")
if gtscore is not None and not isinstance(gtscore, Variable):
raise TypeError("Input gtscore of yolov3_loss must be Variable")
if not isinstance(anchors, list) and not isinstance(anchors, tuple): if not isinstance(anchors, list) and not isinstance(anchors, tuple):
raise TypeError("Attr anchors of yolov3_loss must be list or tuple") raise TypeError("Attr anchors of yolov3_loss must be list or tuple")
if not isinstance(anchor_mask, list) and not isinstance(anchor_mask, tuple): if not isinstance(anchor_mask, list) and not isinstance(anchor_mask, tuple):
...@@ -575,6 +587,9 @@ def yolov3_loss(x, ...@@ -575,6 +587,9 @@ def yolov3_loss(x,
if not isinstance(ignore_thresh, float): if not isinstance(ignore_thresh, float):
raise TypeError( raise TypeError(
"Attr ignore_thresh of yolov3_loss must be a float number") "Attr ignore_thresh of yolov3_loss must be a float number")
if not isinstance(use_label_smooth, bool):
raise TypeError(
"Attr use_label_smooth of yolov3_loss must be a bool value")
if name is None: if name is None:
loss = helper.create_variable_for_type_inference(dtype=x.dtype) loss = helper.create_variable_for_type_inference(dtype=x.dtype)
...@@ -585,21 +600,26 @@ def yolov3_loss(x, ...@@ -585,21 +600,26 @@ def yolov3_loss(x,
objectness_mask = helper.create_variable_for_type_inference(dtype='int32') objectness_mask = helper.create_variable_for_type_inference(dtype='int32')
gt_match_mask = helper.create_variable_for_type_inference(dtype='int32') gt_match_mask = helper.create_variable_for_type_inference(dtype='int32')
inputs = {
"X": x,
"GTBox": gtbox,
"GTLabel": gtlabel,
}
if gtscore:
inputs["GTScore"] = gtscore
attrs = { attrs = {
"anchors": anchors, "anchors": anchors,
"anchor_mask": anchor_mask, "anchor_mask": anchor_mask,
"class_num": class_num, "class_num": class_num,
"ignore_thresh": ignore_thresh, "ignore_thresh": ignore_thresh,
"downsample_ratio": downsample_ratio, "downsample_ratio": downsample_ratio,
"use_label_smooth": use_label_smooth,
} }
helper.append_op( helper.append_op(
type='yolov3_loss', type='yolov3_loss',
inputs={ inputs=inputs,
"X": x,
"GTBox": gtbox,
"GTLabel": gtlabel,
},
outputs={ outputs={
'Loss': loss, 'Loss': loss,
'ObjectnessMask': objectness_mask, 'ObjectnessMask': objectness_mask,
...@@ -609,6 +629,83 @@ def yolov3_loss(x, ...@@ -609,6 +629,83 @@ def yolov3_loss(x,
return loss return loss
@templatedoc(op_type="yolo_box")
def yolo_box(x,
img_size,
anchors,
class_num,
conf_thresh,
downsample_ratio,
name=None):
"""
${comment}
Args:
x (Variable): ${x_comment}
img_size (Variable): ${img_size_comment}
anchors (list|tuple): ${anchors_comment}
class_num (int): ${class_num_comment}
conf_thresh (float): ${conf_thresh_comment}
downsample_ratio (int): ${downsample_ratio_comment}
name (string): the name of yolo box layer. Default None.
Returns:
Variable: A 3-D tensor with shape [N, M, 4], the coordinates of boxes,
and a 3-D tensor with shape [N, M, :attr:`class_num`], the classification
scores of boxes.
Raises:
TypeError: Input x of yolov_box must be Variable
TypeError: Attr anchors of yolo box must be list or tuple
TypeError: Attr class_num of yolo box must be an integer
TypeError: Attr conf_thresh of yolo box must be a float number
Examples:
.. code-block:: python
x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
anchors = [10, 13, 16, 30, 33, 23]
loss = fluid.layers.yolo_box(x=x, class_num=80, anchors=anchors,
conf_thresh=0.01, downsample_ratio=32)
"""
helper = LayerHelper('yolo_box', **locals())
if not isinstance(x, Variable):
raise TypeError("Input x of yolo_box must be Variable")
if not isinstance(img_size, Variable):
raise TypeError("Input img_size of yolo_box must be Variable")
if not isinstance(anchors, list) and not isinstance(anchors, tuple):
raise TypeError("Attr anchors of yolo_box must be list or tuple")
if not isinstance(class_num, int):
raise TypeError("Attr class_num of yolo_box must be an integer")
if not isinstance(conf_thresh, float):
raise TypeError("Attr ignore_thresh of yolo_box must be a float number")
boxes = helper.create_variable_for_type_inference(dtype=x.dtype)
scores = helper.create_variable_for_type_inference(dtype=x.dtype)
attrs = {
"anchors": anchors,
"class_num": class_num,
"conf_thresh": conf_thresh,
"downsample_ratio": downsample_ratio,
}
helper.append_op(
type='yolo_box',
inputs={
"X": x,
"ImgSize": img_size,
},
outputs={
'Boxes': boxes,
'Scores': scores,
},
attrs=attrs)
return boxes, scores
@templatedoc() @templatedoc()
def detection_map(detect_res, def detection_map(detect_res,
label, label,
......
...@@ -23,7 +23,8 @@ import os ...@@ -23,7 +23,8 @@ import os
import inspect import inspect
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
from ..initializer import Normal, Constant, NumpyArrayInitializer from ..initializer import Normal, Constant, NumpyArrayInitializer
from ..framework import Variable, OpProtoHolder from ..framework import Variable, OpProtoHolder, _in_imperative_mode
from ..imperative import base
from ..param_attr import ParamAttr from ..param_attr import ParamAttr
from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_ from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
from .tensor import concat, assign from .tensor import concat, assign
...@@ -205,16 +206,23 @@ def fc(input, ...@@ -205,16 +206,23 @@ def fc(input,
**Fully Connected Layer** **Fully Connected Layer**
This function creates a fully connected layer in the network. It can take This function creates a fully connected layer in the network. It can take
multiple tensors as its inputs. It creates a variable called weights for one or multiple tensors as its inputs(input can be a list of Variable, see
each input tensor, which represents a fully connected weight matrix from Args in detail). It creates a variable called weights for each input tensor,
each input unit to each output unit. The fully connected layer multiplies which represents a fully connected weight matrix from each input unit to
each input tensor with its coresponding weight to produce an output Tensor. each output unit. The fully connected layer multiplies each input tensor
If multiple input tensors are given, the results of multiple multiplications with its corresponding weight to produce an output Tensor with shape [M, `size`],
will be sumed up. If bias_attr is not None, a bias variable will be created where M is batch size. If multiple input tensors are given, the results of
and added to the output. Finally, if activation is not None, it will be applied multiple output tensors with shape [M, `size`] will be summed up. If bias_attr
to the output as well. is not None, a bias variable will be created and added to the output.
Finally, if activation is not None, it will be applied to the output as well.
When the input is single tensor:
This process can be formulated as follows: .. math::
Out = Act({XW + b})
When the input are multiple tensors:
.. math:: .. math::
...@@ -222,13 +230,31 @@ def fc(input, ...@@ -222,13 +230,31 @@ def fc(input,
In the above equation: In the above equation:
* :math:`N`: Number of the input. * :math:`N`: Number of the input. N equals to len(input) if input is list of Variable.
* :math:`X_i`: The input tensor. * :math:`X_i`: The i-th input tensor.
* :math:`W`: The weights created by this layer. * :math:`W_i`: The i-th weights matrix corresponding i-th input tensor.
* :math:`b`: The bias parameter created by this layer (if needed). * :math:`b`: The bias parameter created by this layer (if needed).
* :math:`Act`: The activation function. * :math:`Act`: The activation function.
* :math:`Out`: The output tensor. * :math:`Out`: The output tensor.
See below for an example.
.. code-block:: text
Given:
data_1.data = [[[0.1, 0.2],
[0.3, 0.4]]]
data_1.shape = (1, 2, 2) # 1 is batch_size
data_2 = [[[0.1, 0.2, 0.3]]]
data_2.shape = (1, 1, 3)
out = fluid.layers.fc(input=[data_1, data_2], size=2)
Then:
out.data = [[0.18669507, 0.1893476]]
out.shape = (1, 2)
Args: Args:
input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
the input tensor(s) is at least 2. the input tensor(s) is at least 2.
...@@ -260,8 +286,14 @@ def fc(input, ...@@ -260,8 +286,14 @@ def fc(input,
Examples: Examples:
.. code-block:: python .. code-block:: python
# when input is single tensor
data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32") data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
fc = fluid.layers.fc(input=data, size=1000, act="tanh") fc = fluid.layers.fc(input=data, size=1000, act="tanh")
# when input are multiple tensors
data_1 = fluid.layers.data(name="data_1", shape=[32, 32], dtype="float32")
data_2 = fluid.layers.data(name="data_2", shape=[24, 36], dtype="float32")
fc = fluid.layers.fc(input=[data_1, data_2], size=1000, act="tanh")
""" """
helper = LayerHelper("fc", **locals()) helper = LayerHelper("fc", **locals())
...@@ -1432,6 +1464,8 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): ...@@ -1432,6 +1464,8 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
predict = fluid.layers.fc(input=net, size=classdim, act='softmax') predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label) cost = fluid.layers.cross_entropy(input=predict, label=label)
""" """
if not soft_label:
return cross_entropy2(input, label, ignore_index)
helper = LayerHelper('cross_entropy', **locals()) helper = LayerHelper('cross_entropy', **locals())
out = helper.create_variable_for_type_inference(dtype=input.dtype) out = helper.create_variable_for_type_inference(dtype=input.dtype)
helper.append_op( helper.append_op(
...@@ -1444,6 +1478,22 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): ...@@ -1444,6 +1478,22 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
return out return out
def cross_entropy2(input, label, ignore_index=kIgnoreIndex):
helper = LayerHelper('cross_entropy2', **locals())
out = helper.create_variable_for_type_inference(dtype=input.dtype)
xshape = helper.create_variable_for_type_inference(dtype=input.dtype)
match_x = helper.create_variable_for_type_inference(dtype=input.dtype)
helper.append_op(
type='cross_entropy2',
inputs={'X': [input],
'Label': [label]},
outputs={'Y': [out],
'MatchX': [match_x],
'XShape': [xshape]},
attrs={'ignore_index': ignore_index})
return out
def bpr_loss(input, label, name=None): def bpr_loss(input, label, name=None):
""" """
Bayesian Personalized Ranking Loss Operator. Bayesian Personalized Ranking Loss Operator.
...@@ -2904,11 +2954,17 @@ def batch_norm(input, ...@@ -2904,11 +2954,17 @@ def batch_norm(input,
y_i &\\gets \\gamma \\hat{x_i} + \\beta y_i &\\gets \\gamma \\hat{x_i} + \\beta
Args: Args:
input(variable): The input variable which is a LoDTensor. input(variable): The rank of input variable can be 2, 3, 4, 5.
act(string, Default None): Activation type, linear|relu|prelu|... act(string, Default None): Activation type, linear|relu|prelu|...
is_test(bool, Default False): Used for training or training. is_test (bool, Default False): A flag indicating whether it is in
momentum(float, Default 0.9): test phrase or not.
epsilon(float, Default 1e-05): momentum(float, Default 0.9): The value used for the moving_mean and
moving_var computation. The updated formula is:
:math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
:math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
Default is 0.9.
epsilon(float, Default 1e-05): A value added to the denominator for
numerical stability. Default is 1e-5.
param_attr(ParamAttr|None): The parameter attribute for Parameter `scale` param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
will create ParamAttr as param_attr. If the Initializer of the param_attr will create ParamAttr as param_attr. If the Initializer of the param_attr
...@@ -2966,15 +3022,8 @@ def batch_norm(input, ...@@ -2966,15 +3022,8 @@ def batch_norm(input,
shape=param_shape, shape=param_shape,
dtype=dtype, dtype=dtype,
default_initializer=Constant(1.0)) default_initializer=Constant(1.0))
# setting stop_gradient=True to reduce computation
if use_global_stats and helper.param_attr.learning_rate == 0.:
scale.stop_gradient = True
bias = helper.create_parameter( bias = helper.create_parameter(
attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True) attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
# setting stop_gradient=True to reduce computation
if use_global_stats and helper.bias_attr.learning_rate == 0.:
bias.stop_gradient = True
mean = helper.create_parameter( mean = helper.create_parameter(
attr=ParamAttr( attr=ParamAttr(
...@@ -4847,7 +4896,8 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None): ...@@ -4847,7 +4896,8 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
if transpose_y: if transpose_y:
y_shape[-2], y_shape[-1] = y_shape[-1], y_shape[-2] y_shape[-2], y_shape[-1] = y_shape[-1], y_shape[-2]
if x_shape[-1] != y_shape[-2]: if x_shape[-1] != y_shape[-2]:
raise ValueError("Invalid inputs for matmul.") raise ValueError("Invalid inputs for matmul. x: %s, y: %s\n" %
(x_shape, y_shape))
if len(y_shape) > 2 and len(x_shape) > 2: if len(y_shape) > 2 and len(x_shape) > 2:
for i, dim_x in enumerate(x_shape[:-2]): for i, dim_x in enumerate(x_shape[:-2]):
...@@ -6350,6 +6400,8 @@ def squeeze(input, axes, name=None): ...@@ -6350,6 +6400,8 @@ def squeeze(input, axes, name=None):
x = layers.data(name='x', shape=[5, 1, 10]) x = layers.data(name='x', shape=[5, 1, 10])
y = layers.sequeeze(input=x, axes=[1]) y = layers.sequeeze(input=x, axes=[1])
""" """
assert not _in_imperative_mode(), (
"squeeze layer is not supported in imperative mode yet.")
helper = LayerHelper("squeeze", **locals()) helper = LayerHelper("squeeze", **locals())
out = helper.create_variable_for_type_inference(dtype=input.dtype) out = helper.create_variable_for_type_inference(dtype=input.dtype)
x_shape = helper.create_variable_for_type_inference(dtype=input.dtype) x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
...@@ -9087,6 +9139,10 @@ def _elementwise_op(helper): ...@@ -9087,6 +9139,10 @@ def _elementwise_op(helper):
op_type = helper.layer_type op_type = helper.layer_type
x = helper.kwargs.get('x', None) x = helper.kwargs.get('x', None)
y = helper.kwargs.get('y', None) y = helper.kwargs.get('y', None)
if _in_imperative_mode():
x = base.to_variable(x)
y = base.to_variable(y)
assert x is not None, 'x cannot be None in {}'.format(op_type) assert x is not None, 'x cannot be None in {}'.format(op_type)
assert y is not None, 'y cannot be None in {}'.format(op_type) assert y is not None, 'y cannot be None in {}'.format(op_type)
axis = helper.kwargs.get('axis', -1) axis = helper.kwargs.get('axis', -1)
......
...@@ -476,11 +476,29 @@ class TestYoloDetection(unittest.TestCase): ...@@ -476,11 +476,29 @@ class TestYoloDetection(unittest.TestCase):
x = layers.data(name='x', shape=[30, 7, 7], dtype='float32') x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32') gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32')
gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32') gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32')
loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13], gtscore = layers.data(name='gtscore', shape=[10], dtype='float32')
[0, 1], 10, 0.7, 32) loss = layers.yolov3_loss(
x,
gtbox,
gtlabel, [10, 13, 30, 13], [0, 1],
10,
0.7,
32,
gtscore=gtscore,
use_label_smooth=False)
self.assertIsNotNone(loss) self.assertIsNotNone(loss)
def test_yolo_box(self):
program = Program()
with program_guard(program):
x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
img_size = layers.data(name='img_size', shape=[2], dtype='int32')
boxes, scores = layers.yolo_box(x, img_size, [10, 13, 30, 13], 10,
0.01, 32)
self.assertIsNotNone(boxes)
self.assertIsNotNone(scores)
class TestBoxClip(unittest.TestCase): class TestBoxClip(unittest.TestCase):
def test_box_clip(self): def test_box_clip(self):
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from paddle.fluid.tests.unittests.op_test import OpTest
from mkldnn_op_test import format_reorder
class TestTransposeOp(OpTest):
def setUp(self):
self.init_op_type()
self.initTestCase()
self.initInputData()
self.use_mkldnn = True
self.axis = (0, 2, 3, 1)
self.inputs = {
'X': format_reorder(self.input_data, self.shape)
} #transform data format to 'NHWC' for INT8 transpose specially.
self.attrs = {
'axis': list(self.axis),
'use_mkldnn': self.use_mkldnn,
}
self.outputs = {
'XShape': np.random.random(self.shape).astype('int8'),
'Out': self.inputs['X'].transpose(self.axis)
}
def init_op_type(self):
self.op_type = "transpose2"
def test_check_output(self):
self.check_output(no_check_set=['XShape'])
def initTestCase(self):
self.shape = (2, 3, 4, 5)
def initInputData(self):
self.input_data = (
np.random.randint(0, 100, self.shape) - 50).astype('int8')
class TestINT8Case(TestTransposeOp):
def initTestCase(self):
self.shape = (2, 4, 6, 8)
def initInputData(self):
self.input_data = (
np.random.randint(0, 100, self.shape) - 50).astype('int8')
class TestUINT8Case(TestTransposeOp):
def initTestCase(self):
self.shape = (1, 3, 5, 7)
def initDataType(self):
self.input_data = (np.random.randint(0, 100,
self.shape)).astype('uint8')
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from op_test import OpTest
import unittest
import numpy as np
import six
class CrossEntropy2OpTestBase(OpTest):
def initParameters(self):
return [32, 64], 'float32', -100
def calc_output(self, logits, label, ignore_index):
ret = np.zeros(shape=label.shape, dtype=logits.dtype)
match_x = np.zeros(shape=label.shape, dtype=logits.dtype)
for idx in six.moves.range(label.shape[0]):
if label[idx] == ignore_index:
continue
match_x[idx] = logits[idx][label[idx]]
ret[idx] = -np.log(match_x[idx])
return ret, match_x
def setUp(self):
self.shape, self.dtype, self.ignore_index = self.initParameters()
self.op_type = 'cross_entropy2'
feature_size = int(self.shape[-1])
batch_size = int(np.prod(self.shape) / feature_size)
logits = (np.random.random(size=self.shape) + 1).astype(self.dtype)
label = np.random.random_integers(
low=0, high=feature_size - 1,
size=self.shape[0:-1] + [1]).astype('int64')
outputs, match_x = self.calc_output(
np.reshape(logits, [batch_size, feature_size]),
np.reshape(label, [batch_size, 1]), self.ignore_index)
self.inputs = {'X': logits, 'Label': label}
self.outputs = {
'Y': np.reshape(outputs, label.shape),
'MatchX': np.reshape(match_x, label.shape),
'XShape': np.zeros(
shape=logits.shape, dtype=logits.dtype)
}
self.attrs = {'ignore_index': self.ignore_index}
def test_check_output(self):
self.check_output(no_check_set=['XShape'])
def test_check_grad(self):
self.check_grad(
inputs_to_check=['X'],
output_names=['Y'],
no_grad_set=['XShape', 'MatchX', 'Label'])
class CrossEntropy2OpTest2(CrossEntropy2OpTestBase):
def initParameters(self):
return [32, 64], 'float64', 3
class CrossEntropy2OpTest3(CrossEntropy2OpTestBase):
def initParameters(self):
return [4, 8, 16, 32], 'float32', -100
class CrossEntropy2OpTest4(CrossEntropy2OpTestBase):
def initParameters(self):
return [4, 8, 16, 32], 'float32', 3
if __name__ == '__main__':
unittest.main()
...@@ -524,8 +524,8 @@ class TestLocalLookupTable(TestDistLookupTableBase): ...@@ -524,8 +524,8 @@ class TestLocalLookupTable(TestDistLookupTableBase):
ops = [ ops = [
'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add', 'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
'cross_entropy', 'mean', 'fill_constant', 'mean_grad', 'cross_entropy2', 'mean', 'fill_constant', 'mean_grad',
'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad', 'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad',
'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
'split_selected_rows', 'send', 'sequence_pool_grad', 'split_selected_rows', 'send', 'sequence_pool_grad',
'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
...@@ -564,8 +564,8 @@ class TestDistLookupTable(TestDistLookupTableBase): ...@@ -564,8 +564,8 @@ class TestDistLookupTable(TestDistLookupTableBase):
ops = [ ops = [
'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul', 'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
'elementwise_add', 'cross_entropy', 'mean', 'fill_constant', 'elementwise_add', 'cross_entropy2', 'mean', 'fill_constant',
'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mean_grad', 'cross_entropy_grad2', 'elementwise_add_grad', 'send',
'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
'lookup_table_grad', 'split_selected_rows', 'send', 'lookup_table_grad', 'split_selected_rows', 'send',
'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
...@@ -612,8 +612,8 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase): ...@@ -612,8 +612,8 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
ops = [ ops = [
'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add', 'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
'cross_entropy', 'mean', 'fill_constant', 'mean_grad', 'cross_entropy2', 'mean', 'fill_constant', 'mean_grad',
'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad', 'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad',
'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
'split_selected_rows', 'send', 'sequence_pool_grad', 'split_selected_rows', 'send', 'sequence_pool_grad',
'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
...@@ -652,8 +652,8 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase): ...@@ -652,8 +652,8 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
ops = [ ops = [
'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul', 'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
'elementwise_add', 'cross_entropy', 'mean', 'fill_constant', 'elementwise_add', 'cross_entropy2', 'mean', 'fill_constant',
'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mean_grad', 'cross_entropy_grad2', 'elementwise_add_grad', 'send',
'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
'lookup_table_grad', 'split_selected_rows', 'send', 'lookup_table_grad', 'split_selected_rows', 'send',
'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
...@@ -841,8 +841,8 @@ class TestRemoteLookupTable(TestDistLookupTableBase): ...@@ -841,8 +841,8 @@ class TestRemoteLookupTable(TestDistLookupTableBase):
ops = [ ops = [
'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add', 'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
'cross_entropy', 'mean', 'fill_constant', 'mean_grad', 'cross_entropy2', 'mean', 'fill_constant', 'mean_grad',
'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad', 'cross_entropy_grad2', 'elementwise_add_grad', 'send', 'mul_grad',
'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
'split_selected_rows', 'send', 'sequence_pool_grad', 'split_selected_rows', 'send', 'sequence_pool_grad',
'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
......
...@@ -17,6 +17,7 @@ from __future__ import print_function ...@@ -17,6 +17,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest
import paddle.fluid.core as core
class TestFakeQuantizeOp(OpTest): class TestFakeQuantizeOp(OpTest):
...@@ -75,6 +76,7 @@ class TestFakeQuantizeRangeAbsMaxOp(OpTest): ...@@ -75,6 +76,7 @@ class TestFakeQuantizeRangeAbsMaxOp(OpTest):
'InScale': np.zeros(1).astype("float32") 'InScale': np.zeros(1).astype("float32")
} }
scale = np.max(np.abs(self.inputs['X'])).astype("float32") scale = np.max(np.abs(self.inputs['X'])).astype("float32")
out_scales = np.zeros(self.attrs['window_size']).astype("float32") out_scales = np.zeros(self.attrs['window_size']).astype("float32")
out_scales[0] = scale out_scales[0] = scale
self.outputs = { self.outputs = {
...@@ -88,6 +90,46 @@ class TestFakeQuantizeRangeAbsMaxOp(OpTest): ...@@ -88,6 +90,46 @@ class TestFakeQuantizeRangeAbsMaxOp(OpTest):
self.check_output() self.check_output()
class TestFakeQuantizeMovingOp(OpTest):
def setUp(self):
self.op_type = "fake_quantize_moving_average_abs_max"
self.attrs = {
'bit_length': int(5),
'moving_rate': float(0.9),
'is_test': False
}
accum = np.zeros(1).astype("float32")
accum[0] = 1
state = np.zeros(1).astype("float32")
state[0] = 1
scale = np.zeros(1).astype("float32")
scale[0] = 0.001
self.inputs = {
'X': np.random.random((8, 16, 7, 7)).astype("float32"),
'InScale': scale,
'InAccum': accum,
'InState': state,
}
out_accum = np.zeros(1).astype("float32")
out_state = np.zeros(1).astype("float32")
out_scale = np.zeros(1).astype("float32")
out_accum[0] = self.attrs['moving_rate'] * accum[0] + np.max(
np.abs(self.inputs['X'])).astype("float32")
out_state[0] = self.attrs['moving_rate'] * state[0] + 1
out_scale = out_accum / out_state
self.outputs = {
'Out': np.round(self.inputs['X'] / out_scale * (
(1 << (self.attrs['bit_length'] - 1)) - 1)),
'OutAccum': out_accum,
'OutState': out_state,
'OutScale': out_scale,
}
def test_check_output(self):
self.check_output()
class TestFakeQuantizeRangeAbsMaxOp2(OpTest): class TestFakeQuantizeRangeAbsMaxOp2(OpTest):
def setUp(self): def setUp(self):
self.op_type = "fake_quantize_range_abs_max" self.op_type = "fake_quantize_range_abs_max"
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import unittest
import numpy as np
import six
import sys
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.optimizer import AdamOptimizer
from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC
from test_imperative_base import new_program_scope
from paddle.fluid.imperative.base import to_variable
def gen_data():
pass
class GraphConv(fluid.imperative.Layer):
def __init__(self, name_scope, in_features, out_features):
super(GraphConv, self).__init__(name_scope)
self._in_features = in_features
self._out_features = out_features
self.weight = self.create_parameter(
attr=None,
dtype='float32',
shape=[self._in_features, self._out_features])
self.bias = self.create_parameter(
attr=None, dtype='float32', shape=[self._out_features])
def forward(self, features, adj):
support = fluid.layers.matmul(features, self.weight)
# TODO(panyx0718): sparse matmul?
return fluid.layers.matmul(adj, support) + self.bias
class GCN(fluid.imperative.Layer):
def __init__(self, name_scope, num_hidden):
super(GCN, self).__init__(name_scope)
self.gc = GraphConv(self.full_name(), num_hidden, 32)
self.gc2 = GraphConv(self.full_name(), 32, 10)
def forward(self, x, adj):
x = fluid.layers.relu(self.gc(x, adj))
return self.gc2(x, adj)
class TestImperativeGNN(unittest.TestCase):
def test_gnn_float32(self):
seed = 90
startup = fluid.Program()
startup.random_seed = seed
main = fluid.Program()
main.random_seed = seed
scope = fluid.core.Scope()
with new_program_scope(main=main, startup=startup, scope=scope):
features = fluid.layers.data(
name='features',
shape=[1, 100, 50],
dtype='float32',
append_batch_size=False)
# Use selected rows when it's supported.
adj = fluid.layers.data(
name='adj',
shape=[1, 100, 100],
dtype='float32',
append_batch_size=False)
labels = fluid.layers.data(
name='labels',
shape=[100, 1],
dtype='int64',
append_batch_size=False)
model = GCN('test_gcn', 50)
logits = model(features, adj)
logits = fluid.layers.reshape(logits, logits.shape[1:])
# In other example, it's nll with log_softmax. However, paddle's
# log_loss only supports binary classification now.
loss = fluid.layers.softmax_with_cross_entropy(logits, labels)
loss = fluid.layers.reduce_sum(loss)
adam = AdamOptimizer(learning_rate=1e-3)
adam.minimize(loss)
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
exe.run(startup)
static_loss = exe.run(feed={
'features': np.zeros(
[1, 100, 50], dtype=np.float32),
'adj': np.zeros(
[1, 100, 100], dtype=np.float32),
'labels': np.zeros(
[100, 1], dtype=np.int64)
},
fetch_list=[loss])[0]
static_weight = np.array(
scope.find_var(model.gc.weight.name).get_tensor())
with fluid.imperative.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
features = np.zeros([1, 100, 50], dtype=np.float32)
# Use selected rows when it's supported.
adj = np.zeros([1, 100, 100], dtype=np.float32)
labels = np.zeros([100, 1], dtype=np.int64)
model = GCN('test_gcn', 50)
logits = model(to_variable(features), to_variable(adj))
logits = fluid.layers.reshape(logits, logits.shape[1:])
# In other example, it's nll with log_softmax. However, paddle's
# log_loss only supports binary classification now.
loss = fluid.layers.softmax_with_cross_entropy(logits,
to_variable(labels))
loss = fluid.layers.reduce_sum(loss)
adam = AdamOptimizer(learning_rate=1e-3)
adam.minimize(loss)
self.assertEqual(static_loss, loss._numpy())
self.assertTrue(
np.allclose(static_weight, model.gc.weight._numpy()))
sys.stderr.write('%s %s\n' % (static_loss, loss._numpy()))
if __name__ == '__main__':
unittest.main()
...@@ -84,6 +84,27 @@ class TestLayer(LayerTest): ...@@ -84,6 +84,27 @@ class TestLayer(LayerTest):
self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
def test_matmul(self):
with self.static_graph():
t = layers.data(name='t', shape=[3, 3], dtype='float32')
t2 = layers.data(name='t2', shape=[3, 3], dtype='float32')
ret = layers.matmul(t, t2)
static_ret = self.get_static_graph_result(
feed={
't': np.ones(
[3, 3], dtype='float32'),
't2': np.ones(
[3, 3], dtype='float32')
},
fetch_list=[ret])[0]
with self.dynamic_graph():
t = np.ones([3, 3], dtype='float32')
t2 = np.ones([3, 3], dtype='float32')
dy_ret = layers.matmul(base.to_variable(t), base.to_variable(t2))
self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
def test_conv2d(self): def test_conv2d(self):
with self.static_graph(): with self.static_graph():
images = layers.data(name='pixel', shape=[3, 5, 5], dtype='float32') images = layers.data(name='pixel', shape=[3, 5, 5], dtype='float32')
...@@ -153,6 +174,60 @@ class TestLayer(LayerTest): ...@@ -153,6 +174,60 @@ class TestLayer(LayerTest):
self.assertTrue(np.allclose(static_ret[i], static_ret2[i])) self.assertTrue(np.allclose(static_ret[i], static_ret2[i]))
self.assertTrue(np.allclose(static_ret[i], dy_ret[i]._numpy())) self.assertTrue(np.allclose(static_ret[i], dy_ret[i]._numpy()))
def test_elementwise_math(self):
n = np.ones([3, 3], dtype='float32')
n2 = np.ones([3, 3], dtype='float32') * 1.1
n3 = np.ones([3, 3], dtype='float32') * 2
n4 = np.ones([3, 3], dtype='float32') * 3
n5 = np.ones([3, 3], dtype='float32') * 4
n6 = np.ones([3, 3], dtype='float32') * 5
with self.static_graph():
t = layers.data(name='t', shape=[3, 3], dtype='float32')
t2 = layers.data(name='t2', shape=[3, 3], dtype='float32')
t3 = layers.data(name='t3', shape=[3, 3], dtype='float32')
t4 = layers.data(name='t4', shape=[3, 3], dtype='float32')
t5 = layers.data(name='t5', shape=[3, 3], dtype='float32')
t6 = layers.data(name='t6', shape=[3, 3], dtype='float32')
ret = layers.elementwise_add(t, t2)
ret = layers.elementwise_pow(ret, t3)
ret = layers.elementwise_div(ret, t4)
ret = layers.elementwise_sub(ret, t5)
ret = layers.elementwise_mul(ret, t6)
static_ret = self.get_static_graph_result(
feed={
't': n,
't2': n2,
't3': n3,
't4': n4,
't5': n5,
't6': n6
},
fetch_list=[ret])[0]
with self.dynamic_graph():
ret = layers.elementwise_add(n, n2)
ret = layers.elementwise_pow(ret, n3)
ret = layers.elementwise_div(ret, n4)
ret = layers.elementwise_sub(ret, n5)
dy_ret = layers.elementwise_mul(ret, n6)
self.assertTrue(
np.allclose(static_ret, dy_ret._numpy()),
'%s vs %s' % (static_ret, dy_ret._numpy()))
def test_elementwise_minmax(self):
n = np.ones([3, 3], dtype='float32')
n2 = np.ones([3, 3], dtype='float32') * 2
with self.dynamic_graph():
min_ret = layers.elementwise_min(n, n2)
max_ret = layers.elementwise_max(n, n2)
self.assertTrue(np.allclose(n, min_ret._numpy()))
self.assertTrue(np.allclose(n2, max_ret._numpy()))
class TestBook(unittest.TestCase): class TestBook(unittest.TestCase):
def test_fit_a_line(self): def test_fit_a_line(self):
......
...@@ -16,6 +16,7 @@ from __future__ import print_function ...@@ -16,6 +16,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core
from op_test import OpTest from op_test import OpTest
...@@ -63,5 +64,54 @@ class TestCase2(TestSliceOp): ...@@ -63,5 +64,54 @@ class TestCase2(TestSliceOp):
self.out = self.input[-3:3, 0:100, :, 2:-1] self.out = self.input[-3:3, 0:100, :, 2:-1]
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestFP16(TestSliceOp):
def config(self):
self.dtype = "float16"
self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
self.starts = [-3, 0, 2]
self.ends = [3, 100, -1]
self.axes = [0, 1, 3]
self.out = self.input[-3:3, 0:100, :, 2:-1]
def test_check_output(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-5)
def test_check_grad_normal(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ['Input'], 'Out', max_relative_error=0.006)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestFP16_2(TestSliceOp):
def config(self):
self.dtype = "float16"
self.input = np.random.random([3, 4, 5]).astype(self.dtype)
self.starts = [0]
self.ends = [1]
self.axes = [1]
self.out = self.input[:, 0:1, :]
def test_check_output(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-5)
def test_check_grad_normal(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ['Input'],
'Out',
max_relative_error=0.006,
numeric_grad_delta=0.5)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import os
import six
import paddle.fluid.core as core
import paddle.fluid as fluid
from paddle.fluid import compiler
class TestSyncBatchNormOpTraining(unittest.TestCase):
def setUp(self):
#self.dtype = np.float32
self.dtype = np.float64
self.N = 32
self.C = 16
self.H = 64
self.W = 32
self.dshape = [self.N, self.C, self.H, self.W]
def build_program(self,
place,
layout,
seed,
sync_bn=False,
only_forward=False):
main = fluid.Program()
startup = fluid.Program()
main.random_seed = seed
startup.random_seed = seed
with fluid.unique_name.guard():
with fluid.program_guard(main, startup):
data = fluid.layers.data(
name='input',
shape=self.dshape,
dtype=self.dtype,
append_batch_size=False)
conv = fluid.layers.conv2d(
input=data,
num_filters=32,
filter_size=1,
param_attr=fluid.ParamAttr(name='conv2d_weight'),
bias_attr=False,
use_cudnn=False)
bn = fluid.layers.batch_norm(
conv,
param_attr=fluid.ParamAttr(name='bn_scale'),
bias_attr=fluid.ParamAttr(name='bn_bias'),
moving_mean_name='bn_moving_mean',
moving_variance_name='bn_moving_variance',
data_layout=layout,
is_test=only_forward)
sigmoid = fluid.layers.sigmoid(bn)
out = fluid.layers.reduce_sum(sigmoid)
if not sync_bn:
out = out / core.get_cuda_device_count()
if not only_forward:
sgd_opt = fluid.optimizer.SGD(learning_rate=0.0)
sgd_opt.backward(out)
return main, startup, [out, conv, bn]
def compare(self, place, layout, only_forward):
seed = 10
os.environ['FLAGS_cudnn_deterministic'] = "1"
data = np.random.random(size=self.dshape).astype(self.dtype) * 4. - 2
# Single-GPU, N = 32 per GPU
main, startup, outs = self.build_program(place, layout, seed, False,
only_forward)
exe = fluid.Executor(place)
exe.run(startup)
fetch_names = [v.name for v in outs] + [
'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
]
if not only_forward:
others = [
'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD', 'conv2d_0.tmp_0@GRAD'
]
fetch_names += others
bn_fetches = exe.run(program=main,
feed={'input': data},
fetch_list=fetch_names)
#####################################################################
# Multi-GPUs, self.N / core.get_cuda_device_count() per GPU
main, startup, outs = self.build_program(place, layout, seed, True,
only_forward)
exe = fluid.Executor(place)
exe.run(startup)
fetch_names = [v.name for v in outs] + [
'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
]
if not only_forward:
others = [
'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD', 'conv2d_0.tmp_0@GRAD'
]
fetch_names += others
for nm in fetch_names:
fv = fluid.framework._get_var(str(nm), program=main)
fv.persistable = True
build_strategy = fluid.BuildStrategy()
build_strategy.sync_batch_norm = True
build_strategy.enable_inplace = False
build_strategy.memory_optimize = False
comp_prog = compiler.CompiledProgram(main).with_data_parallel(
outs[0].name if not only_forward else None,
build_strategy=build_strategy)
sync_bn_fetches = exe.run(program=comp_prog,
feed={'input': data},
fetch_list=fetch_names)
for i in six.moves.xrange(1, len(sync_bn_fetches)):
bn_val = bn_fetches[i]
sync_bn_val = sync_bn_fetches[i]
if sync_bn_val.shape != bn_val.shape:
sync_bn_val = sync_bn_val[:bn_val.shape[0]]
self.assertTrue(
np.allclose(
bn_val, sync_bn_val, atol=1e-3),
"Output (" + fetch_names[i] + ") has diff. \n" + "\nBN " +
str(bn_val) + "\n" + "Sync BN " + str(sync_bn_val))
def test_train(self):
if not core.is_compiled_with_cuda():
return
places = [core.CUDAPlace(0)]
for place in places:
for layout in ["NCHW", "NHWC"]:
self.compare(place, layout, False)
def test_infer(self):
if not core.is_compiled_with_cuda():
return
places = [core.CUDAPlace(0)]
for place in places:
for layout in ["NCHW", "NHWC"]:
self.compare(place, layout, True)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
import unittest
import numpy as np
from op_test import OpTest
from paddle.fluid import core
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-1.0 * x))
def YoloBox(x, img_size, attrs):
n, c, h, w = x.shape
anchors = attrs['anchors']
an_num = int(len(anchors) // 2)
class_num = attrs['class_num']
conf_thresh = attrs['conf_thresh']
downsample = attrs['downsample']
input_size = downsample * h
x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
pred_box = x[:, :, :, :, :4].copy()
grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1))
grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w))
pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w
pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h
anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
anchors_s = np.array(
[(an_w / input_size, an_h / input_size) for an_w, an_h in anchors])
anchor_w = anchors_s[:, 0:1].reshape((1, an_num, 1, 1))
anchor_h = anchors_s[:, 1:2].reshape((1, an_num, 1, 1))
pred_box[:, :, :, :, 2] = np.exp(pred_box[:, :, :, :, 2]) * anchor_w
pred_box[:, :, :, :, 3] = np.exp(pred_box[:, :, :, :, 3]) * anchor_h
pred_conf = sigmoid(x[:, :, :, :, 4:5])
pred_conf[pred_conf < conf_thresh] = 0.
pred_score = sigmoid(x[:, :, :, :, 5:]) * pred_conf
pred_box = pred_box * (pred_conf > 0.).astype('float32')
pred_box = pred_box.reshape((n, -1, 4))
pred_box[:, :, :2], pred_box[:, :, 2:4] = \
pred_box[:, :, :2] - pred_box[:, :, 2:4] / 2., \
pred_box[:, :, :2] + pred_box[:, :, 2:4] / 2.0
pred_box[:, :, 0] = pred_box[:, :, 0] * img_size[:, 1][:, np.newaxis]
pred_box[:, :, 1] = pred_box[:, :, 1] * img_size[:, 0][:, np.newaxis]
pred_box[:, :, 2] = pred_box[:, :, 2] * img_size[:, 1][:, np.newaxis]
pred_box[:, :, 3] = pred_box[:, :, 3] * img_size[:, 0][:, np.newaxis]
for i in range(len(pred_box)):
pred_box[i, :, 0] = np.clip(pred_box[i, :, 0], 0, np.inf)
pred_box[i, :, 1] = np.clip(pred_box[i, :, 1], 0, np.inf)
pred_box[i, :, 2] = np.clip(pred_box[i, :, 2], -np.inf,
img_size[i, 1] - 1)
pred_box[i, :, 3] = np.clip(pred_box[i, :, 3], -np.inf,
img_size[i, 0] - 1)
return pred_box, pred_score.reshape((n, -1, class_num))
class TestYoloBoxOp(OpTest):
def setUp(self):
self.initTestCase()
self.op_type = 'yolo_box'
x = np.random.random(self.x_shape).astype('float32')
img_size = np.random.randint(10, 20, self.imgsize_shape).astype('int32')
self.attrs = {
"anchors": self.anchors,
"class_num": self.class_num,
"conf_thresh": self.conf_thresh,
"downsample": self.downsample,
}
self.inputs = {
'X': x,
'ImgSize': img_size,
}
boxes, scores = YoloBox(x, img_size, self.attrs)
self.outputs = {
"Boxes": boxes,
"Scores": scores,
}
def test_check_output(self):
self.check_output()
def initTestCase(self):
self.anchors = [10, 13, 16, 30, 33, 23]
an_num = int(len(self.anchors) // 2)
self.batch_size = 32
self.class_num = 2
self.conf_thresh = 0.5
self.downsample = 32
self.x_shape = (self.batch_size, an_num * (5 + self.class_num), 13, 13)
self.imgsize_shape = (self.batch_size, 2)
if __name__ == "__main__":
unittest.main()
...@@ -23,8 +23,8 @@ from op_test import OpTest ...@@ -23,8 +23,8 @@ from op_test import OpTest
from paddle.fluid import core from paddle.fluid import core
def l2loss(x, y): def l1loss(x, y):
return 0.5 * (y - x) * (y - x) return abs(x - y)
def sce(x, label): def sce(x, label):
...@@ -66,7 +66,7 @@ def batch_xywh_box_iou(box1, box2): ...@@ -66,7 +66,7 @@ def batch_xywh_box_iou(box1, box2):
return inter_area / union return inter_area / union
def YOLOv3Loss(x, gtbox, gtlabel, attrs): def YOLOv3Loss(x, gtbox, gtlabel, gtscore, attrs):
n, c, h, w = x.shape n, c, h, w = x.shape
b = gtbox.shape[1] b = gtbox.shape[1]
anchors = attrs['anchors'] anchors = attrs['anchors']
...@@ -75,21 +75,21 @@ def YOLOv3Loss(x, gtbox, gtlabel, attrs): ...@@ -75,21 +75,21 @@ def YOLOv3Loss(x, gtbox, gtlabel, attrs):
mask_num = len(anchor_mask) mask_num = len(anchor_mask)
class_num = attrs["class_num"] class_num = attrs["class_num"]
ignore_thresh = attrs['ignore_thresh'] ignore_thresh = attrs['ignore_thresh']
downsample = attrs['downsample'] downsample_ratio = attrs['downsample_ratio']
input_size = downsample * h use_label_smooth = attrs['use_label_smooth']
input_size = downsample_ratio * h
x = x.reshape((n, mask_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2)) x = x.reshape((n, mask_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
loss = np.zeros((n)).astype('float32') loss = np.zeros((n)).astype('float32')
label_pos = 1.0 - 1.0 / class_num if use_label_smooth else 1.0
label_neg = 1.0 / class_num if use_label_smooth else 0.0
pred_box = x[:, :, :, :, :4].copy() pred_box = x[:, :, :, :, :4].copy()
grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1)) grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1))
grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w)) grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w))
pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w
pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h
x[:, :, :, :, 5:] = np.where(x[:, :, :, :, 5:] < -0.5, x[:, :, :, :, 5:],
np.ones_like(x[:, :, :, :, 5:]) * 1.0 /
class_num)
mask_anchors = [] mask_anchors = []
for m in anchor_mask: for m in anchor_mask:
mask_anchors.append((anchors[2 * m], anchors[2 * m + 1])) mask_anchors.append((anchors[2 * m], anchors[2 * m + 1]))
...@@ -138,21 +138,22 @@ def YOLOv3Loss(x, gtbox, gtlabel, attrs): ...@@ -138,21 +138,22 @@ def YOLOv3Loss(x, gtbox, gtlabel, attrs):
ty = gtbox[i, j, 1] * w - gj ty = gtbox[i, j, 1] * w - gj
tw = np.log(gtbox[i, j, 2] * input_size / mask_anchors[an_idx][0]) tw = np.log(gtbox[i, j, 2] * input_size / mask_anchors[an_idx][0])
th = np.log(gtbox[i, j, 3] * input_size / mask_anchors[an_idx][1]) th = np.log(gtbox[i, j, 3] * input_size / mask_anchors[an_idx][1])
scale = (2.0 - gtbox[i, j, 2] * gtbox[i, j, 3]) scale = (2.0 - gtbox[i, j, 2] * gtbox[i, j, 3]) * gtscore[i, j]
loss[i] += sce(x[i, an_idx, gj, gi, 0], tx) * scale loss[i] += sce(x[i, an_idx, gj, gi, 0], tx) * scale
loss[i] += sce(x[i, an_idx, gj, gi, 1], ty) * scale loss[i] += sce(x[i, an_idx, gj, gi, 1], ty) * scale
loss[i] += l2loss(x[i, an_idx, gj, gi, 2], tw) * scale loss[i] += l1loss(x[i, an_idx, gj, gi, 2], tw) * scale
loss[i] += l2loss(x[i, an_idx, gj, gi, 3], th) * scale loss[i] += l1loss(x[i, an_idx, gj, gi, 3], th) * scale
objness[i, an_idx * h * w + gj * w + gi] = 1.0 objness[i, an_idx * h * w + gj * w + gi] = gtscore[i, j]
for label_idx in range(class_num): for label_idx in range(class_num):
loss[i] += sce(x[i, an_idx, gj, gi, 5 + label_idx], loss[i] += sce(x[i, an_idx, gj, gi, 5 + label_idx], label_pos
float(label_idx == gtlabel[i, j])) if label_idx == gtlabel[i, j] else
label_neg) * gtscore[i, j]
for j in range(mask_num * h * w): for j in range(mask_num * h * w):
if objness[i, j] > 0: if objness[i, j] > 0:
loss[i] += sce(pred_obj[i, j], 1.0) loss[i] += sce(pred_obj[i, j], 1.0) * objness[i, j]
elif objness[i, j] == 0: elif objness[i, j] == 0:
loss[i] += sce(pred_obj[i, j], 0.0) loss[i] += sce(pred_obj[i, j], 0.0)
...@@ -176,7 +177,8 @@ class TestYolov3LossOp(OpTest): ...@@ -176,7 +177,8 @@ class TestYolov3LossOp(OpTest):
"anchor_mask": self.anchor_mask, "anchor_mask": self.anchor_mask,
"class_num": self.class_num, "class_num": self.class_num,
"ignore_thresh": self.ignore_thresh, "ignore_thresh": self.ignore_thresh,
"downsample": self.downsample, "downsample_ratio": self.downsample_ratio,
"use_label_smooth": self.use_label_smooth,
} }
self.inputs = { self.inputs = {
...@@ -184,7 +186,14 @@ class TestYolov3LossOp(OpTest): ...@@ -184,7 +186,14 @@ class TestYolov3LossOp(OpTest):
'GTBox': gtbox.astype('float32'), 'GTBox': gtbox.astype('float32'),
'GTLabel': gtlabel.astype('int32'), 'GTLabel': gtlabel.astype('int32'),
} }
loss, objness, gt_matches = YOLOv3Loss(x, gtbox, gtlabel, self.attrs)
gtscore = np.ones(self.gtbox_shape[:2]).astype('float32')
if self.gtscore:
gtscore = np.random.random(self.gtbox_shape[:2]).astype('float32')
self.inputs['GTScore'] = gtscore
loss, objness, gt_matches = YOLOv3Loss(x, gtbox, gtlabel, gtscore,
self.attrs)
self.outputs = { self.outputs = {
'Loss': loss, 'Loss': loss,
'ObjectnessMask': objness, 'ObjectnessMask': objness,
...@@ -193,24 +202,57 @@ class TestYolov3LossOp(OpTest): ...@@ -193,24 +202,57 @@ class TestYolov3LossOp(OpTest):
def test_check_output(self): def test_check_output(self):
place = core.CPUPlace() place = core.CPUPlace()
self.check_output_with_place(place, atol=1e-3) self.check_output_with_place(place, atol=2e-3)
def test_check_grad_ignore_gtbox(self): def test_check_grad_ignore_gtbox(self):
place = core.CPUPlace() place = core.CPUPlace()
self.check_grad_with_place( self.check_grad_with_place(place, ['X'], 'Loss', max_relative_error=0.2)
place, ['X'],
'Loss', def initTestCase(self):
no_grad_set=set(["GTBox", "GTLabel"]), self.anchors = [
max_relative_error=0.3) 10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
373, 326
]
self.anchor_mask = [0, 1, 2]
self.class_num = 5
self.ignore_thresh = 0.7
self.downsample_ratio = 32
self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
self.gtbox_shape = (3, 5, 4)
self.gtscore = True
self.use_label_smooth = True
class TestYolov3LossWithoutLabelSmooth(TestYolov3LossOp):
def initTestCase(self):
self.anchors = [
10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
373, 326
]
self.anchor_mask = [0, 1, 2]
self.class_num = 5
self.ignore_thresh = 0.7
self.downsample_ratio = 32
self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
self.gtbox_shape = (3, 5, 4)
self.gtscore = True
self.use_label_smooth = False
class TestYolov3LossNoGTScore(TestYolov3LossOp):
def initTestCase(self): def initTestCase(self):
self.anchors = [10, 13, 16, 30, 33, 23] self.anchors = [
self.anchor_mask = [1, 2] 10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
373, 326
]
self.anchor_mask = [0, 1, 2]
self.class_num = 5 self.class_num = 5
self.ignore_thresh = 0.5 self.ignore_thresh = 0.7
self.downsample = 32 self.downsample_ratio = 32
self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5) self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
self.gtbox_shape = (3, 5, 4) self.gtbox_shape = (3, 5, 4)
self.gtscore = False
self.use_label_smooth = True
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -38,9 +38,8 @@ items. It can be any function with no parameter that creates a iterable ...@@ -38,9 +38,8 @@ items. It can be any function with no parameter that creates a iterable
Element produced from the iterable should be a **single** entry of data, Element produced from the iterable should be a **single** entry of data,
**not** a mini batch. That entry of data could be a single item, or a tuple of **not** a mini batch. That entry of data could be a single item, or a tuple of
items. items.
Item should be of `supported type <http://www.paddlepaddle.org/doc/ui/data_provider Item should be of supported type (e.g., numpy array or list/tuple of float
/pydataprovider2.html?highlight=dense_vector#input-types>`_ (e.g., numpy 1d or int).
array of float32, int, list of int)
An example implementation for single item data reader creator: An example implementation for single item data reader creator:
...@@ -62,8 +61,6 @@ An example implementation for multiple item data reader creator: ...@@ -62,8 +61,6 @@ An example implementation for multiple item data reader creator:
yield numpy.random.uniform(-1, 1, size=width*height), label yield numpy.random.uniform(-1, 1, size=width*height), label
return reader return reader
TODO(yuyang18): Should we add whole design doc here?
""" """
import paddle.reader.decorator import paddle.reader.decorator
......
...@@ -44,8 +44,11 @@ def text_file(path): ...@@ -44,8 +44,11 @@ def text_file(path):
Creates a data reader that outputs text line by line from given text file. Creates a data reader that outputs text line by line from given text file.
Trailing new line ('\\\\n') of each line will be removed. Trailing new line ('\\\\n') of each line will be removed.
:path: path of the text file. Args:
:returns: data reader of text file path (str): path of the text file.
Returns:
callable: data reader of text file.
""" """
def reader(): def reader():
...@@ -59,10 +62,15 @@ def text_file(path): ...@@ -59,10 +62,15 @@ def text_file(path):
def recordio(paths, buf_size=100): def recordio(paths, buf_size=100):
""" """
Creates a data reader from given RecordIO file paths separated by ",", Creates a data reader from given RecordIO file paths separated
glob pattern is supported. by ",", glob pattern is supported.
:path: path of recordio files, can be a string or a string list.
:returns: data reader of recordio files. Args:
paths (str|list(str)): path of recordio files.
buf_size (int): prefetched buffer size.
Returns:
callable: data reader of recordio files.
""" """
import recordio as rec import recordio as rec
......
...@@ -242,20 +242,18 @@ class XmapEndSignal(): ...@@ -242,20 +242,18 @@ class XmapEndSignal():
def xmap_readers(mapper, reader, process_num, buffer_size, order=False): def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
""" """
Use multiprocess to map samples from reader by a mapper defined by user. Use multi-threads to map samples from reader by a mapper defined by user.
And this function contains a buffered decorator.
:param mapper: a function to map sample. Args:
:type mapper: callable mapper (callable): a function to map the data from reader.
:param reader: the data reader to read from reader (callable): a data reader which yields the data.
:type reader: callable process_num (int): thread number to handle original sample.
:param process_num: process number to handle original sample buffer_size (int): size of the queue to read data in.
:type process_num: int order (bool): whether to keep the data order from original reader.
:param buffer_size: max buffer size Default False.
:type buffer_size: int
:param order: keep the order of reader Returns:
:type order: bool callable: a decorated reader with data mapping.
:return: the decarated reader
:rtype: callable
""" """
end = XmapEndSignal() end = XmapEndSignal()
...@@ -477,7 +475,7 @@ class PipeReader: ...@@ -477,7 +475,7 @@ class PipeReader:
""" """
:param cut_lines: cut buffer to lines :param cut_lines: cut buffer to lines
:type cut_lines: bool :type cut_lines: bool
:param line_break: line break of the file, like \n or \r :param line_break: line break of the file, like '\\\\n' or '\\\\r'
:type line_break: string :type line_break: string
:return: one line or a buffer of bytes :return: one line or a buffer of bytes
......
...@@ -153,3 +153,9 @@ done ...@@ -153,3 +153,9 @@ done
# Restore LD_LIBRARY_PATH # Restore LD_LIBRARY_PATH
LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}" LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}"
# According to ar issues: https://lists.gnu.org/archive/html/bug-binutils/2016-05/msg00211.html
# we should install new version ar with 64-bit supported here
wget https://ftp.gnu.org/gnu/binutils/binutils-2.27.tar.gz
tar xzf binutils-2.27.tar.gz && cd binutils-2.27
./configure --prefix=/opt/rh/devtoolset-2/root/usr/ --enable-64-bit-archive && make -j `nproc` && make install
...@@ -160,6 +160,8 @@ class Timeline(object): ...@@ -160,6 +160,8 @@ class Timeline(object):
self._devices[(k, event.device_id, "GPUKernel")] = pid self._devices[(k, event.device_id, "GPUKernel")] = pid
self._chrome_trace.emit_pid("%s:gpu:%d" % self._chrome_trace.emit_pid("%s:gpu:%d" %
(k, event.device_id), pid) (k, event.device_id), pid)
if not hasattr(profile_pb, "mem_events"):
continue
for mevent in profile_pb.mem_events: for mevent in profile_pb.mem_events:
if mevent.place == profiler_pb2.MemEvent.CUDAPlace: if mevent.place == profiler_pb2.MemEvent.CUDAPlace:
if (k, mevent.device_id, "GPU") not in self._mem_devices: if (k, mevent.device_id, "GPU") not in self._mem_devices:
...@@ -211,7 +213,7 @@ class Timeline(object): ...@@ -211,7 +213,7 @@ class Timeline(object):
args = {'name': event.name} args = {'name': event.name}
if event.memcopy.bytes > 0: if event.memcopy.bytes > 0:
args['mem_bytes'] = event.memcopy.bytes args['mem_bytes'] = event.memcopy.bytes
if event.detail_info: if hasattr(event, "detail_info") and event.detail_info:
args['detail_info'] = event.detail_info args['detail_info'] = event.detail_info
# TODO(panyx0718): Chrome tracing only handles ms. However, some # TODO(panyx0718): Chrome tracing only handles ms. However, some
# ops takes micro-seconds. Hence, we keep the ns here. # ops takes micro-seconds. Hence, we keep the ns here.
...@@ -220,6 +222,8 @@ class Timeline(object): ...@@ -220,6 +222,8 @@ class Timeline(object):
event.sub_device_id, 'Op', event.name, args) event.sub_device_id, 'Op', event.name, args)
def _allocate_memory_event(self): def _allocate_memory_event(self):
if not hasattr(profiler_pb2, "MemEvent"):
return
place_to_str = { place_to_str = {
profiler_pb2.MemEvent.CPUPlace: "CPU", profiler_pb2.MemEvent.CPUPlace: "CPU",
profiler_pb2.MemEvent.CUDAPlace: "GPU", profiler_pb2.MemEvent.CUDAPlace: "GPU",
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册