“7f5f4cadafea3f11a64bfd608d41d15b882adaef”上不存在“doc/api/v1/trainer_config_helpers/networks.html”
提交 2d92b6be 编写于 作者: S sneaxiy

merge develop

test=develop
...@@ -56,7 +56,7 @@ paddle.fluid.io.save_persistables (ArgSpec(args=['executor', 'dirname', 'main_pr ...@@ -56,7 +56,7 @@ paddle.fluid.io.save_persistables (ArgSpec(args=['executor', 'dirname', 'main_pr
paddle.fluid.io.load_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '0a5308f496632ab1ec3ba1f1377e6f95')) paddle.fluid.io.load_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '0a5308f496632ab1ec3ba1f1377e6f95'))
paddle.fluid.io.load_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '41779819cef32f2246e83aebc5a002e2')) paddle.fluid.io.load_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '41779819cef32f2246e83aebc5a002e2'))
paddle.fluid.io.load_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '28df5bfe26ca7a077f91156abb0fe6d2')) paddle.fluid.io.load_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '28df5bfe26ca7a077f91156abb0fe6d2'))
paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)), ('document', '582d87b8df75a5a639a107db8ff86f9c')) paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)), ('document', '70f4f53f13572436ac72d1c8b5efeb9d'))
paddle.fluid.io.load_inference_model (ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '7a5255386075dac3c75b7058254fcdcb')) paddle.fluid.io.load_inference_model (ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '7a5255386075dac3c75b7058254fcdcb'))
paddle.fluid.initializer.ConstantInitializer.__init__ (ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.initializer.ConstantInitializer.__init__ (ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.initializer.UniformInitializer.__init__ (ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.initializer.UniformInitializer.__init__ (ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
...@@ -109,7 +109,7 @@ paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name ...@@ -109,7 +109,7 @@ paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name
paddle.fluid.layers.sequence_first_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '2b290d3d77882bfe9bb8d331cac8cdd3')) paddle.fluid.layers.sequence_first_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '2b290d3d77882bfe9bb8d331cac8cdd3'))
paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'c16a892f44f7fe71bfa5afc32d3f34ce')) paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'c16a892f44f7fe71bfa5afc32d3f34ce'))
paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'fdcea0e8b5bc7d8d4b1b072c521014e6')) paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'fdcea0e8b5bc7d8d4b1b072c521014e6'))
paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')), ('document', 'dc7042734c6d8b8ce97321f017f01d6f')) paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')), ('document', 'f1dd22f7351f7f9853212958e0d8aa7a'))
paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '652625345c2acb900029c78cc75f8aa6')) paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '652625345c2acb900029c78cc75f8aa6'))
paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbf2adbd79683dc93db03454dfa18c2')) paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbf2adbd79683dc93db03454dfa18c2'))
paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)), ('document', '97f0262f97602644c83142789d784571')) paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)), ('document', '97f0262f97602644c83142789d784571'))
...@@ -205,7 +205,7 @@ paddle.fluid.layers.maxout (ArgSpec(args=['x', 'groups', 'name'], varargs=None, ...@@ -205,7 +205,7 @@ paddle.fluid.layers.maxout (ArgSpec(args=['x', 'groups', 'name'], varargs=None,
paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5f207ae10589ebe38a63575ef6ff8e1e')) paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5f207ae10589ebe38a63575ef6ff8e1e'))
paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '51def402b8910e163cbace9d0c0526ed')) paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '51def402b8910e163cbace9d0c0526ed'))
paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '77a6d80aa5551ca70324fc975c44507f')) paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '77a6d80aa5551ca70324fc975c44507f'))
paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)), ('document', '2f46f1ff39a13ab00857e7b9f44b2fa7')) paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name', 'act'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None, None)), ('document', 'ab84fdc6dc60f3ad9aa397e6007e3bf9'))
paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '70e3b5182a18b40b47ecabd7c8490a35')) paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '70e3b5182a18b40b47ecabd7c8490a35'))
paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', '9bb77f8dc002dd2ce75d4769eaaf5007')) paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', '9bb77f8dc002dd2ce75d4769eaaf5007'))
paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd256cba1c41a5ed92ce3f31e24a2ca6d')) paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd256cba1c41a5ed92ce3f31e24a2ca6d'))
...@@ -255,6 +255,7 @@ paddle.fluid.layers.reverse (ArgSpec(args=['x', 'axis'], varargs=None, keywords= ...@@ -255,6 +255,7 @@ paddle.fluid.layers.reverse (ArgSpec(args=['x', 'axis'], varargs=None, keywords=
paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '8f8c0306117ea441f20dcbbdba1f0ecc')) paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '8f8c0306117ea441f20dcbbdba1f0ecc'))
paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '2e53e83127dbfd86e7098bdfe9a549e8')) paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '2e53e83127dbfd86e7098bdfe9a549e8'))
paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '0a437011c3906079fd8947ed3e52d292')) paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '0a437011c3906079fd8947ed3e52d292'))
paddle.fluid.layers.range (ArgSpec(args=['start', 'end', 'step', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '2ec937ede953ded2fdff2675883900bb'))
paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
...@@ -277,7 +278,7 @@ paddle.fluid.layers.DynamicRNN.block (ArgSpec(args=['self'], varargs=None, keywo ...@@ -277,7 +278,7 @@ paddle.fluid.layers.DynamicRNN.block (ArgSpec(args=['self'], varargs=None, keywo
paddle.fluid.layers.DynamicRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32')), ('document', 'b9174d4e91505b0c8ecc193eb51e248d')) paddle.fluid.layers.DynamicRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32')), ('document', 'b9174d4e91505b0c8ecc193eb51e248d'))
paddle.fluid.layers.DynamicRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'b439a176a3328de8a75bdc5c08eece4a')) paddle.fluid.layers.DynamicRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'b439a176a3328de8a75bdc5c08eece4a'))
paddle.fluid.layers.DynamicRNN.static_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', 'f29ad2478b6b2ad4f413d2936a331ea0')) paddle.fluid.layers.DynamicRNN.static_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', 'f29ad2478b6b2ad4f413d2936a331ea0'))
paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '169d694d2224f62b4f3afdc3dbc19e95')) paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x', 'level'], varargs=None, keywords=None, defaults=(0,)), ('document', '7568c5ac7622a10288d3307a94134655'))
paddle.fluid.layers.DynamicRNN.update_memory (ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None), ('document', '5d83987da13b98363d6a807a52d8024f')) paddle.fluid.layers.DynamicRNN.update_memory (ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None), ('document', '5d83987da13b98363d6a807a52d8024f'))
paddle.fluid.layers.StaticRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.StaticRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', 'c24e368e23afac1ed91a78a639d7a9c7')) paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', 'c24e368e23afac1ed91a78a639d7a9c7'))
...@@ -296,7 +297,7 @@ paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=Non ...@@ -296,7 +297,7 @@ paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=Non
paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3a46e0b5f9ce82348406478e610f14c9')) paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3a46e0b5f9ce82348406478e610f14c9'))
paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7')) paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7'))
paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13')) paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13'))
paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '072a8541e0f632366bba10f67cb0db27')) paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9e27491c39ac74d0b1ffe506aec0ebb'))
paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '64650ac42cf82e9920cb0b172b1d29fd')) paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '64650ac42cf82e9920cb0b172b1d29fd'))
paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad')) paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad'))
paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973')) paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973'))
...@@ -376,23 +377,9 @@ paddle.fluid.contrib.Calibrator.__init__ (ArgSpec(args=['self'], varargs='args', ...@@ -376,23 +377,9 @@ paddle.fluid.contrib.Calibrator.__init__ (ArgSpec(args=['self'], varargs='args',
paddle.fluid.contrib.Calibrator.sample_data (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3b8c85ca1e2cf753cc8c90a6c6992958')) paddle.fluid.contrib.Calibrator.sample_data (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3b8c85ca1e2cf753cc8c90a6c6992958'))
paddle.fluid.contrib.Calibrator.save_int8_model (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.Calibrator.save_int8_model (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.reader.ctr_reader.ctr_reader (ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2ebf3de2a6ef1af2c3b88d2db7591ab')) paddle.fluid.contrib.reader.ctr_reader.ctr_reader (ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2ebf3de2a6ef1af2c3b88d2db7591ab'))
paddle.fluid.contrib.build_compressor (ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.Compressor.__init__ (ArgSpec(args=['self', 'place', 'scope', 'train_program', 'train_reader', 'train_feed_list', 'train_fetch_list', 'eval_program', 'eval_reader', 'eval_feed_list', 'eval_fetch_list', 'teacher_programs', 'checkpoint_path', 'train_optimizer', 'distiller_optimizer'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, [], './checkpoints', None, None)), ('document', '31ae143830c9bf6b43547dd546c5ba80'))
paddle.fluid.contrib.CompressPass.__init__ (ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.Compressor.config (ArgSpec(args=['self', 'config_file'], varargs=None, keywords=None, defaults=None), ('document', '780d9c007276ccbb95b292400d7807b0'))
paddle.fluid.contrib.CompressPass.add_strategy (ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None), ('document', '3bf6010b6f47d3c86df0ec8957be95e0')) paddle.fluid.contrib.Compressor.run (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'c6e43d6a078d307672283c1f36e04fe9'))
paddle.fluid.contrib.CompressPass.apply (ArgSpec(args=['self', 'graph'], varargs=None, keywords=None, defaults=None), ('document', 'a92bf85d4b59bd4f2ac1706d7c4899a6'))
paddle.fluid.contrib.ImitationGraph.__init__ (ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.ImitationGraph.all_parameters (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.SensitivePruneStrategy.__init__ (ArgSpec(args=['self', 'pruner', 'start_epoch', 'end_epoch', 'delta_rate', 'acc_loss_threshold', 'sensitivities'], varargs=None, keywords=None, defaults=(None, 0, 10, 0.2, 0.2, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.SensitivePruneStrategy.on_batch_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.SensitivePruneStrategy.on_batch_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.SensitivePruneStrategy.on_compress_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.SensitivePruneStrategy.on_compress_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.MagnitudePruner.__init__ (ArgSpec(args=['self', 'threshold'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.MagnitudePruner.prune (ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.RatioPruner.__init__ (ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e7a81a325b296a9ca502ee5adb4fc85d'))
paddle.fluid.contrib.RatioPruner.prune (ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,)), ('document', '358cbf2978c91028fb96a195a9884645'))
paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '2ab36d4f7a564f5f65e455807ad06c67')) paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '2ab36d4f7a564f5f65e455807ad06c67'))
paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '59066bac9db0ac6ce414d05780b7333f')) paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '59066bac9db0ac6ce414d05780b7333f'))
paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '74c39c595dc70d6be2f16d8e462d282b')) paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '74c39c595dc70d6be2f16d8e462d282b'))
...@@ -432,48 +419,59 @@ paddle.fluid.nets.img_conv_group (ArgSpec(args=['input', 'conv_num_filter', 'poo ...@@ -432,48 +419,59 @@ paddle.fluid.nets.img_conv_group (ArgSpec(args=['input', 'conv_num_filter', 'poo
paddle.fluid.optimizer.SGDOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.SGDOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.SGDOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.SGDOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
paddle.fluid.optimizer.SGDOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.SGDOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
paddle.fluid.optimizer.SGDOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.SGDOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.SGDOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
paddle.fluid.optimizer.MomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.MomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.MomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.MomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
paddle.fluid.optimizer.MomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.MomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
paddle.fluid.optimizer.MomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.MomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.MomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
paddle.fluid.optimizer.AdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name', 'initial_accumulator_value'], varargs=None, keywords=None, defaults=(1e-06, None, None, 0.0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name', 'initial_accumulator_value'], varargs=None, keywords=None, defaults=(1e-06, None, None, 0.0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.AdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.AdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
paddle.fluid.optimizer.AdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.AdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
paddle.fluid.optimizer.AdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.AdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.AdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
paddle.fluid.optimizer.AdamOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdamOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.AdamOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.AdamOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
paddle.fluid.optimizer.AdamOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.AdamOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
paddle.fluid.optimizer.AdamOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.AdamOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.AdamOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
paddle.fluid.optimizer.AdamaxOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdamaxOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
paddle.fluid.optimizer.AdamaxOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
paddle.fluid.optimizer.DecayedAdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.DecayedAdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
paddle.fluid.optimizer.DecayedAdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
paddle.fluid.optimizer.FtrlOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.FtrlOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.FtrlOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.FtrlOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
paddle.fluid.optimizer.FtrlOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.FtrlOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
paddle.fluid.optimizer.FtrlOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.FtrlOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.FtrlOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
paddle.fluid.optimizer.RMSPropOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.RMSPropOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
paddle.fluid.optimizer.RMSPropOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.RMSPropOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
paddle.fluid.optimizer.RMSPropOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.RMSPropOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.RMSPropOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
paddle.fluid.optimizer.AdadeltaOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdadeltaOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
paddle.fluid.optimizer.AdadeltaOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.AdadeltaOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
paddle.fluid.optimizer.AdadeltaOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.AdadeltaOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.AdadeltaOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
paddle.fluid.optimizer.ModelAverage.__init__ (ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.ModelAverage.__init__ (ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.ModelAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '46234a5470590feb336346f70a3db715')) paddle.fluid.optimizer.ModelAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '46234a5470590feb336346f70a3db715'))
paddle.fluid.optimizer.ModelAverage.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.ModelAverage.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
paddle.fluid.optimizer.ModelAverage.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.ModelAverage.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
paddle.fluid.optimizer.ModelAverage.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.ModelAverage.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.ModelAverage.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
paddle.fluid.optimizer.ModelAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '18db9c70be9c4dd466f9844457b21bfe')) paddle.fluid.optimizer.ModelAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '18db9c70be9c4dd466f9844457b21bfe'))
paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
paddle.fluid.optimizer.LarsMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '1a79bd7d10ae54ca763ec81bca36ba24')) paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '1a79bd7d10ae54ca763ec81bca36ba24'))
paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
......
...@@ -9,6 +9,7 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place ...@@ -9,6 +9,7 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place
cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper) cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper) cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper) cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad_pass.cc DEPS graph graph_helper)
cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows) cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
...@@ -22,6 +23,8 @@ endif() ...@@ -22,6 +23,8 @@ endif()
if(WITH_GPU) if(WITH_GPU)
nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda variable_visitor) dynload_cuda variable_visitor)
nv_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda variable_visitor)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim dynload_cuda selected_rows_functor sendrecvop_rpc) ddim dynload_cuda selected_rows_functor sendrecvop_rpc)
...@@ -35,6 +38,8 @@ if(WITH_GPU) ...@@ -35,6 +38,8 @@ if(WITH_GPU)
else() else()
cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
variable_visitor) variable_visitor)
cc_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
variable_visitor)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim selected_rows_functor sendrecvop_rpc) ddim selected_rows_functor sendrecvop_rpc)
...@@ -46,9 +51,7 @@ else() ...@@ -46,9 +51,7 @@ else()
cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
endif() endif()
cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor)
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
if(WITH_GPU) if(WITH_GPU)
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info) cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info)
...@@ -69,7 +72,9 @@ cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS grap ...@@ -69,7 +72,9 @@ cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS grap
cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass) cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle fused_broadcast_op_handle)
cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle)
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass) set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass)
if (WITH_GPU) if (WITH_GPU)
...@@ -98,5 +103,5 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS ...@@ -98,5 +103,5 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass multi_devices_graph_print_pass multi_devices_graph_check_pass
fuse_elewise_add_act_pass multi_batch_merge_pass fuse_elewise_add_act_pass multi_batch_merge_pass
fuse_relu_depthwise_conv_pass fuse_relu_depthwise_conv_pass
memory_optimize_pass lock_free_optimize_pass) memory_optimize_pass lock_free_optimize_pass alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass)
...@@ -11,9 +11,8 @@ ...@@ -11,9 +11,8 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <algorithm>
#include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include <algorithm>
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
...@@ -56,6 +55,7 @@ void AllReduceOpHandle::RunImpl() { ...@@ -56,6 +55,7 @@ void AllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name()); platform::RecordEvent record_event(Name());
WaitInputVarGenerated(); WaitInputVarGenerated();
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs()); auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs()); auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
DEFINE_uint32(fuse_parameter_memory_size, 0, // 0 KB
"fuse_parameter_memory_size is up limited memory size "
"of one group parameters' gradient which is the input "
"of communication calling(e.g NCCLAllReduce). "
"The default value is 0, it means that "
"not set group according to memory_size.");
DEFINE_int32(
fuse_parameter_groups_size, 3,
"fuse_parameter_groups_size is the size of one group parameters' gradient. "
"The default value is a experimental result. If the "
"fuse_parameter_groups_size is 1, it means that the groups size is "
"the number of parameters' gradient. If the fuse_parameter_groups_size is "
"-1, it means that there are only one group. The default value is 3, it is "
"an experimental value.");
namespace paddle {
namespace framework {
namespace details {
static const char kUnKnow[] = "@UNKNOW@";
static framework::proto::VarType::Type kDefaultDtype =
framework::proto::VarType::Type::VarType_Type_BOOL;
class AllocContinuousSpaceForGradPass : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override {
ir::Graph &result = *graph;
auto &places = Get<const std::vector<platform::Place>>(kPlaces);
auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
ResetAttribute<ParamsAndGrads>(kParamsAndGrads, &result);
ResetAttribute<GroupGradsAndParams>(kGroupGradsAndParams, &result);
// NOTE: The operator nodes should be in topology order.
std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result);
auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
for (auto &node : topo_nodes) {
RecordParamsAndGrads(node, &params_grads);
}
if (params_grads.size() == 0) {
VLOG(10) << "Doesn't find gradients";
return std::move(graph);
}
std::unordered_map<std::string, ir::Node *> vars;
for (ir::Node *node : result.Nodes()) {
if (node->IsVar() && node->Var()) {
// Note: The graph may have the same name node. For example, parameter
// is the input of operator and it also is the output of optimizer;
vars.emplace(node->Var()->Name(), node);
}
}
auto &group_grads_params =
result.Get<GroupGradsAndParams>(kGroupGradsAndParams);
// Note: the order of params_grads may be changed by SetGroupGradsAndParams.
SetGroupGradsAndParams(vars, params_grads, &group_grads_params);
params_grads.clear();
for (auto &group_p_g : group_grads_params) {
params_grads.insert(params_grads.begin(), group_p_g.begin(),
group_p_g.end());
}
for (auto &p_g : params_grads) {
std::swap(p_g.first, p_g.second);
}
// Set Gradients as Persistable to prevent this var becoming reusable.
auto dtype = kDefaultDtype;
for (auto &p_g : params_grads) {
// Get gradient var
auto iter = vars.find(p_g.second);
PADDLE_ENFORCE(iter != vars.end(), "%s is not found.", p_g.second);
iter->second->Var()->SetPersistable(true);
PADDLE_ENFORCE(IsSupportedVarType(iter->second->Var()->GetType()));
// Get Dtype
auto ele_dtype = iter->second->Var()->GetDataType();
if (dtype == kDefaultDtype) {
dtype = ele_dtype;
PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype);
}
PADDLE_ENFORCE_EQ(ele_dtype, dtype);
}
// Create the fused variable name.
if (!result.Has(kFusedVars)) {
result.Set(kFusedVars, new FusedVars);
}
const std::string prefix(kFusedVarNamePrefix);
// The fused_var_name should be unique.
auto fused_var_name = prefix + "GRAD@" + params_grads[0].second;
auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0);
fused_var_set.insert(fused_var_name);
InitFusedVarsAndAllocSpaceForVars(places, local_scopes, vars,
fused_var_name, params_grads);
return std::move(graph);
}
template <typename AttrType>
void ResetAttribute(const std::string &attr_name, ir::Graph *graph) const {
if (graph->Has(attr_name)) {
VLOG(10) << attr_name << " is reset.";
graph->Erase(attr_name);
}
graph->Set(attr_name, new AttrType);
}
void SetGroupGradsAndParams(
const std::unordered_map<std::string, ir::Node *> &var_nodes,
const ParamsAndGrads &params_grads,
GroupGradsAndParams *group_grads_params) const {
SetGroupAccordingToLayers(var_nodes, params_grads, group_grads_params);
SetGroupAccordingToMemorySize(var_nodes, group_grads_params);
SetGroupAccordingToGroupSize(var_nodes, group_grads_params);
}
void SetGroupAccordingToLayers(
const std::unordered_map<std::string, ir::Node *> &var_nodes,
const ParamsAndGrads &params_grads,
GroupGradsAndParams *group_grads_params) const {
std::unordered_map<std::string, std::vector<int>> layer_params;
for (size_t i = 0; i < params_grads.size(); ++i) {
auto pos = params_grads[i].first.find_first_of(".");
if (pos == std::string::npos) {
layer_params[std::string(kUnKnow)].emplace_back(i);
} else {
layer_params[params_grads[i].first.substr(0, pos)].emplace_back(i);
}
}
group_grads_params->reserve(layer_params.size());
for (size_t i = 0; i < params_grads.size(); ++i) {
auto pos = params_grads[i].first.find_first_of(".");
std::string key = kUnKnow;
if (pos != std::string::npos) {
key = params_grads[i].first.substr(0, pos);
}
auto iter = layer_params.find(key);
if (iter == layer_params.end()) continue;
group_grads_params->emplace_back();
auto &local_group_grads_params = group_grads_params->back();
for (auto &idx : iter->second) {
local_group_grads_params.emplace_back(
std::make_pair(params_grads[idx].second, params_grads[idx].first));
}
layer_params.erase(iter);
}
VLOG(10) << "SetGroupAccordingToLayers: ";
for (size_t i = 0; i < group_grads_params->size(); ++i) {
VLOG(10) << "group " << i;
std::stringstream out;
for (auto &p_g : group_grads_params->at(i)) {
out << "(" << p_g.second << ", " << p_g.first << "), ";
}
VLOG(10) << out.str();
}
}
void SetGroupAccordingToMemorySize(
const std::unordered_map<std::string, ir::Node *> &var_nodes,
GroupGradsAndParams *group_grads_params) const {
if (FLAGS_fuse_parameter_memory_size == 0) {
return;
}
size_t group_memory_size =
static_cast<size_t>(FLAGS_fuse_parameter_memory_size);
GroupGradsAndParams local_group_grads_params;
size_t j = 0;
while (j < group_grads_params->size()) {
local_group_grads_params.emplace_back();
auto &group_p_g = local_group_grads_params.back();
size_t local_group_memory_size = 0;
while (j < group_grads_params->size()) {
std::for_each(
group_grads_params->at(j).begin(), group_grads_params->at(j).end(),
[&local_group_memory_size,
&var_nodes](const std::pair<std::string, std::string> &g_p) {
auto iter = var_nodes.find(g_p.second);
PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.",
g_p.second);
auto shape = iter->second->Var()->GetShape();
size_t size =
framework::SizeOfType(iter->second->Var()->GetDataType());
std::for_each(shape.begin(), shape.end(),
[&size](const int64_t &n) { size *= n; });
local_group_memory_size += size;
});
group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(),
group_grads_params->at(j).end());
++j;
if (local_group_memory_size >= group_memory_size) {
break;
}
}
}
std::swap(*group_grads_params, local_group_grads_params);
VLOG(10) << string::Sprintf(
"SetGroupAccordingToMemorySize(memory_size: %d):",
FLAGS_fuse_parameter_memory_size);
for (size_t i = 0; i < group_grads_params->size(); ++i) {
VLOG(10) << "group " << i;
std::stringstream out;
for (auto &g_p : group_grads_params->at(i)) {
auto iter = var_nodes.find(g_p.second);
PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.", g_p.second);
auto shape = iter->second->Var()->GetShape();
size_t size = framework::SizeOfType(iter->second->Var()->GetDataType());
std::for_each(shape.begin(), shape.end(),
[&size](const int64_t &n) { size *= n; });
out << string::Sprintf("(%s(%d), %s)", g_p.second, size, g_p.first);
}
VLOG(10) << out.str();
}
}
void SetGroupAccordingToGroupSize(
const std::unordered_map<std::string, ir::Node *> &var_nodes,
GroupGradsAndParams *group_grads_params) const {
if (FLAGS_fuse_parameter_groups_size == 1) {
return;
}
size_t group_size = static_cast<size_t>(FLAGS_fuse_parameter_groups_size);
if (FLAGS_fuse_parameter_groups_size == -1) {
group_size = group_grads_params->size();
}
PADDLE_ENFORCE_GT(group_size, 1);
size_t groups = (group_grads_params->size() + group_size - 1) / group_size;
GroupGradsAndParams local_group_grads_params;
local_group_grads_params.reserve(groups);
size_t j = 0;
for (size_t i = 0; i < groups; ++i) {
local_group_grads_params.emplace_back();
auto &group_p_g = local_group_grads_params.back();
group_p_g.reserve(group_size);
while (j < group_grads_params->size()) {
group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(),
group_grads_params->at(j).end());
++j;
if (j % group_size == 0) break;
}
}
std::swap(*group_grads_params, local_group_grads_params);
VLOG(10) << "SetGroupAccordingToGroupSize(group_size: " << group_size
<< "): ";
for (size_t i = 0; i < group_grads_params->size(); ++i) {
VLOG(10) << "group " << i;
std::stringstream out;
for (auto &p_g : group_grads_params->at(i)) {
out << "(" << p_g.second << ", " << p_g.first << "), ";
}
VLOG(10) << out.str();
}
}
private:
bool IsSupportedVarType(const proto::VarType::Type &type) const {
// Current only support LOD_TENSOR.
return type == proto::VarType::LOD_TENSOR;
}
void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
const std::vector<std::string> &grads_name,
const std::string &fused_var_name,
BlockDesc *global_block) const {
auto op_desc = global_block->AppendOp();
op_desc->SetType("alloc_continuous_space");
op_desc->SetInput("Input", params_name);
op_desc->SetOutput("Output", grads_name);
op_desc->SetOutput("FusedOutput", {fused_var_name});
}
void RecordParamsAndGrads(ir::Node *node,
ParamsAndGrads *params_grads) const {
try {
bool is_bk_op =
static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())) &
static_cast<int>(OpRole::kBackward));
if (!is_bk_op) return;
// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once.
auto backward_vars =
boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(backward_vars.size() % 2, static_cast<size_t>(0));
for (size_t i = 0; i < backward_vars.size(); i += 2) {
VLOG(10) << "Trainable parameter: " << backward_vars[i]
<< ", gradient: " << backward_vars[i + 1];
params_grads->emplace_back(std::make_pair(
backward_vars[i] /*param*/, backward_vars[i + 1] /*grad*/));
}
} catch (boost::bad_get e) {
}
}
void InitFusedVarsAndAllocSpaceForVars(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const std::unordered_map<std::string, ir::Node *> &vars,
const std::string &fused_var_name,
const ParamsAndGrads &params_grads) const {
// Init Gradients and FusedVars
VLOG(10) << "Init FusedVars and Gradients.";
for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) {
auto &scope = *it;
PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
"%s has existed in scope.", fused_var_name);
scope->Var(fused_var_name)->GetMutable<LoDTensor>();
for (auto &p_g : params_grads) {
auto iter = vars.find(p_g.second);
PADDLE_ENFORCE(iter != vars.end());
PADDLE_ENFORCE_NOT_NULL(iter->second->Var());
PADDLE_ENFORCE_EQ(iter->second->Var()->GetType(),
proto::VarType::LOD_TENSOR);
scope->Var(p_g.second)->GetMutable<LoDTensor>();
}
}
std::vector<std::string> grads_name;
std::vector<std::string> params_name;
grads_name.reserve(params_grads.size());
params_name.reserve(params_grads.size());
for (auto &p_g : params_grads) {
params_name.emplace_back(p_g.first);
grads_name.emplace_back(p_g.second);
}
framework::ProgramDesc program_desc;
AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name,
program_desc.MutableBlock(0));
// Run Only Once Programs
for (size_t i = 0; i < local_scopes.size(); ++i) {
for (auto &op_desc : program_desc.Block(0).AllOps()) {
auto op = OpRegistry::CreateOp(*op_desc);
op->Run(*local_scopes[i], places[i]);
}
}
}
};
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_PASS(alloc_continuous_space_for_grad_pass,
paddle::framework::details::AllocContinuousSpaceForGradPass)
.RequirePassAttr(paddle::framework::details::kPlaces)
.RequirePassAttr(paddle::framework::details::kLocalScopes);
...@@ -57,7 +57,7 @@ struct BroadcastOpHandle : public OpHandleBase { ...@@ -57,7 +57,7 @@ struct BroadcastOpHandle : public OpHandleBase {
std::string Name() const override; std::string Name() const override;
bool IsMultiDeviceTransfer() override { return false; }; bool IsMultiDeviceTransfer() override { return true; };
protected: protected:
void RunImpl() override; void RunImpl() override;
......
...@@ -46,7 +46,16 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -46,7 +46,16 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
public: public:
explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy) explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
: ir::PassBuilder(), strategy_(strategy) { : ir::PassBuilder(), strategy_(strategy) {
// Add a graph viz pass to record a graph.
if (!strategy_.debug_graphviz_path_.empty()) {
auto viz_pass = AppendPass("graph_viz_pass");
const std::string graph_path = string::Sprintf(
"%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph");
viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
}
if (strategy_.enable_sequential_execution_) { if (strategy_.enable_sequential_execution_) {
VLOG(10) << "Add sequential_execution_pass";
AppendPass("sequential_execution_pass"); AppendPass("sequential_execution_pass");
} }
...@@ -57,6 +66,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -57,6 +66,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
// Add op fusion. // Add op fusion.
if (strategy.fuse_relu_depthwise_conv_) { if (strategy.fuse_relu_depthwise_conv_) {
VLOG(10) << "Add fuse_relu_depthwise_conv_pass";
AppendPass("fuse_relu_depthwise_conv_pass"); AppendPass("fuse_relu_depthwise_conv_pass");
} }
...@@ -68,29 +78,30 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -68,29 +78,30 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
// Add automatically inplace. // Add automatically inplace.
if (strategy_.enable_inplace_) { if (strategy_.enable_inplace_) {
VLOG(10) << "Add inplace_pass";
AppendPass("inplace_pass"); AppendPass("inplace_pass");
} }
if (strategy.fuse_elewise_add_act_ops_) {
VLOG(10) << "Add fuse_elewise_add_act_pass";
AppendPass("fuse_elewise_add_act_pass");
}
// for single card training, fuse_all_reduce_ops is unnecessary.
// alloc_continuous_space_for_grad_pass should be before of MultiDevPass.
if (strategy.fuse_all_reduce_ops_) {
VLOG(10) << "Add alloc_continuous_space_for_grad_pass";
AppendPass("alloc_continuous_space_for_grad_pass");
}
// Add a graph viz pass to record a graph. // Add a graph viz pass to record a graph.
if (!strategy_.debug_graphviz_path_.empty()) { if (!strategy.debug_graphviz_path_.empty()) {
auto viz_pass = AppendPass("graph_viz_pass"); auto viz_pass = AppendPass("graph_viz_pass");
const std::string graph_path = string::Sprintf( const std::string graph_path = string::Sprintf(
"%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph"); "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path)); viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
} }
if (strategy.fuse_elewise_add_act_ops_) {
auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass");
// Add a graph viz pass to record a graph.
if (!strategy.debug_graphviz_path_.empty()) {
auto viz_pass = AppendPass("graph_viz_pass");
const std::string graph_path = string::Sprintf(
"%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
viz_pass->Set<std::string>("graph_viz_path",
new std::string(graph_path));
}
}
CollectiveContext *context = CollectiveContext::GetInstance(); CollectiveContext *context = CollectiveContext::GetInstance();
context->endpoints_ = strategy_.trainers_endpoints_; context->endpoints_ = strategy_.trainers_endpoints_;
context->trainer_id_ = strategy_.trainer_id_; context->trainer_id_ = strategy_.trainer_id_;
...@@ -108,11 +119,19 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -108,11 +119,19 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
// A side-effect of that, memory optimize cannot forsee the fetched vars // A side-effect of that, memory optimize cannot forsee the fetched vars
// , so fetchlist should be set persistable before call the Run interface. // , so fetchlist should be set persistable before call the Run interface.
if (strategy.memory_optimize_) { if (strategy.memory_optimize_) {
auto memory_optimize_pass = AppendPass("memory_optimize_pass"); VLOG(10) << "Add memory_optimize_pass";
AppendPass("memory_optimize_pass");
} }
AppendMultiDevPass(strategy); AppendMultiDevPass(strategy);
if (strategy.fuse_all_reduce_ops_) {
// NOTE: fuse_all_reduce_ops will count the number of all_reduce operator
// first, if the number is zero, fuse_all_reduce_ops will do nothing.
VLOG(10) << "Add fuse_all_reduce_op_pass";
AppendPass("fuse_all_reduce_op_pass");
}
// Add a graph print pass to record a graph with device info. // Add a graph print pass to record a graph with device info.
if (!strategy_.debug_graphviz_path_.empty()) { if (!strategy_.debug_graphviz_path_.empty()) {
auto multi_devices_print_pass = AppendPass("multi_devices_print_pass"); auto multi_devices_print_pass = AppendPass("multi_devices_print_pass");
...@@ -128,28 +147,34 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -128,28 +147,34 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
// Verify that the graph is correct for multi-device executor. // Verify that the graph is correct for multi-device executor.
AppendPass("multi_devices_check_pass"); AppendPass("multi_devices_check_pass");
if (VLOG_IS_ON(2)) {
AppendPass("all_reduce_deps_pass");
}
if (SeqOnlyAllReduceOps(strategy)) { if (SeqOnlyAllReduceOps(strategy)) {
VLOG(10) << "Add all_reduce_deps_pass";
AppendPass("all_reduce_deps_pass"); AppendPass("all_reduce_deps_pass");
} }
if (strategy_.remove_unnecessary_lock_) { if (strategy_.remove_unnecessary_lock_) {
VLOG(10) << "Add modify_op_lock_and_record_event_pass";
AppendPass("modify_op_lock_and_record_event_pass"); AppendPass("modify_op_lock_and_record_event_pass");
} }
} }
// Convert graph to run on multi-devices. // Convert graph to run on multi-devices.
void AppendMultiDevPass(const BuildStrategy &strategy) { void AppendMultiDevPass(const BuildStrategy &strategy) {
ir::Pass *multi_devices_pass; ir::Pass *multi_devices_pass = nullptr;
if (strategy_.is_distribution_) { if (strategy_.is_distribution_) {
VLOG(3) << "multi device parameter server mode"; VLOG(10) << "Add dist_multi_devices_pass";
multi_devices_pass = AppendPass("dist_multi_devices_pass").get(); multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
} else { } else {
if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
VLOG(3) << "multi devices collective mode with allreduce"; VLOG(10) << "Add all_reduce_mode_multi_devices_pass";
multi_devices_pass = multi_devices_pass =
AppendPass("allreduce_mode_multi_devices_pass").get(); AppendPass("all_reduce_mode_multi_devices_pass").get();
} else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
VLOG(3) << "multi deivces collective mode with reduce"; VLOG(10) << "Add reduce_mode_multi_devices_pass";
multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get(); multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
} else { } else {
PADDLE_THROW("Unknown reduce strategy."); PADDLE_THROW("Unknown reduce strategy.");
...@@ -206,9 +231,26 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -206,9 +231,26 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
pass->Erase("nccl_ctxs"); pass->Erase(kNCCLCtxs);
pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx); pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
#endif
} else if (pass->Type() == "fuse_all_reduce_op_pass") {
pass->Erase(kPlaces);
pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
pass->Erase(kLocalScopes);
pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
&local_scopes);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
pass->Erase(kNCCLCtxs);
pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
#endif #endif
} else if (pass->Type() == "alloc_continuous_space_for_grad_pass") {
pass->Erase(kPlaces);
pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
pass->Erase(kLocalScopes);
pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
&local_scopes);
} else if (pass->Type() == "sequential_execution_pass") { } else if (pass->Type() == "sequential_execution_pass") {
LOG(INFO) << "set enable_sequential_execution:" LOG(INFO) << "set enable_sequential_execution:"
<< enable_sequential_execution_; << enable_sequential_execution_;
...@@ -239,7 +281,7 @@ USE_PASS(fuse_elewise_add_act_pass); ...@@ -239,7 +281,7 @@ USE_PASS(fuse_elewise_add_act_pass);
USE_PASS(graph_viz_pass); USE_PASS(graph_viz_pass);
USE_PASS(multi_batch_merge_pass); USE_PASS(multi_batch_merge_pass);
USE_PASS(reduce_mode_multi_devices_pass); USE_PASS(reduce_mode_multi_devices_pass);
USE_PASS(allreduce_mode_multi_devices_pass); USE_PASS(all_reduce_mode_multi_devices_pass);
USE_PASS(dist_multi_devices_pass); USE_PASS(dist_multi_devices_pass);
USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass); USE_PASS(multi_devices_print_pass);
...@@ -249,4 +291,6 @@ USE_PASS(all_reduce_deps_pass); ...@@ -249,4 +291,6 @@ USE_PASS(all_reduce_deps_pass);
USE_PASS(modify_op_lock_and_record_event_pass); USE_PASS(modify_op_lock_and_record_event_pass);
USE_PASS(inplace_pass); USE_PASS(inplace_pass);
USE_PASS(lock_free_optimize_pass); USE_PASS(lock_free_optimize_pass);
USE_PASS(alloc_continuous_space_for_grad_pass);
USE_PASS(graph_to_program_pass); USE_PASS(graph_to_program_pass);
USE_PASS(fuse_all_reduce_op_pass);
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/ir/pass_builder.h"
...@@ -75,6 +76,8 @@ struct BuildStrategy { ...@@ -75,6 +76,8 @@ struct BuildStrategy {
bool fuse_elewise_add_act_ops_{false}; bool fuse_elewise_add_act_ops_{false};
bool fuse_all_reduce_ops_{false};
bool fuse_relu_depthwise_conv_{false}; bool fuse_relu_depthwise_conv_{false};
bool sync_batch_norm_{false}; bool sync_batch_norm_{false};
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/data_balance_op_handle.h"
#include <algorithm>
#include "paddle/fluid/framework/details/container_cast.h"
namespace paddle {
namespace framework {
namespace details {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
DataBalanceOpHandle::DataBalanceOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::NCCLContextMap *ctxs)
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
if (ctxs) {
for (auto &p : places_) {
this->SetDeviceContext(p, ctxs->DevCtx(p));
}
}
}
#else
DataBalanceOpHandle::DataBalanceOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places)
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
#endif
std::string DataBalanceOpHandle::Name() const { return "data balance"; }
std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
const std::vector<int> &device_sizes) {
int device_num = device_sizes.size();
int total_size = 0;
int empty_num = 0;
std::vector<std::array<int, 2>> size_device_vec;
size_device_vec.reserve(device_num);
for (int i = 0; i < device_num; ++i) {
if (device_sizes[i] == 0) {
++empty_num;
}
total_size += device_sizes[i];
size_device_vec.push_back({{device_sizes[i], i}});
}
std::vector<std::array<int, 3>> res;
if (empty_num == 0) {
// No need to do data balance.
return res;
}
if (total_size < device_num) {
// No enough data.
PADDLE_THROW_EOF();
}
std::sort(size_device_vec.begin(), size_device_vec.end(),
[](const std::array<int, 2> &a, const std::array<int, 2> &b) {
return a[0] > b[0];
});
int expected_device_size = total_size / device_num;
int src_idx = 0;
for (int dst_idx = device_num - empty_num; dst_idx < device_num; ++dst_idx) {
if (size_device_vec[src_idx][0] <= expected_device_size) {
++src_idx;
PADDLE_ENFORCE_LT(
src_idx, device_num - empty_num,
"In current srategy an empty tensor should not be copy source.");
}
size_device_vec[src_idx][0] -= expected_device_size;
size_device_vec[dst_idx][0] += expected_device_size;
res.push_back({{size_device_vec[src_idx][1], size_device_vec[dst_idx][1],
expected_device_size}});
}
return res;
}
void DataBalanceOpHandle::RunImpl() {
PADDLE_ENFORCE_GT(places_.size(), 1UL,
"Data balance can only be enabled when the number of "
"places to run larger than 1.");
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal.");
int data_num = in_var_handles.size() / places_.size();
WaitInputVarGenerated();
std::vector<std::vector<LoDTensor *>> lod_tensors(data_num);
std::vector<int> device_sizes;
for (int i = 0; i < static_cast<int>(in_var_handles.size()); ++i) {
PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
"The name of input and output should be equal.");
int place_idx = i / data_num;
int data_idx = i % data_num;
auto *local_scope =
local_scopes_[place_idx]->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto *tensor_var = local_scope->FindVar(in_var_handles[i]->name());
PADDLE_ENFORCE(tensor_var->IsType<LoDTensor>());
auto *tensor = tensor_var->GetMutable<LoDTensor>();
lod_tensors[data_idx].push_back(tensor);
int ins_size =
tensor->lod().empty() ? tensor->dims()[0] : tensor->NumElements();
if (data_idx == 0) {
device_sizes.emplace_back(ins_size);
} else {
PADDLE_ENFORCE_EQ(
ins_size, device_sizes.at(place_idx),
"All data on the same device shall have the same batch size.");
}
}
const auto &balance_plan = GetBalancePlan(device_sizes);
for (const auto &trans : balance_plan) {
for (int data_idx = 0; data_idx < data_num; ++data_idx) {
LoDTensor *src_tensor = lod_tensors[data_idx][trans[0]];
LoDTensor *dst_tensor = lod_tensors[data_idx][trans[1]];
int trans_ins_size = trans[2];
LoD src_lod = src_tensor->lod();
int src_ins_size =
src_lod.empty() ? src_tensor->dims()[0] : src_tensor->NumElements();
int cut_point = src_ins_size - trans_ins_size;
if (!src_lod.empty()) {
for (auto &level : src_lod) {
cut_point = level[cut_point];
}
}
TensorCopySync(src_tensor->Slice(cut_point, src_tensor->dims()[0]),
dst_tensor->place(), dst_tensor);
src_tensor->ShareDataWith(src_tensor->Slice(0, cut_point));
if (!src_lod.empty()) {
dst_tensor->set_lod(SliceInLevel(
src_lod, 0, src_ins_size - trans_ins_size, src_ins_size));
src_tensor->set_lod(
SliceInLevel(src_lod, 0, 0, src_ins_size - trans_ins_size));
}
}
}
}
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -82,6 +82,8 @@ void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) { ...@@ -82,6 +82,8 @@ void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) {
} }
} }
bool FetchOpHandle::IsMultiDeviceTransfer() { return true; }
std::string FetchOpHandle::Name() const { return "Fetch"; } std::string FetchOpHandle::Name() const { return "Fetch"; }
} // namespace details } // namespace details
......
...@@ -39,6 +39,8 @@ struct FetchOpHandle : public OpHandleBase { ...@@ -39,6 +39,8 @@ struct FetchOpHandle : public OpHandleBase {
std::string Name() const override; std::string Name() const override;
bool IsMultiDeviceTransfer() override;
protected: protected:
void RunImpl() override; void RunImpl() override;
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
namespace paddle {
namespace framework {
namespace details {
class FuseAllReduceOpPass : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override {
ir::Graph &result = *graph;
auto &places = Get<const std::vector<platform::Place>>(kPlaces);
auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
auto *nccl_ctxs = &Get<platform::NCCLContextMap>(kNCCLCtxs);
#endif
std::unordered_set<std::string> grads;
auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
size_t num_of_all_reduce = params_grads.size();
grads.reserve(num_of_all_reduce);
for (auto p_g : params_grads) {
grads.insert(p_g.second);
}
size_t num_place = places.size();
std::unordered_map<std::string, ir::Node *> all_reduce_ops;
all_reduce_ops.reserve(grads.size());
for (auto &node : result.Nodes()) {
if (node->IsOp()) {
PADDLE_ENFORCE(node->IsWrappedBy<OpHandleBase>());
auto *all_reduce_op_handle =
dynamic_cast<AllReduceOpHandle *>(&node->Wrapper<OpHandleBase>());
if (all_reduce_op_handle) {
auto inputs = DynamicCast<VarHandle>(all_reduce_op_handle->Inputs());
PADDLE_ENFORCE_EQ(inputs.size(), num_place);
// The inputs' name should be the same.
auto &grad_name = inputs[0]->name();
for (size_t i = 1; i < inputs.size(); ++i) {
PADDLE_ENFORCE_EQ(inputs[i]->name(), grad_name,
"The input name should be the same.");
}
PADDLE_ENFORCE_NE(grads.count(grad_name), static_cast<size_t>(0));
all_reduce_ops.emplace(grad_name, node);
}
}
}
VLOG(10) << "Find all_reduce_ops: " << all_reduce_ops.size();
if (all_reduce_ops.size() == 0) {
return std::move(graph);
}
PADDLE_ENFORCE_EQ(all_reduce_ops.size(), grads.size(),
"The number of all_reduce OpHandle is not equal to the "
"number of grads. Maybe some gradients are sparse type, "
"it is not supported currently.");
VLOG(10) << "Insert fused_all_reduce";
auto &group_grads_params =
graph->Get<GroupGradsAndParams>(kGroupGradsAndParams);
for (auto &group_g_p : group_grads_params) {
size_t group_size = group_g_p.size();
PADDLE_ENFORCE_GT(group_size, static_cast<size_t>(0));
std::vector<ir::Node *> group_all_reduce_ops;
group_all_reduce_ops.reserve(group_size);
for (auto &g_p : group_g_p) {
group_all_reduce_ops.emplace_back(all_reduce_ops.at(g_p.first));
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
InsertFusedAllReduce(places, local_scopes, group_size,
group_all_reduce_ops, nccl_ctxs, &result);
#else
InsertFusedAllReduce(places, local_scopes, group_size,
group_all_reduce_ops, &result);
#endif
}
return std::move(graph);
}
void InsertFusedAllReduce(const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const size_t num_of_all_reduce,
const std::vector<ir::Node *> &all_reduce_ops,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const platform::NCCLContextMap *nccl_ctxs,
#endif
ir::Graph *result) const {
std::vector<VarHandleBase *> inputs;
std::vector<VarHandleBase *> outputs;
for (auto &op : all_reduce_ops) {
auto &op_handle = op->Wrapper<OpHandleBase>();
inputs.insert(inputs.end(), op_handle.Inputs().begin(),
op_handle.Inputs().end());
// Remove output
for_each(op_handle.Inputs().begin(), op_handle.Inputs().end(),
[&op_handle](VarHandleBase *var_handle) {
var_handle->RemoveOutput(&op_handle, op_handle.Node());
});
outputs.insert(outputs.end(), op_handle.Outputs().begin(),
op_handle.Outputs().end());
// Remove Input
for_each(
op_handle.Outputs().begin(), op_handle.Outputs().end(),
[](VarHandleBase *var_handle) { var_handle->ClearGeneratedOp(); });
result->RemoveNode(op_handle.Node());
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
local_scopes, nccl_ctxs, result);
#else
CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
local_scopes, result);
#endif
}
private:
void CreateFusedAllReduceOp(const std::vector<VarHandleBase *> &inputs,
const std::vector<VarHandleBase *> &outputs,
const size_t num_of_all_reduce,
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const platform::NCCLContextMap *nccl_ctxs,
#endif
ir::Graph *result) const {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
auto *op_handle = new FusedAllReduceOpHandle(
result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
local_scopes, places, num_of_all_reduce, nccl_ctxs);
#else
auto *op_handle = new FusedAllReduceOpHandle(
result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
local_scopes, places, num_of_all_reduce);
#endif
for (auto in : inputs) {
op_handle->AddInput(in);
}
for (auto out : outputs) {
op_handle->AddOutput(out);
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if (!nccl_ctxs) {
SetCommunicationContext(places, op_handle);
}
#else
SetCommunicationContext(places, op_handle);
#endif
}
void SetCommunicationContext(const std::vector<platform::Place> &places,
FusedAllReduceOpHandle *op_handle) const {
for (size_t i = 0; i < places.size(); ++i) {
op_handle->SetDeviceContext(
places[i], platform::DeviceContextPool::Instance().Get(places[i]));
}
}
};
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_PASS(fuse_all_reduce_op_pass,
paddle::framework::details::FuseAllReduceOpPass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fuse_vars_op_handle.h"
namespace paddle {
namespace framework {
namespace details {
void FuseVarsOpHandle::RunImpl() {
WaitInputVarGenerated(place_);
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(in_var_handles.size(), 0UL);
PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), "");
auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto out_var_handle = out_var_handles[0];
auto out_var = scope->Var(out_var_handle->name());
auto out_tensor = out_var->GetMutable<LoDTensor>();
out_tensor->Resize({total_numel_}).mutable_data(this->place_, type_);
int64_t s = 0;
for (size_t i = 1; i < out_var_handles.size(); ++i) {
auto out_name = out_var_handles[i]->name();
auto out_t = scope->Var(out_name)->GetMutable<LoDTensor>();
auto numel = this->inputs_numel_.at(out_name);
out_t->ShareDataWith(out_tensor->Slice(s, s + numel));
s += numel;
}
this->RunAndRecordEvent([] {});
}
std::string FuseVarsOpHandle::Name() const { return "fuse vars"; }
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
struct FuseVarsOpHandle : public OpHandleBase {
public:
FuseVarsOpHandle(ir::Node *node, Scope *local_scope,
const platform::Place &place,
const std::unordered_map<std::string, int64_t> &inputs_numel,
const proto::VarType::Type var_type)
: OpHandleBase(node),
local_scope_(local_scope),
place_(place),
inputs_numel_(inputs_numel),
type_(var_type) {
total_numel_ = 0;
for (auto in_numel : inputs_numel) {
PADDLE_ENFORCE_GT(in_numel.second, 0);
total_numel_ += in_numel.second;
}
}
std::string Name() const override;
bool IsMultiDeviceTransfer() override { return false; };
protected:
void RunImpl() override;
private:
Scope *local_scope_;
const platform::Place place_;
const std::unordered_map<std::string, int64_t> inputs_numel_;
const proto::VarType::Type type_;
int64_t total_numel_;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
#include <algorithm>
#include <utility>
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_bool(skip_fused_all_reduce_check, false, "");
namespace paddle {
namespace framework {
namespace details {
typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
GradientAndLoDTensor;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
FusedAllReduceOpHandle::FusedAllReduceOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const size_t num_of_all_reduce,
const platform::NCCLContextMap *ctxs)
: OpHandleBase(node),
local_scopes_(local_scopes),
places_(places),
num_of_all_reduce_(num_of_all_reduce),
nccl_ctxs_(ctxs) {
if (nccl_ctxs_) {
for (auto &p : places_) {
this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
}
}
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
}
#else
FusedAllReduceOpHandle::FusedAllReduceOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const size_t num_of_all_reduce)
: OpHandleBase(node),
local_scopes_(local_scopes),
places_(places),
num_of_all_reduce_(num_of_all_reduce) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
}
#endif
void FusedAllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name());
VLOG(4) << this->DebugString();
WaitInputVarGenerated();
// The input: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)...
// The output: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)...
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
size_t place_num = places_.size();
PADDLE_ENFORCE_EQ(
in_var_handles.size(), place_num * num_of_all_reduce_,
"The NoDummyInputSize should be equal to the number of places.");
PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal.");
GradientAndLoDTensor grads_tensor;
grads_tensor.resize(place_num);
int64_t numel = -1;
auto dtype = static_cast<framework::proto::VarType::Type>(0);
for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) {
auto &g_tensor = grads_tensor.at(scope_idx);
g_tensor.reserve(num_of_all_reduce_);
GetGradLoDTensor(scope_idx, in_var_handles, out_var_handles, &g_tensor);
int64_t element_num = 0;
framework::proto::VarType::Type ele_dtype =
static_cast<framework::proto::VarType::Type>(0);
GetDTypeAndNumel(g_tensor, &ele_dtype, &element_num);
if (numel == -1) {
numel = element_num;
}
if (dtype == static_cast<framework::proto::VarType::Type>(0)) {
dtype = ele_dtype;
PADDLE_ENFORCE_NE(ele_dtype,
static_cast<framework::proto::VarType::Type>(0));
}
PADDLE_ENFORCE_EQ(ele_dtype, dtype);
// Check whether the address space is contiguous.
std::sort(
g_tensor.begin(), g_tensor.end(),
[](const std::pair<std::string, const LoDTensor *> &grad1,
const std::pair<std::string, const LoDTensor *> &grad2) -> bool {
return grad1.second->data<void>() < grad2.second->data<void>();
});
for (size_t k = 1; k < g_tensor.size(); ++k) {
const void *cur_address = g_tensor.at(k - 1).second->data<void>();
int64_t len = g_tensor.at(k - 1).second->numel();
auto offset = len * framework::SizeOfType(dtype);
void *infer_next_address = reinterpret_cast<void *>(
reinterpret_cast<uintptr_t>(cur_address) + offset);
const void *next_address = g_tensor.at(k).second->data<void>();
VLOG(10) << string::Sprintf(
"Input[%d](%s) address: 0X%02x, Input[%d](%s) address: 0X%02x, Infer "
"input[%d] address: 0X%02x. The offset: %d",
k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k,
next_address, k, infer_next_address, offset);
PADDLE_ENFORCE_EQ(infer_next_address, next_address,
"The address is not consistent.");
}
}
if (!FLAGS_skip_fused_all_reduce_check) {
for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
for (size_t j = 1; j < num_of_all_reduce_; ++j) {
PADDLE_ENFORCE_EQ(grads_tensor.at(0).at(j).first,
grads_tensor.at(scope_idx).at(j).first);
}
}
}
std::vector<const void *> lod_tensor_data;
for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
auto data = grads_tensor.at(scope_idx).at(0).second->data<void>();
lod_tensor_data.emplace_back(data);
}
if (platform::is_gpu_place(places_[0])) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
int nccl_dtype = platform::ToNCCLDataType(dtype);
std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &p = places_[i];
void *buffer = const_cast<void *>(lod_tensor_data.at(i));
int dev_id = boost::get<platform::CUDAPlace>(p).device;
auto &nccl_ctx = nccl_ctxs_->at(dev_id);
auto stream = nccl_ctx.stream();
auto comm = nccl_ctx.comm_;
all_reduce_calls.emplace_back([=] {
PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
buffer, buffer, numel, static_cast<ncclDataType_t>(nccl_dtype),
ncclSum, comm, stream));
});
}
this->RunAndRecordEvent([&] {
if (all_reduce_calls.size() == 1UL) {
// Do not use NCCLGroup when manage NCCL by per thread per device
all_reduce_calls[0]();
} else {
platform::NCCLGroupGuard guard;
for (auto &call : all_reduce_calls) {
call();
}
}
});
#else
PADDLE_THROW("Not compiled with CUDA");
#endif
} else {
// Special handle CPU only Operator's gradient. Like CRF
auto grad_name = grads_tensor.at(0).at(0).first;
auto &trg = *this->local_scopes_[0]
->FindVar(kLocalExecScopeName)
->Get<Scope *>()
->FindVar(grad_name)
->GetMutable<framework::LoDTensor>();
// Reduce All data to trg in CPU
ReduceBufferData func(lod_tensor_data, trg.data<void>(), numel);
VisitDataType(trg.type(), func);
for (size_t i = 1; i < local_scopes_.size(); ++i) {
auto &scope =
*local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto &p = places_[i];
auto *var = scope.FindVar(grad_name);
auto *dev_ctx = dev_ctxes_.at(p);
size_t size = numel * SizeOfType(trg.type());
RunAndRecordEvent(p, [&trg, var, dev_ctx, p, size] {
auto dst_ptr = var->GetMutable<framework::LoDTensor>()->data<void>();
platform::CPUPlace cpu_place;
memory::Copy(cpu_place, dst_ptr, cpu_place, trg.data<void>(), size);
});
}
}
}
void FusedAllReduceOpHandle::GetGradLoDTensor(
const size_t &scope_idx, const std::vector<VarHandle *> &in_var_handles,
const std::vector<VarHandle *> &out_var_handles,
std::vector<std::pair<std::string, const LoDTensor *>> *grad_tensor) const {
auto *local_scope =
local_scopes_.at(scope_idx)->FindVar(kLocalExecScopeName)->Get<Scope *>();
size_t place_num = places_.size();
for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
auto var_name = in_var_handles[j]->name();
PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name());
auto &lod_tensor = local_scope->FindVar(var_name)->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(lod_tensor.place(), places_.at(scope_idx));
grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor));
}
}
void FusedAllReduceOpHandle::GetDTypeAndNumel(
const std::vector<std::pair<std::string, const LoDTensor *>> &grad_tensor,
proto::VarType::Type *dtype, int64_t *numel) const {
*numel = 0;
for (size_t i = 0; i < grad_tensor.size(); ++i) {
// Get element number
int64_t len = grad_tensor.at(i).second->numel();
PADDLE_ENFORCE_GT(len, 0);
*numel += len;
// Get dtype
auto ele_type = grad_tensor.at(i).second->type();
if (i == 0) {
*dtype = ele_type;
}
PADDLE_ENFORCE_EQ(ele_type, *dtype);
}
}
std::string FusedAllReduceOpHandle::Name() const { return "fused_all_reduce"; }
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
...@@ -27,31 +28,47 @@ namespace paddle { ...@@ -27,31 +28,47 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
struct DataBalanceOpHandle : public OpHandleBase { struct FusedAllReduceOpHandle : public OpHandleBase {
public:
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes, FusedAllReduceOpHandle(ir::Node *node,
const std::vector<platform::Place> &places, const std::vector<Scope *> &local_scopes,
const platform::NCCLContextMap *ctxs); const std::vector<platform::Place> &places,
const size_t num_of_all_reduce,
const platform::NCCLContextMap *ctxs);
#else #else
DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes, FusedAllReduceOpHandle(ir::Node *node,
const std::vector<platform::Place> &places); const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const size_t num_of_all_reduce);
#endif #endif
std::string Name() const override; std::string Name() const override;
bool IsMultiDeviceTransfer() override { return false; }; // Delay and buffer nccl_all_reduce together can significantly increase
// performance. Disable this feature by returning false.
bool IsMultiDeviceTransfer() override { return true; };
protected: protected:
void RunImpl() override; void RunImpl() override;
private: private:
// std::vector<(src_dev_id, dst_dev_id, trans_size)> std::vector<Scope *> local_scopes_;
std::vector<std::array<int, 3>> GetBalancePlan( std::vector<platform::Place> places_;
const std::vector<int> &batch_size_per_device); size_t num_of_all_reduce_;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const platform::NCCLContextMap *nccl_ctxs_;
#endif
// Check the dtype of the input
void GetDTypeAndNumel(
const std::vector<std::pair<std::string, const LoDTensor *>> &g_tensor,
proto::VarType::Type *dtype, int64_t *total_num) const;
const std::vector<Scope *> local_scopes_; // Get gradient's name and LoDTensor
const std::vector<platform::Place> places_; void GetGradLoDTensor(const size_t &scope_idx,
const std::vector<VarHandle *> &in_var_handles,
const std::vector<VarHandle *> &out_var_handles,
std::vector<std::pair<std::string, const LoDTensor *>>
*grad_tensor) const;
}; };
} // namespace details } // namespace details
......
...@@ -11,18 +11,19 @@ ...@@ -11,18 +11,19 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include <algorithm> #include <algorithm>
#include <fstream> #include <fstream>
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/data_balance_op_handle.h"
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/rpc_op_handle.h" #include "paddle/fluid/framework/details/rpc_op_handle.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
...@@ -134,21 +135,26 @@ void AddOutputToLeafOps(ir::Graph *graph) { ...@@ -134,21 +135,26 @@ void AddOutputToLeafOps(ir::Graph *graph) {
} }
} // namespace } // namespace
void MultiDevSSAGraphBuilderBase::CheckGraph(const ir::Graph &graph) const {}
void MultiDevSSAGraphBuilderBase::Init() const { void MultiDevSSAGraphBuilderBase::Init() const {
all_vars_.clear(); all_vars_.clear();
loss_var_name_ = Get<const std::string>(kLossVarName); loss_var_name_ = Get<const std::string>(kLossVarName);
VLOG(10) << "Init MultiDevSSAGraphBuilder, loss name: " << loss_var_name_;
places_ = Get<const std::vector<platform::Place>>(kPlaces); places_ = Get<const std::vector<platform::Place>>(kPlaces);
local_scopes_ = Get<const std::vector<Scope *>>(kLocalScopes); local_scopes_ = Get<const std::vector<Scope *>>(kLocalScopes);
strategy_ = Get<const BuildStrategy>(kStrategy); strategy_ = Get<const BuildStrategy>(kStrategy);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs"); nccl_ctxs_ = &Get<platform::NCCLContextMap>(kNCCLCtxs);
#endif #endif
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
} }
std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl( std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
Init(); Init();
CheckGraph(*graph);
std::vector<ir::Node *> sorted_ops = SortOperations(*graph); std::vector<ir::Node *> sorted_ops = SortOperations(*graph);
auto nodes = graph->ReleaseNodes(); auto nodes = graph->ReleaseNodes();
...@@ -166,7 +172,6 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl( ...@@ -166,7 +172,6 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
result.Set(kGraphOps, new GraphOps); result.Set(kGraphOps, new GraphOps);
bool is_forwarding = true; bool is_forwarding = true;
bool insert_collection_ops = NeedCollectiveOps();
for (ir::Node *node : sorted_ops) { for (ir::Node *node : sorted_ops) {
if (DealWithSpecialOp(&result, node)) { if (DealWithSpecialOp(&result, node)) {
...@@ -185,8 +190,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl( ...@@ -185,8 +190,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
CreateComputationalOps(&result, node, places_.size()); CreateComputationalOps(&result, node, places_.size());
} }
// Insert collection ops // Insert collective ops if nranks > 1
if (!is_forwarding && insert_collection_ops) { if (!is_forwarding && Get<size_t>(kNRanks) > 1) {
try { try {
bool is_bk_op = bool is_bk_op =
static_cast<bool>(boost::get<int>(node->Op()->GetAttr( static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
...@@ -200,13 +205,13 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl( ...@@ -200,13 +205,13 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr( boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName())); OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
for (size_t i = 0; i < backward_vars.size(); i += 2) { for (size_t i = 0; i < backward_vars.size(); i += 2) {
auto &p_name = backward_vars[i]; auto &p_name = backward_vars[i];
auto &g_name = backward_vars[i + 1]; auto &g_name = backward_vars[i + 1];
VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
if (NeedCollectiveForGrad(g_name, sorted_ops)) {
InsertCollectiveOp(&result, p_name, g_name); InsertCollectiveOp(&result, p_name, g_name);
}
} }
} catch (boost::bad_get e) { } catch (boost::bad_get e) {
} }
...@@ -226,6 +231,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl( ...@@ -226,6 +231,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
* Only variables should be the leaves of graph. * Only variables should be the leaves of graph.
*/ */
AddOutputToLeafOps(&result); AddOutputToLeafOps(&result);
result.Erase(kGraphOps); result.Erase(kGraphOps);
return graph; return graph;
} }
...@@ -258,6 +264,11 @@ void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp( ...@@ -258,6 +264,11 @@ void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp(
} }
} }
bool MultiDevSSAGraphBuilderBase::DealWithSpecialOp(ir::Graph *result,
ir::Node *node) const {
return false;
}
std::vector<ir::Node *> MultiDevSSAGraphBuilderBase::SortOperations( std::vector<ir::Node *> MultiDevSSAGraphBuilderBase::SortOperations(
const ir::Graph &graph) const { const ir::Graph &graph) const {
return ir::TopologySortOperations(graph); return ir::TopologySortOperations(graph);
...@@ -271,8 +282,20 @@ bool MultiDevSSAGraphBuilderBase::UseGPU() const { ...@@ -271,8 +282,20 @@ bool MultiDevSSAGraphBuilderBase::UseGPU() const {
return use_gpu; return use_gpu;
} }
bool MultiDevSSAGraphBuilderBase::NeedCollectiveOps() const { bool MultiDevSSAGraphBuilderBase::NeedCollectiveForGrad(
return Get<size_t>(kNRanks) > 1; const std::string &grad_name, std::vector<ir::Node *> ops) const {
// if we have allreduce_op for current gradient variable in the graph,
// then we don't need to add allreduce_op_handle for this gradient
// NOTE: This is for the case that all gradients should add collective ops
for (auto *node : ops) {
if (node->Op()->Type() != "allreduce") continue;
for (auto in_name : node->Op()->InputArgumentNames()) {
if (in_name == grad_name) {
return false;
}
}
}
return true;
} }
void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result, void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result,
...@@ -496,20 +519,17 @@ VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(ir::Graph *result, ...@@ -496,20 +519,17 @@ VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(ir::Graph *result,
} }
bool MultiDevSSAGraphBuilderBase::IsScaleLossOp(ir::Node *node) const { bool MultiDevSSAGraphBuilderBase::IsScaleLossOp(ir::Node *node) const {
return boost::get<int>( return !loss_var_name_.empty() && node->Op() &&
boost::get<int>(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
(static_cast<int>(OpRole::kBackward) | (static_cast<int>(OpRole::kBackward) |
static_cast<int>(OpRole::kLoss)) && static_cast<int>(OpRole::kLoss));
!loss_var_name_.empty(); // If loss_var is empty. This is test mode
} }
bool MultiDevSSAGraphBuilderBase::IsSparseGradient( bool MultiDevSSAGraphBuilderBase::IsSparseGradient(
const std::string &og) const { const std::string &og) const {
PADDLE_ENFORCE(all_vars_.count(og) != 0); PADDLE_ENFORCE(all_vars_.count(og) != 0);
if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { return all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS;
return true;
}
return false;
} }
void AllReduceSSAGraphBuilder::InsertCollectiveOp( void AllReduceSSAGraphBuilder::InsertCollectiveOp(
...@@ -995,7 +1015,7 @@ static int MultiDevSSAGraphBuilderRegister(const std::string &builder_mode) { ...@@ -995,7 +1015,7 @@ static int MultiDevSSAGraphBuilderRegister(const std::string &builder_mode) {
REGISTER_MULTI_DEVICES_PASS(reduce_mode_multi_devices_pass, REGISTER_MULTI_DEVICES_PASS(reduce_mode_multi_devices_pass,
paddle::framework::details::ReduceSSAGraphBuilder); paddle::framework::details::ReduceSSAGraphBuilder);
REGISTER_MULTI_DEVICES_PASS( REGISTER_MULTI_DEVICES_PASS(
allreduce_mode_multi_devices_pass, all_reduce_mode_multi_devices_pass,
paddle::framework::details::AllReduceSSAGraphBuilder); paddle::framework::details::AllReduceSSAGraphBuilder);
REGISTER_MULTI_DEVICES_PASS(dist_multi_devices_pass, REGISTER_MULTI_DEVICES_PASS(dist_multi_devices_pass,
paddle::framework::details::DistSSAGraphBuilder); paddle::framework::details::DistSSAGraphBuilder);
...@@ -14,7 +14,10 @@ ...@@ -14,7 +14,10 @@
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
...@@ -31,12 +34,6 @@ namespace framework { ...@@ -31,12 +34,6 @@ namespace framework {
class Scope; class Scope;
namespace details { namespace details {
constexpr char kLossVarName[] = "loss_var_name";
constexpr char kPlaces[] = "places";
constexpr char kLocalScopes[] = "local_scopes";
constexpr char kStrategy[] = "strategy";
constexpr char kNRanks[] = "nranks";
class MultiDevSSAGraphBuilderBase : public ir::Pass { class MultiDevSSAGraphBuilderBase : public ir::Pass {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl( std::unique_ptr<ir::Graph> ApplyImpl(
...@@ -44,18 +41,21 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass { ...@@ -44,18 +41,21 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
virtual void Init() const; virtual void Init() const;
virtual void CheckGraph(const ir::Graph &graph) const;
virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const; virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;
virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
const std::string &g_name) const = 0; const std::string &g_name) const = 0;
virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const = 0; virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
virtual void InsertPostprocessOps(ir::Graph *result) const = 0; virtual void InsertPostprocessOps(ir::Graph *result) const = 0;
bool UseGPU() const; bool UseGPU() const;
bool NeedCollectiveOps() const; bool NeedCollectiveForGrad(const std::string &grad_name,
std::vector<ir::Node *> ops) const;
bool IsScaleLossOp(ir::Node *node) const; bool IsScaleLossOp(ir::Node *node) const;
...@@ -109,10 +109,6 @@ class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { ...@@ -109,10 +109,6 @@ class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
const std::string &g_name) const; const std::string &g_name) const;
virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const {
return false;
}
virtual void InsertPostprocessOps(ir::Graph *result) const {} virtual void InsertPostprocessOps(ir::Graph *result) const {}
}; };
......
...@@ -16,6 +16,9 @@ ...@@ -16,6 +16,9 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
...@@ -44,6 +47,26 @@ const char kGraphVars[] = "vars"; ...@@ -44,6 +47,26 @@ const char kGraphVars[] = "vars";
typedef std::unordered_set<VarHandleBase *> GraphDepVars; typedef std::unordered_set<VarHandleBase *> GraphDepVars;
const char kGraphDepVars[] = "dep_vars"; const char kGraphDepVars[] = "dep_vars";
constexpr char kNCCLCtxs[] = "nccl_ctxs";
constexpr char kLossVarName[] = "loss_var_name";
constexpr char kPlaces[] = "places";
constexpr char kLocalScopes[] = "local_scopes";
constexpr char kStrategy[] = "strategy";
constexpr char kNRanks[] = "nranks";
typedef std::unordered_set<std::string> FusedVars;
constexpr char kFusedVars[] = "fused_vars";
typedef std::vector<std::pair<std::string, std::string>> ParamsAndGrads;
constexpr char kParamsAndGrads[] = "params_grads";
typedef std::vector<std::vector<std::pair<std::string, std::string>>>
GroupGradsAndParams;
constexpr char kGroupGradsAndParams[] = "group_grads_params";
constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@";
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include <map> #include <map>
#include <unordered_set>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -41,15 +42,42 @@ OpHandleBase::~OpHandleBase() { ...@@ -41,15 +42,42 @@ OpHandleBase::~OpHandleBase() {
void OpHandleBase::Run(bool use_cuda) { void OpHandleBase::Run(bool use_cuda) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (events_.empty() && use_cuda) { if (events_.empty() && use_cuda && dev_ctxes_.size() > 0) {
for (auto &p : dev_ctxes_) { for (auto &p : dev_ctxes_) {
int dev_id = boost::get<platform::CUDAPlace>(p.first).device; int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
PADDLE_ENFORCE(cudaSetDevice(dev_id)); PADDLE_ENFORCE(cudaSetDevice(dev_id));
PADDLE_ENFORCE( PADDLE_ENFORCE(
cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming)); cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
} }
if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
for (auto &out_var : outputs_) {
auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
if (out_var_handle) {
int dev_id =
boost::get<platform::CUDAPlace>(out_var_handle->place()).device;
out_var_handle->SetGenerateEvent(events_[dev_id]);
}
}
} else {
PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL,
"%s should have only one dev_ctx.", Name());
auto &place = dev_ctxes_.begin()->first;
int dev_id = boost::get<platform::CUDAPlace>(place).device;
for (auto &out_var : outputs_) {
auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
if (out_var_handle) {
PADDLE_ENFORCE(
platform::is_same_place(place, out_var_handle->place()),
"The place of input(%s) is not consistent with the "
"place of current op(%s).",
out_var_handle->Name(), Name());
out_var_handle->SetGenerateEvent(events_[dev_id]);
}
}
}
} }
#else #else
PADDLE_ENFORCE(!use_cuda); PADDLE_ENFORCE(!use_cuda);
#endif #endif
...@@ -93,17 +121,48 @@ void OpHandleBase::AddOutput(VarHandleBase *out) { ...@@ -93,17 +121,48 @@ void OpHandleBase::AddOutput(VarHandleBase *out) {
void OpHandleBase::WaitInputVarGenerated() { void OpHandleBase::WaitInputVarGenerated() {
for (auto in_var : inputs_) { for (auto in_var : inputs_) {
if (NeedWait(in_var)) { if (NeedWait(in_var)) {
for (auto &pair : dev_ctxes_) { // Dummy Variable is used to represent dependencies between operators, so
in_var->GeneratedOp()->RecordWaitEventOnCtx(pair.second); // there doesn't add event for it.
auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
if (in_var_handle) {
auto &place = in_var_handle->place();
if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA
auto stream =
static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
->stream();
PADDLE_ENFORCE(
cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#else
PADDLE_THROW("Doesn't compile the GPU.");
#endif
}
// There are nothing to do when the place is CPUPlace.
} }
} }
} }
} }
void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
for (auto *in : inputs_) { for (auto in_var : inputs_) {
if (NeedWait(in)) { if (NeedWait(in_var)) {
in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(place)); // Dummy Variable is used to represent dependencies between operators, so
// there doesn't add event for it.
auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
if (in_var_handle) {
if (platform::is_gpu_place(in_var_handle->place())) {
#ifdef PADDLE_WITH_CUDA
auto stream = static_cast<platform::CUDADeviceContext *>(
dev_ctxes_.at(in_var_handle->place()))
->stream();
PADDLE_ENFORCE(
cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#else
PADDLE_THROW("Doesn't compile the GPU.");
#endif
}
// There are nothing to do when the place is CPUPlace.
}
} }
} }
} }
......
...@@ -53,6 +53,31 @@ struct ReduceLoDTensor { ...@@ -53,6 +53,31 @@ struct ReduceLoDTensor {
} }
}; };
struct ReduceBufferData {
const std::vector<const void *> &src_data_;
void *dst_data_;
int64_t numel_;
ReduceBufferData(const std::vector<const void *> &src, void *dst,
int64_t numel)
: src_data_(src), dst_data_(dst), numel_(numel) {}
template <typename T>
void apply() const {
T *dst_data = reinterpret_cast<T *>(dst_data_);
for (size_t i = 0; i < src_data_.size(); ++i) {
auto srd_data = reinterpret_cast<const T *>(src_data_[i]);
VLOG(10) << "dst: " << dst_data_ << ", " << srd_data;
if (srd_data == dst_data_) {
continue;
}
std::transform(srd_data, srd_data + numel_, dst_data, dst_data,
[](T a, T b) -> T { return a + b; });
}
}
};
inline void GatherLocalSelectedRows( inline void GatherLocalSelectedRows(
const std::vector<const SelectedRows *> &src_selecte_rows_, const std::vector<const SelectedRows *> &src_selecte_rows_,
const std::vector<platform::Place> &in_places, const std::vector<platform::Place> &in_places,
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -27,62 +26,49 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( ...@@ -27,62 +26,49 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
: graph_(graph), : graph_(graph),
pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_) pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
: nullptr), : nullptr),
prepare_pool_(1),
local_scopes_(local_scopes), local_scopes_(local_scopes),
places_(places), places_(places),
fetch_ctxs_(places), fetch_ctxs_(places),
running_ops_(0), strategy_(strategy) {
strategy_(strategy) {} PrepareOpDeps();
CopyOpDeps();
}
FeedFetchList ThreadedSSAGraphExecutor::Run( FeedFetchList ThreadedSSAGraphExecutor::Run(
const std::vector<std::string> &fetch_tensors) { const std::vector<std::string> &fetch_tensors) {
std::unique_ptr<platform::RecordEvent> event( std::unique_ptr<platform::RecordEvent> event(
new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare")); new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare"));
std::unordered_map<OpHandleBase *, size_t> pending_ops; std::unique_ptr<OpDependentData> op_deps = op_deps_futures_.get();
std::unordered_set<VarHandleBase *> pending_vars; CopyOpDeps();
auto ready_vars = std::make_shared<BlockingQueue<VarHandleBase *>>(); VLOG(10) << "ThreadedSSAGraphExecutor::Run";
std::unordered_set<OpHandleBase *> ready_ops; std::shared_ptr<BlockingQueue<VarHandleBase *>> ready_vars(
new BlockingQueue<VarHandleBase *>);
auto &pending_ops = op_deps->pending_ops_;
auto &pending_vars = op_deps->pending_vars_;
auto &ready_ops = op_deps->ready_ops_;
// For ops (e.g. nccl_all_reduce) that need to coordinate multiple // For ops (e.g. nccl_all_reduce) that need to coordinate multiple
// streams from multiple GPUs, it's faster to buffer them and schedule // streams from multiple GPUs, it's faster to buffer them and schedule
// together since we currently cannot overlap computation and memcpy streams. // together since we currently cannot overlap computation and memcpy streams.
// Should revisit it if overlapping is available. // Should revisit it if overlapping is available.
std::unordered_set<OpHandleBase *> delayed_ops; std::unordered_set<OpHandleBase *> delayed_ops;
// Transform SSAGraph to pending_ops & pending_vars
for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
for (auto &name_pair : var_map) {
for (auto &version_pair : name_pair.second) {
InsertPendingVar(&pending_vars, ready_vars.get(), version_pair);
}
}
}
for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
InsertPendingVar(&pending_vars, ready_vars.get(), var);
}
for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
if (op->Inputs().empty()) { // Special case, Op has no input.
ready_ops.insert(op);
} else {
InsertPendingOp(&pending_ops, op);
}
}
// Step 2. Insert FetchOps // Step 2. Insert FetchOps
std::vector<FetchOpHandle *> fetch_ops; std::vector<FetchOpHandle *> fetch_ops;
std::unordered_set<VarHandleBase *> fetch_dependencies; std::unordered_set<VarHandleBase *> fetch_dependencies;
FeedFetchList fetch_data(fetch_tensors.size()); FeedFetchList fetch_data(fetch_tensors.size());
InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops, InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &ready_ops,
&pending_vars, ready_vars.get(), &fetch_data); &pending_ops, &pending_vars, &fetch_data);
auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) { auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
for (auto *op : set) { for (auto *op : set) {
running_ops_++;
RunOp(ready_vars, op); RunOp(ready_vars, op);
} }
set.clear(); set.clear();
}; };
auto run_all_op = [&](OpHandleBase *op) { RunOp(ready_vars, op); };
// Clean run context // Clean run context
run_op_futures_.clear(); run_op_futures_.clear();
exception_holder_.Clear(); exception_holder_.Clear();
...@@ -91,19 +77,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -91,19 +77,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
while (!pending_vars.empty()) { while (!pending_vars.empty()) {
// 1. Run All Ready ops // 1. Run All Ready ops
// Keep loop until all vars are ready. // Keep loop until all vars are ready.
// run_all_ops(ready_ops);
// NOTE: DelayedOps have a lower priority. It will be scheduled after all
// ready_ops have been performed.
if (ready_ops.empty() && strategy_.allow_op_delay_ && running_ops_ == 0) {
run_all_ops(delayed_ops);
} else {
run_all_ops(ready_ops);
}
// 2. Find ready variable // 2. Find ready variable
bool timeout; bool timeout;
auto cur_ready_vars = ready_vars->PopAll(1, &timeout); auto cur_ready_vars = ready_vars->PopAll(1, &timeout);
if (timeout) { if (timeout) {
if (exception_holder_.IsCaught()) { if (exception_holder_.IsCaught()) {
for (auto &run_op_future : run_op_futures_) { for (auto &run_op_future : run_op_futures_) {
...@@ -115,6 +93,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -115,6 +93,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
continue; continue;
} }
} }
// 3. Remove the dependency of ready_var. // 3. Remove the dependency of ready_var.
// Find the ready_ops after the ready_var. // Find the ready_ops after the ready_var.
for (auto ready_var : cur_ready_vars) { for (auto ready_var : cur_ready_vars) {
...@@ -123,11 +102,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -123,11 +102,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
auto &deps = pending_ops[op]; auto &deps = pending_ops[op];
--deps; --deps;
if (deps == 0) { if (deps == 0) {
if (op->IsMultiDeviceTransfer() && strategy_.allow_op_delay_) { run_all_op(op);
delayed_ops.insert(op);
} else {
ready_ops.insert(op);
}
} }
} }
} }
...@@ -143,16 +118,17 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( ...@@ -143,16 +118,17 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
const std::vector<std::string> &fetch_tensors, const std::vector<std::string> &fetch_tensors,
std::vector<FetchOpHandle *> *fetch_ops, std::vector<FetchOpHandle *> *fetch_ops,
std::unordered_set<VarHandleBase *> *fetch_dependencies, std::unordered_set<VarHandleBase *> *fetch_dependencies,
std::unordered_set<OpHandleBase *> *ready_ops,
std::unordered_map<OpHandleBase *, size_t> *pending_ops, std::unordered_map<OpHandleBase *, size_t> *pending_ops,
std::unordered_set<VarHandleBase *> *pending_vars, std::unordered_set<VarHandleBase *> *pending_vars,
BlockingQueue<VarHandleBase *> *ready_vars, FeedFetchList *fetch_data) { FeedFetchList *fetch_data) {
std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars; std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
std::unordered_set<VarHandleBase *> local_ready_vars;
for (auto &fetch_var_name : fetch_tensors) { for (auto &fetch_var_name : fetch_tensors) {
for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) { for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
auto it = var_map.find(fetch_var_name); auto it = var_map.find(fetch_var_name);
if (it != var_map.end()) { if (it != var_map.end()) {
fetched_vars[fetch_var_name].push_back(*it->second.rbegin()); fetched_vars[fetch_var_name].emplace_back(*it->second.rbegin());
} }
} }
} }
...@@ -161,8 +137,9 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( ...@@ -161,8 +137,9 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
auto &var_name = fetch_tensors[i]; auto &var_name = fetch_tensors[i];
auto fetched_var_it = fetched_vars.find(var_name); auto fetched_var_it = fetched_vars.find(var_name);
PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(), PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
"Cannot find fetched variable.(Perhaps the main_program " "Cannot find fetched variable(%s).(Perhaps the main_program "
"is not set to ParallelExecutor)"); "is not set to ParallelExecutor)",
var_name);
auto &vars = fetched_var_it->second; auto &vars = fetched_var_it->second;
...@@ -184,9 +161,23 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( ...@@ -184,9 +161,23 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
auto *fetch_dummy = new DummyVarHandle(fetch_var); auto *fetch_dummy = new DummyVarHandle(fetch_var);
op->AddOutput(fetch_dummy); op->AddOutput(fetch_dummy);
fetch_dependencies->emplace(fetch_dummy); fetch_dependencies->emplace(fetch_dummy);
this->InsertPendingVar(pending_vars, ready_vars, fetch_dummy);
this->InsertPendingOp(pending_ops, op); this->InsertPendingVar(pending_vars, &local_ready_vars, fetch_dummy);
size_t wait_input_num = 0;
std::unordered_set<VarHandleBase *> input_set(vars.begin(), vars.end());
for (auto *var : input_set) {
if (pending_vars->count(var)) {
++wait_input_num;
}
}
if (wait_input_num) {
pending_ops->insert({op, wait_input_num});
} else {
ready_ops->insert(static_cast<OpHandleBase *>(op));
}
} }
PADDLE_ENFORCE_EQ(local_ready_vars.size(), 0);
} }
void ThreadedSSAGraphExecutor::InsertPendingOp( void ThreadedSSAGraphExecutor::InsertPendingOp(
...@@ -197,11 +188,63 @@ void ThreadedSSAGraphExecutor::InsertPendingOp( ...@@ -197,11 +188,63 @@ void ThreadedSSAGraphExecutor::InsertPendingOp(
void ThreadedSSAGraphExecutor::InsertPendingVar( void ThreadedSSAGraphExecutor::InsertPendingVar(
std::unordered_set<VarHandleBase *> *pending_vars, std::unordered_set<VarHandleBase *> *pending_vars,
BlockingQueue<VarHandleBase *> *ready_vars, VarHandleBase *var) const { std::unordered_set<VarHandleBase *> *ready_vars, VarHandleBase *var) const {
pending_vars->insert(var); pending_vars->insert(var);
if (var->GeneratedOp() == nullptr) { if (var->GeneratedOp() == nullptr) {
ready_vars->Push(var); ready_vars->insert(var);
}
}
void ThreadedSSAGraphExecutor::PrepareOpDeps() {
op_deps_.reset(new OpDependentData());
std::unordered_map<OpHandleBase *, size_t> &pending_ops =
op_deps_->pending_ops_;
std::unordered_set<VarHandleBase *> &pending_vars = op_deps_->pending_vars_;
std::unordered_set<OpHandleBase *> &ready_ops = op_deps_->ready_ops_;
std::unordered_set<VarHandleBase *> ready_vars;
// Transform SSAGraph to pending_ops & pending_vars
for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
for (auto &name_pair : var_map) {
for (auto &version_pair : name_pair.second) {
InsertPendingVar(&pending_vars, &ready_vars, version_pair);
}
}
}
for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
InsertPendingVar(&pending_vars, &ready_vars, var);
}
for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
if (op->Inputs().empty()) { // Special case, Op has no input.
ready_ops.insert(op);
} else {
InsertPendingOp(&pending_ops, op);
}
} }
for (auto ready_var : ready_vars) {
pending_vars.erase(ready_var);
for (auto *op : ready_var->PendingOps()) {
auto &deps = pending_ops[op];
--deps;
if (deps == 0) {
ready_ops.insert(op);
}
}
}
}
void ThreadedSSAGraphExecutor::CopyOpDeps() {
op_deps_futures_ = prepare_pool_.enqueue([&] {
auto *op_deps = new OpDependentData();
op_deps->pending_ops_.insert(op_deps_->pending_ops_.begin(),
op_deps_->pending_ops_.end());
op_deps->pending_vars_.insert(op_deps_->pending_vars_.begin(),
op_deps_->pending_vars_.end());
op_deps->ready_ops_.insert(op_deps_->ready_ops_.begin(),
op_deps_->ready_ops_.end());
return std::unique_ptr<OpDependentData>(op_deps);
});
} }
void ThreadedSSAGraphExecutor::RunOp( void ThreadedSSAGraphExecutor::RunOp(
...@@ -216,7 +259,6 @@ void ThreadedSSAGraphExecutor::RunOp( ...@@ -216,7 +259,6 @@ void ThreadedSSAGraphExecutor::RunOp(
op->Run(strategy_.use_cuda_); op->Run(strategy_.use_cuda_);
} }
VLOG(10) << op << " " << op->Name() << " Done "; VLOG(10) << op << " " << op->Name() << " Done ";
running_ops_--;
ready_var_q->Extend(op->Outputs()); ready_var_q->Extend(op->Outputs());
VLOG(10) << op << " " << op->Name() << " Signal posted"; VLOG(10) << op << " " << op->Name() << " Signal posted";
} catch (...) { } catch (...) {
......
...@@ -15,18 +15,20 @@ ...@@ -15,18 +15,20 @@
#pragma once #pragma once
#include <deque> #include <deque>
#include <functional>
#include <list> #include <list>
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <functional>
#include "ThreadPool.h" // ThreadPool in thrird party #include "ThreadPool.h" // ThreadPool in thrird party
#include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/exception_holder.h"
#include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h"
#include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/fetch_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/ssa_graph_executor.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
...@@ -36,6 +38,12 @@ class Scope; ...@@ -36,6 +38,12 @@ class Scope;
namespace details { namespace details {
struct OpDependentData {
std::unordered_map<OpHandleBase *, size_t> pending_ops_;
std::unordered_set<VarHandleBase *> pending_vars_;
std::unordered_set<OpHandleBase *> ready_ops_;
};
class ThreadedSSAGraphExecutor : public SSAGraphExecutor { class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
public: public:
ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy, ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
...@@ -57,29 +65,35 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -57,29 +65,35 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
private: private:
ir::Graph *graph_; ir::Graph *graph_;
std::unique_ptr<::ThreadPool> pool_; std::unique_ptr<::ThreadPool> pool_;
::ThreadPool prepare_pool_;
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
platform::DeviceContextPool fetch_ctxs_; platform::DeviceContextPool fetch_ctxs_;
ExceptionHolder exception_holder_; ExceptionHolder exception_holder_;
std::atomic<int> running_ops_;
void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops, void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
OpHandleBase *op_instance) const; OpHandleBase *op_instance) const;
void InsertPendingVar(std::unordered_set<VarHandleBase *> *pending_vars, void InsertPendingVar(std::unordered_set<VarHandleBase *> *pending_vars,
BlockingQueue<VarHandleBase *> *ready_vars, std::unordered_set<VarHandleBase *> *ready_vars,
VarHandleBase *var) const; VarHandleBase *var) const;
void InsertFetchOps(const std::vector<std::string> &fetch_tensors, void InsertFetchOps(const std::vector<std::string> &fetch_tensors,
std::vector<FetchOpHandle *> *fetch_ops, std::vector<FetchOpHandle *> *fetch_ops,
std::unordered_set<VarHandleBase *> *fetch_dependencies, std::unordered_set<VarHandleBase *> *fetch_dependencies,
std::unordered_set<OpHandleBase *> *ready_ops,
std::unordered_map<OpHandleBase *, size_t> *pending_ops, std::unordered_map<OpHandleBase *, size_t> *pending_ops,
std::unordered_set<VarHandleBase *> *pending_vars, std::unordered_set<VarHandleBase *> *pending_vars,
BlockingQueue<VarHandleBase *> *ready_vars,
FeedFetchList *fetch_data); FeedFetchList *fetch_data);
void PrepareOpDeps();
void CopyOpDeps();
private: private:
std::future<std::unique_ptr<OpDependentData>> op_deps_futures_;
ExecutionStrategy strategy_; ExecutionStrategy strategy_;
std::unique_ptr<OpDependentData> op_deps_;
// use std::list because clear(), push_back, and for_each are O(1) // use std::list because clear(), push_back, and for_each are O(1)
std::list<std::future<void>> run_op_futures_; std::list<std::future<void>> run_op_futures_;
}; };
......
...@@ -43,6 +43,7 @@ struct VarHandleBase { ...@@ -43,6 +43,7 @@ struct VarHandleBase {
virtual ~VarHandleBase(); virtual ~VarHandleBase();
virtual std::string DebugString() const = 0; virtual std::string DebugString() const = 0;
virtual const std::string& Name() const = 0;
void AddInput(OpHandleBase* in, ir::Node* node) { void AddInput(OpHandleBase* in, ir::Node* node) {
node_->inputs.clear(); node_->inputs.clear();
...@@ -95,8 +96,6 @@ struct VarHandleBase { ...@@ -95,8 +96,6 @@ struct VarHandleBase {
// //
// NOTE: runtime variables have place. // NOTE: runtime variables have place.
struct VarHandle : public VarHandleBase { struct VarHandle : public VarHandleBase {
explicit VarHandle(ir::Node* node) : VarHandleBase(node) {}
virtual ~VarHandle(); virtual ~VarHandle();
std::string DebugString() const override; std::string DebugString() const override;
...@@ -109,6 +108,20 @@ struct VarHandle : public VarHandleBase { ...@@ -109,6 +108,20 @@ struct VarHandle : public VarHandleBase {
name_(std::move(name)), name_(std::move(name)),
place_(std::move(place)) {} place_(std::move(place)) {}
#ifdef PADDLE_WITH_CUDA
bool HasEvent() { return has_event_; }
const cudaEvent_t& GetEvent() {
PADDLE_ENFORCE(HasEvent(), "The event is not set.");
return event_;
}
void SetGenerateEvent(const cudaEvent_t& event) {
has_event_ = true;
event_ = event;
}
#endif
// version field currently is not used, however, just store the version to // version field currently is not used, however, just store the version to
// debug easily. // debug easily.
private: private:
...@@ -116,6 +129,11 @@ struct VarHandle : public VarHandleBase { ...@@ -116,6 +129,11 @@ struct VarHandle : public VarHandleBase {
size_t scope_idx_; size_t scope_idx_;
std::string name_; std::string name_;
platform::Place place_; platform::Place place_;
#ifdef PADDLE_WITH_CUDA
// Only when this event is triggered, var is generated.
cudaEvent_t event_;
bool has_event_{false};
#endif
public: public:
bool IsTheSameVar(const VarHandle& o) const { bool IsTheSameVar(const VarHandle& o) const {
...@@ -125,6 +143,7 @@ struct VarHandle : public VarHandleBase { ...@@ -125,6 +143,7 @@ struct VarHandle : public VarHandleBase {
size_t version() const { return version_; } size_t version() const { return version_; }
size_t scope_idx() const { return scope_idx_; } size_t scope_idx() const { return scope_idx_; }
const std::string& Name() const override { return name_; }
const std::string& name() const { return name_; } const std::string& name() const { return name_; }
const platform::Place& place() const { return place_; } const platform::Place& place() const { return place_; }
}; };
...@@ -136,6 +155,10 @@ struct DummyVarHandle : public VarHandleBase { ...@@ -136,6 +155,10 @@ struct DummyVarHandle : public VarHandleBase {
virtual ~DummyVarHandle(); virtual ~DummyVarHandle();
std::string DebugString() const override; std::string DebugString() const override;
public:
const std::string& Name() const override { return name_; }
std::string name_{"DummyVar"};
}; };
} // namespace details } // namespace details
......
...@@ -224,8 +224,8 @@ std::unique_ptr<ir::Graph> CPUQuantizePass::ApplyImpl( ...@@ -224,8 +224,8 @@ std::unique_ptr<ir::Graph> CPUQuantizePass::ApplyImpl(
PADDLE_ENFORCE(param_scope()); PADDLE_ENFORCE(param_scope());
QuantizeConv(graph.get(), false /* with_residual_data */);
QuantizeConv(graph.get(), true /* with_residual_data */); QuantizeConv(graph.get(), true /* with_residual_data */);
QuantizeConv(graph.get());
QuantizePool(graph.get()); QuantizePool(graph.get());
return graph; return graph;
......
...@@ -599,10 +599,19 @@ bool VarLinksToOp(Node *node, const std::string &op_type) { ...@@ -599,10 +599,19 @@ bool VarLinksToOp(Node *node, const std::string &op_type) {
bool IsNthInput(Node *var, Node *op, const std::string &argument, size_t nth) { bool IsNthInput(Node *var, Node *op, const std::string &argument, size_t nth) {
PADDLE_ENFORCE(var->IsVar()); PADDLE_ENFORCE(var->IsVar());
PADDLE_ENFORCE(op->IsOp()); PADDLE_ENFORCE(op->IsOp());
if (op->Op()->Input(argument).size() <= nth) return false; if (!HasInput(op, argument) || op->Op()->Input(argument).size() <= nth)
return false;
return var->Name() == op->Op()->Input(argument)[nth]; return var->Name() == op->Op()->Input(argument)[nth];
} }
bool HasInput(Node *op, const std::string &argument) {
PADDLE_ENFORCE(op->IsOp());
auto const &names = op->Op()->InputNames();
if (std::find(names.begin(), names.end(), argument) == names.end())
return false;
return true;
}
bool IsNthOutput(Node *var, Node *op, const std::string &argument, size_t nth) { bool IsNthOutput(Node *var, Node *op, const std::string &argument, size_t nth) {
PADDLE_ENFORCE(var->IsVar()); PADDLE_ENFORCE(var->IsVar());
PADDLE_ENFORCE(op->IsOp()); PADDLE_ENFORCE(op->IsOp());
...@@ -1082,8 +1091,15 @@ PDNode *patterns::Conv::operator()() { ...@@ -1082,8 +1091,15 @@ PDNode *patterns::Conv::operator()() {
PDNode *patterns::ConvResidual::operator()(bool with_residual_data) { PDNode *patterns::ConvResidual::operator()(bool with_residual_data) {
auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
if (!with_residual_data) if (!with_residual_data) {
conv_op->assert_op_attr("fuse_residual_connection", false); conv_op->assert_more([&](Node *x) {
auto node_names = x->Op()->InputNames();
if (!HasInput(x, "ResidualData") ||
x->Op()->Input("ResidualData").size() == 0)
return true;
return false;
});
}
auto input_var = pattern->NewNode(conv_input_repr()) auto input_var = pattern->NewNode(conv_input_repr())
->AsInput() ->AsInput()
......
...@@ -305,6 +305,9 @@ bool VarLinksFromOp(Node* node, const std::string& op_type); ...@@ -305,6 +305,9 @@ bool VarLinksFromOp(Node* node, const std::string& op_type);
// Check whether a var node is a op node's nth input. // Check whether a var node is a op node's nth input.
bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth); bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth);
// Check whether the op node has input of given name.
bool HasInput(Node* op, const std::string& argument);
// Tell whether a var node is a op node's nth output. // Tell whether a var node is a op node's nth output.
bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth); bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth);
......
...@@ -14,12 +14,16 @@ limitations under the License. */ ...@@ -14,12 +14,16 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
/*
* Specifies which operators should use MKLDNN.
*/
class MKLDNNPlacementPass : public Pass { class MKLDNNPlacementPass : public Pass {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl( std::unique_ptr<ir::Graph> ApplyImpl(
......
...@@ -874,23 +874,24 @@ std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig( ...@@ -874,23 +874,24 @@ std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(
return kernel_configs; return kernel_configs;
} }
RuntimeContext* OperatorWithKernel::GetRuntimeContext( void OperatorWithKernel::RunImpl(const Scope& scope,
const Scope& scope) const { const platform::Place& place) const {
if (!HasAttr(kEnableCacheRuntimeContext)) { if (!HasAttr(kEnableCacheRuntimeContext)) {
return new RuntimeContext(Inputs(), Outputs(), scope); RuntimeContext ctx(Inputs(), Outputs(), scope);
RunImpl(scope, place, &ctx);
} else { } else {
const Scope* cur_scope = &scope; const Scope* cur_scope = &scope;
if (!runtime_ctx_ || pre_scope_ != cur_scope) { if (!runtime_ctx_ || pre_scope_ != cur_scope) {
runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope)); runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
pre_scope_ = cur_scope; pre_scope_ = cur_scope;
} }
return runtime_ctx_.get(); RunImpl(scope, place, runtime_ctx_.get());
} }
} }
void OperatorWithKernel::RunImpl(const Scope& scope, void OperatorWithKernel::RunImpl(const Scope& scope,
const platform::Place& place) const { const platform::Place& place,
auto runtime_ctx = GetRuntimeContext(scope); RuntimeContext* runtime_ctx) const {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place); auto* dev_ctx = pool.Get(place);
......
...@@ -461,7 +461,8 @@ class OperatorWithKernel : public OperatorBase { ...@@ -461,7 +461,8 @@ class OperatorWithKernel : public OperatorBase {
// same. // same.
proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const; proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const;
void RunImpl(const Scope& scope, const platform::Place& place) const final; void RunImpl(const Scope& scope, const platform::Place& place) const final;
RuntimeContext* GetRuntimeContext(const Scope& scope) const; void RunImpl(const Scope& scope, const platform::Place& place,
RuntimeContext* runtime_ctx) const;
/** /**
* Transfer data from scope to a transfered scope. If there is no data need to * Transfer data from scope to a transfered scope. If there is no data need to
......
...@@ -254,18 +254,29 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places, ...@@ -254,18 +254,29 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
member_->places_, nccl_id, build_strategy.num_trainers_, member_->places_, nccl_id, build_strategy.num_trainers_,
build_strategy.trainer_id_)); build_strategy.trainer_id_));
std::unique_ptr<platform::NCCLContextMap> dev_nccl_ctxs; // Initialize device context's nccl comm, will be used by normal
dev_nccl_ctxs.reset(new platform::NCCLContextMap(member_->places_)); // Operators like sync_batch_norm, and collective ops.
// Initialize device context's nccl comm // NOTE: more than one ParallelExecutor with same place, the nccl comm will
// Note, more than one ParallelExecutor with same place, the nccl comm will
// be rewrite and there will be some problem. // be rewrite and there will be some problem.
// NOTE: NCCL group-calls and non-group-calls can not use the same
// NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use
// same communicators.
std::unique_ptr<platform::NCCLContextMap> dev_nccl_ctxs;
if (nccl_id == nullptr) {
dev_nccl_ctxs.reset(new platform::NCCLContextMap(member_->places_));
}
for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
auto &nccl_ctx = dev_nccl_ctxs->at(dev_id);
platform::DeviceContextPool &pool = platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>( auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
pool.Get(member_->places_[dev_id])); pool.Get(member_->places_[dev_id]));
dev_ctx->set_nccl_comm(nccl_ctx.comm()); if (nccl_id != nullptr) {
auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[dev_id]);
dev_ctx->set_nccl_comm(nccl_ctx.comm());
} else {
auto &nccl_ctx = dev_nccl_ctxs->at(member_->places_[dev_id]);
dev_ctx->set_nccl_comm(nccl_ctx.comm());
}
} }
#else #else
PADDLE_THROW("Not compiled with CUDA"); PADDLE_THROW("Not compiled with CUDA");
......
...@@ -34,7 +34,7 @@ DEFINE_double( ...@@ -34,7 +34,7 @@ DEFINE_double(
"Memory size threshold (GB) when the garbage collector clear tensors." "Memory size threshold (GB) when the garbage collector clear tensors."
"Disabled when this value is less than 0"); "Disabled when this value is less than 0");
DEFINE_bool(fast_eager_deletion_mode, false, DEFINE_bool(fast_eager_deletion_mode, true,
"Fast eager deletion mode. If enabled, memory would release " "Fast eager deletion mode. If enabled, memory would release "
"immediately without waiting GPU kernel ends."); "immediately without waiting GPU kernel ends.");
......
...@@ -131,6 +131,15 @@ struct Argument { ...@@ -131,6 +131,15 @@ struct Argument {
// Pass a set of op types to enable its mkldnn kernel // Pass a set of op types to enable its mkldnn kernel
DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes, DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
std::unordered_set<std::string>); std::unordered_set<std::string>);
// A set of op types to enable their quantized kernels
DECL_ARGUMENT_FIELD(quantize_enabled_op_types, QuantizeEnabledOpTypes,
std::unordered_set<std::string>);
// A set of op IDs to exclude from enabling their quantized kernels
DECL_ARGUMENT_FIELD(quantize_excluded_op_ids, QuantizeExcludedOpIds,
std::unordered_set<int>);
// Scales for variables to be quantized // Scales for variables to be quantized
DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale); DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale);
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/inference/analysis/ir_pass_manager.h" #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
...@@ -60,6 +61,13 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -60,6 +61,13 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("mkldnn_enabled_op_types", pass->Set("mkldnn_enabled_op_types",
new std::unordered_set<std::string>( new std::unordered_set<std::string>(
argument->mkldnn_enabled_op_types())); argument->mkldnn_enabled_op_types()));
} else if (pass_name == "cpu_quantize_placement_pass") {
pass->Set("quantize_enabled_op_types",
new std::unordered_set<std::string>(
argument->quantize_enabled_op_types()));
pass->Set(
"quantize_excluded_op_ids",
new std::unordered_set<int>(argument->quantize_excluded_op_ids()));
} else if (pass_name == "cpu_quantize_pass") { } else if (pass_name == "cpu_quantize_pass") {
pass->Set("quant_var_scales", pass->Set("quant_var_scales",
new VarQuantScale(argument->quant_var_scales())); new VarQuantScale(argument->quant_var_scales()));
......
...@@ -118,9 +118,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -118,9 +118,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(serialized_info_cache_); CP_MEMBER(serialized_info_cache_);
// framework related.
CP_MEMBER(enable_runtime_context_cache_);
if (use_gpu_) { if (use_gpu_) {
pass_builder_.reset(new GpuPassStrategy( pass_builder_.reset(new GpuPassStrategy(
*static_cast<GpuPassStrategy *>(other.pass_builder()))); *static_cast<GpuPassStrategy *>(other.pass_builder())));
...@@ -205,6 +202,7 @@ void AnalysisConfig::Update() { ...@@ -205,6 +202,7 @@ void AnalysisConfig::Update() {
// Append after the Affine_channel_conv_fuse pass. // Append after the Affine_channel_conv_fuse pass.
pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
} }
pass_builder()->DeletePass("runtime_context_cache_pass");
} }
if (use_mkldnn_) { if (use_mkldnn_) {
...@@ -235,10 +233,6 @@ void AnalysisConfig::Update() { ...@@ -235,10 +233,6 @@ void AnalysisConfig::Update() {
if (ir_debug_) { if (ir_debug_) {
pass_builder()->TurnOnDebug(); pass_builder()->TurnOnDebug();
} }
if (enable_runtime_context_cache_) {
pass_builder()->AppendPass("runtime_context_cache_pass");
}
} }
std::string AnalysisConfig::SerializeInfoCache() { std::string AnalysisConfig::SerializeInfoCache() {
...@@ -272,7 +266,6 @@ std::string AnalysisConfig::SerializeInfoCache() { ...@@ -272,7 +266,6 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << specify_input_name_; ss << specify_input_name_;
ss << cpu_math_library_num_threads_; ss << cpu_math_library_num_threads_;
ss << enable_runtime_context_cache_;
return ss.str(); return ss.str();
} }
......
...@@ -194,23 +194,6 @@ struct AnalysisConfig { ...@@ -194,23 +194,6 @@ struct AnalysisConfig {
/** Tell whether the memory optimization is activated. */ /** Tell whether the memory optimization is activated. */
bool enable_memory_optim() const; bool enable_memory_optim() const;
// framework related
/** \brief Control whether to perform runtime context cache optimization.
*
* If turned off, in Op's every execution, RuntimeContext would be called to
* relate input/output names of this Op with the corresponding variables in
* Scope.
*/
void SwitchRuntimeContextCache(int x = true) {
enable_runtime_context_cache_ = x;
}
/** A boolean state tell whether the runtime context cache optimization is
* actived.
*/
bool runtime_context_cache_enabled() const {
return enable_runtime_context_cache_;
}
friend class ::paddle::AnalysisPredictor; friend class ::paddle::AnalysisPredictor;
/** NOTE just for developer, not an official API, easily to be broken. /** NOTE just for developer, not an official API, easily to be broken.
...@@ -271,15 +254,6 @@ struct AnalysisConfig { ...@@ -271,15 +254,6 @@ struct AnalysisConfig {
int cpu_math_library_num_threads_{1}; int cpu_math_library_num_threads_{1};
// framework related
// RuntimeContext is used to relate input/output names of Operator with
// the corresponding variables in Scope.
// If enable_runtime_context_cache_ is true, it means that in a same Scope,
// since the input/output names of this Op do not change in the execution,
// RuntimeContext could be created only at the first iteration of this Op's
// execution to save the elapsed time.
bool enable_runtime_context_cache_{false};
// A runtime cache, shouldn't be transferred to others. // A runtime cache, shouldn't be transferred to others.
std::string serialized_info_cache_; std::string serialized_info_cache_;
......
...@@ -80,6 +80,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { ...@@ -80,6 +80,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
"conv_elementwise_add_act_fuse_pass", // "conv_elementwise_add_act_fuse_pass", //
"conv_elementwise_add2_act_fuse_pass", // "conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", // "conv_elementwise_add_fuse_pass", //
"runtime_context_cache_pass", //
#endif #endif
}); });
...@@ -90,6 +91,10 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { ...@@ -90,6 +91,10 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
use_gpu_ = true; use_gpu_ = true;
} }
void GpuPassStrategy::EnableQuantizer() {
LOG(ERROR) << "GPU not support quantization yet";
}
void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
analysis_passes_.push_back(pass); analysis_passes_.push_back(pass);
} }
...@@ -115,6 +120,7 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { ...@@ -115,6 +120,7 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
"conv_eltwiseadd_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", //
"is_test_pass", // "is_test_pass", //
"identity_scale_op_clean_pass", // "identity_scale_op_clean_pass", //
"runtime_context_cache_pass", //
}); });
use_gpu_ = false; use_gpu_ = false;
} }
......
...@@ -84,6 +84,10 @@ class PassStrategy : public PaddlePassBuilder { ...@@ -84,6 +84,10 @@ class PassStrategy : public PaddlePassBuilder {
*/ */
virtual void EnableMKLDNN() {} virtual void EnableMKLDNN() {}
/** Enable quantize optimization
*/
virtual void EnableQuantizer() {}
bool use_gpu() const { return use_gpu_; } bool use_gpu() const { return use_gpu_; }
virtual ~PassStrategy() = default; virtual ~PassStrategy() = default;
...@@ -124,6 +128,16 @@ class CpuPassStrategy : public PassStrategy { ...@@ -124,6 +128,16 @@ class CpuPassStrategy : public PassStrategy {
use_mkldnn_ = false; use_mkldnn_ = false;
#endif #endif
} }
void EnableQuantizer() override {
if (!use_quantizer_) {
passes_.push_back("cpu_quantize_placement_pass");
}
use_quantizer_ = true;
}
protected:
bool use_quantizer_{false};
}; };
/** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode. /** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
...@@ -138,6 +152,7 @@ class GpuPassStrategy : public PassStrategy { ...@@ -138,6 +152,7 @@ class GpuPassStrategy : public PassStrategy {
} }
void EnableMKLDNN() override; void EnableMKLDNN() override;
void EnableQuantizer() override;
virtual ~GpuPassStrategy() = default; virtual ~GpuPassStrategy() = default;
}; };
......
...@@ -107,7 +107,6 @@ void SetConfig(AnalysisConfig *cfg) { ...@@ -107,7 +107,6 @@ void SetConfig(AnalysisConfig *cfg) {
cfg->DisableGpu(); cfg->DisableGpu();
cfg->SwitchSpecifyInputNames(); cfg->SwitchSpecifyInputNames();
cfg->SwitchIrOptim(); cfg->SwitchIrOptim();
cfg->SwitchRuntimeContextCache();
if (FLAGS_zero_copy) { if (FLAGS_zero_copy) {
cfg->SwitchUseFeedFetchOps(false); cfg->SwitchUseFeedFetchOps(false);
} }
......
...@@ -72,8 +72,7 @@ std::ostream &operator<<(std::ostream &os, const AnalysisConfig &config) { ...@@ -72,8 +72,7 @@ std::ostream &operator<<(std::ostream &os, const AnalysisConfig &config) {
} }
os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim() os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
<< "\n"; << "\n";
os << GenSpaces(num_spaces) os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
<< "use_runtime_context_cache: " << config.runtime_context_cache_enabled()
<< "\n"; << "\n";
os << GenSpaces(num_spaces) os << GenSpaces(num_spaces)
<< "use_feed_fetch_ops: " << config.use_feed_fetch_ops_enabled() << "\n"; << "use_feed_fetch_ops: " << config.use_feed_fetch_ops_enabled() << "\n";
......
cc_library(benchmark SRCS benchmark.cc DEPS enforce) cc_library(benchmark SRCS benchmark.cc DEPS enforce)
cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark) cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
cc_binary(visualizer SRCS visualizer.cc DEPS analysis
paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/utils/visualizer.h"
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <fstream>
#include <memory>
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
#include "paddle/fluid/platform/init.h"
DEFINE_string(model_dir, "", "model directory");
DEFINE_string(model_program_path, "", "model program path");
DEFINE_string(model_params_path, "", "model params path");
using paddle::inference::analysis::Argument;
namespace paddle {
namespace inference {
namespace utils {
void Visualizer::SetArgument(Argument *argument) { argument_ = argument; }
bool Visualizer::Run() {
paddle::framework::InitDevices(false);
paddle::inference::analysis::Analyzer().Run(argument_);
return true;
}
} // namespace utils
} // namespace inference
} // namespace paddle
// Generate a dot file describing the structure of graph.
// To use this tool, run command: ./visualizer [options...]
// Options:
// --model_dir: the directory of model
// --model_program_path: the path of program
// --model_params_path: the path of params
int main(int argc, char *argv[]) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
google::InitGoogleLogging(argv[0]);
paddle::inference::analysis::Argument argument;
argument.SetUseGPU(false);
argument.SetUseTensorRT(false);
if (FLAGS_model_dir.empty()) {
if (FLAGS_model_program_path.empty() || FLAGS_model_params_path.empty()) {
LOG(ERROR) << "Please set model_dir"
" or model_program_path and model_params_path";
return -1;
} else {
argument.SetModelProgramPath(FLAGS_model_program_path);
argument.SetModelParamsPath(FLAGS_model_params_path);
}
} else {
argument.SetModelDir(FLAGS_model_dir);
}
// Only 1 pass, default filename is 0_ir_origin.dot
// For more details, looking for paddle::inference::analysis::IRPassManager
argument.SetIrAnalysisPasses({"infer_clean_graph_pass", "graph_viz_pass"});
std::unique_ptr<paddle::framework::Scope> scope{
new paddle::framework::Scope()};
argument.SetScopeNotOwned(
const_cast<paddle::framework::Scope *>(scope.get()));
paddle::inference::utils::Visualizer visualizer;
visualizer.SetArgument(&argument);
visualizer.Run();
return 0;
}
USE_PASS(infer_clean_graph_pass);
USE_PASS(graph_viz_pass);
USE_PASS(graph_to_program_pass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/fluid/inference/analysis/argument.h"
namespace paddle {
namespace inference {
namespace utils {
using paddle::inference::analysis::Argument;
class Visualizer final {
public:
Visualizer() = default;
~Visualizer() = default;
Visualizer(const Visualizer &) = delete;
Visualizer &operator=(const Visualizer &) = delete;
void SetArgument(Argument *);
bool Run();
private:
Argument *argument_;
};
} // namespace utils
} // namespace inference
} // namespace paddle
...@@ -128,7 +128,7 @@ class ChunkedAllocator : public Allocator { ...@@ -128,7 +128,7 @@ class ChunkedAllocator : public Allocator {
allocator = WrapRetryAllocator(allocator, retry_time_); allocator = WrapRetryAllocator(allocator, retry_time_);
return std::make_shared<AlignedAllocator<4096>>(std::move(allocator)); return std::make_shared<AlignedAllocator<64u>>(std::move(allocator));
} }
bool IsAllocThreadSafe() const override { return true; } bool IsAllocThreadSafe() const override { return true; }
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_int64(gpu_allocator_retry_time);
#endif
namespace paddle {
namespace memory {
namespace allocation {
//! Run allocate test cases for different places
void AllocateTestCases() {
auto &instance = AllocatorFacade::Instance();
platform::Place place;
size_t size = 1024;
{
place = platform::CPUPlace();
size = 1024;
auto cpu_allocation = instance.Alloc(place, size);
ASSERT_NE(cpu_allocation, nullptr);
ASSERT_NE(cpu_allocation->ptr(), nullptr);
ASSERT_EQ(cpu_allocation->place(), place);
ASSERT_EQ(cpu_allocation->size(), size);
}
#ifdef PADDLE_WITH_CUDA
{
place = platform::CUDAPlace(0);
size = 1024;
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
}
{
// Allocate 2GB gpu memory
place = platform::CUDAPlace(0);
size = 2 * static_cast<size_t>(1 << 30);
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
}
{
place = platform::CUDAPinnedPlace();
size = (1 << 20);
auto cuda_pinned_allocation =
instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
ASSERT_NE(cuda_pinned_allocation, nullptr);
ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
ASSERT_EQ(cuda_pinned_allocation->place(), place);
ASSERT_GE(cuda_pinned_allocation->size(), size);
}
#endif
}
TEST(Allocator, SpecifyGpuMemory) {
#ifdef PADDLE_WITH_CUDA
// Set to 0.0 to test FLAGS_initial_gpu_memory_in_mb and
// FLAGS_reallocate_gpu_memory_in_mb
FLAGS_fraction_of_gpu_memory_to_use = 0.0;
// 512 MB
FLAGS_initial_gpu_memory_in_mb = 512;
// 4 MB
FLAGS_reallocate_gpu_memory_in_mb = 4;
FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
#endif
AllocateTestCases();
}
} // namespace allocation
} // namespace memory
} // namespace paddle
...@@ -19,6 +19,8 @@ ...@@ -19,6 +19,8 @@
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use); DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_int64(gpu_allocator_retry_time); DECLARE_int64(gpu_allocator_retry_time);
#endif #endif
...@@ -26,13 +28,8 @@ namespace paddle { ...@@ -26,13 +28,8 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
TEST(allocator, allocator) { //! Run allocate test cases for different places
#ifdef PADDLE_WITH_CUDA void AllocateTestCases() {
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
#endif
auto &instance = AllocatorFacade::Instance(); auto &instance = AllocatorFacade::Instance();
platform::Place place; platform::Place place;
size_t size = 1024; size_t size = 1024;
...@@ -82,6 +79,16 @@ TEST(allocator, allocator) { ...@@ -82,6 +79,16 @@ TEST(allocator, allocator) {
#endif #endif
} }
TEST(Allocator, Allocator) {
#ifdef PADDLE_WITH_CUDA
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
#endif
AllocateTestCases();
}
} // namespace allocation } // namespace allocation
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
...@@ -37,6 +37,8 @@ DEFINE_bool(init_allocated_mem, false, ...@@ -37,6 +37,8 @@ DEFINE_bool(init_allocated_mem, false,
"that initializing the allocated memory with a small value " "that initializing the allocated memory with a small value "
"during unit testing."); "during unit testing.");
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_bool(benchmark); DECLARE_bool(benchmark);
namespace paddle { namespace paddle {
...@@ -148,6 +150,7 @@ class GPUBuddyAllocatorList { ...@@ -148,6 +150,7 @@ class GPUBuddyAllocatorList {
std::unique_ptr<detail::SystemAllocator>( std::unique_ptr<detail::SystemAllocator>(
new detail::GPUAllocator(dev_id)), new detail::GPUAllocator(dev_id)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE:\n" VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable " << "You can set GFlags environment variable "
<< "'FLAGS_fraction_of_gpu_memory_to_use' " << "'FLAGS_fraction_of_gpu_memory_to_use' "
......
...@@ -9,3 +9,5 @@ endif(${WITH_GPU}) ...@@ -9,3 +9,5 @@ endif(${WITH_GPU})
cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator) cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS memory_block system_allocator glog) cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS memory_block system_allocator glog)
cc_test(buddy_allocator_test SRCS buddy_allocator_test.cc DEPS buddy_allocator)
...@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/buddy_allocator.h"
#include <algorithm>
#include <utility>
#include "glog/logging.h" #include "glog/logging.h"
DEFINE_bool(free_idle_memory, false, DEFINE_bool(free_idle_memory, false,
...@@ -36,9 +40,10 @@ BuddyAllocator::~BuddyAllocator() { ...@@ -36,9 +40,10 @@ BuddyAllocator::~BuddyAllocator() {
"have actually been freed"; "have actually been freed";
while (!pool_.empty()) { while (!pool_.empty()) {
auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin())); auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")"; VLOG(10) << "Free from block (" << block << ", " << block->size(cache_)
<< ")";
system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); system_allocator_->Free(block, block->size(cache_), block->index(cache_));
cache_.invalidate(block); cache_.invalidate(block);
pool_.erase(pool_.begin()); pool_.erase(pool_.begin());
} }
...@@ -71,7 +76,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { ...@@ -71,7 +76,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
// refill the pool if failure // refill the pool if failure
if (it == pool_.end()) { if (it == pool_.end()) {
it = RefillPool(); it = RefillPool(size);
// if still failure, fail fatally // if still failure, fail fatally
if (it == pool_.end()) { if (it == pool_.end()) {
return nullptr; return nullptr;
...@@ -184,19 +189,28 @@ void* BuddyAllocator::SystemAlloc(size_t size) { ...@@ -184,19 +189,28 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
return static_cast<MemoryBlock*>(p)->data(); return static_cast<MemoryBlock*>(p)->data();
} }
BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
size_t request_bytes) {
size_t allocate_bytes = max_chunk_size_;
size_t index = 0;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (system_allocator_->UseGpu()) { if (system_allocator_->UseGpu()) {
if ((total_used_ + total_free_) == 0) { if ((total_used_ + total_free_) == 0) {
// Compute the maximum allocation size for the first allocation. // Compute the allocation size for gpu for the first allocation.
max_chunk_size_ = platform::GpuMaxChunkSize(); allocate_bytes = std::max(platform::GpuInitAllocSize(), request_bytes);
} else {
// Reallocation size
if (realloc_size_ == 0) {
realloc_size_ = platform::GpuReallocSize();
}
allocate_bytes = std::max(realloc_size_, request_bytes);
} }
} }
#endif #endif
// Allocate a new maximum sized block // Allocate a new block
size_t index = 0; void* p = system_allocator_->Alloc(&index, allocate_bytes);
void* p = system_allocator_->Alloc(&index, max_chunk_size_);
if (p == nullptr) return pool_.end(); if (p == nullptr) return pool_.end();
...@@ -204,7 +218,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { ...@@ -204,7 +218,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
<< " from system allocator"; << " from system allocator";
static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index, static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
max_chunk_size_, nullptr, nullptr); allocate_bytes, nullptr, nullptr);
// gpu fallback allocation // gpu fallback allocation
if (system_allocator_->UseGpu() && if (system_allocator_->UseGpu() &&
...@@ -212,10 +226,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { ...@@ -212,10 +226,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
fallback_alloc_count_++; fallback_alloc_count_++;
} }
total_free_ += max_chunk_size_; total_free_ += allocate_bytes;
// dump the block into pool // dump the block into pool
return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first; return pool_.insert(IndexSizeAddress(index, allocate_bytes, p)).first;
} }
BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) { BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
...@@ -286,12 +300,12 @@ void BuddyAllocator::CleanIdleFallBackAlloc() { ...@@ -286,12 +300,12 @@ void BuddyAllocator::CleanIdleFallBackAlloc() {
VLOG(10) << "Return block " << block << " to fallback allocator."; VLOG(10) << "Return block " << block << " to fallback allocator.";
system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); system_allocator_->Free(block, block->size(cache_), block->index(cache_));
cache_.invalidate(block); cache_.invalidate(block);
pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
total_free_ -= max_chunk_size_; total_free_ -= block->size(cache_);
fallback_alloc_count_--; fallback_alloc_count_--;
// If no fall allocation exists, return directly // If no fall allocation exists, return directly
...@@ -322,12 +336,12 @@ void BuddyAllocator::CleanIdleNormalAlloc() { ...@@ -322,12 +336,12 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
VLOG(10) << "Return block " << block << " to base allocator."; VLOG(10) << "Return block " << block << " to base allocator.";
system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); system_allocator_->Free(block, block->size(cache_), block->index(cache_));
cache_.invalidate(block); cache_.invalidate(block);
pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
total_free_ -= max_chunk_size_; total_free_ -= block->size(cache_);
if (!shall_free_alloc()) return; if (!shall_free_alloc()) return;
} }
......
...@@ -60,7 +60,7 @@ class BuddyAllocator { ...@@ -60,7 +60,7 @@ class BuddyAllocator {
void* SystemAlloc(size_t size); void* SystemAlloc(size_t size);
/*! \brief If existing chunks are not suitable, refill pool */ /*! \brief If existing chunks are not suitable, refill pool */
PoolSet::iterator RefillPool(); PoolSet::iterator RefillPool(size_t request_bytes);
/** /**
* \brief Find the suitable chunk from existing pool and split * \brief Find the suitable chunk from existing pool and split
...@@ -89,6 +89,8 @@ class BuddyAllocator { ...@@ -89,6 +89,8 @@ class BuddyAllocator {
size_t min_chunk_size_; // the minimum size of each chunk size_t min_chunk_size_; // the minimum size of each chunk
size_t max_chunk_size_; // the maximum size of each chunk size_t max_chunk_size_; // the maximum size of each chunk
size_t realloc_size_ = 0; // the size of re-allocated chunk
private: private:
/** /**
* \brief A list of free allocation * \brief A list of free allocation
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include <memory>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"
#ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
#endif
namespace paddle {
namespace memory {
namespace detail {
constexpr static int test_gpu_id = 0;
void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) {
bool freed = false;
size_t used_bytes = allocator->Used();
if (size_bytes > 0) {
void* p = allocator->Alloc(size_bytes);
EXPECT_NE(p, nullptr);
#ifdef PADDLE_WITH_CUDA
if (size_bytes < platform::GpuMaxChunkSize()) {
#else
if (size_bytes < platform::CpuMaxChunkSize()) {
#endif
// Not allocate from SystemAllocator
EXPECT_GE(allocator->Used(), used_bytes + size_bytes);
} else {
// Allocate from SystemAllocator doesn't count in Used()
EXPECT_EQ(allocator->Used(), used_bytes);
}
int* intp = static_cast<int*>(p);
std::shared_ptr<int> ptr(intp, [&](void* p) {
allocator->Free(intp);
freed = true;
});
} else {
freed = true;
}
EXPECT_EQ(used_bytes, allocator->Used());
EXPECT_TRUE(freed);
}
#ifdef PADDLE_WITH_CUDA
TEST(BuddyAllocator, GpuFraction) {
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
TestBuddyAllocator(&buddy_allocator, 10);
TestBuddyAllocator(&buddy_allocator, 10 << 10);
TestBuddyAllocator(&buddy_allocator, 10 << 20);
TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
}
TEST(BuddyAllocator, InitRealloc) {
FLAGS_initial_gpu_memory_in_mb = 100;
FLAGS_reallocate_gpu_memory_in_mb = 50;
EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(100 << 20));
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
// Less then initial size and reallocate size
TestBuddyAllocator(&buddy_allocator, 10 << 20);
// Between initial size and reallocate size and not exceed pool
TestBuddyAllocator(&buddy_allocator, 80 << 20);
// Less then reallocate size and exceed pool
TestBuddyAllocator(&buddy_allocator, 40 << 20);
// Greater then reallocate size and exceed pool
TestBuddyAllocator(&buddy_allocator, 80 << 20);
// Greater then initial size and reallocate size
TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
}
TEST(BuddyAllocator, ReallocSizeGreaterThanInit) {
FLAGS_initial_gpu_memory_in_mb = 5;
FLAGS_reallocate_gpu_memory_in_mb = 10;
EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(10 << 20));
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
// Less then initial size and reallocate size
TestBuddyAllocator(&buddy_allocator, 1 << 20);
// Between initial size and reallocate size and not exceed pool
TestBuddyAllocator(&buddy_allocator, 3 << 20);
// Less then initial size and exceed pool
TestBuddyAllocator(&buddy_allocator, 3 << 20);
// Less then reallocate size and not exceed pool (now pool is 15 MB, used 7
// MB)
TestBuddyAllocator(&buddy_allocator, 7 << 20);
// Less then reallocate size and exceed pool
TestBuddyAllocator(&buddy_allocator, 8 << 20);
// Greater then initial size and reallocate size
TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
}
#endif
} // namespace detail
} // namespace memory
} // namespace paddle
...@@ -32,6 +32,9 @@ limitations under the License. */ ...@@ -32,6 +32,9 @@ limitations under the License. */
DECLARE_bool(use_pinned_memory); DECLARE_bool(use_pinned_memory);
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace detail { namespace detail {
...@@ -119,11 +122,18 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { ...@@ -119,11 +122,18 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
gpu_alloc_size_ += size; gpu_alloc_size_ += size;
return p; return p;
} else { } else {
LOG(WARNING) LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
<< "Cannot malloc " << size / 1024.0 / 1024.0 << " MB GPU memory. Please shrink "
<< " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use " "FLAGS_fraction_of_gpu_memory_to_use or "
"environment variable to a lower value. Current value is " "FLAGS_initial_gpu_memory_in_mb or "
<< FLAGS_fraction_of_gpu_memory_to_use; "FLAGS_reallocate_gpu_memory_in_mb"
"environment variable to a lower value. "
<< "Current FLAGS_fraction_of_gpu_memory_to_use value is "
<< FLAGS_fraction_of_gpu_memory_to_use
<< ". Current FLAGS_initial_gpu_memory_in_mb value is "
<< FLAGS_initial_gpu_memory_in_mb
<< ". Current FLAGS_reallocate_gpu_memory_in_mb value is "
<< FLAGS_reallocate_gpu_memory_in_mb;
return nullptr; return nullptr;
} }
} }
......
...@@ -76,12 +76,16 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, ...@@ -76,12 +76,16 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
const std::string& name) { const std::string& name) {
framework::LibraryType library{framework::LibraryType::kPlain}; framework::LibraryType library{framework::LibraryType::kPlain};
framework::DataLayout layout = framework::DataLayout::kAnyLayout; framework::DataLayout layout = framework::DataLayout::kAnyLayout;
#ifdef PADDLE_WITH_CUDA // FIXME(liuwei1031) temporarily disable the code to unblock users
auto it1 = oper.Attrs().find("use_cudnn"); // TODO(liuwei1031) figure out the reason behind
if (it1 != oper.Attrs().end() && platform::CanCUDNNBeUsed(ctx)) { // https://github.com/PaddlePaddle/Paddle/issues/16096
library = framework::LibraryType::kCUDNN; // and re-enable this in the future
} // #ifdef PADDLE_WITH_CUDA
#endif // auto it1 = oper.Attrs().find("use_cudnn");
// if (it1 != oper.Attrs().end() && platform::CanCUDNNBeUsed(ctx)) {
// library = framework::LibraryType::kCUDNN;
// }
// #endif
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
auto it = oper.Attrs().find("use_mkldnn"); auto it = oper.Attrs().find("use_mkldnn");
if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() && if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() &&
...@@ -188,6 +192,9 @@ $$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ ...@@ -188,6 +192,9 @@ $$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
UNUSED constexpr char SqrtDoc[] = R"DOC( UNUSED constexpr char SqrtDoc[] = R"DOC(
Sqrt Activation Operator. Sqrt Activation Operator.
Please make sure legal input, when input a negative value closed to zero,
you should add a small epsilon(1e-12) to avoid negative number caused by numerical errors.
$out = \sqrt{x}$ $out = \sqrt{x}$
)DOC"; )DOC";
......
...@@ -67,6 +67,22 @@ class AffineChannelOp : public framework::OperatorWithKernel { ...@@ -67,6 +67,22 @@ class AffineChannelOp : public framework::OperatorWithKernel {
"Input(Bias) of AffineChannelOp should not be null."); "Input(Bias) of AffineChannelOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of AffineChannelOp should not be null."); "Output(Out) of AffineChannelOp should not be null.");
auto x_dims = ctx->GetInputDim("X");
auto scale_dims = ctx->GetInputDim("Scale");
auto b_dims = ctx->GetInputDim("Bias");
const framework::DataLayout data_layout = framework::StringToDataLayout(
ctx->Attrs().Get<std::string>("data_layout"));
const int64_t C = (data_layout == framework::DataLayout::kNCHW
? x_dims[1]
: x_dims[x_dims.size() - 1]);
PADDLE_ENFORCE_EQ(scale_dims.size(), 1UL);
PADDLE_ENFORCE_EQ(scale_dims[0], C);
PADDLE_ENFORCE_EQ(b_dims.size(), 1UL);
PADDLE_ENFORCE_EQ(b_dims[0], C);
ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", "Out"); ctx->ShareLoD("X", "Out");
} }
...@@ -97,6 +113,27 @@ class AffineChannelOpGrad : public framework::OperatorWithKernel { ...@@ -97,6 +113,27 @@ class AffineChannelOpGrad : public framework::OperatorWithKernel {
} }
}; };
class AffineChannelGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto* op = new framework::OpDesc();
op->SetType("affine_channel_grad");
op->SetInput("X", Input("X"));
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetInput("Scale", Input("Scale"));
op->SetAttrMap(Attrs());
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
return std::unique_ptr<framework::OpDesc>(op);
}
};
template <typename T> template <typename T>
using EigenArrayMap = using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>; Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
...@@ -244,8 +281,7 @@ namespace ops = paddle::operators; ...@@ -244,8 +281,7 @@ namespace ops = paddle::operators;
using CPU = paddle::platform::CPUDeviceContext; using CPU = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(affine_channel, ops::AffineChannelOp, REGISTER_OPERATOR(affine_channel, ops::AffineChannelOp,
ops::AffineChannelOpMaker, ops::AffineChannelOpMaker, ops::AffineChannelGradMaker);
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(affine_channel_grad, ops::AffineChannelOpGrad); REGISTER_OPERATOR(affine_channel_grad, ops::AffineChannelOpGrad);
REGISTER_OP_CPU_KERNEL(affine_channel, ops::AffineChannelKernel<CPU, float>, REGISTER_OP_CPU_KERNEL(affine_channel, ops::AffineChannelKernel<CPU, float>,
......
...@@ -50,9 +50,19 @@ class ConcatOp : public framework::OperatorWithKernel { ...@@ -50,9 +50,19 @@ class ConcatOp : public framework::OperatorWithKernel {
if (j == axis) { if (j == axis) {
out_dims[axis] += ins[i][j]; out_dims[axis] += ins[i][j];
} else { } else {
PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j], if (ctx->IsRuntime()) {
"Input tensors should have the same " // check all shape in run time
"elements except the specify axis."); PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
"Input tensors should have the same "
"elements except the specify axis.");
} else {
// not check -1 with other in compile time
if (out_dims[j] > 0 && ins[i][j] > 0) {
PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
"Input tensors should have the same "
"elements except the specify axis.");
}
}
} }
} }
} }
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/conv_transpose_op.h" #include "paddle/fluid/operators/conv_transpose_op.h"
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -344,6 +345,28 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType( ...@@ -344,6 +345,28 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
ctx.GetPlace(), layout_, library_); ctx.GetPlace(), layout_, library_);
} }
class ConvTransposeGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType(ForwardOp().Type() + "_grad");
op->SetInput("Input", Input("Input"));
op->SetInput("Filter", Input("Filter"));
op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
if (ForwardOp().Inputs().count("Bias") > 0) {
op->SetInput("Bias", Input("Bias"));
op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
}
op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -352,7 +375,7 @@ namespace ops = paddle::operators; ...@@ -352,7 +375,7 @@ namespace ops = paddle::operators;
// conv2d_transpose // conv2d_transpose
REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp, REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp,
ops::Conv2DTransposeOpMaker, ops::Conv2DTransposeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::ConvTransposeGradOpDescMaker);
REGISTER_OPERATOR(conv2d_transpose_grad, ops::ConvTransposeOpGrad); REGISTER_OPERATOR(conv2d_transpose_grad, ops::ConvTransposeOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
...@@ -368,7 +391,7 @@ REGISTER_OP_CPU_KERNEL( ...@@ -368,7 +391,7 @@ REGISTER_OP_CPU_KERNEL(
// conv3d_transpose // conv3d_transpose
REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp, REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp,
ops::Conv3DTransposeOpMaker, ops::Conv3DTransposeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::ConvTransposeGradOpDescMaker);
REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad); REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
...@@ -384,7 +407,7 @@ REGISTER_OP_CPU_KERNEL( ...@@ -384,7 +407,7 @@ REGISTER_OP_CPU_KERNEL(
// depthwise conv2d_transpose // depthwise conv2d_transpose
REGISTER_OPERATOR(depthwise_conv2d_transpose, ops::ConvTransposeOp, REGISTER_OPERATOR(depthwise_conv2d_transpose, ops::ConvTransposeOp,
ops::Conv2DTransposeOpMaker, ops::Conv2DTransposeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::ConvTransposeGradOpDescMaker);
REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad); REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
......
...@@ -74,6 +74,9 @@ class CosSimOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -74,6 +74,9 @@ class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
"Norm of the second input, reduced along the 1st " "Norm of the second input, reduced along the 1st "
"dimension.") "dimension.")
.AsIntermediate(); .AsIntermediate();
AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
"Skip calling InferShape() function in the runtime.")
.SetDefault(true);
AddComment(R"DOC( AddComment(R"DOC(
**Cosine Similarity Operator** **Cosine Similarity Operator**
......
...@@ -28,17 +28,21 @@ class CosSimKernel : public framework::OpKernel<T> { ...@@ -28,17 +28,21 @@ class CosSimKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
// get Tensor // get Tensor
auto* in_x = context.Input<Tensor>("X"); auto* in_x = context.Input<framework::LoDTensor>("X");
auto* in_y = context.Input<Tensor>("Y"); auto* in_y = context.Input<Tensor>("Y");
auto* out_z = context.Output<Tensor>("Out"); auto* out_z = context.Output<framework::LoDTensor>("Out");
auto* out_x_norm = context.Output<Tensor>("XNorm"); auto* out_x_norm = context.Output<Tensor>("XNorm");
auto* out_y_norm = context.Output<Tensor>("YNorm"); auto* out_y_norm = context.Output<Tensor>("YNorm");
out_z->mutable_data<T>(context.GetPlace());
out_x_norm->mutable_data<T>(context.GetPlace());
out_y_norm->mutable_data<T>(context.GetPlace());
int rows_x = in_x->dims()[0]; int rows_x = in_x->dims()[0];
int rows_y = in_y->dims()[0]; int rows_y = in_y->dims()[0];
out_z->Resize({rows_x, 1});
out_x_norm->Resize({rows_x, 1});
out_y_norm->Resize({rows_y, 1});
out_z->mutable_data<T>(context.GetPlace());
out_x_norm->mutable_data<T>(context.GetPlace());
out_y_norm->mutable_data<T>(context.GetPlace());
out_z->set_lod(in_x->lod());
int cols = framework::product(in_x->dims()) / rows_x; int cols = framework::product(in_x->dims()) / rows_x;
...@@ -81,6 +85,7 @@ class CosSimGradKernel : public framework::OpKernel<T> { ...@@ -81,6 +85,7 @@ class CosSimGradKernel : public framework::OpKernel<T> {
if (rows_x == rows_y) { if (rows_x == rows_y) {
if (out_grad_x) { if (out_grad_x) {
out_grad_x->Resize(in_x->dims());
math::CosSimGradFunctor<T> functor( math::CosSimGradFunctor<T> functor(
in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(), in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(), in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
...@@ -91,6 +96,7 @@ class CosSimGradKernel : public framework::OpKernel<T> { ...@@ -91,6 +96,7 @@ class CosSimGradKernel : public framework::OpKernel<T> {
for_range(functor); for_range(functor);
} }
if (out_grad_y) { if (out_grad_y) {
out_grad_y->Resize(in_y->dims());
math::CosSimGradFunctor<T> functor( math::CosSimGradFunctor<T> functor(
in_y_norm->data<T>(), in_x_norm->data<T>(), in_y->data<T>(), in_y_norm->data<T>(), in_x_norm->data<T>(), in_y->data<T>(),
in_x->data<T>(), in_z->data<T>(), in_grad_z->data<T>(), in_x->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
...@@ -102,6 +108,7 @@ class CosSimGradKernel : public framework::OpKernel<T> { ...@@ -102,6 +108,7 @@ class CosSimGradKernel : public framework::OpKernel<T> {
} }
} else { } else {
if (out_grad_x) { if (out_grad_x) {
out_grad_x->Resize(in_x->dims());
math::CosSimDxFunctor<T> functor( math::CosSimDxFunctor<T> functor(
in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(), in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(), in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
...@@ -112,6 +119,7 @@ class CosSimGradKernel : public framework::OpKernel<T> { ...@@ -112,6 +119,7 @@ class CosSimGradKernel : public framework::OpKernel<T> {
for_range(functor); for_range(functor);
} }
if (out_grad_y) { if (out_grad_y) {
out_grad_y->Resize(in_y->dims());
out_grad_y->mutable_data<T>(context.GetPlace()); out_grad_y->mutable_data<T>(context.GetPlace());
math::SetConstant<DeviceContext, T> set_zero; math::SetConstant<DeviceContext, T> set_zero;
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <future> // NOLINT
#include <ostream>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace paddle {
namespace operators {
struct MutableDataFunctor {
MutableDataFunctor(void** data, framework::LoDTensor* tensor,
const platform::Place& place)
: data_(data), tensor_(tensor), place_(place) {}
template <typename T>
void apply() {
*data_ = tensor_->mutable_data<T>(place_);
}
void** data_;
framework::LoDTensor* tensor_;
platform::Place place_;
};
class AllReduceOp : public framework::OperatorBase {
using OperatorBase::OperatorBase;
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
PADDLE_ENFORCE(is_gpu_place(place),
"AllReduce op can run on gpu place only for now.");
#ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* ctx = pool.Get(place);
auto in_names = Inputs("X");
auto out_names = Outputs("Out");
PADDLE_ENFORCE_EQ(in_names.size(), 1, "Only support one input");
PADDLE_ENFORCE_EQ(out_names.size(), 1, "Only support one output");
auto* in = scope.FindVar(in_names[0]);
auto* out = scope.FindVar(out_names[0]);
PADDLE_ENFORCE(in->IsType<framework::LoDTensor>() ||
out->IsType<framework::LoDTensor>(),
"Only support allreduce LoDTensors");
int dtype = -1;
auto in_tensor = in->Get<framework::LoDTensor>();
dtype = platform::ToNCCLDataType(in_tensor.type());
int64_t numel = in_tensor.numel();
auto* sendbuff = in_tensor.data<void>();
auto* out_tensor = out->GetMutable<framework::LoDTensor>();
out_tensor->Resize(in_tensor.dims());
void* recvbuff = nullptr;
framework::VisitDataType(in_tensor.type(),
MutableDataFunctor(&recvbuff, out_tensor, place));
auto cuda_ctx = static_cast<platform::CUDADeviceContext*>(ctx);
auto* comm = cuda_ctx->nccl_comm();
// FIXME(typhoonzero): should use nccl stream here.
auto stream = cuda_ctx->stream();
int reduce_type = Attr<int>("reduce_type");
ncclRedOp_t red_type = ncclSum;
switch (reduce_type) {
case 0:
red_type = ncclSum;
break;
case 1:
red_type = ncclProd;
break;
case 2:
red_type = ncclMax;
break;
case 3:
red_type = ncclMin;
break;
}
PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, numel, static_cast<ncclDataType_t>(dtype), red_type,
comm, stream));
#endif
}
};
class AllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "(Tensor), tensor to be allreduced.");
AddOutput("Out", "(Tensor) the result of allreduced.");
AddAttr<int>("reduce_type", "(int) determin the reduce type.")
.SetDefault(0);
AddComment(R"DOC(
***AllReduce Operator***
Call NCCL AllReduce internally. Note that this op must be used when one
thread is managing one GPU device.
For speed reasons, reduce_type should be an integer:
0: sum
1: prod
2: max
3: min
If input and output are the same variable, in-place allreduce will be used.
)DOC");
}
};
class AllReduceOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(allreduce, ops::AllReduceOp,
paddle::framework::EmptyGradOpMaker, ops::AllReduceOpMaker,
ops::AllReduceOpShapeInference);
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/operators/dropout_op.h"
#include <memory>
#include <string> #include <string>
namespace paddle { namespace paddle {
...@@ -70,7 +71,7 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -70,7 +71,7 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
"1. downgrade_in_infer(default), downgrade the outcome at inference " "1. downgrade_in_infer(default), downgrade the outcome at inference "
"time" "time"
" train: out = input * mask" " train: out = input * mask"
" inference: out = input * dropout_prob" " inference: out = input * (1.0 - dropout_prob)"
"2. upscale_in_train, upscale the outcome at training time, do nothing " "2. upscale_in_train, upscale the outcome at training time, do nothing "
"in inference" "in inference"
" train: out = input * mask / ( 1.0 - dropout_prob )" " train: out = input * mask / ( 1.0 - dropout_prob )"
...@@ -106,21 +107,31 @@ class DropoutOpGrad : public framework::OperatorWithKernel { ...@@ -106,21 +107,31 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false, PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
"GradOp is only callable when is_test is false"); "GradOp is only callable when is_test is false");
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
PADDLE_ENFORCE(ctx->HasInput("Mask"), "Mask must not be null."); PADDLE_ENFORCE(ctx->HasInput("Mask"), "Mask must not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) must not be null."); "Input(Out@GRAD) must not be null.");
auto x_dims = ctx->GetInputDim("X");
auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
PADDLE_ENFORCE_EQ(x_dims, out_dims,
"Dimensions of Input(X) and Out@Grad must be the same."); ctx->SetOutputDim(framework::GradVarName("X"), out_dims);
auto mask_dims = ctx->GetInputDim("Mask"); ctx->ShareLoD(framework::GradVarName("Out"),
PADDLE_ENFORCE_EQ(x_dims, mask_dims, /*->*/ framework::GradVarName("X"));
"Dimensions of Input(X) and Mask must be the same."); }
};
ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); class DropoutGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("dropout_grad");
op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
op->SetInput("Mask", Output("Mask"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
} }
}; };
...@@ -129,7 +140,7 @@ class DropoutOpGrad : public framework::OperatorWithKernel { ...@@ -129,7 +140,7 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker, REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::DropoutGradOpDescMaker);
REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad); REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>, dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>,
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h"
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
namespace paddle {
namespace operators {
class ElementwiseFloorDivOpMaker : public ElementwiseOpMaker {
protected:
std::string GetName() const override { return "FloorDiv"; }
std::string GetEquation() const override { return "Out = X // Y"; }
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp,
ops::ElementwiseFloorDivOpMaker);
REGISTER_OP_CPU_KERNEL(
elementwise_floordiv,
ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, int>,
ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext,
int64_t>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
elementwise_floordiv,
ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int64_t>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle {
namespace operators {
template <typename T>
struct FloorDivFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return a / b; }
};
template <typename DeviceContext, typename T>
void elementwise_floor_div(const framework::ExecutionContext &ctx,
const framework::Tensor *x,
const framework::Tensor *y, framework::Tensor *z) {
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
ctx, x, y, axis, FloorDivFunctor<T>(), z);
}
template <typename DeviceContext, typename T>
class ElementwiseFloorDivKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *x = ctx.Input<framework::LoDTensor>("X");
auto *y = ctx.Input<framework::LoDTensor>("Y");
auto *z = ctx.Output<framework::LoDTensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
// dtype of x and y is int64 or int32
elementwise_floor_div<DeviceContext, T>(ctx, x, y, z);
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mod_op.h"
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
namespace paddle {
namespace operators {
class ElementwiseModOpMaker : public ElementwiseOpMaker {
protected:
std::string GetName() const override { return "Mod"; }
std::string GetEquation() const override { return "Out = X % Y"; }
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(elementwise_mod, ops::ElementwiseOp,
ops::ElementwiseModOpMaker);
REGISTER_OP_CPU_KERNEL(
elementwise_mod,
ops::ElementwiseModKernel<paddle::platform::CPUDeviceContext, int>,
ops::ElementwiseModKernel<paddle::platform::CPUDeviceContext, int64_t>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mod_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
elementwise_mod, ops::ElementwiseModKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseModKernel<plat::CUDADeviceContext, int64_t>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle {
namespace operators {
template <typename T>
struct ModFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return a % b; }
};
template <typename DeviceContext, typename T>
void elementwise_mod(const framework::ExecutionContext &ctx,
const framework::Tensor *x, const framework::Tensor *y,
framework::Tensor *z) {
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<ModFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
ModFunctor<T>(), z);
}
template <typename DeviceContext, typename T>
class ElementwiseModKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *x = ctx.Input<framework::LoDTensor>("X");
auto *y = ctx.Input<framework::LoDTensor>("Y");
auto *z = ctx.Output<framework::LoDTensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
// dtype of x and y is int64 or int32
elementwise_mod<DeviceContext, T>(ctx, x, y, z);
}
};
} // namespace operators
} // namespace paddle
...@@ -33,8 +33,51 @@ struct DequantizeFunctor<platform::CPUDeviceContext, T> { ...@@ -33,8 +33,51 @@ struct DequantizeFunctor<platform::CPUDeviceContext, T> {
} }
}; };
template <typename T>
struct ChannelDequantizeFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& dev_ctx,
const framework::Tensor* in, const framework::Tensor** scales,
const int scale_num, T max_range, framework::Tensor* out) {
if (scale_num == 1) {
const int channel = in->dims()[0];
const T* scale_factor = scales[0]->data<T>();
for (int i = 0; i < channel; i++) {
T s = scale_factor[i];
framework::Tensor one_channel_in = in->Slice(i, i + 1);
framework::Tensor one_channel_out = out->Slice(i, i + 1);
auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
auto& dev = *dev_ctx.eigen_device();
out_e.device(dev) = (s / max_range) * in_e;
}
} else if (scale_num == 2) {
int batch_size = in->dims()[0];
int channel = in->dims()[1];
const T* scale_one = scales[0]->data<T>();
const T* scale_two = scales[1]->data<T>();
for (int i = 0; i < batch_size; i++) {
framework::Tensor one_batch_in = in->Slice(i, i + 1).Resize(
framework::slice_ddim(in->dims(), 1, in->dims().size()));
framework::Tensor one_batch_out = out->Slice(i, i + 1).Resize(
framework::slice_ddim(out->dims(), 1, out->dims().size()));
for (int j = 0; j < channel; j++) {
T s = scale_one[j];
framework::Tensor one_channel_in = one_batch_in.Slice(j, j + 1);
framework::Tensor one_channel_out = one_batch_out.Slice(j, j + 1);
auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
auto& dev = *dev_ctx.eigen_device();
out_e.device(dev) = (s * scale_two[0] / max_range) * in_e;
}
}
}
}
};
template struct DequantizeFunctor<platform::CPUDeviceContext, float>; template struct DequantizeFunctor<platform::CPUDeviceContext, float>;
template struct DequantizeFunctor<platform::CPUDeviceContext, double>; template struct DequantizeFunctor<platform::CPUDeviceContext, double>;
template struct ChannelDequantizeFunctor<platform::CPUDeviceContext, float>;
template struct ChannelDequantizeFunctor<platform::CPUDeviceContext, double>;
class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel { class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
public: public:
......
...@@ -44,8 +44,66 @@ struct DequantizeFunctor<platform::CUDADeviceContext, T> { ...@@ -44,8 +44,66 @@ struct DequantizeFunctor<platform::CUDADeviceContext, T> {
} }
}; };
template <typename T>
__global__ void DequantizeOneScale(const T* in, const T* scale, T max_range,
int num, int channel, T* out) {
int tid = threadIdx.x;
int channel_size = num / channel;
const T* in_c = in + blockIdx.x * channel_size;
T* out_c = out + blockIdx.x * channel_size;
for (int i = tid; i < channel_size; i += blockDim.x) {
out_c[i] = in_c[i] * scale[blockIdx.x] / max_range;
}
}
template <typename T>
__global__ void DequantizeTwoScale(const T* in, const T* scale_one,
const T* scale_two, T max_range, int num,
int batch_size, int channel, T* out) {
int tid = threadIdx.x;
int channel_size = num / (batch_size * channel);
int scale_index = blockIdx.x % channel;
const T* in_c = in + blockIdx.x * channel_size;
T* out_c = out + blockIdx.x * channel_size;
for (int i = tid; i < channel_size; i += blockDim.x) {
out_c[i] = in_c[i] * scale_one[scale_index] * scale_two[0] / max_range;
}
}
template <typename T>
struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& dev_ctx,
const framework::Tensor* in, const framework::Tensor** scales,
const int scale_num, T max_range, framework::Tensor* out) {
const T* in_data = in->data<T>();
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
if (scale_num == 1) {
int num = in->numel();
int channel = in->dims()[0];
const T* scale_factor = scales[0]->data<T>();
int block = 1024;
int grid = channel;
DequantizeOneScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
in_data, scale_factor, max_range, num, channel, out_data);
} else if (scale_num == 2) {
int num = in->numel();
int batch_size = in->dims()[0];
int channel = in->dims()[1];
const T* scale_one = scales[0]->data<T>();
const T* scale_two = scales[1]->data<T>();
int block = 1024;
int grid = batch_size * channel;
DequantizeTwoScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
in_data, scale_one, scale_two, max_range, num, batch_size, channel,
out_data);
}
}
};
template struct DequantizeFunctor<platform::CUDADeviceContext, float>; template struct DequantizeFunctor<platform::CUDADeviceContext, float>;
template struct DequantizeFunctor<platform::CUDADeviceContext, double>; template struct DequantizeFunctor<platform::CUDADeviceContext, double>;
template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, float>;
template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, double>;
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector> #include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -28,6 +29,13 @@ struct DequantizeFunctor { ...@@ -28,6 +29,13 @@ struct DequantizeFunctor {
framework::Tensor* out); framework::Tensor* out);
}; };
template <typename DeviceContext, typename T>
struct ChannelDequantizeFunctor {
void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in,
const framework::Tensor** scales, const int scale_num,
T max_range, framework::Tensor* out);
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> { class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
public: public:
...@@ -54,32 +62,33 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> { ...@@ -54,32 +62,33 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
auto scales = ctx.MultiInput<framework::Tensor>("Scales"); auto scales = ctx.MultiInput<framework::Tensor>("Scales");
auto* out = ctx.Output<framework::Tensor>("Out"); auto* out = ctx.Output<framework::Tensor>("Out");
PADDLE_ENFORCE_EQ(scales[0]->numel(), in->dims()[0],
"The number of first scale values must be the same with "
"first dimension value of Input(X).");
auto quant_bits = ctx.Attr<std::vector<int>>("quant_bits"); auto quant_bits = ctx.Attr<std::vector<int>>("quant_bits");
int max_range = std::pow(2, quant_bits[0] - 1) - 1; int max_range = 1;
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
out->mutable_data<T>(dev_ctx.GetPlace()); out->mutable_data<T>(dev_ctx.GetPlace());
int scale_num = scales.size();
auto dequant = DequantizeFunctor<DeviceContext, T>(); if (scale_num == 1) {
for (int64_t i = 0; i < in->dims()[0]; i++) { PADDLE_ENFORCE_EQ(
framework::Tensor one_channel_in = in->Slice(i, i + 1); scales[0]->numel(), in->dims()[0],
framework::Tensor one_channel_out = out->Slice(i, i + 1); "The number of first scale values must be the same with "
framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1); "first dimension value of Input(X) when the `Scales` has only one "
dequant(dev_ctx, &one_channel_in, &one_channel_scale, "element.");
static_cast<T>(max_range), &one_channel_out); max_range *= (std::pow(2, quant_bits[0] - 1) - 1);
} } else if (scale_num == 2) {
PADDLE_ENFORCE_EQ(
if (scales.size() == 2) { scales[0]->numel(), in->dims()[1],
"The number of first scale values must be the same with "
"second dimension value of Input(X) when the `Scales` has two "
"elements.");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
scales[1]->numel(), 1, scales[1]->numel(), 1,
"The second scale tensor should only have one value at now."); "The second scale tensor should only have one value at now.");
max_range = std::pow(2, quant_bits[1] - 1) - 1; max_range *= (std::pow(2, quant_bits[0] - 1) - 1) *
dequant(dev_ctx, out, scales[1], static_cast<T>(max_range), out); (std::pow(2, quant_bits[1] - 1) - 1);
} }
ChannelDequantizeFunctor<DeviceContext, T>()(
dev_ctx, in, scales.data(), scale_num, static_cast<T>(max_range), out);
} }
}; };
......
...@@ -37,6 +37,21 @@ struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> { ...@@ -37,6 +37,21 @@ struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
template struct FindAbsMaxFunctor<platform::CPUDeviceContext, float>; template struct FindAbsMaxFunctor<platform::CPUDeviceContext, float>;
template <typename T>
struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& ctx, const T* in,
const int num, const int channel, T* out) {
const int channel_size = num / channel;
for (int i = 0; i < channel; i++) {
auto* start = in + i * channel_size;
auto* end = in + (i + 1) * channel_size;
out[i] = std::abs(*(std::max_element(start, end, Compare<T>())));
}
}
};
template struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, float>;
template <typename T> template <typename T>
struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> { struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& ctx, void operator()(const platform::CPUDeviceContext& ctx,
...@@ -53,6 +68,36 @@ struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> { ...@@ -53,6 +68,36 @@ struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
template struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, float>; template struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, float>;
template <typename T>
struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& ctx,
const framework::Tensor& in, const framework::Tensor& scale,
const int bin_cnt, const int channel,
framework::Tensor* out) {
auto* scale_data = scale.data<T>();
auto* in_data = in.data<T>();
auto* out_data = out->mutable_data<T>(ctx.GetPlace());
const int channel_size = in.numel() / channel;
platform::Transform<platform::CPUDeviceContext> trans;
for (int i = 0; i < channel; i++) {
T s = scale_data[i];
auto* start = in_data + i * channel_size;
auto* end = in_data + (i + 1) * channel_size;
trans(ctx, start, end, out_data + i * channel_size,
ClipFunctor<T>(-s, s));
}
for (int i = 0; i < channel; i++) {
T s = scale_data[i];
framework::Tensor one_channel_out = out->Slice(i, i + 1);
auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
out_e.device(*ctx.eigen_device()) = (bin_cnt / s * out_e).round();
}
}
};
template struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext,
float>;
template <typename T> template <typename T>
struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> { struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& ctx, void operator()(const platform::CPUDeviceContext& ctx,
...@@ -169,10 +214,10 @@ class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel { ...@@ -169,10 +214,10 @@ class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel {
ctx->HasOutput("Out"), ctx->HasOutput("Out"),
"Output(Out) of FakeChannelWiseQuantizeOp should not be null."); "Output(Out) of FakeChannelWiseQuantizeOp should not be null.");
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->HasOutput("OutScales"), ctx->HasOutput("OutScale"),
"Output(Scales) of FakeChannelWiseQuantizeOp should not be null."); "Output(Scale) of FakeChannelWiseQuantizeOp should not be null.");
ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->SetOutputDim("OutScales", {ctx->GetInputDim("X")[0]}); ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[0]});
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
} }
...@@ -192,7 +237,7 @@ class FakeChannelWiseQuantizeAbsMaxOpMaker ...@@ -192,7 +237,7 @@ class FakeChannelWiseQuantizeAbsMaxOpMaker
AddOutput("Out", AddOutput("Out",
"(Tensor) Output of quantized low level tensor, " "(Tensor) Output of quantized low level tensor, "
"but also saved as float data type."); "but also saved as float data type.");
AddOutput("OutScales", "(Tensor) Current channel wise scale"); AddOutput("OutScale", "(Tensor) Current channel wise scale");
AddAttr<int>("bit_length", "(int, default 8)") AddAttr<int>("bit_length", "(int, default 8)")
.SetDefault(8) .SetDefault(8)
.AddCustomChecker([](const int& bit_length) { .AddCustomChecker([](const int& bit_length) {
......
...@@ -74,6 +74,45 @@ struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> { ...@@ -74,6 +74,45 @@ struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
template struct FindAbsMaxFunctor<platform::CUDADeviceContext, float>; template struct FindAbsMaxFunctor<platform::CUDADeviceContext, float>;
template <typename T>
__global__ void FindChannelAbsMaxKernel(const T* in, const int n, const int c,
T* out) {
int tid = threadIdx.x;
int channel_size = n / c;
const T* in_c = in + blockIdx.x * channel_size;
extern __shared__ T shared_max_data[];
shared_max_data[tid] = T(0);
for (int i = tid; i < channel_size; i += blockDim.x) {
T tmp = fabs(in_c[i]);
if (tmp > shared_max_data[tid]) {
shared_max_data[tid] = tmp;
}
}
__syncthreads();
for (int i = blockDim.x / 2; i > 0; i >>= 1) {
if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
shared_max_data[tid] = shared_max_data[tid + i];
}
__syncthreads();
}
if (tid == 0) {
out[blockIdx.x] = shared_max_data[0];
}
}
template <typename T>
struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx, const T* in,
const int num, const int channel, T* out) {
int block = 1024;
int grid = channel;
FindChannelAbsMaxKernel<T><<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(
in, num, channel, out);
}
};
template struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, float>;
template <typename T> template <typename T>
__global__ void ClipAndQuantKernel(const T* in, const T* scale, __global__ void ClipAndQuantKernel(const T* in, const T* scale,
const int bin_cnt, const int n, T* out) { const int bin_cnt, const int n, T* out) {
...@@ -82,14 +121,76 @@ __global__ void ClipAndQuantKernel(const T* in, const T* scale, ...@@ -82,14 +121,76 @@ __global__ void ClipAndQuantKernel(const T* in, const T* scale,
T s = scale[0]; T s = scale[0];
for (int i = bid; i < n; i += blockDim.x * gridDim.x) { for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
T x = in[bid]; T x = in[i];
T v = x > s ? s : x; T v = x > s ? s : x;
v = v < -s ? -s : v; v = v < -s ? -s : v;
v = bin_cnt / s * v; v = bin_cnt / s * v;
out[bid] = round(v); out[i] = round(v);
} }
} }
template <typename T>
struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& in, const framework::Tensor& scale,
const int bin_cnt, framework::Tensor* out) {
int num = in.numel();
int block = 1024;
int grid = (block - 1 + num) / block;
const T* in_data = in.data<T>();
const T* scale_data = scale.data<T>();
T* out_data = out->mutable_data<T>(ctx.GetPlace());
ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
in_data, scale_data, bin_cnt, num, out_data);
}
};
template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
template <typename T>
__global__ void ChannelClipAndQuantKernel(const T* in, const T* scale,
const int bin_cnt, const int n,
const int c, T* out) {
int tid = threadIdx.x;
int channel_size = n / c;
const T* in_c = in + blockIdx.x * channel_size;
T* out_c = out + blockIdx.x * channel_size;
T s = scale[blockIdx.x];
for (int i = tid; i < channel_size; i += blockDim.x) {
T x = in_c[i];
T v = x > s ? s : x;
v = v < -s ? -s : v;
v = bin_cnt / s * v;
out_c[i] = round(v);
}
}
template <typename T>
struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& in, const framework::Tensor& scale,
const int bin_cnt, const int channel,
framework::Tensor* out) {
int num = in.numel();
int block = 1024;
int grid = channel;
const T* in_data = in.data<T>();
const T* scale_data = scale.data<T>();
T* out_data = out->mutable_data<T>(ctx.GetPlace());
ChannelClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
in_data, scale_data, bin_cnt, num, channel, out_data);
}
};
template struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext,
float>;
template <typename T> template <typename T>
__global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale, __global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale,
const T* last_scale, const T* last_scale,
...@@ -182,26 +283,6 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> { ...@@ -182,26 +283,6 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
template struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, template struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext,
float>; float>;
template <typename T>
struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& in, const framework::Tensor& scale,
const int bin_cnt, framework::Tensor* out) {
int num = in.numel();
int block = 1024;
int grid = (block - 1 + num) / block;
const T* in_data = in.data<T>();
const T* scale_data = scale.data<T>();
T* out_data = out->mutable_data<T>(ctx.GetPlace());
ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
in_data, scale_data, bin_cnt, num, out_data);
}
};
template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
......
...@@ -42,6 +42,19 @@ struct FindRangeAbsMaxFunctor { ...@@ -42,6 +42,19 @@ struct FindRangeAbsMaxFunctor {
framework::Tensor* scales_arr, framework::Tensor* out_scale); framework::Tensor* scales_arr, framework::Tensor* out_scale);
}; };
template <typename DeviceContext, typename T>
struct FindChannelAbsMaxFunctor {
void operator()(const DeviceContext& ctx, const T* in, const int num,
const int channel, T* out);
};
template <typename DeviceContext, typename T>
struct ChannelClipAndFakeQuantFunctor {
void operator()(const DeviceContext& ctx, const framework::Tensor& in,
const framework::Tensor& scale, const int bin_cnt,
const int channel, framework::Tensor* out);
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
struct FindMovingAverageAbsMaxFunctor { struct FindMovingAverageAbsMaxFunctor {
void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum, void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum,
...@@ -78,29 +91,18 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> { ...@@ -78,29 +91,18 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
auto* in = context.Input<framework::Tensor>("X"); auto* in = context.Input<framework::Tensor>("X");
auto* out = context.Output<framework::Tensor>("Out"); auto* out = context.Output<framework::Tensor>("Out");
auto* out_scales = context.Output<framework::Tensor>("OutScales"); auto* out_scale = context.Output<framework::Tensor>("OutScale");
T* out_scales_data = out_scales->mutable_data<T>(context.GetPlace()); T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
out->mutable_data<T>(context.GetPlace()); out->mutable_data<T>(context.GetPlace());
int bit_length = context.Attr<int>("bit_length"); int bit_length = context.Attr<int>("bit_length");
int bin_cnt = std::pow(2, bit_length - 1) - 1; int bin_cnt = std::pow(2, bit_length - 1) - 1;
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
auto find_abs_max = FindAbsMaxFunctor<DeviceContext, T>(); FindChannelAbsMaxFunctor<DeviceContext, T>()(
for (int64_t i = 0; i < in->dims()[0]; i++) { dev_ctx, in->data<T>(), in->numel(), in->dims()[0], out_scale_data);
framework::Tensor one_channel = in->Slice(i, i + 1); ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
const T* one_channel_data = one_channel.data<T>(); dev_ctx, *in, *out_scale, bin_cnt, in->dims()[0], out);
find_abs_max(dev_ctx, one_channel_data, one_channel.numel(),
&out_scales_data[i]);
}
auto clip_quant = ClipAndFakeQuantFunctor<DeviceContext, T>();
for (int64_t i = 0; i < in->dims()[0]; i++) {
framework::Tensor one_channel_in = in->Slice(i, i + 1);
framework::Tensor one_channel_out = out->Slice(i, i + 1);
framework::Tensor one_channel_scale = out_scales->Slice(i, i + 1);
clip_quant(dev_ctx, one_channel_in, one_channel_scale, bin_cnt,
&one_channel_out);
}
} }
}; };
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/layer_norm_op.h" #include "paddle/fluid/operators/layer_norm_op.h"
#include <memory>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -133,7 +134,7 @@ class LayerNormGradOp : public framework::OperatorWithKernel { ...@@ -133,7 +134,7 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
} }
if (ctx->HasOutput(framework::GradVarName("Bias"))) { if (ctx->HasOutput(framework::GradVarName("Bias"))) {
ctx->SetOutputDim(framework::GradVarName("Bias"), ctx->SetOutputDim(framework::GradVarName("Bias"),
ctx->GetInputDim("Bias")); ctx->GetInputDim("Scale"));
} }
} }
...@@ -157,12 +158,39 @@ class LayerNormGradOp : public framework::OperatorWithKernel { ...@@ -157,12 +158,39 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
} }
}; };
class LayerNormGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("layer_norm_grad");
op->SetInput("X", Input("X"));
op->SetInput("Mean", Output("Mean"));
op->SetInput("Variance", Output("Variance"));
if (ForwardOp().Inputs().count("Scale") > 0) {
op->SetInput("Scale", Input("Scale"));
op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
}
if (ForwardOp().Inputs().count("Bias") > 0) {
op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
}
op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker, REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); ops::LayerNormGradOpDescMaker);
REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp); REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>, layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -245,11 +245,9 @@ class LayerNormGradKernel : public framework::OpKernel<T> { ...@@ -245,11 +245,9 @@ class LayerNormGradKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon"); const float epsilon = ctx.Attr<float>("epsilon");
auto x = *ctx.Input<Tensor>("X"); auto x = *ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* mean = ctx.Input<Tensor>("Mean"); auto* mean = ctx.Input<Tensor>("Mean");
auto* var = ctx.Input<Tensor>("Variance"); auto* var = ctx.Input<Tensor>("Variance");
auto* scale = ctx.Input<Tensor>("Scale"); auto* scale = ctx.Input<Tensor>("Scale");
auto* bias = ctx.Input<Tensor>("Bias");
auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y")); auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis"); const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
...@@ -275,18 +273,13 @@ class LayerNormGradKernel : public framework::OpKernel<T> { ...@@ -275,18 +273,13 @@ class LayerNormGradKernel : public framework::OpKernel<T> {
x.Resize(matrix_shape); x.Resize(matrix_shape);
temp.mutable_data<T>(matrix_shape, ctx.GetPlace()); temp.mutable_data<T>(matrix_shape, ctx.GetPlace());
if (!(bias && scale)) { temp_norm.mutable_data<T>(matrix_shape, ctx.GetPlace());
temp_norm.ShareDataWith(*y); // get x_norm
temp_norm.Resize(matrix_shape); ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
} else { ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &temp_norm);
temp_norm.mutable_data<T>(matrix_shape, ctx.GetPlace()); ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
// get x_norm ctx, &temp_norm, var, /*axis*/ 0,
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>( DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &temp_norm);
ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &temp_norm);
ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
ctx, &temp_norm, var, /*axis*/ 0,
DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &temp_norm);
}
} }
if (d_bias) { if (d_bias) {
......
...@@ -11,89 +11,27 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,89 +11,27 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <fstream>
#include "paddle/fluid/framework/data_type_transform.h" #include <string>
#include "paddle/fluid/framework/op_registry.h" #include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/operators/load_combine_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class LoadCombineOp : public framework::OperatorBase { class LoadCombineOp : public framework::OperatorWithKernel {
public: public:
LoadCombineOp(const std::string &type, using framework::OperatorWithKernel::OperatorWithKernel;
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs, void InferShape(framework::InferShapeContext *ctx) const override {}
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {} protected:
framework::OpKernelType GetExpectedKernelType(
private: const framework::ExecutionContext &ctx) const override {
void RunImpl(const framework::Scope &scope, framework::OpKernelType kt = framework::OpKernelType(
const platform::Place &place) const override { framework::proto::VarType::FP32, ctx.GetPlace());
auto filename = Attr<std::string>("file_path"); return kt;
auto load_as_fp16 = Attr<bool>("load_as_fp16");
auto model_from_memory = Attr<bool>("model_from_memory");
auto out_var_names = Outputs("Out");
PADDLE_ENFORCE_GT(
static_cast<int>(out_var_names.size()), 0,
"The number of output variables should be greater than 0.");
if (!model_from_memory) {
std::ifstream fin(filename, std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin),
"Cannot open file %s for load_combine op", filename);
LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
} else {
PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory");
std::stringstream fin(filename, std::ios::in | std::ios::binary);
LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
}
}
void LoadParamsFromBuffer(
const framework::Scope &scope, const platform::Place &place,
std::istream *buffer, bool load_as_fp16,
const std::vector<std::string> &out_var_names) const {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
for (size_t i = 0; i < out_var_names.size(); i++) {
auto *out_var = scope.FindVar(out_var_names[i]);
PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
out_var_names[i]);
auto *tensor = out_var->GetMutable<framework::LoDTensor>();
// Error checking
PADDLE_ENFORCE(static_cast<bool>(*buffer), "Cannot read more");
// Get data from fin to tensor
DeserializeFromStream(*buffer, tensor, dev_ctx);
auto in_dtype = tensor->type();
auto out_dtype =
load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
if (in_dtype != out_dtype) {
// convert to float16 tensor
auto in_kernel_type = framework::OpKernelType(in_dtype, place);
auto out_kernel_type = framework::OpKernelType(out_dtype, place);
framework::LoDTensor fp16_tensor;
// copy LoD info to the new tensor
fp16_tensor.set_lod(tensor->lod());
framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
&fp16_tensor);
// reset output tensor
out_var->Clear();
tensor = out_var->GetMutable<framework::LoDTensor>();
tensor->set_lod(fp16_tensor.lod());
tensor->ShareDataWith(fp16_tensor);
}
}
buffer->peek();
PADDLE_ENFORCE(buffer->eof(),
"You are not allowed to load partial data via "
"load_combine_op, use load_op instead.");
} }
}; };
...@@ -124,21 +62,30 @@ class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker { ...@@ -124,21 +62,30 @@ class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC( AddComment(R"DOC(
LoadCombine Operator. LoadCombine Operator.
LoadCombine operator loads LoDTensor variables from a file, which could be LoadCombine operator loads LoDTensor variables from a file, which could be
loaded in memory already. The file should contain one or more LoDTensors loaded in memory already. The file should contain one or more LoDTensors
serialized using the SaveCombine operator. The serialized using the SaveCombine operator. The
LoadCombine operator applies a deserialization strategy to appropriately load LoadCombine operator applies a deserialization strategy to appropriately load
the LodTensors, and this strategy complements the serialization strategy used the LodTensors, and this strategy complements the serialization strategy used
in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled
with the SaveCombine operator, and can only deserialize one or more LoDTensors with the SaveCombine operator, and can only deserialize one or more LoDTensors
that were saved using the SaveCombine operator. that were saved using the SaveCombine operator.
)DOC"); )DOC");
} }
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(load_combine, ops::LoadCombineOp, REGISTER_OPERATOR(load_combine, ops::LoadCombineOp,
ops::LoadCombineOpProtoMaker); ops::LoadCombineOpProtoMaker);
REGISTER_OP_CPU_KERNEL(
load_combine,
ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/load_combine_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
load_combine,
ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <fstream>
#include <string>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class LoadCombineOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto place = ctx.GetPlace();
auto filename = ctx.Attr<std::string>("file_path");
auto load_as_fp16 = ctx.Attr<bool>("load_as_fp16");
auto model_from_memory = ctx.Attr<bool>("model_from_memory");
auto &out_var_names = ctx.Outputs("Out");
PADDLE_ENFORCE_GT(
static_cast<int>(out_var_names.size()), 0,
"The number of output variables should be greater than 0.");
if (!model_from_memory) {
std::ifstream fin(filename, std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin),
"Cannot open file %s for load_combine op", filename);
LoadParamsFromBuffer(ctx, place, &fin, load_as_fp16, out_var_names);
} else {
PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory");
std::stringstream fin(filename, std::ios::in | std::ios::binary);
LoadParamsFromBuffer(ctx, place, &fin, load_as_fp16, out_var_names);
}
}
void LoadParamsFromBuffer(
const framework::ExecutionContext &context, const platform::Place &place,
std::istream *buffer, bool load_as_fp16,
const std::vector<std::string> &out_var_names) const {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
auto out_vars = context.MultiOutputVar("Out");
for (size_t i = 0; i < out_var_names.size(); i++) {
PADDLE_ENFORCE(out_vars[i] != nullptr,
"Output variable %s cannot be found", out_var_names[i]);
auto *tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
// Error checking
PADDLE_ENFORCE(static_cast<bool>(*buffer), "Cannot read more");
// Get data from fin to tensor
DeserializeFromStream(*buffer, tensor, dev_ctx);
auto in_dtype = tensor->type();
auto out_dtype =
load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
if (in_dtype != out_dtype) {
// convert to float16 tensor
auto in_kernel_type = framework::OpKernelType(in_dtype, place);
auto out_kernel_type = framework::OpKernelType(out_dtype, place);
framework::LoDTensor fp16_tensor;
// copy LoD info to the new tensor
fp16_tensor.set_lod(tensor->lod());
framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
&fp16_tensor);
// reset output tensor
out_vars[i]->Clear();
tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
tensor->set_lod(fp16_tensor.lod());
tensor->ShareDataWith(fp16_tensor);
}
}
buffer->peek();
PADDLE_ENFORCE(buffer->eof(),
"You are not allowed to load partial data via "
"load_combine_op, use load_op instead.");
}
};
} // namespace operators
} // namespace paddle
...@@ -11,89 +11,26 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,89 +11,26 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <fstream>
#include "paddle/fluid/framework/data_type_transform.h" #include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/operators/load_op.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class LoadOp : public framework::OperatorBase { class LoadOp : public framework::OperatorWithKernel {
public: public:
LoadOp(const std::string &type, const framework::VariableNameMap &inputs, using framework::OperatorWithKernel::OperatorWithKernel;
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
// FIXME(yuyang18): We save variable to local file now, but we should change
// it to save an output stream.
auto filename = Attr<std::string>("file_path");
std::ifstream fin(filename, std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
filename);
auto out_var_name = Output("Out"); void InferShape(framework::InferShapeContext *ctx) const override {}
auto *out_var = scope.FindVar(out_var_name);
PADDLE_ENFORCE(out_var != nullptr,
"Output variable %s cannot be found in scope %p",
out_var_name, &scope);
if (out_var->IsType<framework::LoDTensor>()) { protected:
LoadLodTensor(fin, place, out_var); framework::OpKernelType GetExpectedKernelType(
} else if (out_var->IsType<framework::SelectedRows>()) { const framework::ExecutionContext &ctx) const override {
LoadSelectedRows(fin, place, out_var); framework::OpKernelType kt = framework::OpKernelType(
} else { framework::proto::VarType::FP32, platform::CPUPlace());
PADDLE_ENFORCE( return kt;
false,
"Load only support LoDTensor and SelectedRows, %s has wrong type",
out_var_name);
}
}
void LoadLodTensor(std::istream &fin, const platform::Place &place,
framework::Variable *var) const {
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
auto *tensor = var->GetMutable<framework::LoDTensor>();
DeserializeFromStream(fin, tensor, dev_ctx);
auto load_as_fp16 = Attr<bool>("load_as_fp16");
auto in_dtype = tensor->type();
auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
if (in_dtype != out_dtype) {
// convert to float16 tensor
auto in_kernel_type = framework::OpKernelType(in_dtype, place);
auto out_kernel_type = framework::OpKernelType(out_dtype, place);
framework::LoDTensor fp16_tensor;
// copy LoD info to the new tensor
fp16_tensor.set_lod(tensor->lod());
framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
&fp16_tensor);
// reset output tensor
var->Clear();
tensor = var->GetMutable<framework::LoDTensor>();
tensor->set_lod(fp16_tensor.lod());
tensor->ShareDataWith(fp16_tensor);
}
}
void LoadSelectedRows(std::istream &fin, const platform::Place &place,
framework::Variable *var) const {
auto *selectedRows = var->GetMutable<framework::SelectedRows>();
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
selectedRows->SyncIndex();
} }
}; };
...@@ -116,8 +53,15 @@ class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { ...@@ -116,8 +53,15 @@ class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
"file."); "file.");
} }
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(load, ops::LoadOp, ops::LoadOpProtoMaker); REGISTER_OPERATOR(load, ops::LoadOp, ops::LoadOpProtoMaker);
REGISTER_OP_CPU_KERNEL(
load, ops::LoadOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::LoadOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/load_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
load, ops::LoadOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::LoadOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::LoadOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::LoadOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::LoadOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <fstream>
#include <string>
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class LoadOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto place = ctx.GetPlace();
// FIXME(yuyang18): We save variable to local file now, but we should change
// it to save an output stream.
auto filename = ctx.Attr<std::string>("file_path");
std::ifstream fin(filename, std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
filename);
auto out_var_name = ctx.Outputs("Out").data();
auto *out_var = ctx.OutputVar("Out");
PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found ",
out_var_name);
PADDLE_ENFORCE(out_var != nullptr, "Output variable cannot be found ");
if (out_var->IsType<framework::LoDTensor>()) {
LoadLodTensor(fin, place, out_var, ctx);
} else if (out_var->IsType<framework::SelectedRows>()) {
LoadSelectedRows(fin, place, out_var);
} else {
PADDLE_ENFORCE(
false,
"Load only support LoDTensor and SelectedRows, %s has wrong type",
out_var_name);
}
}
void LoadLodTensor(std::istream &fin, const platform::Place &place,
framework::Variable *var,
const framework::ExecutionContext &ctx) const {
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
auto *tensor = var->GetMutable<framework::LoDTensor>();
DeserializeFromStream(fin, tensor, dev_ctx);
auto load_as_fp16 = ctx.Attr<bool>("load_as_fp16");
auto in_dtype = tensor->type();
auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
if (in_dtype != out_dtype) {
// convert to float16 tensor
auto in_kernel_type = framework::OpKernelType(in_dtype, place);
auto out_kernel_type = framework::OpKernelType(out_dtype, place);
framework::LoDTensor fp16_tensor;
// copy LoD info to the new tensor
fp16_tensor.set_lod(tensor->lod());
framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
&fp16_tensor);
// reset output tensor
var->Clear();
tensor = var->GetMutable<framework::LoDTensor>();
tensor->set_lod(fp16_tensor.lod());
tensor->ShareDataWith(fp16_tensor);
}
}
void LoadSelectedRows(std::istream &fin, const platform::Place &place,
framework::Variable *var) const {
auto *selectedRows = var->GetMutable<framework::SelectedRows>();
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
selectedRows->SyncIndex();
}
};
} // namespace operators
} // namespace paddle
...@@ -32,7 +32,10 @@ class LoDResetOp : public framework::OperatorWithKernel { ...@@ -32,7 +32,10 @@ class LoDResetOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_GT(level0.size(), 1, PADDLE_ENFORCE_GT(level0.size(), 1,
"If Input(Y) not provided, the target lod should be " "If Input(Y) not provided, the target lod should be "
"specified by attribute `target_lod`."); "specified by attribute `target_lod`.");
} else {
ctx->ShareLoD("Y", "Out");
} }
ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
} }
......
...@@ -78,12 +78,6 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -78,12 +78,6 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
"The numel of 'pad_value' can only be 1 or be equal to the " "The numel of 'pad_value' can only be 1 or be equal to the "
"'step_width'."); "'step_width'.");
if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
TensorCopy(seq_tensor, context.GetPlace(), context, pad_tensor);
pad_tensor->Resize(pad_tensor_dims);
return;
}
const int kBlockSize = 512; const int kBlockSize = 512;
/* At least use 32 threads to copy sequence_width elements, /* At least use 32 threads to copy sequence_width elements,
...@@ -129,12 +123,13 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -129,12 +123,13 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len, CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
step_width, layout); step_width, layout);
/*
if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) { if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) {
TensorCopy(pad_tensor, context.GetPlace(), context, seq_tensor); TensorCopy(pad_tensor, context.GetPlace(), context, seq_tensor);
seq_tensor->Resize(seq_tensor_dims); seq_tensor->Resize(seq_tensor_dims);
return; return;
} }
*/
const int kBlockSize = 512; const int kBlockSize = 512;
......
...@@ -290,8 +290,10 @@ class MatMulOp : public framework::OperatorWithKernel { ...@@ -290,8 +290,10 @@ class MatMulOp : public framework::OperatorWithKernel {
context->Attrs().Get<bool>("transpose_Y")); context->Attrs().Get<bool>("transpose_Y"));
PADDLE_ENFORCE_EQ(mat_dim_x.width_, mat_dim_y.height_); PADDLE_ENFORCE_EQ(mat_dim_x.width_, mat_dim_y.height_);
PADDLE_ENFORCE(mat_dim_x.batch_size_ == mat_dim_y.batch_size_ || if (context->IsRuntime()) {
mat_dim_x.batch_size_ == 0 || mat_dim_y.batch_size_ == 0); PADDLE_ENFORCE(mat_dim_x.batch_size_ == mat_dim_y.batch_size_ ||
mat_dim_x.batch_size_ == 0 || mat_dim_y.batch_size_ == 0);
}
std::vector<int64_t> dim_out; std::vector<int64_t> dim_out;
if (mat_dim_x.batch_size_ != 0) { if (mat_dim_x.batch_size_ != 0) {
dim_out = framework::vectorize(dim_x); dim_out = framework::vectorize(dim_x);
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include <memory> #include <memory>
#include "paddle/fluid/operators/concat_op.h" #include "paddle/fluid/operators/concat_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -38,15 +39,20 @@ static void EnforceLayouts(const std::vector<const Tensor*> inputs) { ...@@ -38,15 +39,20 @@ static void EnforceLayouts(const std::vector<const Tensor*> inputs) {
} }
static memory::primitive_desc CreateMemPrimDesc(const Tensor& input, static memory::primitive_desc CreateMemPrimDesc(const Tensor& input,
const mkldnn::engine& engine) { const mkldnn::engine& engine,
constexpr auto data_type = mkldnn::memory::f32; const memory::data_type& dt) {
const auto dims = paddle::framework::vectorize2int(input.dims()); const auto dims = paddle::framework::vectorize2int(input.dims());
const auto format = input.format(); const auto format = input.format();
auto description = memory::desc(dims, data_type, format); auto description = memory::desc(dims, dt, format);
auto mem_prim_desc = memory::primitive_desc(description, engine); auto mem_prim_desc = memory::primitive_desc(description, engine);
return mem_prim_desc; return mem_prim_desc;
} }
static mkldnn::memory::format GetDstMemFormat(
const concat::primitive_desc& concat_pd) {
return (memory::format)concat_pd.dst_primitive_desc().desc().data.format;
}
static platform::CPUPlace GetCpuPlace( static platform::CPUPlace GetCpuPlace(
const paddle::framework::ExecutionContext& ctx) { const paddle::framework::ExecutionContext& ctx) {
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
...@@ -61,14 +67,30 @@ static const mkldnn::engine& GetMKLDNNEngine( ...@@ -61,14 +67,30 @@ static const mkldnn::engine& GetMKLDNNEngine(
return dev_ctx.GetEngine(); return dev_ctx.GetEngine();
} }
std::string CreateKey(const paddle::framework::ExecutionContext& ctx,
const std::vector<const Tensor*> multi_input,
const int64_t& concat_axis, const memory::data_type& dt) {
std::string key;
key.reserve(platform::MKLDNNHandler::MaxKeyLength);
for (size_t i = 0; i < multi_input.size(); i++) {
platform::MKLDNNHandler::AppendKeyDims(
&key, paddle::framework::vectorize2int(multi_input[i]->dims()));
}
platform::MKLDNNHandler::AppendKey(&key, std::to_string(concat_axis));
platform::MKLDNNHandler::AppendKey(&key, ctx.op().Output("Out"));
platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt));
return key;
}
template <typename T> template <typename T>
class ConcatPrimitiveFactory { class ConcatPrimitiveFactory {
public: public:
concat::primitive_desc CreateConcatPrimDescriptor( concat::primitive_desc CreateConcatPrimDescriptor(
const std::vector<const Tensor*> multi_input, Tensor* output, const std::vector<const Tensor*> multi_input, Tensor* output,
int concat_axis, const mkldnn::engine& mkldnn_engine) { int concat_axis, const mkldnn::engine& mkldnn_engine,
CreateSourcesDescriptors(multi_input, mkldnn_engine); const memory::data_type& dt = memory::data_type::f32) {
auto dst_desc = CreateDstMemDescriptor(output); CreateSourcesDescriptors(multi_input, mkldnn_engine, dt);
auto dst_desc = CreateDstMemDescriptor(output, dt);
return concat::primitive_desc(dst_desc, concat_axis, srcs_pd); return concat::primitive_desc(dst_desc, concat_axis, srcs_pd);
} }
...@@ -79,23 +101,39 @@ class ConcatPrimitiveFactory { ...@@ -79,23 +101,39 @@ class ConcatPrimitiveFactory {
return concat(concat_pd, inputs, dst_mem.get()); return concat(concat_pd, inputs, dst_mem.get());
} }
void SetSrcDataHandleByIndex(const std::vector<memory>& srcs, const size_t& i,
void* handler) {
srcs[i].set_data_handle(handler);
}
void SetDstDataHandle(const memory& dst_mem, void* handler) {
dst_mem.set_data_handle(handler);
}
std::vector<memory> GetSrcs() { return srcs; }
memory GetDst() { return dst_mem.get(); }
private: private:
memory::desc CreateDstMemDescriptor(Tensor* output) { memory::desc CreateDstMemDescriptor(Tensor* output,
const memory::data_type& dt) {
auto dst_dims = paddle::framework::vectorize2int(output->dims()); auto dst_dims = paddle::framework::vectorize2int(output->dims());
return memory::desc(dst_dims, platform::MKLDNNGetDataType<T>(), return memory::desc(dst_dims, dt, memory::format::any);
memory::format::any);
} }
mkldnn::memory CreateDstMemory(const concat::primitive_desc& concat_pd, mkldnn::memory CreateDstMemory(const concat::primitive_desc& concat_pd,
Tensor* output, platform::CPUPlace place) { Tensor* output,
const platform::CPUPlace& place) {
return memory(concat_pd.dst_primitive_desc(), return memory(concat_pd.dst_primitive_desc(),
output->mutable_data<T>(place)); output->mutable_data<T>(place));
} }
void CreateSourcesDescriptors(const std::vector<const Tensor*> multi_input, void CreateSourcesDescriptors(const std::vector<const Tensor*> multi_input,
const mkldnn::engine& mkldnn_engine) { const mkldnn::engine& mkldnn_engine,
const memory::data_type& dt) {
for (size_t i = 0; i < multi_input.size(); i++) { for (size_t i = 0; i < multi_input.size(); i++) {
auto mem_prim_desc = CreateMemPrimDesc(*multi_input[i], mkldnn_engine); auto mem_prim_desc =
CreateMemPrimDesc(*multi_input[i], mkldnn_engine, dt);
srcs_pd.push_back(mem_prim_desc); srcs_pd.push_back(mem_prim_desc);
srcs.push_back( srcs.push_back(
memory(mem_prim_desc, to_void_cast(multi_input[i]->data<T>()))); memory(mem_prim_desc, to_void_cast(multi_input[i]->data<T>())));
...@@ -120,21 +158,59 @@ template <typename T> ...@@ -120,21 +158,59 @@ template <typename T>
class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> { class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public: public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override { void Compute(const paddle::framework::ExecutionContext& ctx) const override {
auto place = GetCpuPlace(ctx);
const auto& mkldnn_engine = GetMKLDNNEngine(ctx);
auto multi_input = ctx.MultiInput<Tensor>("X"); auto multi_input = ctx.MultiInput<Tensor>("X");
EnforceLayouts(multi_input); EnforceLayouts(multi_input);
Tensor* output = ctx.Output<Tensor>("Out"); Tensor* output = ctx.Output<Tensor>("Out");
int64_t concat_axis = static_cast<int64_t>(ctx.Attr<int>("axis")); int64_t concat_axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
auto& dev_ctx =
ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
auto place = GetCpuPlace(ctx);
memory::data_type dt =
paddle::framework::ToMKLDNNDataType(multi_input[0]->type());
ConcatPrimitiveFactory<T> prim_creator; ConcatPrimitiveFactory<T> prim_creator;
auto concat_pd = prim_creator.CreateConcatPrimDescriptor( std::string key = CreateKey(ctx, multi_input, concat_axis, dt);
multi_input, output, static_cast<int>(concat_axis), mkldnn_engine); const std::string key_prim = key + "@concat_p";
auto concat = prim_creator.CreateConcatPrimitive(concat_pd, output, place); const std::string key_concat_pd = key + "@concat_pd";
stream(stream::kind::eager).submit({concat}).wait(); const std::string key_srcs = key + "@concat_srcs";
const std::string key_dst = key + "@concat_dst";
std::shared_ptr<concat::primitive_desc> concat_pd;
std::shared_ptr<std::vector<memory>> srcs;
std::shared_ptr<memory> dst_mem;
auto concat_p = std::static_pointer_cast<concat>(dev_ctx.GetBlob(key_prim));
if (concat_p == nullptr) {
const auto& mkldnn_engine = dev_ctx.GetEngine();
concat_pd = std::make_shared<concat::primitive_desc>(
prim_creator.CreateConcatPrimDescriptor(multi_input, output,
static_cast<int>(concat_axis),
mkldnn_engine, dt));
concat_p = std::make_shared<concat>(
prim_creator.CreateConcatPrimitive(*concat_pd, output, place));
srcs = std::make_shared<std::vector<memory>>(prim_creator.GetSrcs());
dst_mem = std::make_shared<memory>(prim_creator.GetDst());
dev_ctx.SetBlob(key_prim, concat_p);
dev_ctx.SetBlob(key_concat_pd, concat_pd);
dev_ctx.SetBlob(key_srcs, srcs);
dev_ctx.SetBlob(key_dst, dst_mem);
} else {
srcs = std::static_pointer_cast<std::vector<memory>>(
dev_ctx.GetBlob(key_srcs));
dst_mem = std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_dst));
concat_pd = std::static_pointer_cast<concat::primitive_desc>(
dev_ctx.GetBlob(key_concat_pd));
for (size_t i = 0; i < multi_input.size(); i++) {
prim_creator.SetSrcDataHandleByIndex(
*srcs, i, to_void_cast<T>(multi_input[i]->data<T>()));
}
prim_creator.SetDstDataHandle(*dst_mem, output->mutable_data<T>(place));
}
stream(stream::kind::eager).submit({*concat_p}).wait();
output->set_mkldnn_prim_desc(concat_pd.dst_primitive_desc()); output->set_mkldnn_prim_desc(concat_pd->dst_primitive_desc());
} }
}; };
} // namespace operators } // namespace operators
...@@ -143,4 +219,6 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -143,4 +219,6 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace, REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace,
ops::ConcatMKLDNNOpKernel<float>) ops::ConcatMKLDNNOpKernel<float>,
ops::ConcatMKLDNNOpKernel<int8_t>,
ops::ConcatMKLDNNOpKernel<uint8_t>);
...@@ -92,12 +92,10 @@ static std::vector<std::vector<int>> NgraphOpIntervals( ...@@ -92,12 +92,10 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
int size = ops->size(); int size = ops->size();
int left = 0; int left = 0;
while (left < size && ops->at(left)->Type() != framework::kFeedOpType) { while (left < size && ops->at(left)->Type() != framework::kFeedOpType &&
ops->at(left)->Type() != framework::kFetchOpType) {
++left; ++left;
} }
if (left == size) {
return intervals;
}
while (left < size && ops->at(left)->Type() == framework::kFeedOpType) { while (left < size && ops->at(left)->Type() == framework::kFeedOpType) {
for (auto& var_name_item : ops->at(left)->Outputs()) { for (auto& var_name_item : ops->at(left)->Outputs()) {
...@@ -112,10 +110,6 @@ static std::vector<std::vector<int>> NgraphOpIntervals( ...@@ -112,10 +110,6 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
while (right < size && ops->at(right)->Type() != framework::kFetchOpType) { while (right < size && ops->at(right)->Type() != framework::kFetchOpType) {
++right; ++right;
} }
if (right == size) {
return intervals;
}
if (left >= right) return intervals;
int index = right; int index = right;
while (index < size && ops->at(index)->Type() == framework::kFetchOpType) { while (index < size && ops->at(index)->Type() == framework::kFetchOpType) {
...@@ -127,6 +121,10 @@ static std::vector<std::vector<int>> NgraphOpIntervals( ...@@ -127,6 +121,10 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
++index; ++index;
} }
if (left == size || ops->at(left)->Type() == framework::kFetchOpType) {
left = 0;
}
// (left, right - 1) represents indices between feed and fetch // (left, right - 1) represents indices between feed and fetch
int pivot = left; int pivot = left;
while (pivot < right) { while (pivot < right) {
...@@ -234,6 +232,7 @@ NgraphEngine::NgraphEngine(const framework::Scope& scope, ...@@ -234,6 +232,7 @@ NgraphEngine::NgraphEngine(const framework::Scope& scope,
} }
void NgraphEngine::Prepare(const std::vector<int>& interval) { void NgraphEngine::Prepare(const std::vector<int>& interval) {
bool has_fetch = false, is_full = false;
for (auto& var : p_bdesc->AllVars()) { for (auto& var : p_bdesc->AllVars()) {
if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS || if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS ||
var->GetType() == framework::proto::VarType::LOD_TENSOR || var->GetType() == framework::proto::VarType::LOD_TENSOR ||
...@@ -264,6 +263,9 @@ void NgraphEngine::Prepare(const std::vector<int>& interval) { ...@@ -264,6 +263,9 @@ void NgraphEngine::Prepare(const std::vector<int>& interval) {
std::vector<paddle::framework::OpDesc*> ops_desc; std::vector<paddle::framework::OpDesc*> ops_desc;
for (auto op_desc : p_bdesc->AllOps()) { for (auto op_desc : p_bdesc->AllOps()) {
ops_desc.emplace_back(op_desc); ops_desc.emplace_back(op_desc);
if (op_desc->Type() == framework::kFetchOpType) {
has_fetch = true;
}
} }
for (auto op_desc : ops_desc) { for (auto op_desc : ops_desc) {
...@@ -276,11 +278,11 @@ void NgraphEngine::Prepare(const std::vector<int>& interval) { ...@@ -276,11 +278,11 @@ void NgraphEngine::Prepare(const std::vector<int>& interval) {
if (interval[0] > 0 && if (interval[0] > 0 &&
ops_desc.at(interval[0] - 1)->Type() == framework::kFeedOpType && ops_desc.at(interval[0] - 1)->Type() == framework::kFeedOpType &&
interval[1] < static_cast<int>(ops_desc.size()) && interval[1] < static_cast<int>(ops_desc.size()) &&
ops_desc.at(interval.at(1))->Type() == framework::kFetchOpType) { ops_desc.at(interval[1])->Type() == framework::kFetchOpType) {
this->op_state_ = OpState::FULL; is_full = true;
} }
if (this->op_state_ == OpState::FULL) { if (is_full) {
this->op_state_ = this->is_test_ ? OpState::FULL_TEST : OpState::FULL_TRAIN; this->op_state_ = this->is_test_ ? OpState::FULL_TEST : OpState::FULL_TRAIN;
} else { } else {
this->op_state_ = this->op_state_ =
...@@ -293,7 +295,8 @@ void NgraphEngine::Prepare(const std::vector<int>& interval) { ...@@ -293,7 +295,8 @@ void NgraphEngine::Prepare(const std::vector<int>& interval) {
framework::OpRegistry::CreateOp(*(ops_desc[idx]))); framework::OpRegistry::CreateOp(*(ops_desc[idx])));
++idx; ++idx;
} }
while (ops_desc.at(idx)->Type() != framework::kFetchOpType) { while (idx < static_cast<int>(ops_desc.size()) &&
ops_desc.at(idx)->Type() != framework::kFetchOpType) {
auto op_desc = ops_desc.at(idx); auto op_desc = ops_desc.at(idx);
for (auto& var_name_item : op_desc->Inputs()) { for (auto& var_name_item : op_desc->Inputs()) {
for (auto& var_name : var_name_item.second) { for (auto& var_name : var_name_item.second) {
...@@ -303,6 +306,10 @@ void NgraphEngine::Prepare(const std::vector<int>& interval) { ...@@ -303,6 +306,10 @@ void NgraphEngine::Prepare(const std::vector<int>& interval) {
++idx; ++idx;
} }
if (!has_fetch) {
op_state_ = OpState::UNKNOWN;
}
BuildNgIO(ops_desc, interval); BuildNgIO(ops_desc, interval);
} }
...@@ -318,7 +325,8 @@ void NgraphEngine::BuildNgIO(const std::vector<framework::OpDesc*>& ops_desc, ...@@ -318,7 +325,8 @@ void NgraphEngine::BuildNgIO(const std::vector<framework::OpDesc*>& ops_desc,
const bool is_output = outputs.find(var_name) != outputs.end(); const bool is_output = outputs.find(var_name) != outputs.end();
if (!is_output && if (!is_output &&
std::find(var_in_.begin(), var_in_.end(), var_name) == std::find(var_in_.begin(), var_in_.end(), var_name) ==
var_in_.end()) { var_in_.end() &&
scope_.FindVar(var_name)) {
// fill var_in here to keep lhs and rhs order // fill var_in here to keep lhs and rhs order
this->var_in_.emplace_back(var_name); this->var_in_.emplace_back(var_name);
} }
...@@ -378,6 +386,7 @@ void NgraphEngine::BuildNgIO(const std::vector<framework::OpDesc*>& ops_desc, ...@@ -378,6 +386,7 @@ void NgraphEngine::BuildNgIO(const std::vector<framework::OpDesc*>& ops_desc,
} }
} }
} }
for (size_t i = 0; i < var_in_.size(); ++i) { for (size_t i = 0; i < var_in_.size(); ++i) {
auto var_name = var_in_[i]; auto var_name = var_in_[i];
if (persistables_.find(var_name) == persistables_.end()) { if (persistables_.find(var_name) == persistables_.end()) {
......
...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifndef PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_ #pragma once
#define PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
#include <memory> #include <memory>
#include <set> #include <set>
#include <string> #include <string>
...@@ -35,7 +35,6 @@ enum class OpState { /* nGraph support state on ops */ ...@@ -35,7 +35,6 @@ enum class OpState { /* nGraph support state on ops */
PARTIAL_TRAIN, /* Support partial ops for train */ PARTIAL_TRAIN, /* Support partial ops for train */
FULL_TEST, /* Support full list of ops for test */ FULL_TEST, /* Support full list of ops for test */
PARTIAL_TEST, /* Support partial list of ops for test */ PARTIAL_TEST, /* Support partial list of ops for test */
FULL, /* All ops supported from feed to fetch */
UNKNOWN /* Output all for debug purpose */ UNKNOWN /* Output all for debug purpose */
}; };
...@@ -119,4 +118,3 @@ class NgraphEngine { ...@@ -119,4 +118,3 @@ class NgraphEngine {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
#endif // PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
...@@ -27,13 +27,9 @@ namespace paddle { ...@@ -27,13 +27,9 @@ namespace paddle {
namespace operators { namespace operators {
namespace ngraphs { namespace ngraphs {
void BuildCrossEntropyNode( std::shared_ptr<ngraph::Node> GetCrossEntropy(
const std::shared_ptr<paddle::framework::OperatorBase>& op, std::shared_ptr<ngraph::Node> x, std::shared_ptr<ngraph::Node> label,
std::shared_ptr< const bool is_soft_label, int ignore_index) {
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
auto label_shape = label->get_shape(); auto label_shape = label->get_shape();
auto x_shape = x->get_shape(); auto x_shape = x->get_shape();
auto label_rank = label_shape.size(); auto label_rank = label_shape.size();
...@@ -46,18 +42,16 @@ void BuildCrossEntropyNode( ...@@ -46,18 +42,16 @@ void BuildCrossEntropyNode(
label_2d = paddle::platform::NgReshaper(label, label_2d_shape); label_2d = paddle::platform::NgReshaper(label, label_2d_shape);
} }
if (x_rank > 2) { if (x_rank > 2) {
x_2d_shape = paddle::platform::FlattenTo2d(x_shape, x_rank - 1); x_2d_shape = platform::FlattenTo2d(x_shape, x_rank - 1);
x_2d = paddle::platform::NgReshaper(x, x_2d_shape); x_2d = platform::NgReshaper(x, x_2d_shape);
} }
auto batch_size = x_2d_shape.at(0); auto batch_size = x_2d_shape.at(0);
auto op_attrs = paddle::framework::AttrReader(op->Attrs());
const bool is_soft_label = op_attrs.Get<bool>("soft_label");
std::shared_ptr<ngraph::Node> node_1_hot = label_2d; std::shared_ptr<ngraph::Node> node_1_hot = label_2d;
if (!is_soft_label) { if (!is_soft_label) {
auto label_1d = paddle::platform::NgReshaper( auto label_1d =
label_2d, ngraph::Shape{label_2d_shape.at(0)}); platform::NgReshaper(label_2d, ngraph::Shape{label_2d_shape.at(0)});
node_1_hot = std::make_shared<ngraph::op::OneHot>(label_1d, x_2d_shape, 1); node_1_hot = std::make_shared<ngraph::op::OneHot>(label_1d, x_2d_shape, 1);
} }
if (x->get_element_type() != node_1_hot->get_element_type()) { if (x->get_element_type() != node_1_hot->get_element_type()) {
...@@ -76,11 +70,9 @@ void BuildCrossEntropyNode( ...@@ -76,11 +70,9 @@ void BuildCrossEntropyNode(
auto node_sum = auto node_sum =
std::make_shared<ngraph::op::Sum>(node_mul, ngraph::AxisSet{1}); std::make_shared<ngraph::op::Sum>(node_mul, ngraph::AxisSet{1});
auto node_neg = std::make_shared<ngraph::op::Negative>(node_sum); auto node_neg = std::make_shared<ngraph::op::Negative>(node_sum);
auto xe = auto xe = platform::NgReshaper(node_neg, ngraph::Shape{batch_size, 1});
paddle::platform::NgReshaper(node_neg, ngraph::Shape{batch_size, 1});
if (!is_soft_label) { if (!is_soft_label) {
auto ignore_index = op_attrs.Get<int>("ignore_index");
auto ignore_node = ngraph::op::Constant::create( auto ignore_node = ngraph::op::Constant::create(
label->get_element_type(), label_2d_shape, {ignore_index}); label->get_element_type(), label_2d_shape, {ignore_index});
auto not_equal_node = auto not_equal_node =
...@@ -89,21 +81,13 @@ void BuildCrossEntropyNode( ...@@ -89,21 +81,13 @@ void BuildCrossEntropyNode(
xe->get_element_type()); xe->get_element_type());
xe = xe * mask; xe = xe * mask;
} }
return xe;
paddle::platform::SetOutputNode(op, "Y", xe, ngb_node_map);
} }
void BuildCrossEntropyGradNode( std::shared_ptr<ngraph::Node> GetCrossEntropyGrad(
const std::shared_ptr<paddle::framework::OperatorBase>& op, std::shared_ptr<ngraph::Node> x, std::shared_ptr<ngraph::Node> label,
std::shared_ptr< std::shared_ptr<ngraph::Node> dy, const bool is_soft_label,
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> int ignore_index) {
ngb_node_map) {
auto op_attrs = paddle::framework::AttrReader(op->Attrs());
const bool is_soft_label = op_attrs.Get<bool>("soft_label");
auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
auto dy = paddle::platform::GetInputNode(op, "Y@GRAD", ngb_node_map);
auto x_shape = x->get_shape(); auto x_shape = x->get_shape();
auto rank = x_shape.size(); auto rank = x_shape.size();
...@@ -111,9 +95,8 @@ void BuildCrossEntropyGradNode( ...@@ -111,9 +95,8 @@ void BuildCrossEntropyGradNode(
if (!is_soft_label) { if (!is_soft_label) {
auto label_shape = label->get_shape(); auto label_shape = label->get_shape();
label_shape.pop_back(); label_shape.pop_back();
label = paddle::platform::NgReshaper(label, label_shape); label = platform::NgReshaper(label, label_shape);
auto ignore_index = op_attrs.Get<int>("ignore_index");
auto ignore_node = ngraph::op::Constant::create( auto ignore_node = ngraph::op::Constant::create(
label->get_element_type(), label_shape, {ignore_index}); label->get_element_type(), label_shape, {ignore_index});
auto not_equal_node = auto not_equal_node =
...@@ -128,7 +111,7 @@ void BuildCrossEntropyGradNode( ...@@ -128,7 +111,7 @@ void BuildCrossEntropyGradNode(
auto dy_shape = dy->get_shape(); auto dy_shape = dy->get_shape();
dy_shape.pop_back(); dy_shape.pop_back();
auto dy_reshape = paddle::platform::NgReshaper(dy, dy_shape); auto dy_reshape = platform::NgReshaper(dy, dy_shape);
auto dy_bcast = std::make_shared<ngraph::op::Broadcast>( auto dy_bcast = std::make_shared<ngraph::op::Broadcast>(
dy_reshape, x_shape, ngraph::AxisSet{rank - 1}); dy_reshape, x_shape, ngraph::AxisSet{rank - 1});
if (x->get_element_type() != label->get_element_type()) { if (x->get_element_type() != label->get_element_type()) {
...@@ -140,7 +123,35 @@ void BuildCrossEntropyGradNode( ...@@ -140,7 +123,35 @@ void BuildCrossEntropyGradNode(
if (!is_soft_label) { if (!is_soft_label) {
xe_grad = xe_grad * mask; xe_grad = xe_grad * mask;
} }
return xe_grad;
}
void BuildCrossEntropyNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
auto op_attrs = paddle::framework::AttrReader(op->Attrs());
const bool is_soft_label = op_attrs.Get<bool>("soft_label");
int ignore_index = op_attrs.Get<int>("ignore_index");
auto xe = GetCrossEntropy(x, label, is_soft_label, ignore_index);
paddle::platform::SetOutputNode(op, "Y", xe, ngb_node_map);
}
void BuildCrossEntropyGradNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto op_attrs = paddle::framework::AttrReader(op->Attrs());
const bool is_soft_label = op_attrs.Get<bool>("soft_label");
int ignore_index = op_attrs.Get<int>("ignore_index");
auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
auto dy = paddle::platform::GetInputNode(op, "Y@GRAD", ngb_node_map);
auto xe_grad = GetCrossEntropyGrad(x, label, dy, is_soft_label, ignore_index);
paddle::platform::SetOutputNode(op, "X@GRAD", xe_grad, ngb_node_map); paddle::platform::SetOutputNode(op, "X@GRAD", xe_grad, ngb_node_map);
} }
} // namespace ngraphs } // namespace ngraphs
......
...@@ -27,12 +27,7 @@ namespace paddle { ...@@ -27,12 +27,7 @@ namespace paddle {
namespace operators { namespace operators {
namespace ngraphs { namespace ngraphs {
void BuildSoftmaxNode( std::shared_ptr<ngraph::Node> GetSoftmax(std::shared_ptr<ngraph::Node> x) {
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto x_shape = x->get_shape(); auto x_shape = x->get_shape();
int rank = x_shape.size(); int rank = x_shape.size();
auto x_2d_shape = paddle::platform::FlattenTo2d(x_shape, rank - 1); auto x_2d_shape = paddle::platform::FlattenTo2d(x_shape, rank - 1);
...@@ -47,16 +42,11 @@ void BuildSoftmaxNode( ...@@ -47,16 +42,11 @@ void BuildSoftmaxNode(
-64., x_shifted); -64., x_shifted);
auto softmax = auto softmax =
std::make_shared<ngraph::op::Softmax>(x_clipped, ngraph::AxisSet{1}); std::make_shared<ngraph::op::Softmax>(x_clipped, ngraph::AxisSet{1});
paddle::platform::SetOutputNode(op, "Out", softmax, ngb_node_map); return softmax;
} }
void BuildSoftmaxGradNode( std::shared_ptr<ngraph::Node> GetSoftmaxGrad(
const std::shared_ptr<paddle::framework::OperatorBase>& op, std::shared_ptr<ngraph::Node> out, std::shared_ptr<ngraph::Node> dout) {
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto out = paddle::platform::GetInputNode(op, "Out", ngb_node_map);
auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
auto out_shape = out->get_shape(); auto out_shape = out->get_shape();
int rank = out_shape.size(); int rank = out_shape.size();
auto out_2d_shape = paddle::platform::FlattenTo2d(out_shape, rank - 1); auto out_2d_shape = paddle::platform::FlattenTo2d(out_shape, rank - 1);
...@@ -70,6 +60,27 @@ void BuildSoftmaxGradNode( ...@@ -70,6 +60,27 @@ void BuildSoftmaxGradNode(
auto node_bcast = std::make_shared<ngraph::op::Broadcast>( auto node_bcast = std::make_shared<ngraph::op::Broadcast>(
node_sum, out_2d_shape, ngraph::AxisSet{1}); node_sum, out_2d_shape, ngraph::AxisSet{1});
auto dx = (dout - node_bcast) * out; auto dx = (dout - node_bcast) * out;
return dx;
}
void BuildSoftmaxNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto softmax = GetSoftmax(x);
paddle::platform::SetOutputNode(op, "Out", softmax, ngb_node_map);
}
void BuildSoftmaxGradNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto out = paddle::platform::GetInputNode(op, "Out", ngb_node_map);
auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
auto dx = GetSoftmaxGrad(out, dout);
paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map); paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map);
} }
} // namespace ngraphs } // namespace ngraphs
......
/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/cross_entropy_op.h"
#include "paddle/fluid/operators/ngraph/ops/softmax_op.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace ngraphs {
void BuildSoftmaxWithCrossEntropyNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto logits = paddle::platform::GetInputNode(op, "Logits", ngb_node_map);
auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
auto softmax = paddle::operators::ngraphs::GetSoftmax(logits);
auto op_attrs = framework::AttrReader(op->Attrs());
const bool is_soft_label = op_attrs.Get<bool>("soft_label");
int ignore_index = op_attrs.Get<int>("ignore_index");
auto xe = paddle::operators::ngraphs::GetCrossEntropy(
softmax, label, is_soft_label, ignore_index);
paddle::platform::SetOutputNode(op, "Softmax", softmax, ngb_node_map);
paddle::platform::SetOutputNode(op, "Loss", xe, ngb_node_map);
}
void BuildSoftmaxWithCrossEntropyGradNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto op_attrs = framework::AttrReader(op->Attrs());
const bool is_soft_label = op_attrs.Get<bool>("soft_label");
auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
auto softmax = paddle::platform::GetInputNode(op, "Softmax", ngb_node_map);
auto loss_grad =
paddle::platform::GetInputNode(op, "Loss@GRAD", ngb_node_map);
auto softmax_shape = softmax->get_shape();
auto rank = softmax_shape.size();
if (!is_soft_label) {
auto label_shape = label->get_shape();
label_shape.pop_back();
label = platform::NgReshaper(label, label_shape);
label =
std::make_shared<ngraph::op::OneHot>(label, softmax_shape, rank - 1);
}
auto loss_grad_shape = loss_grad->get_shape();
loss_grad_shape.pop_back();
auto loss_grad_reshape = platform::NgReshaper(loss_grad, loss_grad_shape);
auto loss_grad_bcast = std::make_shared<ngraph::op::Broadcast>(
loss_grad_reshape, softmax_shape, ngraph::AxisSet{rank - 1});
if (softmax->get_element_type() != label->get_element_type()) {
label = std::make_shared<ngraph::op::Convert>(label,
softmax->get_element_type());
}
auto logits_grad = loss_grad_bcast * (softmax - label);
paddle::platform::SetOutputNode(op, "Logits@GRAD", logits_grad, ngb_node_map);
}
} // namespace ngraphs
} // namespace operators
} // namespace paddle
REGISTER_NG_OP(softmax_with_cross_entropy, BuildSoftmaxWithCrossEntropyNode);
REGISTER_NG_OP(softmax_with_cross_entropy_grad,
BuildSoftmaxWithCrossEntropyGradNode);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/range_op.h"
namespace paddle {
namespace operators {
class RangeOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
if (ctx->HasInput("Start")) {
auto s_dims = ctx->GetInputDim("Start");
PADDLE_ENFORCE((s_dims.size() == 1) && (s_dims[0] == 1),
"The shape of Input(Start) should be [1].");
}
if (ctx->HasInput("End")) {
auto e_dims = ctx->GetInputDim("End");
PADDLE_ENFORCE((e_dims.size() == 1) && (e_dims[0] == 1),
"The shape of Input(End) should be [1].");
}
if (ctx->HasInput("Step")) {
auto step_dims = ctx->GetInputDim("Step");
PADDLE_ENFORCE((step_dims.size() == 1) && (step_dims[0] == 1),
"The shape of Input(Step) should be [1].");
}
ctx->SetOutputDim("Out", {-1});
}
};
class RangeOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Start",
"Start of interval. The interval includes this value. It is a "
"tensor with shape=[1].");
AddInput("End",
"End of interval. The interval does not include this value, "
"except in some cases where step is not an integer and floating "
"point round-off affects the length of out. It is a tensor with "
"shape=[1].");
AddInput("Step", "Spacing between values. It is a tensor with shape=[1].");
AddOutput("Out", "A sequence of numbers.");
AddComment(R"DOC(
Return evenly spaced values within a given interval. Values are generated within the half-open interval [start, stop) (in other words, the interval including start but excluding stop). Like arange function of numpy.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(range, ops::RangeOp, ops::RangeOpMaker);
REGISTER_OP_CPU_KERNEL(range, ops::CPURangeKernel<int>,
ops::CPURangeKernel<float>, ops::CPURangeKernel<double>,
ops::CPURangeKernel<int64_t>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/range_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle {
namespace operators {
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__global__ void RangeKernel(T start, T step, int64_t size, T* out) {
CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
}
template <typename T>
class CUDARangeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* start_t = context.Input<framework::Tensor>("Start");
auto* end_t = context.Input<framework::Tensor>("End");
auto* step_t = context.Input<framework::Tensor>("Step");
auto* out = context.Output<framework::Tensor>("Out");
framework::Tensor n;
framework::TensorCopy(*start_t, platform::CPUPlace(), &n);
T start = n.data<T>()[0];
framework::TensorCopy(*end_t, platform::CPUPlace(), &n);
T end = n.data<T>()[0];
framework::TensorCopy(*step_t, platform::CPUPlace(), &n);
T step = n.data<T>()[0];
int64_t size = 0;
GetSize(start, end, step, &size);
out->Resize(framework::make_ddim({size}));
T* out_data = out->mutable_data<T>(context.GetPlace());
auto stream = context.cuda_device_context().stream();
int block = 512;
int grid = (size + block - 1) / block;
RangeKernel<T><<<grid, block, 0, stream>>>(start, step, size, out_data);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(range, ops::CUDARangeKernel<int>,
ops::CUDARangeKernel<int64_t>,
ops::CUDARangeKernel<float>,
ops::CUDARangeKernel<double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <functional>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
template <typename T>
void GetSize(T start, T end, T step, int64_t* size) {
PADDLE_ENFORCE(!std::equal_to<T>()(step, 0),
"The step of range op should not be 0.");
PADDLE_ENFORCE(((start < end) && (step > 0)) || ((start > end) && (step < 0)),
"The step should be greater than 0 while start < end. And the "
"step should be less than 0 while start > end.");
*size = std::is_integral<T>::value
? ((std::abs(end - start) + std::abs(step) - 1) / std::abs(step))
: std::ceil(std::abs((end - start) / step));
}
template <typename T>
class CPURangeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
T start = context.Input<framework::Tensor>("Start")->data<T>()[0];
T end = context.Input<framework::Tensor>("End")->data<T>()[0];
T step = context.Input<framework::Tensor>("Step")->data<T>()[0];
auto* out = context.Output<framework::Tensor>("Out");
int64_t size = 0;
GetSize(start, end, step, &size);
out->Resize(framework::make_ddim({size}));
T* out_data = out->mutable_data<T>(context.GetPlace());
T value = start;
for (int64_t i = 0; i < size; ++i) {
out_data[i] = value;
value += step;
}
}
};
} // namespace operators
} // namespace paddle
...@@ -12,87 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,87 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <stdint.h> #include <string>
#include <fstream>
#include <numeric> #include "paddle/fluid/operators/save_combine_op.h"
#include <sstream>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class SaveCombineOp : public framework::OperatorBase { class SaveCombineOp : public framework::OperatorWithKernel {
public: public:
SaveCombineOp(const std::string &type, using framework::OperatorWithKernel::OperatorWithKernel;
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto filename = Attr<std::string>("file_path");
auto overwrite = Attr<bool>("overwrite");
auto save_as_fp16 = Attr<bool>("save_as_fp16");
bool is_present = FileExists(filename);
if (is_present && !overwrite) {
PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
filename, overwrite);
}
MkDirRecursively(DirName(filename).c_str());
std::ofstream fout(filename, std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
filename);
auto inp_var_names = Inputs("X");
PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
"The number of input variables should be greater than 0");
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
for (size_t i = 0; i < inp_var_names.size(); i++) { void InferShape(framework::InferShapeContext *ctx) const override {}
auto *var = scope.FindVar(inp_var_names[i]);
PADDLE_ENFORCE(var != nullptr,
"Cannot find variable %s for save_combine_op",
inp_var_names[i]);
PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
"SaveCombineOp only supports LoDTensor, %s has wrong type",
inp_var_names[i]);
auto &tensor = var->Get<framework::LoDTensor>();
// Serialize tensors one by one
// Check types to see if a fp16 transformation is required
auto in_dtype = tensor.type();
auto out_dtype =
save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
if (in_dtype != out_dtype) {
auto in_kernel_type = framework::OpKernelType(in_dtype, place);
auto out_kernel_type = framework::OpKernelType(out_dtype, place);
framework::LoDTensor out;
// copy LoD info to the new tensor
out.set_lod(tensor.lod());
framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
framework::SerializeToStream(fout, out, dev_ctx);
} else {
framework::SerializeToStream(fout, tensor, dev_ctx);
}
}
fout.close();
}
}; };
class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker { class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
...@@ -105,7 +36,7 @@ class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker { ...@@ -105,7 +36,7 @@ class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC( AddComment(R"DOC(
SaveCombine operator SaveCombine operator
This operator will serialize and write a list of input LoDTensor variables This operator will serialize and write a list of input LoDTensor variables
to a file on disk. to a file on disk.
)DOC"); )DOC");
AddAttr<bool>("overwrite", AddAttr<bool>("overwrite",
...@@ -134,3 +65,10 @@ namespace ops = paddle::operators; ...@@ -134,3 +65,10 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(save_combine, ops::SaveCombineOp, REGISTER_OPERATOR(save_combine, ops::SaveCombineOp,
ops::SaveCombineOpProtoMaker); ops::SaveCombineOpProtoMaker);
REGISTER_OP_CPU_KERNEL(
save_combine,
ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/save_combine_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
save_combine,
ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#include <fstream>
#include <numeric>
#include <sstream>
#include <string>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/port.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class SaveCombineOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto place = ctx.GetPlace();
auto filename = ctx.Attr<std::string>("file_path");
auto overwrite = ctx.Attr<bool>("overwrite");
auto save_as_fp16 = ctx.Attr<bool>("save_as_fp16");
bool is_present = FileExists(filename);
if (is_present && !overwrite) {
PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
filename, overwrite);
}
MkDirRecursively(DirName(filename).c_str());
std::ofstream fout(filename, std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
filename);
auto &inp_var_names = ctx.Inputs("X");
auto &inp_vars = ctx.MultiInputVar("X");
PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
"The number of input variables should be greater than 0");
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
for (size_t i = 0; i < inp_var_names.size(); i++) {
PADDLE_ENFORCE(inp_vars[i] != nullptr,
"Cannot find variable %s for save_combine_op",
inp_var_names[i]);
PADDLE_ENFORCE(inp_vars[i]->IsType<framework::LoDTensor>(),
"SaveCombineOp only supports LoDTensor, %s has wrong type",
inp_var_names[i]);
auto &tensor = inp_vars[i]->Get<framework::LoDTensor>();
// Serialize tensors one by one
// Check types to see if a fp16 transformation is required
auto in_dtype = tensor.type();
auto out_dtype =
save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
if (in_dtype != out_dtype) {
auto in_kernel_type = framework::OpKernelType(in_dtype, place);
auto out_kernel_type = framework::OpKernelType(out_dtype, place);
framework::LoDTensor out;
// copy LoD info to the new tensor
out.set_lod(tensor.lod());
framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
framework::SerializeToStream(fout, out, dev_ctx);
} else {
framework::SerializeToStream(fout, tensor, dev_ctx);
}
}
fout.close();
}
};
} // namespace operators
} // namespace paddle
...@@ -19,8 +19,8 @@ limitations under the License. */ ...@@ -19,8 +19,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
USE_NO_KERNEL_OP(save_combine); USE_CPU_ONLY_OP(save_combine);
USE_NO_KERNEL_OP(load_combine); USE_CPU_ONLY_OP(load_combine);
template <typename T, typename U> template <typename T, typename U>
T* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info, T* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
......
...@@ -16,8 +16,8 @@ limitations under the License. */ ...@@ -16,8 +16,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
USE_NO_KERNEL_OP(save); USE_CPU_ONLY_OP(save);
USE_NO_KERNEL_OP(load); USE_CPU_ONLY_OP(load);
TEST(SaveLoadOp, CPU) { TEST(SaveLoadOp, CPU) {
paddle::framework::Scope scope; paddle::framework::Scope scope;
......
...@@ -15,118 +15,24 @@ limitations under the License. */ ...@@ -15,118 +15,24 @@ limitations under the License. */
#include <stdint.h> #include <stdint.h>
#include <fstream> #include <fstream>
#include <numeric> #include <numeric>
#include <string>
#include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/operators/save_op.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class SaveOp : public framework::OperatorWithKernel {
// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables
// to directory specified.
constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
class SaveOp : public framework::OperatorBase {
public: public:
SaveOp(const std::string &type, const framework::VariableNameMap &inputs, using framework::OperatorWithKernel::OperatorWithKernel;
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto iname = Input("X");
auto *var = scope.FindVar(iname);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op",
iname);
if (var->IsType<framework::LoDTensor>()) {
SaveLodTensor(place, var);
} else if (var->IsType<framework::SelectedRows>()) {
SaveSelectedRows(scope, place, var);
} else {
PADDLE_ENFORCE(
false,
"SaveOp only support LoDTensor and SelectedRows, %s has wrong type",
iname);
}
}
void SaveLodTensor(const platform::Place &place, void InferShape(framework::InferShapeContext *ctx) const override {}
framework::Variable *var) const {
auto filename = Attr<std::string>("file_path");
auto overwrite = Attr<bool>("overwrite");
if (FileExists(filename) && !overwrite) {
PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
filename, overwrite);
}
MkDirRecursively(DirName(filename).c_str());
auto &tensor = var->Get<framework::LoDTensor>();
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
// FIXME(yuyang18): We save variable to local file now, but we should change
// it to save an output stream.
std::ofstream fout(filename, std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
filename);
auto save_as_fp16 = Attr<bool>("save_as_fp16");
auto in_dtype = tensor.type();
auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
if (in_dtype != out_dtype) {
auto in_kernel_type = framework::OpKernelType(in_dtype, place);
auto out_kernel_type = framework::OpKernelType(out_dtype, place);
framework::LoDTensor out;
framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
// copy LoD info to the new tensor
out.set_lod(tensor.lod());
framework::SerializeToStream(fout, out, dev_ctx);
} else {
framework::SerializeToStream(fout, tensor, dev_ctx);
}
fout.close();
}
void SaveSelectedRows(const framework::Scope &scope, protected:
const platform::Place &place, framework::OpKernelType GetExpectedKernelType(
framework::Variable *var) const { const framework::ExecutionContext &ctx) const override {
auto *lt_var = scope.FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>(); return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
PADDLE_ENFORCE( ctx.GetPlace());
lt_var != nullptr,
"Can not find variable kLookupTablePath for SaveSelectedRows");
std::string filename = lt_var->data();
VLOG(4) << "SaveSelectedRows get File name: " << filename;
MkDirRecursively(DirName(filename).c_str());
auto &selectedRows = var->Get<framework::SelectedRows>();
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
// FIXME(yuyang18): We save variable to local file now, but we should change
// it to save an output stream.
std::ofstream fout(filename, std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
filename);
framework::SerializeToStream(fout, selectedRows, dev_ctx);
fout.close();
} }
}; };
...@@ -154,14 +60,20 @@ This operator will serialize and write LoDTensor / SelectedRows variable to file ...@@ -154,14 +60,20 @@ This operator will serialize and write LoDTensor / SelectedRows variable to file
"The \"file_path\" where the variable will be saved.") "The \"file_path\" where the variable will be saved.")
.AddCustomChecker( .AddCustomChecker(
[](const std::string &path) { return !path.empty(); }); [](const std::string &path) { return !path.empty(); });
AddOutput(LOOKUP_TABLE_PATH,
"(string)"
"for pserver: The \"kLookupTablePath\" where checkpoint notify "
"to save lookup table variables"
" to directory specified.")
.AsDispensable();
} }
}; };
class SaveOpVarTypeInference : public framework::VarTypeInference { class SaveOpVarTypeInference : public framework::VarTypeInference {
public: public:
void operator()(framework::InferVarTypeContext *ctx) const override { void operator()(framework::InferVarTypeContext *ctx) const override {
auto out_var_name = ctx->Output(LOOKUP_TABLE_PATH).front(); auto var_type = framework::proto::VarType::RAW;
ctx->SetType(out_var_name, framework::proto::VarType::RAW); ctx->SetType(LOOKUP_TABLE_PATH, var_type);
} }
}; };
...@@ -169,11 +81,18 @@ class SaveOpShapeInference : public framework::InferShapeBase { ...@@ -169,11 +81,18 @@ class SaveOpShapeInference : public framework::InferShapeBase {
public: public:
void operator()(framework::InferShapeContext *ctx) const override {} void operator()(framework::InferShapeContext *ctx) const override {}
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(save, ops::SaveOp, paddle::framework::EmptyGradOpMaker, REGISTER_OPERATOR(save, ops::SaveOp, ops::SaveOpProtoMaker,
ops::SaveOpProtoMaker, ops::SaveOpVarTypeInference, ops::SaveOpVarTypeInference, ops::SaveOpShapeInference);
ops::SaveOpShapeInference);
REGISTER_OP_CPU_KERNEL(
save, ops::SaveOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::SaveOpKernel<paddle::platform::CPUDeviceContext, double>,
ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/save_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
save, ops::SaveOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::SaveOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::SaveOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::SaveOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::SaveOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SaveOpKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#include <fstream>
#include <numeric>
#include <string>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/variable.h"
namespace paddle {
namespace operators {
// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables
// to directory specified.
constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
template <typename DeviceContext, typename T>
class SaveOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto place = ctx.GetPlace();
auto *input_var = ctx.InputVar("X");
auto iname = ctx.Inputs("X").data();
PADDLE_ENFORCE(input_var != nullptr, "Cannot find variable %s for save_op",
iname);
if (input_var->IsType<framework::LoDTensor>()) {
SaveLodTensor(ctx, place, input_var);
} else if (input_var->IsType<framework::SelectedRows>()) {
SaveSelectedRows(ctx, place, input_var);
} else {
PADDLE_ENFORCE(
false,
"SaveOp only support LoDTensor and SelectedRows, %s has wrong type",
iname);
}
}
void SaveLodTensor(const framework::ExecutionContext &ctx,
const platform::Place &place,
const framework::Variable *var) const {
auto filename = ctx.Attr<std::string>("file_path");
auto overwrite = ctx.Attr<bool>("overwrite");
if (FileExists(filename) && !overwrite) {
PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
filename, overwrite);
}
MkDirRecursively(DirName(filename).c_str());
auto &tensor = var->Get<framework::LoDTensor>();
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
// FIXME(yuyang18): We save variable to local file now, but we should change
// it to save an output stream.
std::ofstream fout(filename, std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
filename);
auto save_as_fp16 = ctx.Attr<bool>("save_as_fp16");
auto in_dtype = tensor.type();
auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
if (in_dtype != out_dtype) {
auto in_kernel_type = framework::OpKernelType(in_dtype, place);
auto out_kernel_type = framework::OpKernelType(out_dtype, place);
framework::LoDTensor out;
framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
// copy LoD info to the new tensor
out.set_lod(tensor.lod());
framework::SerializeToStream(fout, out, dev_ctx);
} else {
framework::SerializeToStream(fout, tensor, dev_ctx);
}
fout.close();
}
void SaveSelectedRows(const framework::ExecutionContext &ctx,
const platform::Place &place,
const framework::Variable *var) const {
framework::Variable *out_put_var = ctx.OutputVar(LOOKUP_TABLE_PATH);
PADDLE_ENFORCE(
out_put_var != nullptr,
"Can not find variable kLookupTablePath for SaveSelectedRows");
auto *lt_var = out_put_var->GetMutable<std::string>();
std::string filename = lt_var->data();
VLOG(4) << "SaveSelectedRows get File name: " << filename;
MkDirRecursively(DirName(filename).c_str());
auto &selectedRows = var->Get<framework::SelectedRows>();
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
// FIXME(yuyang18): We save variable to local file now, but we should change
// it to save an output stream.
std::ofstream fout(filename, std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
filename);
framework::SerializeToStream(fout, selectedRows, dev_ctx);
fout.close();
}
};
} // namespace operators
} // namespace paddle
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
#include <memory>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -187,7 +188,6 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker { ...@@ -187,7 +188,6 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
grad_op->SetType("softmax_with_cross_entropy_grad"); grad_op->SetType("softmax_with_cross_entropy_grad");
grad_op->SetInput("Label", Input("Label")); grad_op->SetInput("Label", Input("Label"));
grad_op->SetInput("Softmax", Output("Softmax")); grad_op->SetInput("Softmax", Output("Softmax"));
grad_op->SetInput("Loss", Output("Loss"));
grad_op->SetInput(framework::GradVarName("Softmax"), OutputGrad("Softmax")); grad_op->SetInput(framework::GradVarName("Softmax"), OutputGrad("Softmax"));
grad_op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); grad_op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits")); grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
......
...@@ -40,7 +40,7 @@ class SqueezeOpInferShape : public framework::InferShapeBase { ...@@ -40,7 +40,7 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
"tensor's rank."); "tensor's rank.");
} }
auto out_dims = GetOutputShape(axes, x_dims); auto out_dims = GetOutputShape(axes, x_dims, false);
ctx->SetOutputDim("Out", out_dims); ctx->SetOutputDim("Out", out_dims);
if (x_dims[0] == out_dims[0]) { if (x_dims[0] == out_dims[0]) {
// Only pass LoD when the first dimension of output and Input(X) // Only pass LoD when the first dimension of output and Input(X)
...@@ -50,7 +50,8 @@ class SqueezeOpInferShape : public framework::InferShapeBase { ...@@ -50,7 +50,8 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
} }
static framework::DDim GetOutputShape(const std::vector<int> squeeze_dims, static framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
const framework::DDim &in_dims) { const framework::DDim &in_dims,
bool is_runtime) {
size_t num_squeeze_dims = squeeze_dims.size(); size_t num_squeeze_dims = squeeze_dims.size();
int cnt_squeezed_dims = 0; int cnt_squeezed_dims = 0;
bool should_squeeze[9] = {false}; bool should_squeeze[9] = {false};
...@@ -71,9 +72,12 @@ class SqueezeOpInferShape : public framework::InferShapeBase { ...@@ -71,9 +72,12 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
// Check current index, the upper limit has beed checked in line 36. // Check current index, the upper limit has beed checked in line 36.
PADDLE_ENFORCE(current >= 0, PADDLE_ENFORCE(current >= 0,
"Invalid axis, the negative axis is out of range."); "Invalid axis, the negative axis is out of range.");
PADDLE_ENFORCE(in_dims[current] == 1,
"Invalid axis index, the axis that will be squeezed " if (is_runtime) {
"should be equal to 1."); PADDLE_ENFORCE(in_dims[current] == 1,
"Invalid axis index, the axis that will be squeezed "
"should be equal to 1.");
}
if (!(should_squeeze[current])) { if (!(should_squeeze[current])) {
++cnt_squeezed_dims; ++cnt_squeezed_dims;
...@@ -104,7 +108,7 @@ class SqueezeOp : public framework::OperatorBase { ...@@ -104,7 +108,7 @@ class SqueezeOp : public framework::OperatorBase {
const platform::Place &place) const override { const platform::Place &place) const override {
auto &axes = Attr<std::vector<int>>("axes"); auto &axes = Attr<std::vector<int>>("axes");
auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims(); auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
auto out_dims = SqueezeOpInferShape::GetOutputShape(axes, x_dims); auto out_dims = SqueezeOpInferShape::GetOutputShape(axes, x_dims, true);
framework::AttributeMap attrs; framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(out_dims); attrs["shape"] = framework::vectorize2int(out_dims);
...@@ -224,7 +228,7 @@ class Squeeze2Op : public framework::OperatorBase { ...@@ -224,7 +228,7 @@ class Squeeze2Op : public framework::OperatorBase {
const platform::Place &place) const override { const platform::Place &place) const override {
auto &axes = Attr<std::vector<int>>("axes"); auto &axes = Attr<std::vector<int>>("axes");
auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims(); auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
auto out_dims = Squeeze2OpInferShape::GetOutputShape(axes, x_dims); auto out_dims = Squeeze2OpInferShape::GetOutputShape(axes, x_dims, true);
framework::AttributeMap attrs; framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(out_dims); attrs["shape"] = framework::vectorize2int(out_dims);
......
...@@ -34,8 +34,11 @@ class TopkOp : public framework::OperatorWithKernel { ...@@ -34,8 +34,11 @@ class TopkOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_GE(k, 1, "k must >= 1"); PADDLE_ENFORCE_GE(k, 1, "k must >= 1");
PADDLE_ENFORCE_GE(input_dims.size(), 1, "input must have >= 1d shape"); PADDLE_ENFORCE_GE(input_dims.size(), 1, "input must have >= 1d shape");
PADDLE_ENFORCE_GE(input_dims[input_dims.size() - 1], k,
"input must have >= k columns"); if (ctx->IsRuntime()) {
PADDLE_ENFORCE_GE(input_dims[input_dims.size() - 1], k,
"input must have >= k columns");
}
framework::DDim dims = input_dims; framework::DDim dims = input_dims;
dims[dims.size() - 1] = k; dims[dims.size() - 1] = k;
......
...@@ -23,6 +23,9 @@ limitations under the License. */ ...@@ -23,6 +23,9 @@ limitations under the License. */
#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/cuda_helper.h"
#include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/cudnn.h"
#if !defined(__APPLE__) && !defined(_WIN32)
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#endif #endif
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#include <algorithm> #include <algorithm>
#include <cstdlib> #include <cstdlib>
#include <string> #include <string>
...@@ -31,6 +30,8 @@ constexpr static float fraction_of_gpu_memory_to_use = 0.92f; ...@@ -31,6 +30,8 @@ constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
constexpr static float fraction_of_gpu_memory_to_use = 0.5f; constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
#endif #endif
constexpr static float fraction_reserve_gpu_memory = 0.05f;
DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
"Allocate a trunk of gpu memory that is this fraction of the " "Allocate a trunk of gpu memory that is this fraction of the "
"total gpu memory size. Future memory usage will be allocated " "total gpu memory size. Future memory usage will be allocated "
...@@ -38,6 +39,24 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, ...@@ -38,6 +39,24 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
"additional trunks of the same size will be requested from gpu " "additional trunks of the same size will be requested from gpu "
"until the gpu has no memory left for another trunk."); "until the gpu has no memory left for another trunk.");
DEFINE_uint64(
initial_gpu_memory_in_mb, 0ul,
"Allocate a trunk of gpu memory whose byte size is specified by "
"the flag. Future memory usage will be allocated from the "
"truck. If the trunk doesn't have enough gpu memory, additional "
"trunks of the gpu memory will be requested from gpu with size "
"specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has "
"no memory left for the additional trunk. Note: if you set this "
"flag, the memory size set by "
"FLAGS_fraction_of_gpu_memory_to_use will be overrided by this "
"flag. If you don't set this flag, PaddlePaddle will use "
"FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory");
DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul,
"If this flag is set, Paddle will reallocate the gpu memory with "
"size specified by this flag. Else Paddle will reallocate by "
"FLAGS_fraction_of_gpu_memory_to_use");
DEFINE_bool( DEFINE_bool(
enable_cublas_tensor_op_math, false, enable_cublas_tensor_op_math, false,
"The enable_cublas_tensor_op_math indicate whether to use Tensor Core, " "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
...@@ -180,13 +199,43 @@ void GpuMemoryUsage(size_t *available, size_t *total) { ...@@ -180,13 +199,43 @@ void GpuMemoryUsage(size_t *available, size_t *total) {
} }
size_t GpuMaxAllocSize() { size_t GpuMaxAllocSize() {
return std::max(GpuInitAllocSize(), GpuReallocSize());
}
size_t GpuInitAllocSize() {
if (FLAGS_initial_gpu_memory_in_mb > 0ul) {
// Initial memory will be allocated by FLAGS_initial_gpu_memory_in_mb
return static_cast<size_t>(FLAGS_initial_gpu_memory_in_mb << 20);
}
// FLAGS_initial_gpu_memory_in_mb is 0, initial memory will be allocated by
// fraction
size_t total = 0; size_t total = 0;
size_t available = 0; size_t available = 0;
GpuMemoryUsage(&available, &total); GpuMemoryUsage(&available, &total);
size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
// Reserve the rest for page tables, etc. return static_cast<size_t>((total - reserving) *
return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use); FLAGS_fraction_of_gpu_memory_to_use);
}
size_t GpuReallocSize() {
if (FLAGS_reallocate_gpu_memory_in_mb > 0ul) {
// Additional memory will be allocated by FLAGS_reallocate_gpu_memory_in_mb
return static_cast<size_t>(FLAGS_reallocate_gpu_memory_in_mb << 20);
}
// FLAGS_reallocate_gpu_memory_in_mb is 0, additional memory will be allocated
// by fraction
size_t total = 0;
size_t available = 0;
GpuMemoryUsage(&available, &total);
size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
return static_cast<size_t>((total - reserving) *
FLAGS_fraction_of_gpu_memory_to_use);
} }
size_t GpuMinChunkSize() { size_t GpuMinChunkSize() {
...@@ -201,16 +250,13 @@ size_t GpuMaxChunkSize() { ...@@ -201,16 +250,13 @@ size_t GpuMaxChunkSize() {
GpuMemoryUsage(&available, &total); GpuMemoryUsage(&available, &total);
VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/" VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
<< total / 1024 / 1024 << "M"; << total / 1024 / 1024 << "M";
size_t reserving = static_cast<size_t>(0.05 * total); size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
// If available less than minimum chunk size, no usable memory exists. // If available less than minimum chunk size, no usable memory exists.
available = available =
std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(), std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
total - reserving); total - reserving);
// Reserving the rest memory for page tables, etc. size_t allocating = GpuMaxAllocSize();
size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use *
(total - reserving));
PADDLE_ENFORCE_LE(allocating, available, PADDLE_ENFORCE_LE(allocating, available,
"Insufficient GPU memory to allocation."); "Insufficient GPU memory to allocation.");
......
...@@ -60,6 +60,12 @@ void GpuMemoryUsage(size_t *available, size_t *total); ...@@ -60,6 +60,12 @@ void GpuMemoryUsage(size_t *available, size_t *total);
//! Get the maximum allocation size of current GPU device. //! Get the maximum allocation size of current GPU device.
size_t GpuMaxAllocSize(); size_t GpuMaxAllocSize();
//! Get the initial allocation size of current GPU device.
size_t GpuInitAllocSize();
//! Get the re-allocation size of current GPU device.
size_t GpuReallocSize();
//! Get the minimum chunk size for GPU buddy allocator. //! Get the minimum chunk size for GPU buddy allocator.
size_t GpuMinChunkSize(); size_t GpuMinChunkSize();
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include <typeindex> #include <typeindex>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/dynload/nccl.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -79,7 +80,6 @@ struct NCCLContext { ...@@ -79,7 +80,6 @@ struct NCCLContext {
: ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {} : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
cudaStream_t stream() const { return ctx_->stream(); } cudaStream_t stream() const { return ctx_->stream(); }
ncclComm_t comm() const { return comm_; } ncclComm_t comm() const { return comm_; }
int device_id() const { int device_id() const {
...@@ -105,9 +105,6 @@ struct NCCLContextMap { ...@@ -105,9 +105,6 @@ struct NCCLContextMap {
order_.size(), contexts_.size(), order_.size(), contexts_.size(),
"NCCL Context Map does not support contain two or more same device"); "NCCL Context Map does not support contain two or more same device");
if (places.size() <= 1 && num_trainers == 1) {
return;
}
std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]); std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
// if num_trainers == 1, should create a new nccl id for local comms. // if num_trainers == 1, should create a new nccl id for local comms.
if (num_trainers == 1 && nccl_id == nullptr) { if (num_trainers == 1 && nccl_id == nullptr) {
...@@ -127,8 +124,8 @@ struct NCCLContextMap { ...@@ -127,8 +124,8 @@ struct NCCLContextMap {
} else { } else {
rank = trainer_id; rank = trainer_id;
} }
VLOG(30) << "init nccl rank: " << rank << " nranks: " << nranks VLOG(3) << "init nccl rank: " << rank << " nranks: " << nranks
<< "gpu id: " << gpu_id; << " gpu id: " << gpu_id;
PADDLE_ENFORCE(cudaSetDevice(gpu_id)); PADDLE_ENFORCE(cudaSetDevice(gpu_id));
PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
comms.get() + i, nranks, *nccl_id, rank)); comms.get() + i, nranks, *nccl_id, rank));
......
...@@ -242,10 +242,6 @@ void BindAnalysisConfig(py::module *m) { ...@@ -242,10 +242,6 @@ void BindAnalysisConfig(py::module *m) {
.def("set_mkldnn_op", &AnalysisConfig::SetMKLDNNOp) .def("set_mkldnn_op", &AnalysisConfig::SetMKLDNNOp)
.def("set_model_buffer", &AnalysisConfig::SetModelBuffer) .def("set_model_buffer", &AnalysisConfig::SetModelBuffer)
.def("model_from_memory", &AnalysisConfig::model_from_memory) .def("model_from_memory", &AnalysisConfig::model_from_memory)
.def("runtime_context_cache_enabled",
&AnalysisConfig::runtime_context_cache_enabled)
.def("switch_runtime_context_cache",
&AnalysisConfig::SwitchRuntimeContextCache, py::arg("x") = true)
.def("pass_builder", &AnalysisConfig::pass_builder, .def("pass_builder", &AnalysisConfig::pass_builder,
py::return_value_policy::reference); py::return_value_policy::reference);
} }
......
...@@ -1265,6 +1265,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1265,6 +1265,10 @@ All parameter, weight, gradient are variables in Paddle.
"enable_inplace", "enable_inplace",
[](const BuildStrategy &self) { return self.enable_inplace_; }, [](const BuildStrategy &self) { return self.enable_inplace_; },
[](BuildStrategy &self, bool b) { self.enable_inplace_ = b; }) [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; })
.def_property(
"fuse_all_reduce_ops",
[](const BuildStrategy &self) { return self.fuse_all_reduce_ops_; },
[](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; })
.def("_finalize_strategy_and_create_passes", .def("_finalize_strategy_and_create_passes",
[](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> { [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
return self.CreatePassesFromStrategy(true); return self.CreatePassesFromStrategy(true);
......
...@@ -453,6 +453,7 @@ function assert_api_spec_approvals() { ...@@ -453,6 +453,7 @@ function assert_api_spec_approvals() {
echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}" echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
# NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable. # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
# approval_user_list: velconia 1979255,panyx0718 2887803,XiaoguangHu01 46782768,chengduoZH 30176695,Xreki 12538138,luotao1 6836917,sneaxiy 32832641,tensor-tang 21351065,jacquesqiao 3048612,typhoonzero 13348433,shanyi15 35982308.
if [ "$API_FILE" == "paddle/fluid/API.spec" ];then if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308 46782768 30176695` python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308 46782768 30176695`
...@@ -462,14 +463,14 @@ function assert_api_spec_approvals() { ...@@ -462,14 +463,14 @@ function assert_api_spec_approvals() {
fi fi
else else
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803` python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641`
fi fi
echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
if [ "${APPROVALS}" == "FALSE" ]; then if [ "${APPROVALS}" == "FALSE" ]; then
if [ "$API_FILE" == "paddle/fluid/API.spec" ];then if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
echo "You must have one RD (panyx0718 or chengduoZH or XiaoguangHu01) and one PM (shanyi15) approval for the api change! ${API_FILE}" echo "You must have one RD (panyx0718 or chengduoZH or XiaoguangHu01) and one PM (shanyi15) approval for the api change! ${API_FILE}"
else else
echo "You must have panyx0718 approval for the api change! ${API_FILE}" echo "You must have one RD (velconia,panyx0718,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE}"
fi fi
exit 1 exit 1
fi fi
...@@ -479,10 +480,10 @@ function assert_api_spec_approvals() { ...@@ -479,10 +480,10 @@ function assert_api_spec_approvals() {
HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true` HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true`
if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803` python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641`
echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
if [ "${APPROVALS}" == "FALSE" ]; then if [ "${APPROVALS}" == "FALSE" ]; then
echo "You must have panyx0718 approval for the const_cast" echo "You must have one RD (velconia,panyx0718,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE}"
exit 1 exit 1
fi fi
fi fi
......
...@@ -41,6 +41,8 @@ int main(int argc, char** argv) { ...@@ -41,6 +41,8 @@ int main(int argc, char** argv) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
envs.push_back("fraction_of_gpu_memory_to_use"); envs.push_back("fraction_of_gpu_memory_to_use");
envs.push_back("initial_gpu_memory_in_mb");
envs.push_back("reallocate_gpu_memory_in_mb");
envs.push_back("allocator_strategy"); envs.push_back("allocator_strategy");
#elif __clang__ #elif __clang__
envs.push_back("use_mkldnn"); envs.push_back("use_mkldnn");
......
...@@ -132,7 +132,8 @@ def __bootstrap__(): ...@@ -132,7 +132,8 @@ def __bootstrap__():
'allocator_strategy', 'reader_queue_speed_test_mode', 'allocator_strategy', 'reader_queue_speed_test_mode',
'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir', 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
'inner_op_parallelism', 'enable_parallel_graph', 'inner_op_parallelism', 'enable_parallel_graph',
'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize', 'fuse_parameter_groups_size', 'multiple_of_cupti_buffer_size',
'enable_subgraph_optimize', 'fuse_parameter_memory_size',
'tracer_profile_fname' 'tracer_profile_fname'
] ]
if 'Darwin' not in sysstr: if 'Darwin' not in sysstr:
...@@ -162,7 +163,8 @@ def __bootstrap__(): ...@@ -162,7 +163,8 @@ def __bootstrap__():
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
read_env_flags += [ read_env_flags += [
'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', 'fraction_of_gpu_memory_to_use', 'initial_gpu_memory_in_mb',
'reallocate_gpu_memory_in_mb', 'cudnn_deterministic',
'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus', 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',
'sync_nccl_allreduce', 'limit_of_tmp_allocation', 'sync_nccl_allreduce', 'limit_of_tmp_allocation',
......
...@@ -13,13 +13,4 @@ ...@@ -13,13 +13,4 @@
# limitations under the License. # limitations under the License.
from .core import * from .core import *
from .graph import * __all__ = ['Compressor', ]
from .prune import *
__all__ = [
'build_compressor',
'CompressPass',
'ImitationGraph',
'SensitivePruneStrategy',
'MagnitudePruner',
'RatioPruner',
]
...@@ -14,11 +14,9 @@ ...@@ -14,11 +14,9 @@
from . import config from . import config
from .config import * from .config import *
from . import compress_pass from . import compressor
from .compress_pass import * from .compressor import *
from . import strategy from . import strategy
from .strategy import * from .strategy import *
from . import pass_builder
from .pass_builder import *
__all__ = config.__all__ + compress_pass.__all__ + strategy.__all__ + pass_builder.__all__ __all__ = config.__all__ + compressor.__all__ + strategy.__all__
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ....core import CPUPlace
from ..graph import get_executor
__all__ = ['Context', 'CompressPass']
class Context(object):
"""
The context in the process of compression.
Args:
exe: The executor used to execute graph.
graph: The graph to be compressed.
scope: The scope used to execute graph.
program_exe: The program_exe is used to execute the program
created for modifying the variables in scope.
"""
def __init__(self, exe, graph, scope, program_exe=None):
# The total number of epoches to be trained.
self.epoch = 0
# Current epoch
self.epoch_id = 0
# Current batch
self.batch_id = 0
self.exe = exe
self.graph = graph
self.scope = scope
self.program_exe = program_exe
class CompressPass(object):
"""
The pass used to compress model.
Args:
place: The device used in compression.
data_reader: The data_reader used to run graph.
data_feeder: The data_feeder used to run graph.
scope: The scope used to run graph.
metrics: The metrics for evaluating model.
epoch: The total epoches of trainning in compression.
program_exe: The program_exe is used to execute the program
created for modifying the variables in scope.
"""
def __init__(self,
place=None,
data_reader=None,
data_feeder=None,
scope=None,
metrics=None,
epoch=None,
program_exe=None):
self.strategies = []
self.place = CPUPlace() if place is None else place
self.data_reader = data_reader
self.data_feeder = data_feeder
self.scope = scope
self.metrics = metrics
self.epoch = epoch
self.program_exe = program_exe
def add_strategy(self, strategy):
"""
Add a strategy to current compress pass.
Args:
strategy: The strategy to be added into current compress pass.
"""
self.strategies.append(strategy)
self.epoch = max(strategy.end_epoch, self.epoch)
def apply(self, graph):
"""
Compress a model.
Args:
graph: The target graph to be compressed.
"""
self.executor = get_executor(graph, self.place)
context = Context(
self.executor, graph, self.scope, program_exe=self.program_exe)
for strategy in self.strategies:
strategy.on_compress_begin(context)
for epoch in range(self.epoch):
for strategy in self.strategies:
strategy.on_epoch_begin(context)
for data in self.data_reader():
for strategy in self.strategies:
strategy.on_batch_begin(context)
fetches = None
if self.metrics:
fetches = self.metrics.values()
feed = None
if self.data_feeder:
feed = self.data_feeder.feed(data)
results = self.executor.run(graph,
fetches=fetches,
scope=self.scope,
feed=feed)
if results:
print("results: {}".format(
zip(self.metrics.keys(), results)))
for strategy in self.strategies:
strategy.on_batch_end(context)
context.batch_id += 1
for strategy in self.strategies:
strategy.on_epoch_end(context)
context.epoch_id += 1
for strategy in self.strategies:
strategy.on_compress_end(context)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ....core import CPUPlace
from .... import compiler
from .... import io
from .... import profiler
from .... import scope_guard
from ....data_feeder import DataFeeder
from ..graph import *
from .config import ConfigFactory
import numpy as np
from collections import Iterable
import time
import os
import logging
import sys
import pickle
import functools
__all__ = ['Context', 'Compressor']
logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
_logger = logging.getLogger(__name__)
_logger.setLevel(logging.INFO)
def cached_reader(reader, sampled_rate, cache_path, cached_id):
"""
Sample partial data from reader and cache them into local file system.
Args:
reader: Iterative data source.
sampled_rate(float): The sampled rate used to sample partial data for evaluation. None means using all data in eval_reader. default: None.
cache_path(str): The path to cache the sampled data.
cached_id(int): The id of dataset sampled. Evaluations with same cached_id use the same sampled dataset. default: 0.
"""
np.random.seed(cached_id)
cache_path = os.path.join(cache_path, str(cached_id))
_logger.debug('read data from: {}'.format(cache_path))
def s_reader():
if os.path.isdir(cache_path):
for file_name in open(os.path.join(cache_path, "list")):
yield np.load(os.path.join(cache_path, file_name.strip()))
else:
os.makedirs(cache_path)
list_file = open(os.path.join(cache_path, "list"), 'w')
batch = 0
dtype = None
for data in reader():
if batch == 0 or (np.random.uniform() < sampled_rate):
np.save(
os.path.join(cache_path, 'batch' + str(batch)), data)
list_file.write('batch' + str(batch) + '.npy\n')
batch += 1
yield data
return s_reader
class Context(object):
"""
The context in the process of compression.
"""
def __init__(self,
place,
scope,
train_graph=None,
train_reader=None,
eval_graph=None,
eval_reader=None,
teacher_graphs=None,
train_optimizer=None,
distiller_optimizer=None):
"""
Args:
place: The device place where the compression job running.
scope: The scope used in compression job.
train_graph: The graph with loss as output node.
eval_graph: The graph used for evaluation.
eval_reader: The data reader used for evaluation.
teacher_graphs: The teacher graphs used in distillation strategies.
train_optimizer: The optimizer used to append backward ops and
optimization ops into train_graph.
distiller_optimizer: The optimizer used by distillation strategies.
"""
# The total number of epoches to be trained.
self.epoch = 0
# Current epoch
self.epoch_id = 0
# Current batch
self.batch_id = 0
self.k_v = {}
self.place = place
self.scope = scope
self.train_graph = train_graph
self.train_reader = train_reader
self.eval_graph = eval_graph
self.eval_reader = eval_reader
self.executor = None
self.teacher_graphs = teacher_graphs
self.train_optimizer = train_optimizer
self.distiller_optimizer = distiller_optimizer
self.optimize_graph = None
self.cache_path = './eval_cache'
self.eval_results = {}
def to_file(self, file_name):
"""
Save the context into file.
"""
data = {}
data['epoch_id'] = self.epoch_id
data['eval_results'] = self.eval_results
with open(file_name, 'wb') as context_file:
pickle.dump(data, context_file)
def from_file(self, file_name):
"""
Load the context from file.
"""
with open(file_name) as context_file:
if sys.version_info < (3, 0):
data = pickle.load(context_file)
else:
data = pickle.load(context_file, encoding='bytes')
self.epoch_id = data['epoch_id']
self.eval_results = data['eval_results']
def eval_converged(self, metric_name, delta=0.001):
"""
Check whether the training has been converged.
Args:
metric_name(str): The metric used to check convergence.
delta(float): '(metric[k] - metric[k-1] / metric[k-1]) < delta'
means that the training has been converged.
Returns:
bool: True means the training has been converged.
"""
# TODO(wanghaoshuang@baidu.com): enhence this method.
if (metric_name not in self.eval_results
) or len(self.eval_results[metric_name]) < 2:
return False
results = self.eval_results[metric_name][-2:]
_logger.info('Latest evaluations: {}'.format(results))
return abs(results[1] - results[0]) / results[0] < delta
def run_eval_graph(self, sampled_rate=None, cached_id=0):
"""
Evaluate the current mode in context.
Args:
sampled_rate(float): The sampled rate used to sample partial data
for evaluation. None means using all data in eval_reader. default: None.
cached_id(int): The id of dataset sampled. Evaluations with same
cached_id use the same sampled dataset. default: 0.
"""
_logger.info('Running evaluation')
assert self.eval_graph is not None
assert self.eval_reader is not None
eval_graph = self.eval_graph.clone(for_test=True)
executor = SlimGraphExecutor(self.place)
results = []
batch_id = 0
s_time = time.time()
reader = self.eval_reader
if sampled_rate:
reader = cached_reader(reader, sampled_rate, self.cache_path,
cached_id)
for data in reader():
result = executor.run(eval_graph, self.scope, data=data)
result = [np.mean(r) for r in result]
results.append(result)
if batch_id % 20 == 0:
_logger.info("batch-{}; {}={}".format(
batch_id, eval_graph.out_nodes.keys(), result))
batch_id += 1
result = np.mean(np.array(results), axis=0)
_logger.info("Final eval result: {}={}".format(
eval_graph.out_nodes.keys(), result))
if not isinstance(result, Iterable):
result = [result]
_logger.info('Finish evaluation')
return result, eval_graph.out_nodes.keys()
def put(self, key, value):
self.k_v[key] = value
def get(self, key):
return self.k_v.get(key)
class Compressor(object):
"""
The pass used to compress model.
"""
def __init__(self,
place,
scope,
train_program,
train_reader=None,
train_feed_list=None,
train_fetch_list=None,
eval_program=None,
eval_reader=None,
eval_feed_list=None,
eval_fetch_list=None,
teacher_programs=[],
checkpoint_path='./checkpoints',
train_optimizer=None,
distiller_optimizer=None):
"""
Args:
place(fluid.Place): The device place where the compression job running.
scope(fluid.core.Scope): The scope used to run graph.
train_program(Program): The main program to be compressed. It must have loss op.
train_reader: The data reader used for training.
train_feed_list(dict): A dict to indicate the input variable of the training program.
The key is user-defined and human-readable name.
The value is the name of Variable.
train_fetch_list(dict): A dict to indicate the output variable of the training program.
The key is user-defined and human-readable name.
The value is the name of Variable.
eval_program(Program): The program used for evaluation.
eval_reader: The data reader used for evaluation.
eval_feed_list(dict): A dict to indicate the input variable of the evaluation program.
The key is user-defined and human-readable name.
The value is the name of Variable.
eval_fetch_list(dict): A dict to indicate the output variable of the evaluation program.
The key is user-defined and human-readable name.
The value is the name of Variable.
teacher_programs: The teacher graphs used in distillation strategies.
train_optimizer: The optimizer used to append backward ops and
optimization ops into train_graph.
distiller_optimizer: The optimizer used by distillation strategies. In distillation strategy,
this optimizer is used to minimize the combined loss of student-net and
teacher-net while train_optimizer is used to minimize loss of
student-net in fine-tune stage.
"""
assert isinstance(
train_feed_list, list
), "train_feed_list should be a list of tuple, such as [('image', image.name), ('label', gt.name)]"
assert isinstance(
eval_feed_list, list
), "eval_feed_list should be a list of tuple, such as [('image', image.name), ('label', gt.name)]"
self.strategies = []
self.epoch = 0
self.place = CPUPlace() if place is None else place
self.scope = scope
self.train_graph = GraphWrapper(
train_program, in_nodes=train_feed_list, out_nodes=train_fetch_list)
self.eval_graph = GraphWrapper(
eval_program, in_nodes=eval_feed_list, out_nodes=eval_fetch_list)
self.train_reader = train_reader
self.eval_reader = eval_reader
self.teacher_graphs = []
for teacher in teacher_programs:
self.teacher_graphs.append(ImitationGraph(teacher, scope=scope))
self.checkpoint = None
self.checkpoint_path = checkpoint_path
self.eval_epoch = 1
self.train_optimizer = train_optimizer
self.distiller_optimizer = distiller_optimizer
self.init_model = None
def _add_strategy(self, strategy):
"""
Add a strategy to current compress pass.
Args:
strategy: The strategy to be added into current compress pass.
"""
self.strategies.append(strategy)
self.epoch = max(strategy.end_epoch, self.epoch)
def config(self, config_file):
"""
Configure the compress pass from file with yaml format.
Args:
config_file(str): The config file in local file system.
"""
factory = ConfigFactory(config_file)
self.epoch = factory.compressor['epoch']
for strategy in factory.compressor['strategies']:
self._add_strategy(strategy)
if 'checkpoint_path' in factory.compressor:
self.checkpoint_path = factory.compressor['checkpoint_path']
if 'init_model' in factory.compressor:
self.init_model = factory.compressor['init_model']
def _init_model(self, context):
"""
Load model that has been compressed.
"""
if self.init_model and os.path.exists(self.init_model):
exe = SlimGraphExecutor(context.place)
with scope_guard(context.scope):
context.train_graph.load_persistables(self.init_model, exe)
flops = context.eval_graph.flops()
conv_flops = context.eval_graph.flops(only_conv=True)
context.eval_graph.update_param_shape(context.scope)
context.eval_graph.update_groups_of_conv()
_logger.info("conv flops: -{}".format(1 - float(
context.eval_graph.flops(only_conv=True)) / conv_flops))
_logger.info("total flops: -{}".format(1 - float(
context.eval_graph.flops()) / flops))
context.train_graph.update_param_shape(context.scope)
context.train_graph.update_groups_of_conv()
context.train_graph.infer_shape()
_logger.info("Init model from: {}".format(self.init_model))
def _load_checkpoint(self, context):
"""
Load checkpoints from file.
"""
_logger.debug('_load_checkpoint')
strategies = self.strategies
if self.checkpoint_path:
if not os.path.exists(self.checkpoint_path):
_logger.warning("Checkpints path doesn't exist: [{}]".format(
self.checkpoint_path))
return context, strategies
checkpoints = [
dir for dir in os.listdir(self.checkpoint_path)
if os.path.isdir(os.path.join(self.checkpoint_path, dir))
]
_logger.debug('self.checkpoint_path: {}'.format(
self.checkpoint_path))
_logger.info('checkpoints: {}'.format(checkpoints))
if len(checkpoints) > 0:
latest = max([int(ck) for ck in checkpoints])
latest_ck_path = os.path.join(self.checkpoint_path, str(latest))
model_path = os.path.join(latest_ck_path, 'model')
context_path = os.path.join(latest_ck_path, 'context')
strategy_path = os.path.join(latest_ck_path, 'strategies')
if os.path.exists(context_path):
context.from_file(context_path)
context.epoch_id += 1
if os.path.exists(strategy_path):
with open(strategy_path, 'rb') as strategy_file:
if sys.version_info < (3, 0):
strategies = pickle.load(strategy_file)
else:
strategies = pickle.load(
strategy_file, encoding='bytes')
if os.path.exists(model_path):
exe = SlimGraphExecutor(context.place)
with scope_guard(context.scope):
context.optimize_graph.load_persistables(model_path,
exe)
context.optimize_graph.update_param_shape(context.scope)
context.optimize_graph.update_groups_of_conv()
context.eval_graph.update_param_shape(context.scope)
context.eval_graph.update_groups_of_conv()
_logger.info("Loaded params from: {}".format(model_path))
return context, strategies
def _save_checkpoint(self, context):
"""
Save checkpoints to file.
"""
if context.epoch_id % 1 == 0 and self.checkpoint_path:
checkpoint_path = os.path.join(self.checkpoint_path,
str(context.epoch_id))
model_path = os.path.join(checkpoint_path, 'model')
context_path = os.path.join(checkpoint_path, 'context')
strategy_path = os.path.join(checkpoint_path, 'strategies')
if not os.path.isdir(model_path):
os.makedirs(model_path)
exe = SlimGraphExecutor(context.place)
with scope_guard(context.scope):
context.optimize_graph.save_persistables(model_path, exe)
context.to_file(context_path)
with open(strategy_path, 'wb') as strategy_file:
pickle.dump(self.strategies, strategy_file)
_logger.info('Saved checkpoint to: {}'.format(checkpoint_path))
def _train_one_epoch(self, context):
"""
Train one epoch.
"""
executor = SlimGraphExecutor(self.place)
if context.optimize_graph.compiled_graph is None:
context.optimize_graph.compiled_graph = compiler.CompiledProgram(
context.optimize_graph.program).with_data_parallel(
loss_name=context.optimize_graph.out_nodes['loss'])
for data in context.train_reader():
for strategy in self.strategies:
strategy.on_batch_begin(context)
results = executor.run(context.optimize_graph,
context.scope,
data=data)
results = [float(np.mean(result)) for result in results]
if context.batch_id % 20 == 0:
_logger.info("epoch:{}; batch_id:{}; {} = {}".format(
context.epoch_id, context.batch_id,
context.optimize_graph.out_nodes.keys(
), [round(r, 3) for r in results]))
for strategy in self.strategies:
strategy.on_batch_end(context)
context.batch_id += 1
context.batch_id = 0
def _eval(self, context):
"""
Runing evaluation.
"""
results, names = context.run_eval_graph()
for name, result in zip(names, results):
if name not in context.eval_results:
context.eval_results[name] = []
context.eval_results[name].append(result)
def run(self):
"""
Execute compressiong pass.
"""
context = Context(
place=self.place,
scope=self.scope,
train_graph=self.train_graph,
train_reader=self.train_reader,
eval_graph=self.eval_graph,
eval_reader=self.eval_reader,
teacher_graphs=self.teacher_graphs,
train_optimizer=self.train_optimizer,
distiller_optimizer=self.distiller_optimizer)
self.context = context
if self.teacher_graphs:
context.put('teachers', self.teacher_graphs)
self._init_model(context)
if not context.optimize_graph:
if context.train_optimizer:
context.train_optimizer._name = 'train_opt'
context.optimize_graph = context.train_graph.get_optimize_graph(
context.train_optimizer, context.place, context.scope)
else:
context.optimize_graph = context.train_graph
context, self.strategies = self._load_checkpoint(context)
for strategy in self.strategies:
strategy.on_compression_begin(context)
start = context.epoch_id
self._eval(context)
for epoch in range(start, self.epoch):
context.epoch_id = epoch
for strategy in self.strategies:
strategy.on_epoch_begin(context)
self._train_one_epoch(context)
for strategy in self.strategies:
strategy.on_epoch_end(context)
if self.eval_epoch and epoch % self.eval_epoch == 0:
self._eval(context)
self._save_checkpoint(context)
for strategy in self.strategies:
strategy.on_compression_end(context)
return context.eval_graph
...@@ -17,7 +17,7 @@ import funcsigs ...@@ -17,7 +17,7 @@ import funcsigs
import yaml import yaml
from collections import OrderedDict from collections import OrderedDict
from ..prune import * from ..prune import *
from .compress_pass import * from ..quantization import *
from .strategy import * from .strategy import *
__all__ = ['ConfigFactory'] __all__ = ['ConfigFactory']
...@@ -29,15 +29,10 @@ class ConfigFactory(object): ...@@ -29,15 +29,10 @@ class ConfigFactory(object):
def __init__(self, config): def __init__(self, config):
"""Init a factory from configure file.""" """Init a factory from configure file."""
self.instances = {} self.instances = {}
self.compressor = {}
self.version = None self.version = None
self._parse_config(config) self._parse_config(config)
def get_compress_pass(self):
"""
Get compress pass from factory.
"""
return self.instance('compress_pass')
def instance(self, name): def instance(self, name):
""" """
Get instance from factory. Get instance from factory.
...@@ -59,8 +54,16 @@ class ConfigFactory(object): ...@@ -59,8 +54,16 @@ class ConfigFactory(object):
args = {} args = {}
for key in keys: for key in keys:
value = attrs[key] value = attrs[key]
if isinstance(value, str) and value.lower() == 'none':
value = None
if isinstance(value, str) and value in self.instances: if isinstance(value, str) and value in self.instances:
value = self.instances[value] value = self.instances[value]
if isinstance(value, list):
for i in range(len(value)):
if isinstance(value[i],
str) and value[i] in self.instances:
value[i] = self.instances[value[i]]
args[key] = value args[key] = value
self.instances[name] = class_(**args) self.instances[name] = class_(**args)
return self.instances.get(name) return self.instances.get(name)
...@@ -76,16 +79,23 @@ class ConfigFactory(object): ...@@ -76,16 +79,23 @@ class ConfigFactory(object):
assert self.version == int(key_values['version']) assert self.version == int(key_values['version'])
# parse pruners # parse pruners
if key == 'pruners' or key == 'strategies': if key == 'distillers' or key == 'pruners' or key == 'quantizers' or key == 'strategies':
instances = key_values[key] instances = key_values[key]
for name in instances: for name in instances:
self._new_instance(name, instances[name]) self._new_instance(name, instances[name])
if key == 'compress_pass': if key == 'compressor':
compress_pass = self._new_instance(key, key_values[key]) self.compressor['strategies'] = []
for name in key_values[key]['strategies']: self.compressor['epoch'] = key_values[key]['epoch']
strategy = self.instance(name) if 'init_model' in key_values[key]:
compress_pass.add_strategy(strategy) self.compressor['init_model'] = key_values[key][
'init_model']
self.compressor['checkpoint_path'] = key_values[key][
'checkpoint_path']
if 'strategies' in key_values[key]:
for name in key_values[key]['strategies']:
strategy = self.instance(name)
self.compressor['strategies'].append(strategy)
if key == 'include': if key == 'include':
for config_file in key_values[key]: for config_file in key_values[key]:
......
...@@ -20,7 +20,7 @@ class Strategy(object): ...@@ -20,7 +20,7 @@ class Strategy(object):
Base class for all strategies. Base class for all strategies.
""" """
def __init__(self, start_epoch=0, end_epoch=10): def __init__(self, start_epoch=0, end_epoch=0):
""" """
Args: Args:
start_epoch: The first epoch to apply the strategy. start_epoch: The first epoch to apply the strategy.
...@@ -29,7 +29,7 @@ class Strategy(object): ...@@ -29,7 +29,7 @@ class Strategy(object):
self.start_epoch = start_epoch self.start_epoch = start_epoch
self.end_epoch = end_epoch self.end_epoch = end_epoch
def on_compress_begin(self, context): def on_compression_begin(self, context):
pass pass
def on_epoch_begin(self, context): def on_epoch_begin(self, context):
...@@ -44,5 +44,5 @@ class Strategy(object): ...@@ -44,5 +44,5 @@ class Strategy(object):
def on_batch_end(self, context): def on_batch_end(self, context):
pass pass
def on_compress_end(self, context): def on_compression_end(self, context):
pass pass
version: 1.0
pruners:
pruner_1:
class: 'RatioPruner'
ratios:
'conv1_1.w': 0.3
'conv1_2.w': 0.4
'*': 0.9
group_dims:
'*': [1, 2, 3]
criterions:
'*': 'l1-norm'
strategies:
strategy_1:
class: 'SensitivePruneStrategy'
pruner: 'pruner_1'
start_epoch: 0
end_epoch: 10
delta_rate: 0.20
acc_loss_threshold: 0.2
sensitivities:
'conv1_1.w': 0.4
compress_pass:
class: 'CompressPass'
epoch: 100
strategies:
- strategy_1
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import paddle
import os
import sys
from paddle.fluid.contrib.slim import CompressPass
from paddle.fluid.contrib.slim import build_compressor
from paddle.fluid.contrib.slim import ImitationGraph
class LinearModel(object):
def __init__(slef):
pass
def train(self):
train_program = fluid.Program()
startup_program = fluid.Program()
startup_program.random_seed = 10
with fluid.program_guard(train_program, startup_program):
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
predict = fluid.layers.fc(input=x, size=1, act=None)
cost = fluid.layers.square_error_cost(input=predict, label=y)
avg_cost = fluid.layers.mean(cost)
eval_program = train_program.clone()
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_cost)
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=1)
eval_reader = paddle.batch(
paddle.dataset.uci_housing.test(), batch_size=1)
place = fluid.CPUPlace()
train_feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
eval_feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(startup_program)
train_metrics = {"loss": avg_cost.name}
eval_metrics = {"loss": avg_cost.name}
graph = ImitationGraph(train_program)
config = './config.yaml'
comp_pass = build_compressor(
place,
data_reader=train_reader,
data_feeder=train_feeder,
scope=fluid.global_scope(),
metrics=train_metrics,
epoch=1,
config=config)
comp_pass.apply(graph)
if __name__ == "__main__":
model = LinearModel()
model.train()
...@@ -14,10 +14,7 @@ ...@@ -14,10 +14,7 @@
from . import executor from . import executor
from .executor import * from .executor import *
from . import graph from . import graph_wrapper
from .graph import * from .graph_wrapper import *
from . import graph_pass
from .graph_pass import *
__all__ = executor.__all__ __all__ = executor.__all__
__all__ += graph.__all__ __all__ += graph_wrapper.__all__
__all__ += graph_pass.__all__
...@@ -12,51 +12,46 @@ ...@@ -12,51 +12,46 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import abc from ....compiler import CompiledProgram
from abc import abstractmethod from ....data_feeder import DataFeeder
from .... import executor from .... import executor
from .graph import IRGraph, ImitationGraph from .graph_wrapper import GraphWrapper
__all__ = ['get_executor'] __all__ = ['SlimGraphExecutor']
class GraphExecutor(object): class SlimGraphExecutor(object):
__metaclass__ = abc.ABCMeta """
Wrapper of executor used to run GraphWrapper.
"""
def __init__(self, place): def __init__(self, place):
self.place = place
@abstractmethod
def run(self, graph, feches=None, feed=None):
pass
class IRGraphExecutor(GraphExecutor):
def run(self, grah, fetches, feed=None):
pass
class ImitationGraphExecutor(GraphExecutor):
def __init__(self, place):
super(ImitationGraphExecutor, self).__init__(place)
self.exe = executor.Executor(place) self.exe = executor.Executor(place)
self.place = place
def run(self, graph, scope=None, fetches=None, feed=None): def run(self, graph, scope, data=None):
assert isinstance(graph, ImitationGraph) """
fetch_list = None Runing a graph with a batch of data.
if fetches: Args:
fetch_list = [ graph(GraphWrapper): The graph to be executed.
graph.program.global_block().var(name) for name in fetches scope(fluid.core.Scope): The scope to be used.
] data(list<tuple>): A batch of data. Each tuple in this list is a sample.
results = self.exe.run(graph.program, It will feed the items of tuple to the in_nodes of graph.
Returns:
results(list): A list of result with the same order indicated by graph.out_nodes.
"""
assert isinstance(graph, GraphWrapper)
if data is not None:
feeder = DataFeeder(
feed_list=graph.in_nodes.values(),
place=self.place,
program=graph.program)
feed = feeder.feed(data)
fetch_list = graph.out_nodes.values()
program = graph.compiled_graph if graph.compiled_graph else graph.program
results = self.exe.run(program,
scope=scope, scope=scope,
fetch_list=fetch_list, fetch_list=fetch_list,
feed=feed) feed=feed)
return results return results
def get_executor(graph, place):
if isinstance(graph, ImitationGraph):
return ImitationGraphExecutor(place)
if isinstance(graph, IRGraph):
return IRGraphExecutor(place)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import OrderedDict
from .... import io
from .... import compiler
from ....framework import Program
from ....framework import program_guard
from ....framework import Parameter
from ....framework import Variable
from ....executor import Executor
import copy
from collections import Iterable
from ....io import save_inference_model, load_inference_model, save_persistables
import numpy as np
import pickle
import os
__all__ = ['GraphWrapper', 'VarWrapper', 'OpWrapper']
OPTIMIZER_OPS = [
'momentum',
'lars_momentum',
'adagrad',
'adam',
'adamax',
'decayed_adagrad',
'adadelta',
'rmsprop',
]
class VarWrapper(object):
def __init__(self, var, graph):
assert isinstance(var, Variable)
assert isinstance(graph, GraphWrapper)
self._var = var
self._graph = graph
def __eq__(self, v):
"""
Overwrite this function for ...in... syntax in python.
"""
return self._var.name == v._var.name
def name(self):
"""
Get the name of the variable.
"""
return self._var.name
def shape(self):
"""
Get the shape of the varibale.
"""
return self._var.shape
def set_shape(self, shape):
"""
Set the shape of the variable.
"""
self._var.desc.set_shape(shape)
def inputs(self):
"""
Get all the operators that use this variable as output.
Returns:
list<OpWrapper>: A list of operators.
"""
ops = []
for op in self._graph.ops():
if self in op.all_inputs():
ops.append(op)
return ops
def outputs(self):
"""
Get all the operators that use this variable as input.
Returns:
list<OpWrapper>: A list of operators.
"""
ops = []
for op in self._graph.ops():
if self in op.all_outputs():
ops.append(op)
return ops
class OpWrapper(object):
def __init__(self, op, graph):
assert isinstance(graph, GraphWrapper)
self._op = op
self._graph = graph
def __eq__(self, op):
"""
Overwrite this function for ...in... syntax in python.
"""
return self.idx() == op.idx()
def all_inputs(self):
"""
Get all the input variables of this operator.
"""
return [
self._graph.var(var_name) for var_name in self._op.input_arg_names
]
def all_outputs(self):
"""
Get all the output variables of this operator.
"""
return [
self._graph.var(var_name) for var_name in self._op.output_arg_names
]
def idx(self):
"""
Get the id of this operator.
"""
return self._op.idx
def type(self):
"""
Get the type of this operator.
"""
return self._op.type
def is_bwd_op(self):
"""
Whether this operator is backward op.
"""
return self.type().endswith('_grad')
def is_opt_op(self):
"""
Whether this operator is optimizer op.
"""
return self.type() in OPTIMIZER_OPS
def inputs(self, name):
"""
Get all the varibales by the input name.
"""
return [self._graph.var(var_name) for var_name in self._op.input(name)]
def outputs(self, name):
"""
Get all the varibales by the output name.
"""
return [self._graph.var(var_name) for var_name in self._op.output(name)]
def set_attr(self, key, value):
"""
Set the value of attribute by attribute's name.
Args:
key(str): the attribute name.
value(bool|int|str|float|list): the value of the attribute.
"""
self._op._set_attr(key, value)
def attr(self, name):
"""
Get the attribute by name.
Args:
name(str): the attribute name.
Returns:
bool|int|str|float|list: The attribute value. The return value
can be any valid attribute type.
"""
return self._op.attr(name)
class GraphWrapper(object):
"""
It is a wrapper of paddle.fluid.framework.IrGraph with some special functions
for paddle slim framework.
"""
def __init__(self, program=None, in_nodes=[], out_nodes=[]):
"""
Args:
program(framework.Program): A program with
in_nodes(dict): A dict to indicate the input nodes of the graph.
The key is user-defined and human-readable name.
The value is the name of Variable.
out_nodes(dict): A dict to indicate the input nodes of the graph.
The key is user-defined and human-readable name.
The value is the name of Variable.
"""
super(GraphWrapper, self).__init__()
self.program = Program() if program is None else program
self.compiled_graph = None
self.in_nodes = OrderedDict(in_nodes)
self.out_nodes = OrderedDict(out_nodes)
self._attrs = OrderedDict()
def all_parameters(self):
"""
Get all the parameters in this graph.
Returns:
list<VarWrapper>: A list of VarWrapper instances.
"""
params = []
for block in self.program.blocks:
for param in block.all_parameters():
params.append(VarWrapper(param, self))
return params
def is_parameter(self, var):
"""
Whether the given variable is parameter.
Args:
var(VarWrapper): The given varibale.
"""
return isinstance(var._var, Parameter)
def is_persistable(self, var):
"""
Whether the given variable is persistable.
Args:
var(VarWrapper): The given varibale.
"""
return var._var.persistable
def compile(self, for_parallel=True, for_test=False):
"""
Compile the program in this wrapper to framework.CompiledProgram for next running.
This function must be called if the program is modified.
Args:
for_parallel(bool): Whether the program to run in data parallel way. default: True.
for_test(bool): Whether the compiled program is used for test.
"""
target = self.program
if for_test:
loss = None
else:
loss = self.out_nodes['loss']
if for_parallel:
# disable memory optimize for stable training
build_strategy = compiler.BuildStrategy()
build_strategy.enable_inplace = False
build_strategy.memory_optimize = False
self.compiled_graph = compiler.CompiledProgram(
target).with_data_parallel(
loss_name=loss, build_strategy=build_strategy)
else:
self.compiled_graph = compiler.CompiledProgram(target)
def ops(self):
"""
Return all operator nodes included in the graph as a set.
"""
ops = []
for block in self.program.blocks:
for op in block.ops:
ops.append(OpWrapper(op, self))
return ops
def vars(self):
"""
Get all the variables.
"""
return [VarWrapper(var, self) for var in self.program.list_vars()]
def var(self, name):
"""
Get the variable by variable name.
"""
return VarWrapper(self.program.global_block().var(name), self)
def clone(self, for_test=False):
"""
Clone a new graph from current graph.
Returns:
(GraphWrapper): The wrapper of a new graph.
"""
return GraphWrapper(
self.program.clone(for_test),
copy.deepcopy(self.in_nodes), copy.deepcopy(self.out_nodes))
def merge(self, graph):
"""
Merge a graph into current graph.
Args:
graph(GraphWrapper): The graph to be merged by current graph.
"""
for var in graph.program.list_vars():
self.program.global_block()._clone_variable(var)
# TODO: parameters should be cloned
for op in graph.ops():
op = op._op
inputs = {}
outputs = {}
attrs = {}
for input_name in op.input_names:
inputs[input_name] = [
self.var(in_var_name)
for in_var_name in op.inputs(input_name)
]
for output_name in op.output_names:
outputs[output_name] = [
self.var(out_var_name)
for out_var_name in op.output(output_name)
]
for attr_name in op.attr_names:
attrs[attr_name] = op.attr(attr_name)
self.program.global_block().append_op(
type=op.type, inputs=inputs, outputs=outputs, attrs=attrs)
def program(self):
"""
Get the program in current wrapper.
"""
return self.program
def pre_ops(self, op):
"""
Get all the previous operators of target operator.
Args:
op(OpWrapper): Target operator..
Returns:
list<OpWrapper>: A list of operators.
"""
ops = []
for p in self.ops():
for in_var in op.all_inputs():
if in_var in p.all_outputs():
ops.append(p)
return ops
def next_ops(self, op):
"""
Get all the next operators of target operator.
Args:
op(OpWrapper): Target operator..
Returns:
list<OpWrapper>: A list of operators.
"""
ops = []
for p in self.ops():
for out_var in op.all_outputs():
if out_var in p.all_inputs():
ops.append(p)
return ops
def get_param_by_op(self, op):
"""
Get the parameters used by target operator.
"""
assert isinstance(op, OpWrapper)
params = []
for var in op.all_inputs():
if isinstance(var._var, Parameter):
params.append(var)
assert len(params) > 0
return params
def numel_params(self):
"""
Get the number of elements in all parameters.
"""
ret = 0
for param in self.all_parameters():
ret += np.product(param.shape())
return ret
def get_optimize_graph(self, optimizer, place, scope, no_grad_var_names=[]):
"""
Get a new graph for training by appending some backward operators and optimization operators.
Args:
optimizer: The optimzier used to generate training graph.
place: The place to run the graph.
scope: The scope used to run the graph. Some new variable will be added into this scope.
no_grad_var_names(list<str>): Names of variables that should be ignored while computing gradients. default: [].
Returns:
(GraphWrapper): The wrapper of new graph with backward ops and optimization ops.
"""
graph = self.clone()
startup_program = Program()
with program_guard(
main_program=graph.program, startup_program=startup_program):
target_name = None
if 'loss' in graph.out_nodes:
target_name = graph.out_nodes['loss']
elif 'cost' in graph.out_nodes:
target_name = graph.out_nodes['cost']
target = graph.var(target_name)._var
optimizer.minimize(target, no_grad_set=no_grad_var_names)
exe = Executor(place)
exe.run(program=startup_program, scope=scope)
return graph
def flops(self, only_conv=False):
"""
Get the flops of current graph.
Args:
only_conv: Only calculating the conv layers. default: False.
Returns:
int: The flops of current graph.
"""
flops = 0
for op in self.ops():
if op.type() in ['conv2d', 'depthwise_conv2d']:
filter_shape = op.inputs("Filter")[0].shape()
input_shape = op.inputs("Input")[0].shape()
output_shape = op.outputs("Output")[0].shape()
c_out, c_in, k_h, k_w = filter_shape
_, _, h_out, w_out = output_shape
groups = op.attr("groups")
kernel_ops = k_h * k_w * (c_in / groups)
if len(op.inputs("Bias")) > 0:
with_bias = 1
else:
with_bias = 0
flops += 2 * h_out * w_out * c_out * (kernel_ops + with_bias)
elif op.type() == 'pool2d' and not only_conv:
input_shape = op.inputs("X")[0].shape()
output_shape = op.outputs("Out")[0].shape()
_, c_out, h_out, w_out = output_shape
k_size = op.attr("ksize")
flops += h_out * w_out * c_out * (k_size[0]**2)
elif op.type() == 'mul' and not only_conv:
x_shape = list(op.inputs("X")[0].shape())
y_shape = op.inputs("Y")[0].shape()
if x_shape[0] == -1:
x_shape[0] = 1
flops += 2 * x_shape[0] * x_shape[1] * y_shape[1]
elif op.type() in ['relu', 'sigmoid', 'batch_norm'
] and not only_conv:
input_shape = list(op.inputs("X")[0].shape())
if input_shape[0] == -1:
input_shape[0] = 1
flops += np.product(input_shape)
return flops
def save_persistables(self, path, exe):
"""
Save all the persistable variables into file.
Args:
path(str): The path to save the persistables.
exe(framework.Executor): The executor used to save the persistables.
"""
io.save_persistables(exe.exe, path, main_program=self.program)
def load_persistables(self, path, exe):
"""
Load the persistable variables from file.
Args:
path(str): The path to load the persistables.
exe(framework.Executor): The executor used to load the persistables.
"""
def if_exist(var):
return os.path.exists(os.path.join(path, var.name))
io.load_vars(
exe.exe, path, main_program=self.program, predicate=if_exist)
def update_param_shape(self, scope):
"""
Update the shape of parameters in the graph according to tensors in scope.
It is used after loading pruned parameters from file.
"""
for param in self.all_parameters():
tensor_shape = np.array(scope.find_var(param.name()).get_tensor(
)).shape
param.set_shape(tensor_shape)
def infer_shape(self):
"""
Update the groups of convolution layer according to current filters.
It is used after loading pruned parameters from file.
"""
for op in self.ops():
if op.type() != 'conditional_block':
op._op.desc.infer_shape(op._op.block.desc)
def update_groups_of_conv(self):
for op in self.ops():
if op.type() == 'depthwise_conv2d':
op.set_attr('groups', op.inputs('Filter')[0].shape()[0])
...@@ -13,54 +13,919 @@ ...@@ -13,54 +13,919 @@
# limitations under the License. # limitations under the License.
from ..core.strategy import Strategy from ..core.strategy import Strategy
from ....framework import Program, program_guard from ..graph import VarWrapper, OpWrapper, GraphWrapper
from ....framework import Program, program_guard, Parameter
from .... import layers from .... import layers
import prettytable as pt
import numpy as np import numpy as np
from scipy.optimize import leastsq
import copy
import re
import os
import pickle
import logging
import sys
__all__ = ['SensitivePruneStrategy', 'PruneStrategy'] __all__ = ['SensitivePruneStrategy', 'UniformPruneStrategy']
logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
_logger = logging.getLogger(__name__)
_logger.setLevel(logging.INFO)
class PruneStrategy(Strategy):
"""
The base class of all pruning strategies.
"""
class SensitivePruneStrategy(Strategy):
def __init__(self, def __init__(self,
pruner=None, pruner=None,
start_epoch=0, start_epoch=0,
end_epoch=10, end_epoch=0,
delta_rate=0.20, target_ratio=0.5,
acc_loss_threshold=0.2, metric_name=None,
sensitivities=None): pruned_params='conv.*_weights'):
super(SensitivePruneStrategy, self).__init__(start_epoch, end_epoch) """
Args:
pruner(slim.Pruner): The pruner used to prune the parameters.
start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0
end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0
target_ratio(float): The flops ratio to be pruned from current model.
metric_name(str): The metric used to evaluate the model.
It should be one of keys in out_nodes of graph wrapper.
pruned_params(str): The pattern str to match the parameter names to be pruned.
"""
super(PruneStrategy, self).__init__(start_epoch, end_epoch)
self.pruner = pruner self.pruner = pruner
self.delta_rate = delta_rate self.target_ratio = target_ratio
self.acc_loss_threshold = acc_loss_threshold self.metric_name = metric_name
self.sensitivities = sensitivities self.pruned_params = pruned_params
self.pruned_list = []
self.backup = {}
self.param_shape_backup = {}
def _eval_graph(self, context, sampled_rate=None, cached_id=0):
"""
Evaluate the current mode in context.
Args:
context(slim.core.Context): The context storing all information used to evaluate the current model.
sampled_rate(float): The sampled rate used to sample partial data for evaluation. None means using all data in eval_reader. default: None.
cached_id(int): The id of dataset sampled. Evaluations with same cached_id use the same sampled dataset. default: 0.
"""
results, names = context.run_eval_graph(sampled_rate, cached_id)
metric = np.mean(results[list(names).index(self.metric_name)])
return metric
class PruneStrategy(Strategy): def _prune_filters_by_ratio(self,
scope,
params,
ratio,
place,
lazy=False,
only_graph=False):
"""
Pruning filters by given ratio.
Args:
scope(fluid.core.Scope): The scope used to pruning filters.
params(list<VarWrapper>): A list of filter parameters.
ratio(float): The ratio to be pruned.
place(fluid.Place): The device place of filter parameters.
lazy(bool): True means setting the pruned elements to zero.
False means cutting down the pruned elements.
only_graph(bool): True means only modifying the graph.
False means modifying graph and variables in scope.
"""
if params[0].name() in self.pruned_list[0]:
return
param_t = scope.find_var(params[0].name()).get_tensor()
pruned_idx = self.pruner.cal_pruned_idx(
params[0].name(), np.array(param_t), ratio, axis=0)
for param in params:
assert isinstance(param, VarWrapper)
param_t = scope.find_var(param.name()).get_tensor()
if lazy:
self.backup[param.name()] = copy.deepcopy(np.array(param_t))
pruned_param = self.pruner.prune_tensor(
np.array(param_t), pruned_idx, pruned_axis=0, lazy=lazy)
if not only_graph:
param_t.set(pruned_param, place)
ori_shape = param.shape()
if param.name() not in self.param_shape_backup:
self.param_shape_backup[param.name()] = copy.deepcopy(
param.shape())
new_shape = list(param.shape())
new_shape[0] = pruned_param.shape[0]
param.set_shape(new_shape)
_logger.debug(
'|----------------------------------------+----+------------------------------+------------------------------|'
)
_logger.debug('|{:^40}|{:^4}|{:^30}|{:^30}|'.format(
str(param.name()), str(0), str(ori_shape), str(param.shape())))
self.pruned_list[0].append(param.name())
return pruned_idx
def _prune_parameter_by_idx(self,
scope,
params,
pruned_idx,
pruned_axis,
place,
lazy=False,
only_graph=False):
"""
Pruning parameters in given axis.
Args:
scope(fluid.core.Scope): The scope storing paramaters to be pruned.
params(VarWrapper): The parameter to be pruned.
pruned_idx(list): The index of elements to be pruned.
pruned_axis(int): The pruning axis.
place(fluid.Place): The device place of filter parameters.
lazy(bool): True means setting the pruned elements to zero.
False means cutting down the pruned elements.
only_graph(bool): True means only modifying the graph.
False means modifying graph and variables in scope.
"""
if params[0].name() in self.pruned_list[pruned_axis]:
return
for param in params:
assert isinstance(param, VarWrapper)
param_t = scope.find_var(param.name()).get_tensor()
if lazy:
self.backup[param.name()] = copy.deepcopy(np.array(param_t))
pruned_param = self.pruner.prune_tensor(
np.array(param_t), pruned_idx, pruned_axis, lazy=lazy)
if not only_graph:
param_t.set(pruned_param, place)
ori_shape = param.shape()
if param.name() not in self.param_shape_backup:
self.param_shape_backup[param.name()] = copy.deepcopy(
param.shape())
new_shape = list(param.shape())
new_shape[pruned_axis] = pruned_param.shape[pruned_axis]
param.set_shape(new_shape)
_logger.debug(
'|----------------------------------------+----+------------------------------+------------------------------|'
)
_logger.debug('|{:^40}|{:^4}|{:^30}|{:^30}|'.format(
str(param.name()),
str(pruned_axis), str(ori_shape), str(param.shape())))
self.pruned_list[pruned_axis].append(param.name())
def _forward_search_related_op(self, graph, param):
"""
Forward search operators that will be affected by pruning of param.
Args:
graph(GraphWrapper): The graph to be searched.
param(VarWrapper): The current pruned parameter.
Returns:
list<OpWrapper>: A list of operators.
"""
assert isinstance(param, VarWrapper)
visited = {}
for op in graph.ops():
visited[op.idx()] = False
stack = []
for op in graph.ops():
if (not op.is_bwd_op()) and (param in op.all_inputs()):
stack.append(op)
visit_path = []
while len(stack) > 0:
top_op = stack[len(stack) - 1]
if visited[top_op.idx()] == False:
visit_path.append(top_op)
visited[top_op.idx()] = True
next_ops = None
if top_op.type() == "conv2d" and param not in top_op.all_inputs():
next_ops = None
elif top_op.type() == "mul":
next_ops = None
else:
next_ops = self._get_next_unvisited_op(graph, visited, top_op)
if next_ops == None:
stack.pop()
else:
stack += next_ops
return visit_path
def _get_next_unvisited_op(self, graph, visited, top_op):
"""
Get next unvisited adjacent operators of given operators.
Args:
graph(GraphWrapper): The graph used to search.
visited(list): The ids of operators that has been visited.
top_op: The given operator.
Returns:
list<OpWrapper>: A list of operators.
"""
assert isinstance(top_op, OpWrapper)
next_ops = []
for op in graph.next_ops(top_op):
if (visited[op.idx()] == False) and (not op.is_bwd_op()):
next_ops.append(op)
return next_ops if len(next_ops) > 0 else None
def _get_accumulator(self, graph, param):
"""
Get accumulators of given parameter. The accumulator was created by optimizer.
Args:
graph(GraphWrapper): The graph used to search.
param(VarWrapper): The given parameter.
Returns:
list<VarWrapper>: A list of accumulators which are variables.
"""
assert isinstance(param, VarWrapper)
params = []
for op in param.outputs():
if op.is_opt_op():
for out_var in op.all_outputs():
if graph.is_persistable(out_var) and out_var.name(
) != param.name():
params.append(out_var)
return params
def _forward_pruning_ralated_params(self,
graph,
scope,
param,
place,
ratio=None,
pruned_idxs=None,
lazy=False,
only_graph=False):
"""
Pruning all the parameters affected by the pruning of given parameter.
Args:
graph(GraphWrapper): The graph to be searched.
scope(fluid.core.Scope): The scope storing paramaters to be pruned.
param(VarWrapper): The given parameter.
place(fluid.Place): The device place of filter parameters.
ratio(float): The target ratio to be pruned.
pruned_idx(list): The index of elements to be pruned.
lazy(bool): True means setting the pruned elements to zero.
False means cutting down the pruned elements.
only_graph(bool): True means only modifying the graph.
False means modifying graph and variables in scope.
"""
assert isinstance(
graph,
GraphWrapper), "graph must be instance of slim.core.GraphWrapper"
assert isinstance(
param, VarWrapper), "param must be instance of slim.core.VarWrapper"
if param.name() in self.pruned_list[0]:
return
related_ops = self._forward_search_related_op(graph, param)
if ratio is None:
assert pruned_idxs is not None
self._prune_parameter_by_idx(
scope, [param] + self._get_accumulator(graph, param),
pruned_idxs,
pruned_axis=0,
place=place,
lazy=lazy,
only_graph=only_graph)
else:
pruned_idxs = self._prune_filters_by_ratio(
scope, [param] + self._get_accumulator(graph, param),
ratio,
place,
lazy=lazy,
only_graph=only_graph)
corrected_idxs = pruned_idxs[:]
for idx, op in enumerate(related_ops):
if op.type() == "conv2d" and (param not in op.all_inputs()):
for in_var in op.all_inputs():
if graph.is_parameter(in_var):
conv_param = in_var
self._prune_parameter_by_idx(
scope, [conv_param] + self._get_accumulator(
graph, conv_param),
corrected_idxs,
pruned_axis=1,
place=place,
lazy=lazy,
only_graph=only_graph)
if op.type() == "depthwise_conv2d":
for in_var in op.all_inputs():
if graph.is_parameter(in_var):
conv_param = in_var
self._prune_parameter_by_idx(
scope, [conv_param] + self._get_accumulator(
graph, conv_param),
corrected_idxs,
pruned_axis=0,
place=place,
lazy=lazy,
only_graph=only_graph)
elif op.type() == "elementwise_add":
# pruning bias
for in_var in op.all_inputs():
if graph.is_parameter(in_var):
bias_param = in_var
self._prune_parameter_by_idx(
scope, [bias_param] + self._get_accumulator(
graph, bias_param),
pruned_idxs,
pruned_axis=0,
place=place,
lazy=lazy,
only_graph=only_graph)
elif op.type() == "mul": # pruning fc layer
fc_input = None
fc_param = None
for in_var in op.all_inputs():
if graph.is_parameter(in_var):
fc_param = in_var
else:
fc_input = in_var
idx = []
feature_map_size = fc_input.shape()[2] * fc_input.shape()[3]
range_idx = np.array(range(feature_map_size))
for i in corrected_idxs:
idx += list(range_idx + i * feature_map_size)
corrected_idxs = idx
self._prune_parameter_by_idx(
scope, [fc_param] + self._get_accumulator(graph, fc_param),
corrected_idxs,
pruned_axis=0,
place=place,
lazy=lazy,
only_graph=only_graph)
elif op.type() == "concat":
concat_inputs = op.all_inputs()
last_op = related_ops[idx - 1]
for out_var in last_op.all_outputs():
if out_var in concat_inputs:
concat_idx = concat_inputs.index(out_var)
offset = 0
for ci in range(concat_idx):
offset += concat_inputs[ci].shape()[1]
corrected_idxs = [x + offset for x in pruned_idxs]
elif op.type() == "batch_norm":
bn_inputs = op.all_inputs()
mean = bn_inputs[2]
variance = bn_inputs[3]
alpha = bn_inputs[0]
beta = bn_inputs[1]
self._prune_parameter_by_idx(
scope, [mean] + self._get_accumulator(graph, mean),
corrected_idxs,
pruned_axis=0,
place=place,
lazy=lazy,
only_graph=only_graph)
self._prune_parameter_by_idx(
scope, [variance] + self._get_accumulator(graph, variance),
corrected_idxs,
pruned_axis=0,
place=place,
lazy=lazy,
only_graph=only_graph)
self._prune_parameter_by_idx(
scope, [alpha] + self._get_accumulator(graph, alpha),
corrected_idxs,
pruned_axis=0,
place=place,
lazy=lazy,
only_graph=only_graph)
self._prune_parameter_by_idx(
scope, [beta] + self._get_accumulator(graph, beta),
corrected_idxs,
pruned_axis=0,
place=place,
lazy=lazy,
only_graph=only_graph)
def _prune_parameters(self,
graph,
scope,
params,
ratios,
place,
lazy=False,
only_graph=False):
"""
Pruning the given parameters.
Args:
graph(GraphWrapper): The graph to be searched.
scope(fluid.core.Scope): The scope storing paramaters to be pruned.
params(list<str>): A list of parameter names to be pruned.
ratios(list<float>): A list of ratios to be used to pruning parameters.
place(fluid.Place): The device place of filter parameters.
pruned_idx(list): The index of elements to be pruned.
lazy(bool): True means setting the pruned elements to zero.
False means cutting down the pruned elements.
only_graph(bool): True means only modifying the graph.
False means modifying graph and variables in scope.
"""
_logger.debug('\n################################')
_logger.debug('# pruning parameters #')
_logger.debug('################################\n')
_logger.debug(
'|----------------------------------------+----+------------------------------+------------------------------|'
)
_logger.debug('|{:^40}|{:^4}|{:^30}|{:^30}|'.format('parameter', 'axis',
'from', 'to'))
assert len(params) == len(ratios)
self.pruned_list = [[], []]
for param, ratio in zip(params, ratios):
assert isinstance(param, str) or isinstance(param, unicode)
param = graph.var(param)
self._forward_pruning_ralated_params(
graph,
scope,
param,
place,
ratio=ratio,
lazy=lazy,
only_graph=only_graph)
ops = param.outputs()
for op in ops:
if op.type() == 'conv2d':
brother_ops = self._search_brother_ops(graph, op)
for broher in brother_ops:
for p in graph.get_param_by_op(broher):
self._forward_pruning_ralated_params(
graph,
scope,
p,
place,
ratio=ratio,
lazy=lazy,
only_graph=only_graph)
_logger.debug(
'|----------------------------------------+----+------------------------------+------------------------------|'
)
def _search_brother_ops(self, graph, op_node):
"""
Search brother operators that was affected by pruning of given operator.
Args:
graph(GraphWrapper): The graph to be searched.
op_node(OpWrapper): The start node for searching.
Returns:
list<VarWrapper>: A list of operators.
"""
visited = [op_node.idx()]
stack = []
brothers = []
for op in graph.next_ops(op_node):
if (op.type() != 'conv2d') and (op.type() != 'fc') and (
not op._is_bwd_op()):
stack.append(op)
visited.append(op.idx())
while len(stack) > 0:
top_op = stack.pop()
for parent in graph.pre_ops(top_op):
if parent.idx() not in visited and (not parent._is_bwd_op()):
if ((parent.type == 'conv2d') or (parent.type == 'fc')):
brothers.append(parent)
else:
stack.append(parent)
visited.append(parent.idx())
for child in graph.next_ops(top_op):
if (child.type != 'conv2d') and (child.type != 'fc') and (
child.idx() not in visited) and (
not child._is_bwd_op()):
stack.append(child)
visited.append(child.idx())
return brothers
def _prune_graph(self, graph, target_graph):
"""
Pruning parameters of graph according to target graph.
Args:
graph(GraphWrapper): The graph to be pruned.
target_graph(GraphWrapper): The reference graph.
Return: None
"""
count = 1
_logger.debug(
'|----+----------------------------------------+------------------------------+------------------------------|'
)
_logger.debug('|{:^4}|{:^40}|{:^30}|{:^30}|'.format('id', 'parammeter',
'from', 'to'))
for param in target_graph.all_parameters():
var = graph.var(param.name())
ori_shape = var.shape()
var.set_shape(param.shape())
_logger.debug(
'|----+----------------------------------------+------------------------------+------------------------------|'
)
_logger.debug('|{:^4}|{:^40}|{:^30}|{:^30}|'.format(
str(count),
str(param.name()), str(ori_shape), str(param.shape())))
count += 1
_logger.debug(
'|----+----------------------------------------+------------------------------+------------------------------|'
)
class UniformPruneStrategy(PruneStrategy):
""" """
The strategy that pruning weights by threshold or ratio iteratively. The uniform pruning strategy. The parameters will be pruned by uniform ratio.
""" """
def __init__(self, def __init__(self,
pruner, pruner=None,
mini_batch_pruning_frequency=1,
start_epoch=0, start_epoch=0,
end_epoch=10): end_epoch=0,
super(PruneStrategy, self).__init__(start_epoch, end_epoch) target_ratio=0.5,
self.pruner = pruner metric_name=None,
self.mini_batch_pruning_frequency = mini_batch_pruning_frequency pruned_params='conv.*_weights'):
"""
def _triger(self, context): Args:
return (context.batch_id % self.mini_batch_pruning_frequency == 0 and pruner(slim.Pruner): The pruner used to prune the parameters.
self.start_epoch <= context.epoch_id < self.end_epoch) start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0
end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0
def on_batch_end(self, context): target_ratio(float): The flops ratio to be pruned from current model.
if self._triger(context): metric_name(str): The metric used to evaluate the model.
prune_program = Program() It should be one of keys in out_nodes of graph wrapper.
with program_guard(prune_program): pruned_params(str): The pattern str to match the parameter names to be pruned.
for param in context.graph.all_parameters(): """
prune_program.global_block().clone_variable(param) super(UniformPruneStrategy, self).__init__(pruner, start_epoch,
p = prune_program.global_block().var(param.name) end_epoch, target_ratio,
zeros_mask = self.pruner.prune(p) metric_name, pruned_params)
pruned_param = p * zeros_mask
layers.assign(input=pruned_param, output=param) def _get_best_ratios(self, context):
context.program_exe.run(prune_program, scope=context.scope) """
Search a group of ratios for pruning target flops.
"""
_logger.info('_get_best_ratios')
pruned_params = []
for param in context.eval_graph.all_parameters():
if re.match(self.pruned_params, param.name()):
pruned_params.append(param.name())
min_ratio = 0.
max_ratio = 1.
flops = context.eval_graph.flops()
model_size = context.eval_graph.numel_params()
while min_ratio < max_ratio:
ratio = (max_ratio + min_ratio) / 2
_logger.debug(
'-----------Try pruning ratio: {:.2f}-----------'.format(ratio))
ratios = [ratio] * len(pruned_params)
self._prune_parameters(
context.eval_graph,
context.scope,
pruned_params,
ratios,
context.place,
only_graph=True)
pruned_flops = 1 - (float(context.eval_graph.flops()) / flops)
pruned_size = 1 - (float(context.eval_graph.numel_params()) /
model_size)
_logger.debug('Pruned flops: {:.2f}'.format(pruned_flops))
_logger.debug('Pruned model size: {:.2f}'.format(pruned_size))
for param in self.param_shape_backup.keys():
context.eval_graph.var(param).set_shape(self.param_shape_backup[
param])
self.param_shape_backup = {}
if abs(pruned_flops - self.target_ratio) < 1e-2:
break
if pruned_flops > self.target_ratio:
max_ratio = ratio
else:
min_ratio = ratio
_logger.info('Get ratios: {}'.format([round(r, 2) for r in ratios]))
return pruned_params, ratios
def on_epoch_begin(self, context):
if context.epoch_id == self.start_epoch:
params, ratios = self._get_best_ratios(context)
self._prune_parameters(context.optimize_graph, context.scope,
params, ratios, context.place)
model_size = context.eval_graph.numel_params()
flops = context.eval_graph.flops()
_logger.debug('\n################################')
_logger.debug('# pruning eval graph #')
_logger.debug('################################\n')
self._prune_graph(context.eval_graph, context.optimize_graph)
context.optimize_graph.update_groups_of_conv()
context.eval_graph.update_groups_of_conv()
_logger.info(
'------------------finish pruning--------------------------------'
)
_logger.info('Pruned size: {:.2f}'.format(1 - (float(
context.eval_graph.numel_params()) / model_size)))
_logger.info('Pruned flops: {:.2f}'.format(1 - (float(
context.eval_graph.flops()) / flops)))
# metric = self._eval_graph(context)
# _logger.info('Metric after pruning: {:.2f}'.format(metric))
_logger.info(
'------------------UniformPruneStrategy.on_compression_begin finish--------------------------------'
)
class SensitivePruneStrategy(PruneStrategy):
"""
Sensitive pruning strategy. Different pruned ratio was applied on each layer.
"""
def __init__(self,
pruner=None,
start_epoch=0,
end_epoch=0,
delta_rate=0.20,
target_ratio=0.5,
metric_name='top1_acc',
pruned_params='conv.*_weights',
sensitivities_file='./sensitivities.data',
sensitivities={},
num_steps=1,
eval_rate=None):
"""
Args:
pruner(slim.Pruner): The pruner used to prune the parameters.
start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0.
end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 10.
delta_rate(float): The delta used to generate ratios when calculating sensitivities. default: 0.2
target_ratio(float): The flops ratio to be pruned from current model. default: 0.5
metric_name(str): The metric used to evaluate the model.
It should be one of keys in out_nodes of graph wrapper. default: 'top1_acc'
pruned_params(str): The pattern str to match the parameter names to be pruned. default: 'conv.*_weights'.
sensitivities_file(str): The sensitivities file. default: './sensitivities.data'
sensitivities(dict): The user-defined sensitivities. default: {}.
num_steps(int): The number of pruning steps. default: 1.
eval_rate(float): The rate of sampled data used to calculate sensitivities.
None means using all the data. default: None.
"""
super(SensitivePruneStrategy, self).__init__(pruner, start_epoch,
end_epoch, target_ratio,
metric_name, pruned_params)
self.delta_rate = delta_rate
self.pruned_list = []
self.sensitivities = sensitivities
self.sensitivities_file = sensitivities_file
self.backup = {}
self.param_shape_backup = {}
self.num_steps = num_steps
self.eval_rate = eval_rate
self.pruning_step = 1 - pow((1 - target_ratio), 1.0 / self.num_steps)
def _save_sensitivities(self, sensitivities, sensitivities_file):
"""
Save sensitivities into file.
"""
with open(sensitivities_file, 'wb') as f:
pickle.dump(sensitivities, f)
def _load_sensitivities(self, sensitivities_file):
"""
Load sensitivities from file.
"""
sensitivities = {}
if sensitivities_file and os.path.exists(sensitivities_file):
with open(sensitivities_file, 'rb') as f:
if sys.version_info < (3, 0):
sensitivities = pickle.load(f)
else:
sensitivities = pickle.load(f, encoding='bytes')
for param in sensitivities:
sensitivities[param]['pruned_percent'] = [
round(p, 2) for p in sensitivities[param]['pruned_percent']
]
self._format_sensitivities(sensitivities)
return sensitivities
def _format_sensitivities(self, sensitivities):
"""
Print formated sensitivities in debug log level.
"""
tb = pt.PrettyTable()
tb.field_names = ["parameter", "size"] + [
str(round(i, 2))
for i in np.arange(self.delta_rate, 1, self.delta_rate)
]
for param in sensitivities:
if len(sensitivities[param]['loss']) == (len(tb.field_names) - 2):
tb.add_row([param, sensitivities[param]['size']] + [
round(loss, 2) for loss in sensitivities[param]['loss']
])
_logger.debug('\n################################')
_logger.debug('# sensitivities table #')
_logger.debug('################################\n')
_logger.debug(tb)
def _compute_sensitivities(self, context):
"""
Computing the sensitivities of all parameters.
"""
_logger.info("calling _compute_sensitivities.")
self.param_shape_backup = {}
self.backup = {}
cached_id = np.random.randint(1000)
if self.start_epoch == context.epoch_id:
sensitivities_file = self.sensitivities_file
else:
sensitivities_file = self.sensitivities_file + ".epoch" + str(
context.epoch_id)
sensitivities = self._load_sensitivities(sensitivities_file)
for param in context.eval_graph.all_parameters():
if not re.match(self.pruned_params, param.name()):
continue
if param.name() not in sensitivities:
sensitivities[param.name()] = {
'pruned_percent': [],
'loss': [],
'size': param.shape()[0]
}
metric = None
for param in sensitivities.keys():
ratio = self.delta_rate
while ratio < 1:
ratio = round(ratio, 2)
if ratio in sensitivities[param]['pruned_percent']:
_logger.debug('{}, {} has computed.'.format(param, ratio))
ratio += self.delta_rate
continue
if metric is None:
metric = self._eval_graph(context, self.eval_rate,
cached_id)
# prune parameter by ratio
self._prune_parameters(
context.eval_graph,
context.scope, [param], [ratio],
context.place,
lazy=True)
self.pruned_list[0]
# get accuracy after pruning and update self.sensitivities
pruned_metric = self._eval_graph(context, self.eval_rate,
cached_id)
loss = metric - pruned_metric
_logger.info("pruned param: {}; {}; loss={}".format(
param, ratio, loss))
for brother in self.pruned_list[0]:
if re.match(self.pruned_params, brother):
if brother not in sensitivities:
sensitivities[brother] = {
'pruned_percent': [],
'loss': []
}
sensitivities[brother]['pruned_percent'].append(ratio)
sensitivities[brother]['loss'].append(loss)
self._save_sensitivities(sensitivities, sensitivities_file)
# restore pruned parameters
for param_name in self.backup.keys():
param_t = context.scope.find_var(param_name).get_tensor()
param_t.set(self.backup[param_name], context.place)
# pruned_metric = self._eval_graph(context)
self.backup = {}
ratio += self.delta_rate
return sensitivities
def _get_best_ratios(self, context, sensitivities, target_ratio):
"""
Search a group of ratios for pruning target flops.
"""
_logger.info('_get_best_ratios for pruning ratie: {}'.format(
target_ratio))
self.param_shape_backup = {}
self.backup = {}
def func(params, x):
a, b, c, d = params
return a * x * x * x + b * x * x + c * x + d
def error(params, x, y):
return func(params, x) - y
def slove_coefficient(x, y):
init_coefficient = [10, 10, 10, 10]
coefficient, loss = leastsq(error, init_coefficient, args=(x, y))
return coefficient
min_loss = 0.
max_loss = 0.
# step 1: fit curve by sensitivities
coefficients = {}
for param in sensitivities:
losses = np.array([0] * 5 + sensitivities[param]['loss'])
precents = np.array([0] * 5 + sensitivities[param][
'pruned_percent'])
coefficients[param] = slove_coefficient(precents, losses)
loss = np.max(losses)
max_loss = np.max([max_loss, loss])
# step 2: Find a group of ratios by binary searching.
flops = context.eval_graph.flops()
model_size = context.eval_graph.numel_params()
ratios = []
while min_loss < max_loss:
loss = (max_loss + min_loss) / 2
_logger.info(
'-----------Try pruned ratios while acc loss={:.4f}-----------'.
format(loss))
ratios = []
# step 2.1: Get ratios according to current loss
for param in sensitivities:
coefficient = copy.deepcopy(coefficients[param])
coefficient[-1] = coefficient[-1] - loss
roots = np.roots(coefficient)
for root in roots:
min_root = 1
if np.isreal(root) and root > 0 and root < 1:
selected_root = min(root.real, min_root)
ratios.append(selected_root)
_logger.info('Pruned ratios={}'.format(
[round(ratio, 3) for ratio in ratios]))
# step 2.2: Pruning by current ratios
self._prune_parameters(
context.eval_graph,
context.scope,
sensitivities.keys(),
ratios,
context.place,
only_graph=True)
pruned_flops = 1 - (float(context.eval_graph.flops()) / flops)
pruned_size = 1 - (float(context.eval_graph.numel_params()) /
model_size)
_logger.info('Pruned flops: {:.4f}'.format(pruned_flops))
_logger.info('Pruned model size: {:.4f}'.format(pruned_size))
for param in self.param_shape_backup.keys():
context.eval_graph.var(param).set_shape(self.param_shape_backup[
param])
self.param_shape_backup = {}
# step 2.3: Check whether current ratios is enough
if abs(pruned_flops - target_ratio) < 0.015:
break
if pruned_flops > target_ratio:
max_loss = loss
else:
min_loss = loss
return sensitivities.keys(), ratios
def _current_pruning_target(self, context):
'''
Get the target pruning rate in current epoch.
'''
_logger.info('Left number of pruning steps: {}'.format(self.num_steps))
if self.num_steps <= 0:
return None
if (self.start_epoch == context.epoch_id) or context.eval_converged(
self.metric_name, 0.005):
self.num_steps -= 1
return self.pruning_step
def on_epoch_begin(self, context):
current_ratio = self._current_pruning_target(context)
if current_ratio is not None:
sensitivities = self._compute_sensitivities(context)
params, ratios = self._get_best_ratios(context, sensitivities,
current_ratio)
self._prune_parameters(context.optimize_graph, context.scope,
params, ratios, context.place)
self.param_shape_backup = {}
self.backup = {}
model_size = context.eval_graph.numel_params()
flops = context.eval_graph.flops()
_logger.debug('################################')
_logger.debug('# pruning eval graph #')
_logger.debug('################################')
self._prune_graph(context.eval_graph, context.optimize_graph)
context.optimize_graph.update_groups_of_conv()
context.eval_graph.update_groups_of_conv()
context.optimize_graph.compile() # to update the compiled program
context.eval_graph.compile(
for_parallel=False,
for_test=True) # to update the compiled program
_logger.info(
'------------------finish pruning--------------------------------'
)
_logger.info('Pruned size: {:.3f}'.format(1 - (float(
context.eval_graph.numel_params()) / model_size)))
_logger.info('Pruned flops: {:.3f}'.format(1 - (float(
context.eval_graph.flops()) / flops)))
metric = self._eval_graph(context)
_logger.info('Metric after pruning: {:.2f}'.format(metric))
_logger.info(
'------------------SensitivePruneStrategy.on_epoch_begin finish--------------------------------'
)
...@@ -13,9 +13,10 @@ ...@@ -13,9 +13,10 @@
# limitations under the License. # limitations under the License.
import numpy as np import numpy as np
import collections
from .... import layers from .... import layers
__all__ = ['Pruner', 'MagnitudePruner', 'RatioPruner'] __all__ = ['Pruner', 'StructurePruner']
class Pruner(object): class Pruner(object):
...@@ -30,54 +31,77 @@ class Pruner(object): ...@@ -30,54 +31,77 @@ class Pruner(object):
pass pass
class MagnitudePruner(Pruner): class StructurePruner(Pruner):
""" """
Pruner used to pruning a parameter by threshold. Pruner used to pruning parameters by groups.
""" """
def __init__(self, threshold): def __init__(self, pruning_axis, criterions):
self.threshold = threshold """
Args:
def prune(self, param, threshold=None): pruning_axis(dict): The key is the name of parameter to be pruned,
if threshold is None: '*' means all the parameters.
thres = layers.fill_constant( The value is the axis to be used. Given a parameter
shape=[1], dtype='float32', value=self.threshold) with shape [3, 4], the result of pruning 50% on aixs 1
else: is a parameter with shape [3, 2].
thres = threshold criterions(dict): The key is the name of parameter to be pruned,
zeros_mask = layers.less_than(x=param, y=thres) '*' means all the parameters.
return zeros_mask The value is the criterion used to sort groups for pruning.
It only supports 'l1_norm' currently.
"""
class RatioPruner(Pruner): self.pruning_axis = pruning_axis
""" self.criterions = criterions
Pruner used to pruning a parameter by ratio.
"""
def __init__(self, ratios=None): def cal_pruned_idx(self, name, param, ratio, axis=None):
""" """
Calculate the index to be pruned on axis by given pruning ratio.
Args: Args:
ratios: dict with pair (paramer_name, pruned_ratio). name(str): The name of parameter to be pruned.
param(np.array): The data of parameter to be pruned.
ratio(float): The ratio to be pruned.
axis(int): The axis to be used for pruning given parameter.
If it is None, the value in self.pruning_axis will be used.
default: None.
Returns:
list<int>: The indexes to be pruned on axis.
""" """
self.ratios = ratios criterion = self.criterions[
name] if name in self.criterions else self.criterions['*']
if axis is None:
assert self.pruning_axis is not None, "pruning_axis should set if axis is None."
axis = self.pruning_axis[
name] if name in self.pruning_axis else self.pruning_axis['*']
prune_num = int(round(param.shape[axis] * ratio))
reduce_dims = [i for i in range(len(param.shape)) if i != axis]
if criterion == 'l1_norm':
criterions = np.sum(np.abs(param), axis=tuple(reduce_dims))
pruned_idx = criterions.argsort()[:prune_num]
return pruned_idx
def prune(self, param, ratio=None): def prune_tensor(self, tensor, pruned_idx, pruned_axis, lazy=False):
""" """
Pruning a array by indexes on given axis.
Args: Args:
ratio: `ratio=40%` means pruning (1 - 40%) weights to zero. tensor(numpy.array): The target array to be pruned.
pruned_idx(list<int>): The indexes to be pruned.
pruned_axis(int): The axis of given array to be pruned on.
lazy(bool): True means setting the pruned elements to zero.
False means remove the pruned elements from memory.
default: False.
Returns:
numpy.array: The pruned array.
""" """
if ratio is None: mask = np.zeros(tensor.shape[pruned_axis], dtype=bool)
rat = self.ratios[ mask[pruned_idx] = True
param.name] if param.name in self.ratios else self.ratios['*']
else: def func(data):
rat = ratio return data[~mask]
if rat < 1.0:
k = max(int(rat * np.prod(param.shape)), 1) def lazy_func(data):
param_vec = layers.reshape(x=param, shape=[1, -1]) data[mask] = 0
param_topk, _ = layers.topk(param_vec, k=k) return data
threshold = layers.slice(
param_topk, axes=[1], starts=[-1], ends=[k]) if lazy:
threshold = layers.reshape(x=threshold, shape=[1]) return np.apply_along_axis(lazy_func, pruned_axis, tensor)
zeros_mask = layers.less_than(x=param, y=threshold)
else: else:
zeros_mask = layers.ones(param.shape) return np.apply_along_axis(func, pruned_axis, tensor)
return zeros_mask
...@@ -22,6 +22,7 @@ from ....framework import IrGraph ...@@ -22,6 +22,7 @@ from ....framework import IrGraph
from ....framework import IrNode from ....framework import IrNode
from ....framework import Program from ....framework import Program
from ....initializer import Constant from ....initializer import Constant
from ....initializer import NumpyArrayInitializer
from .... import unique_name from .... import unique_name
__all__ = [ __all__ = [
...@@ -54,14 +55,15 @@ class QuantizationTransformPass(object): ...@@ -54,14 +55,15 @@ class QuantizationTransformPass(object):
the bias is not quantized. the bias is not quantized.
activation_bits (int): quantization bit number for activation. activation_bits (int): quantization bit number for activation.
activation_quantize_type (str): quantization type for activation, activation_quantize_type (str): quantization type for activation,
now support 'abs_max', 'range_abs_max'. If use 'abs_max' mode, now support 'abs_max', 'range_abs_max' and 'moving_average_abs_max'.
the quantization scale will be calculated dynamically each step If use 'abs_max' mode, the quantization scale will be calculated
in both training and testing period. If use 'range_abs_max', dynamically each step in both training and testing period. If use
a static quantization scale will be calculated during training 'range_abs_max', a static quantization scale will be calculated
and used in inference. during training and used in inference.
weight_quantize_type (str): quantization type for weights, weight_quantize_type (str): quantization type for weights,
support 'abs_max'. The 'range_abs_max' usually is not used for support 'abs_max' and 'channel_wise_abs_max'. The 'range_abs_max'
weight, since weights are fixed once the model is well trained. usually is not used for weight, since weights are fixed once the
model is well trained.
window_size (int): the window size for 'range_abs_max' quantization. window_size (int): the window size for 'range_abs_max' quantization.
Examples: Examples:
...@@ -84,7 +86,11 @@ class QuantizationTransformPass(object): ...@@ -84,7 +86,11 @@ class QuantizationTransformPass(object):
self._weight_bits = weight_bits self._weight_bits = weight_bits
self._activation_bits = activation_bits self._activation_bits = activation_bits
quant_type = ['abs_max', 'range_abs_max', 'moving_average_abs_max'] quant_type = [
'abs_max', 'channel_wise_abs_max', 'range_abs_max',
'moving_average_abs_max'
]
assert activation_quantize_type != 'channel_wise_abs_max', "The activation quantization type does not support 'channel_wise_abs_max'."
if activation_quantize_type not in quant_type: if activation_quantize_type not in quant_type:
raise ValueError( raise ValueError(
"Unknown activation_quantize_type : '%s'. It can only be ", "Unknown activation_quantize_type : '%s'. It can only be ",
...@@ -93,7 +99,7 @@ class QuantizationTransformPass(object): ...@@ -93,7 +99,7 @@ class QuantizationTransformPass(object):
if weight_quantize_type not in quant_type: if weight_quantize_type not in quant_type:
raise ValueError( raise ValueError(
"Unknown weight_quantize_type: '%s'. It can only be ", "Unknown weight_quantize_type: '%s'. It can only be ",
"'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.", "'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
str(weight_quantize_type)) str(weight_quantize_type))
self._activation_quantize_type = activation_quantize_type self._activation_quantize_type = activation_quantize_type
...@@ -103,6 +109,7 @@ class QuantizationTransformPass(object): ...@@ -103,6 +109,7 @@ class QuantizationTransformPass(object):
self._need_initialized = collections.OrderedDict() self._need_initialized = collections.OrderedDict()
self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul'] self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
self._conv_ops = ['conv2d', 'depthwise_conv2d']
self._quantizable_grad_ops = [ self._quantizable_grad_ops = [
'%s_grad' % (op) for op in self._quantizable_ops '%s_grad' % (op) for op in self._quantizable_ops
] ]
...@@ -135,10 +142,26 @@ class QuantizationTransformPass(object): ...@@ -135,10 +142,26 @@ class QuantizationTransformPass(object):
else self._activation_bits else self._activation_bits
quant_type = self._weight_quantize_type if var_node.name() \ quant_type = self._weight_quantize_type if var_node.name() \
in persistable_vars else self._activation_quantize_type in persistable_vars else self._activation_quantize_type
quant_var_node, scale_var_node = self._insert_quant_op( if quant_type == 'channel_wise_abs_max':
graph, var_node, quant_bits, quant_type) assert var_node.name(
dequant_var_node = self._insert_dequant_op( ) in persistable_vars, "'channel_wise_abs_max' can only be applied on weights."
graph, quant_var_node, scale_var_node, quant_bits) if op.name() in self._conv_ops:
quant_var_node, scale_var_node = self._insert_channel_quant_op(
graph, var_node, quant_bits)
dequant_var_node = self._insert_channel_dequant_op(
graph, quant_var_node, [scale_var_node],
[quant_bits])
else:
quant_var_node, scale_var_node = self._insert_quant_op(
graph, var_node, quant_bits, 'abs_max')
dequant_var_node = self._insert_dequant_op(
graph, quant_var_node, scale_var_node,
quant_bits)
else:
quant_var_node, scale_var_node = self._insert_quant_op(
graph, var_node, quant_bits, quant_type)
dequant_var_node = self._insert_dequant_op(
graph, quant_var_node, scale_var_node, quant_bits)
dequantized_vars[var_node.name()] = dequant_var_node dequantized_vars[var_node.name()] = dequant_var_node
graph.update_input_link(var_node, dequant_var_node, op) graph.update_input_link(var_node, dequant_var_node, op)
...@@ -244,7 +267,7 @@ class QuantizationTransformPass(object): ...@@ -244,7 +267,7 @@ class QuantizationTransformPass(object):
scale_var_node = graph.create_var_node( scale_var_node = graph.create_var_node(
name=self._quantized_scale_name(var_node.name()), name=self._quantized_scale_name(var_node.name()),
var_type=var_node.type(), var_type=var_node.type(),
shape=var_node.shape(), shape=[1],
var_dtype=var_node.dtype()) var_dtype=var_node.dtype())
quant_op_node = graph.create_op_node( quant_op_node = graph.create_op_node(
op_type='fake_quantize_abs_max', op_type='fake_quantize_abs_max',
...@@ -384,6 +407,36 @@ class QuantizationTransformPass(object): ...@@ -384,6 +407,36 @@ class QuantizationTransformPass(object):
return quant_var_node, scale_out_node return quant_var_node, scale_out_node
def _insert_channel_quant_op(self, graph, var_node, quant_bits):
"""
Insert fake_channel_wise_quantize_abs_max op in the graph.
"""
assert var_node.is_var(), '{} is not a var'.format(var_node.name())
quant_var_node = graph.create_var_node(
name=self._quantized_var_name(var_node.name()),
var_type=var_node.type(),
shape=var_node.shape(),
var_dtype=var_node.dtype())
scale_var_node = graph.create_var_node(
name=self._quantized_scale_name(var_node.name()),
var_type=var_node.type(),
shape=[var_node.shape()[0]],
var_dtype=var_node.dtype())
quant_op_node = graph.create_op_node(
op_type='fake_channel_wise_quantize_abs_max',
attrs={
'bit_length': quant_bits,
'op_role': core.op_proto_and_checker_maker.OpRole.Forward
},
inputs={'X': var_node},
outputs={'Out': quant_var_node,
'OutScale': scale_var_node})
graph.link_to(var_node, quant_op_node)
graph.link_to(quant_op_node, quant_var_node)
graph.link_to(quant_op_node, scale_var_node)
return quant_var_node, scale_var_node
def _insert_dequant_op(self, graph, var_node, scale_var_node, quant_bits): def _insert_dequant_op(self, graph, var_node, scale_var_node, quant_bits):
""" """
Insert fake_dequantize_op in the graph. Insert fake_dequantize_op in the graph.
...@@ -410,6 +463,33 @@ class QuantizationTransformPass(object): ...@@ -410,6 +463,33 @@ class QuantizationTransformPass(object):
graph.link_to(dequant_op_node, dequant_var_node) graph.link_to(dequant_op_node, dequant_var_node)
return dequant_var_node return dequant_var_node
def _insert_channel_dequant_op(self, graph, var_node, scale_var_nodes,
quant_bits):
"""
Insert fake_channel_wise_dequantize_max_abs in the graph.
"""
assert var_node.is_var(), '{} is not a var'.format(var_node.name())
dequant_var_node = graph.create_var_node(
name=self._dequantized_var_name(var_node.name()),
var_type=var_node.type(),
shape=var_node.shape(),
var_dtype=var_node.dtype())
dequant_op_node = graph.create_op_node(
op_type='fake_channel_wise_dequantize_max_abs',
attrs={
'quant_bits': quant_bits,
'op_role': core.op_proto_and_checker_maker.OpRole.Forward
},
inputs={'X': var_node,
'Scales': scale_var_nodes},
outputs={'Out': dequant_var_node})
graph.link_to(var_node, dequant_op_node)
for scale_n in scale_var_nodes:
graph.link_to(scale_n, dequant_op_node)
graph.link_to(dequant_op_node, dequant_var_node)
return dequant_var_node
def _quantized_var_name(self, var_name): def _quantized_var_name(self, var_name):
""" """
Return quantized variable name for the input `var_name`. Return quantized variable name for the input `var_name`.
...@@ -442,7 +522,7 @@ class QuantizationFreezePass(object): ...@@ -442,7 +522,7 @@ class QuantizationFreezePass(object):
place(fluid.CPUPlace|fluid.CUDAPlace): place is used to restore the weight tensors. place(fluid.CPUPlace|fluid.CUDAPlace): place is used to restore the weight tensors.
weight_bits (int): quantization bit number for weights. weight_bits (int): quantization bit number for weights.
activation_bits (int): quantization bit number for activation. activation_bits (int): quantization bit number for activation.
weight_quantize_type (str): quantization type for weights, support 'abs_max'. weight_quantize_type (str): quantization type for weights, support 'abs_max' and 'channel_wise_abs_max'.
The 'range_abs_max' usually is not used for weight, since weights are fixed once the The 'range_abs_max' usually is not used for weight, since weights are fixed once the
model is well trained. model is well trained.
""" """
...@@ -463,11 +543,15 @@ class QuantizationFreezePass(object): ...@@ -463,11 +543,15 @@ class QuantizationFreezePass(object):
self._activation_bits = activation_bits self._activation_bits = activation_bits
self._weight_quantize_type = weight_quantize_type self._weight_quantize_type = weight_quantize_type
self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul'] self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
self._conv_ops = ['conv2d', 'depthwise_conv2d']
self._fake_quant_op_names = [ self._fake_quant_op_names = [
'fake_quantize_abs_max', 'fake_quantize_range_abs_max', 'fake_quantize_abs_max', 'fake_quantize_range_abs_max',
'fake_quantize_moving_average_abs_max' 'fake_quantize_moving_average_abs_max',
'fake_channel_wise_quantize_abs_max'
]
self._fake_dequant_op_names = [
'fake_dequantize_max_abs', 'fake_channel_wise_dequantize_max_abs'
] ]
self._fake_dequant_op_names = ['fake_dequantize_max_abs']
self._op_input_rename_map = collections.OrderedDict() self._op_input_rename_map = collections.OrderedDict()
self._op_output_rename_map = collections.OrderedDict() self._op_output_rename_map = collections.OrderedDict()
self._var_scale_map = collections.OrderedDict() self._var_scale_map = collections.OrderedDict()
...@@ -489,20 +573,27 @@ class QuantizationFreezePass(object): ...@@ -489,20 +573,27 @@ class QuantizationFreezePass(object):
if self._weight_quantize_type == 'abs_max': if self._weight_quantize_type == 'abs_max':
param = self._load_var(input_arg_name) param = self._load_var(input_arg_name)
scale_v = np.max(np.abs(param)) scale_v = np.max(np.abs(param))
elif self._weight_quantize_type == 'channel_wise_abs_max':
param = self._load_var(input_arg_name)
if len(param.shape) == 4: # conv2d or depthwise_conv2d
scale_v = []
for i in range(param.shape[0]):
scale_v.append(np.max(np.abs(param[i])))
else:
scale_v = np.max(np.abs(param))
else: else:
scale_v = self._load_var( scale_v = self._load_var(
op_node.output('OutScale')[0])[0] op_node.output('OutScale')[0])[0]
self._var_scale_map[input_arg_name] = scale_v self._var_scale_map[input_arg_name] = scale_v
else:
scale_v = graph.var_node(op_node.output('OutScale')[0])
self._var_scale_map[input_arg_name] = scale_v
if input_arg_name in persistable_vars:
self._remove_fake_quant_and_dequant_op(graph, op_node) self._remove_fake_quant_and_dequant_op(graph, op_node)
# quantize weight and restore # quantize weight and restore
param_v = self._load_var(input_arg_name) param_v = self._load_var(input_arg_name)
quantized_param_v = self._quant(param_v, scale_v, quantized_param_v = self._quant(param_v, scale_v,
self._weight_bits) self._weight_bits)
self._restore_var(input_arg_name, quantized_param_v) self._restore_var(input_arg_name, quantized_param_v)
else:
scale_v = graph.var_node(op_node.output('OutScale')[0])
self._var_scale_map[input_arg_name] = scale_v
ops = graph.all_op_nodes() ops = graph.all_op_nodes()
for op_node in ops: for op_node in ops:
...@@ -514,7 +605,10 @@ class QuantizationFreezePass(object): ...@@ -514,7 +605,10 @@ class QuantizationFreezePass(object):
for op_node in ops: for op_node in ops:
op_name = op_node.name() op_name = op_node.name()
if op_name in self._quantizable_ops: if op_name in self._quantizable_ops:
self._insert_post_dequant_op(graph, op_node) if self._weight_quantize_type == 'channel_wise_abs_max' and op_name in self._conv_ops:
self._insert_post_channel_dequant_op(graph, op_node)
else:
self._insert_post_dequant_op(graph, op_node)
for op_node in ops: for op_node in ops:
# insert dequant_op after fc/conv, need to rename inputs of the followed ops # insert dequant_op after fc/conv, need to rename inputs of the followed ops
...@@ -538,9 +632,73 @@ class QuantizationFreezePass(object): ...@@ -538,9 +632,73 @@ class QuantizationFreezePass(object):
self._op_input_rename_map[k] = self._op_input_rename_map[v] self._op_input_rename_map[k] = self._op_input_rename_map[v]
graph.safe_remove_nodes(op_node) graph.safe_remove_nodes(op_node)
def _insert_post_channel_dequant_op(self, graph, op_node):
persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
for var_node in op_node.inputs:
name = var_node.name()
if name in self._op_input_rename_map:
old_in = graph.var_node(name)
new_in = graph.var_node(self._op_input_rename_map[name])
new_in.clear_outputs()
graph.update_input_link(old_in, new_in, op_node)
original_var_name = self._original_var_name(name)
scale_v = self._var_scale_map[original_var_name]
if original_var_name in persistable_vars:
assert isinstance(
scale_v,
list), 'The scale of parameter %s is not a list.' % (
original_var_name)
channel_scale = np.array(scale_v)
else:
assert isinstance(scale_v, IrNode)
scale_var_node = self._var_scale_map[original_var_name]
if len(op_node.outputs) != 1:
raise ValueError("Only support one output, but op %s has"
" more than one output." % (op_node.name()))
output_var_node = op_node.outputs[0]
weight_scale_node = graph.create_persistable_node(
name=unique_name.generate('channel_scale'),
var_type=core.VarDesc.VarType.LOD_TENSOR,
shape=[channel_scale.shape[0]],
var_dtype=output_var_node.dtype())
init_program = Program()
weight_scale_var = init_program.global_block().create_var(
name=weight_scale_node.name(),
shape=weight_scale_node.shape(),
dtype=weight_scale_node.dtype(),
type=weight_scale_node.type(),
lod_level=weight_scale_node.var().lod_level(),
persistable=weight_scale_node.persistable())
initializer = NumpyArrayInitializer(value=channel_scale)
initializer(weight_scale_var, init_program.global_block())
exe = Executor(self._place)
exe.run(program=init_program, scope=self._scope)
dequant_var_node = graph.create_var_node(
name=self._dequantized_var_name(output_var_node.name()),
var_type=output_var_node.type(),
shape=output_var_node.shape(),
var_dtype=output_var_node.dtype())
dequant_op_node = graph.create_op_node(
op_type='fake_channel_wise_dequantize_max_abs',
attrs={
'quant_bits': [self._weight_bits, self._activation_bits],
'op_role': core.op_proto_and_checker_maker.OpRole.Forward
},
inputs={
'X': output_var_node,
'Scales': [weight_scale_node, scale_var_node]
},
outputs={'Out': dequant_var_node})
graph.link_to(output_var_node, dequant_op_node)
graph.link_to(scale_var_node, dequant_op_node)
graph.link_to(weight_scale_node, dequant_op_node)
graph.link_to(dequant_op_node, dequant_var_node)
self._op_output_rename_map[output_var_node.name()] = dequant_var_node
return dequant_var_node
def _insert_post_dequant_op(self, graph, op_node): def _insert_post_dequant_op(self, graph, op_node):
max_range = None
scale_var_node = None
persistable_vars = [p.name() for p in graph.all_persistable_nodes()] persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
for var_node in op_node.inputs: for var_node in op_node.inputs:
name = var_node.name() name = var_node.name()
...@@ -637,7 +795,12 @@ class QuantizationFreezePass(object): ...@@ -637,7 +795,12 @@ class QuantizationFreezePass(object):
or isinstance(v, np.float64) or isinstance(v, np.float64)
def _quant(self, x, scale, num_bits): def _quant(self, x, scale, num_bits):
return np.round(x / scale * ((1 << (num_bits - 1)) - 1)) if isinstance(scale, list):
for i, s in enumerate(scale):
x[i] = np.round(x[i] / s * ((1 << (num_bits - 1)) - 1))
return x
else:
return np.round(x / scale * ((1 << (num_bits - 1)) - 1))
class ConvertToInt8Pass(object): class ConvertToInt8Pass(object):
...@@ -731,9 +894,13 @@ class TransformForMobilePass(object): ...@@ -731,9 +894,13 @@ class TransformForMobilePass(object):
def __init__(self): def __init__(self):
self._fake_quant_op_names = [ self._fake_quant_op_names = [
'fake_quantize_abs_max', 'fake_quantize_range_abs_max' 'fake_quantize_abs_max', 'fake_quantize_range_abs_max',
'fake_quantize_moving_average_abs_max',
'fake_channel_wise_quantize_abs_max'
]
self._fake_dequant_op_names = [
'fake_dequantize_max_abs', 'fake_channel_wise_dequantize_max_abs'
] ]
self._fake_dequant_op_names = ['fake_dequantize_max_abs']
def apply(self, graph): def apply(self, graph):
""" """
......
version: 1.0
include: ["./configs/pruners.yaml", "./configs/pruners_0.yaml"]
pruners:
pruner_1:
class: 'RatioPruner'
ratios:
'conv1_1.w': 0.3
'conv1_2.w': 0.4
'*': 0.9
group_dims:
'*': [1, 2, 3]
criterions:
'*': 'l1-norm'
strategies:
strategy_1:
class: 'SensitivePruneStrategy'
pruner: 'pruner_2'
start_epoch: 0
end_epoch: 10
delta_rate: 0.20
acc_loss_threshold: 0.2
sensitivities:
'conv1_1.w': 0.4
compress_pass:
class: 'CompressPass'
epoch: 100
strategies:
- strategy_1
#start_epoch: The 'on_epoch_begin' function will be called in start_epoch. default: 0.
#end_epoch: The 'on_epoch_end' function will be called in end_epoch. default: 10.
#delta_rate: The delta used to generate ratios when calculating sensitivities.
#target_ratio: The flops ratio to be pruned from current model.
#metric_name: The metric used to evaluate the model.
#pruned_params: The pattern str to match the parameter names to be pruned.
#sensitivities_file: The sensitivities file.
#num_steps: The number of pruning steps.
#eval_rate: The rate of sampled data used to calculate sensitivities.
version: 1.0
pruners:
pruner_1:
class: 'StructurePruner'
pruning_axis:
'*': 0
criterions:
'*': 'l1_norm'
strategies:
sensitive_pruning_strategy:
class: 'SensitivePruneStrategy'
pruner: 'pruner_1'
start_epoch: 0
delta_rate: 0.1
target_ratio: 0.3
num_steps: 1
eval_rate: 0.5
pruned_params: '.*_sep_weights'
sensitivities_file: 'mobilenet_acc_top1_sensitive.data'
metric_name: 'acc_top1'
compressor:
epoch: 120
checkpoint_path: './checkpoints/'
strategies:
- sensitive_pruning_strategy
version: 1.0
pruners:
pruner_2:
class: 'RatioPruner'
ratios:
'conv1_1.w': 0.5
'conv1_2.w': 0.2
'*': 0.7
group_dims:
'*': [1, 2, 3]
criterions:
'*': 'l1-norm'
version: 1.0
pruners:
pruner_3:
class: 'RatioPruner'
ratios:
'conv1_1.w': 0.5
'conv1_2.w': 0.2
'*': 0.7
group_dims:
'*': [1, 2, 3]
criterions:
'*': 'l1-norm'
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -11,29 +11,3 @@ ...@@ -11,29 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .compress_pass import CompressPass
from .config import ConfigFactory
__all__ = ['build_compressor']
def build_compressor(place=None,
data_reader=None,
data_feeder=None,
scope=None,
metrics=None,
epoch=None,
config=None):
if config is not None:
factory = ConfigFactory(config)
comp_pass = factory.get_compress_pass()
else:
comp_pass = CompressPass()
comp_pass.place = place
comp_pass.data_reader = data_reader
comp_pass.data_feeder = data_feeder
comp_pass.scope = scope
comp_pass.metrics = metrics
comp_pass.epoch = epoch
return comp_pass
#start_epoch: The 'on_epoch_begin' function will be called in start_epoch. default: 0.
#end_epoch: The 'on_epoch_end' function will be called in end_epoch. default: 10.
#delta_rate: The delta used to generate ratios when calculating sensitivities.
#target_ratio: The flops ratio to be pruned from current model.
#metric_name: The metric used to evaluate the model.
#pruned_params: The pattern str to match the parameter names to be pruned.
#sensitivities_file: The sensitivities file.
#num_steps: The number of pruning steps.
#eval_rate: The rate of sampled data used to calculate sensitivities.
version: 1.0
pruners:
pruner_1:
class: 'StructurePruner'
pruning_axis:
'*': 0
criterions:
'*': 'l1_norm'
strategies:
sensitive_pruning_strategy:
class: 'SensitivePruneStrategy'
pruner: 'pruner_1'
start_epoch: 1
delta_rate: 0.2
target_ratio: 0.08
num_steps: 1
eval_rate: 0.5
pruned_params: 'conv6_sep_weights'
sensitivities_file: 'mobilenet_acc_top1_sensitive.data'
metric_name: 'acc_top1'
compressor:
epoch: 2
checkpoint_path: './checkpoints/'
strategies:
- sensitive_pruning_strategy
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
from paddle.fluid.initializer import MSRA
from paddle.fluid.param_attr import ParamAttr
__all__ = ['MobileNet']
train_parameters = {
"input_size": [3, 224, 224],
"input_mean": [0.485, 0.456, 0.406],
"input_std": [0.229, 0.224, 0.225],
"learning_strategy": {
"name": "piecewise_decay",
"batch_size": 256,
"epochs": [30, 60, 90],
"steps": [0.1, 0.01, 0.001, 0.0001]
}
}
class MobileNet():
def __init__(self):
self.params = train_parameters
def net(self, input, class_dim=1000, scale=1.0):
# conv1: 112x112
input = self.conv_bn_layer(
input,
filter_size=3,
channels=3,
num_filters=int(32 * scale),
stride=2,
padding=1,
name="conv1")
# 56x56
input = self.depthwise_separable(
input,
num_filters1=32,
num_filters2=64,
num_groups=32,
stride=1,
scale=scale,
name="conv2_1")
input = self.depthwise_separable(
input,
num_filters1=64,
num_filters2=128,
num_groups=64,
stride=2,
scale=scale,
name="conv2_2")
# 28x28
input = self.depthwise_separable(
input,
num_filters1=128,
num_filters2=128,
num_groups=128,
stride=1,
scale=scale,
name="conv3_1")
input = self.depthwise_separable(
input,
num_filters1=128,
num_filters2=256,
num_groups=128,
stride=2,
scale=scale,
name="conv3_2")
# 14x14
input = self.depthwise_separable(
input,
num_filters1=256,
num_filters2=256,
num_groups=256,
stride=1,
scale=scale,
name="conv4_1")
input = self.depthwise_separable(
input,
num_filters1=256,
num_filters2=512,
num_groups=256,
stride=2,
scale=scale,
name="conv4_2")
# 14x14
for i in range(5):
input = self.depthwise_separable(
input,
num_filters1=512,
num_filters2=512,
num_groups=512,
stride=1,
scale=scale,
name="conv5" + "_" + str(i + 1))
# 7x7
input = self.depthwise_separable(
input,
num_filters1=512,
num_filters2=1024,
num_groups=512,
stride=2,
scale=scale,
name="conv5_6")
input = self.depthwise_separable(
input,
num_filters1=1024,
num_filters2=1024,
num_groups=1024,
stride=1,
scale=scale,
name="conv6")
input = fluid.layers.pool2d(
input=input,
pool_size=0,
pool_stride=1,
pool_type='avg',
global_pooling=True)
output = fluid.layers.fc(input=input,
size=class_dim,
act='softmax',
param_attr=ParamAttr(
initializer=MSRA(), name="fc7_weights"),
bias_attr=ParamAttr(name="fc7_offset"))
return output
def conv_bn_layer(self,
input,
filter_size,
num_filters,
stride,
padding,
channels=None,
num_groups=1,
act='relu',
use_cudnn=True,
name=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=padding,
groups=num_groups,
act=None,
use_cudnn=use_cudnn,
param_attr=ParamAttr(
initializer=MSRA(), name=name + "_weights"),
bias_attr=False)
bn_name = name + "_bn"
return fluid.layers.batch_norm(
input=conv,
act=act,
param_attr=ParamAttr(name=bn_name + "_scale"),
bias_attr=ParamAttr(name=bn_name + "_offset"),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
def depthwise_separable(self,
input,
num_filters1,
num_filters2,
num_groups,
stride,
scale,
name=None):
depthwise_conv = self.conv_bn_layer(
input=input,
filter_size=3,
num_filters=int(num_filters1 * scale),
stride=stride,
padding=1,
num_groups=int(num_groups * scale),
use_cudnn=False,
name=name + "_dw")
pointwise_conv = self.conv_bn_layer(
input=depthwise_conv,
filter_size=1,
num_filters=int(num_filters2 * scale),
stride=1,
padding=0,
name=name + "_sep")
return pointwise_conv
...@@ -12,29 +12,25 @@ ...@@ -12,29 +12,25 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from paddle.fluid.contrib.slim import ConfigFactory from paddle.fluid.contrib.slim.core import ConfigFactory
import unittest import unittest
class TestFactory(unittest.TestCase): class TestFactory(unittest.TestCase):
def test_parse(self): def test_parse_pruning(self):
factory = ConfigFactory('./configs/config.yaml') factory = ConfigFactory('./configs/filter_pruning.yaml')
pruner = factory.instance('pruner_1') pruner_1 = factory.instance('pruner_1')
self.assertEquals(pruner.ratios['conv1_1.w'], 0.3) self.assertEquals(pruner_1.pruning_axis['*'], 0)
self.assertEquals(pruner_1.criterions['*'], 'l1_norm')
pruner = factory.instance('pruner_2') strategy = factory.instance('sensitive_pruning_strategy')
self.assertEquals(pruner.ratios['*'], 0.7) pruner_1 = strategy.pruner
self.assertEquals(pruner_1.criterions['*'], 'l1_norm')
strategy = factory.instance('strategy_1') self.assertEquals(strategy.start_epoch, 0)
pruner = strategy.pruner self.assertEquals(strategy.sensitivities_file,
self.assertEquals(pruner.ratios['*'], 0.7) 'mobilenet_acc_top1_sensitive.data')
compress_pass = factory.get_compress_pass()
self.assertEquals(compress_pass.epoch, 100)
strategy = compress_pass.strategies[0]
self.assertEquals(strategy.delta_rate, 0.2)
if __name__ == '__main__': if __name__ == '__main__':
......
# copyright (c) 2019 paddlepaddle authors. all rights reserved.
#
# licensed under the apache license, version 2.0 (the "license");
# you may not use this file except in compliance with the license.
# you may obtain a copy of the license at
#
# http://www.apache.org/licenses/license-2.0
#
# unless required by applicable law or agreed to in writing, software
# distributed under the license is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# see the license for the specific language governing permissions and
# limitations under the license.
import paddle
import unittest
import paddle.fluid as fluid
from filter_pruning.mobilenet import MobileNet
from paddle.fluid.contrib.slim.core import Compressor
from paddle.fluid.contrib.slim.graph import GraphWrapper
class TestFilterPruning(unittest.TestCase):
def test_compression(self):
"""
Model: mobilenet_v1
data: mnist
step1: Training one epoch
step2: pruning flops
step3: fine-tune one epoch
step4: check top1_acc.
"""
if not fluid.core.is_compiled_with_cuda():
return
class_dim = 10
image_shape = [1, 28, 28]
image = fluid.layers.data(
name='image', shape=image_shape, dtype='float32')
image.stop_gradient = False
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
out = MobileNet().net(input=image, class_dim=class_dim)
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
val_program = fluid.default_main_program().clone(for_test=False)
cost = fluid.layers.cross_entropy(input=out, label=label)
avg_cost = fluid.layers.mean(x=cost)
optimizer = fluid.optimizer.Momentum(
momentum=0.9,
learning_rate=0.01,
regularization=fluid.regularizer.L2Decay(4e-5))
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
val_feed_list = [('img', image.name), ('label', label.name)]
val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5',
acc_top5.name)]
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128)
train_feed_list = [('img', image.name), ('label', label.name)]
train_fetch_list = [('loss', avg_cost.name)]
com_pass = Compressor(
place,
fluid.global_scope(),
fluid.default_main_program(),
train_reader=train_reader,
train_feed_list=train_feed_list,
train_fetch_list=train_fetch_list,
eval_program=val_program,
eval_reader=val_reader,
eval_feed_list=val_feed_list,
eval_fetch_list=val_fetch_list,
train_optimizer=optimizer)
com_pass.config('./filter_pruning/compress.yaml')
eval_graph = com_pass.run()
self.assertTrue(
abs((com_pass.context.eval_results['acc_top1'][-1] - 0.969) / 0.969)
< 0.02)
if __name__ == '__main__':
unittest.main()
# copyright (c) 2019 paddlepaddle authors. all rights reserved.
#
# licensed under the apache license, version 2.0 (the "license");
# you may not use this file except in compliance with the license.
# you may obtain a copy of the license at
#
# http://www.apache.org/licenses/license-2.0
#
# unless required by applicable law or agreed to in writing, software
# distributed under the license is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# see the license for the specific language governing permissions and
# limitations under the license.
from __future__ import print_function
import unittest
import paddle.fluid as fluid
import six
import numpy as np
from paddle.fluid.contrib.slim.graph import GraphWrapper
from paddle.fluid import core
def residual_block(num):
def conv_bn_layer(input,
ch_out,
filter_size,
stride,
padding,
act='relu',
bias_attr=False):
tmp = fluid.layers.conv2d(
input=input,
filter_size=filter_size,
num_filters=ch_out,
stride=stride,
padding=padding,
act=None,
bias_attr=bias_attr)
return fluid.layers.batch_norm(input=tmp, act=act)
data = fluid.layers.data(name='image', shape=[1, 8, 8], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
data.stop_gradinet = False
hidden = data
for _ in six.moves.xrange(num):
conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
fc = fluid.layers.fc(input=hidden, size=10)
loss = fluid.layers.cross_entropy(input=fc, label=label)
loss = fluid.layers.mean(loss)
return data, label, loss
class TestGraphWrapper(unittest.TestCase):
def build_program(self):
place = fluid.CPUPlace()
if fluid.core.is_compiled_with_cuda():
place = fluid.CUDAPlace(0)
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
image, label, self.loss = residual_block(2)
eval_program = main.clone()
opt = fluid.optimizer.SGD(learning_rate=0.001)
opt.minimize(self.loss)
self.scope = core.Scope()
exe = fluid.Executor(place)
exe.run(startup, scope=self.scope)
self.eval_graph = GraphWrapper(
program=eval_program,
in_nodes={'image': image.name,
'label': label.name},
out_nodes={'loss': self.loss.name})
self.train_graph = GraphWrapper(
program=main,
in_nodes={'image': image.name,
'label': label.name},
out_nodes={'loss': self.loss.name})
def test_all_parameters(self):
self.build_program()
self.assertEquals(len(self.train_graph.all_parameters()), 24)
def test_all_vars(self):
self.build_program()
self.assertEquals(len(self.train_graph.vars()), 90)
def test_numel_params(self):
self.build_program()
self.assertEquals(self.train_graph.numel_params(), 13258)
def test_compile(self):
self.build_program()
place = fluid.CPUPlace()
if fluid.core.is_compiled_with_cuda():
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
self.train_graph.compile()
exe.run(self.train_graph.compiled_graph,
scope=self.scope,
feed={
'image':
np.random.randint(0, 40, [16, 1, 8, 8]).astype('float32'),
'label': np.random.randint(0, 10, [16, 1]).astype('int64')
})
def test_pre_and_next_ops(self):
self.build_program()
for op in self.train_graph.ops():
for next_op in self.train_graph.next_ops(op):
self.assertTrue(op in self.train_graph.pre_ops(next_op))
def test_get_optimize_graph(self):
self.build_program()
place = fluid.CPUPlace()
if fluid.core.is_compiled_with_cuda():
place = fluid.CUDAPlace(0)
opt = fluid.optimizer.SGD(learning_rate=0.001)
train_graph = self.eval_graph.get_optimize_graph(
opt, place, self.scope, no_grad_var_names=['image'])
self.assertEquals(len(self.train_graph.ops()), len(train_graph.ops()))
exe = fluid.Executor(place)
train_graph.compile()
image = np.random.randint(0, 225, [16, 1, 8, 8]).astype('float32')
label = np.random.randint(0, 10, [16, 1]).astype('int64')
exe.run(train_graph.compiled_graph,
scope=self.scope,
feed={'image': image,
'label': label})
def test_flops(self):
self.build_program()
self.assertEquals(self.train_graph.flops(), 354624)
if __name__ == '__main__':
unittest.main()
...@@ -127,7 +127,7 @@ class TestQuantizationTransformPass(unittest.TestCase): ...@@ -127,7 +127,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
arg_name.endswith('.quantized.dequantized')) arg_name.endswith('.quantized.dequantized'))
self.assertTrue(arg_name in quantized_ops) self.assertTrue(arg_name in quantized_ops)
def linear_fc_quant(self, quant_type, for_ci=False): def linear_fc_quant(self, activation_quant_type, for_ci=False):
main = fluid.Program() main = fluid.Program()
startup = fluid.Program() startup = fluid.Program()
with fluid.program_guard(main, startup): with fluid.program_guard(main, startup):
...@@ -140,14 +140,15 @@ class TestQuantizationTransformPass(unittest.TestCase): ...@@ -140,14 +140,15 @@ class TestQuantizationTransformPass(unittest.TestCase):
transform_pass = QuantizationTransformPass( transform_pass = QuantizationTransformPass(
scope=fluid.global_scope(), scope=fluid.global_scope(),
place=place, place=place,
activation_quantize_type=quant_type) activation_quantize_type=activation_quant_type)
transform_pass.apply(graph) transform_pass.apply(graph)
if not for_ci: if not for_ci:
marked_nodes = set() marked_nodes = set()
for op in graph.all_op_nodes(): for op in graph.all_op_nodes():
if op.name().find('quantize') > -1: if op.name().find('quantize') > -1:
marked_nodes.add(op) marked_nodes.add(op)
graph.draw('.', 'quantize_fc_' + quant_type, marked_nodes) graph.draw('.', 'quantize_fc_' + activation_quant_type,
marked_nodes)
program = graph.to_program() program = graph.to_program()
self.check_program(transform_pass, program) self.check_program(transform_pass, program)
val_graph = IrGraph(core.Graph(program.desc), for_test=False) val_graph = IrGraph(core.Graph(program.desc), for_test=False)
...@@ -156,7 +157,8 @@ class TestQuantizationTransformPass(unittest.TestCase): ...@@ -156,7 +157,8 @@ class TestQuantizationTransformPass(unittest.TestCase):
for op in val_graph.all_op_nodes(): for op in val_graph.all_op_nodes():
if op.name().find('quantize') > -1: if op.name().find('quantize') > -1:
val_marked_nodes.add(op) val_marked_nodes.add(op)
val_graph.draw('.', 'val_fc_' + quant_type, val_marked_nodes) val_graph.draw('.', 'val_fc_' + activation_quant_type,
val_marked_nodes)
def test_linear_fc_quant_abs_max(self): def test_linear_fc_quant_abs_max(self):
self.linear_fc_quant('abs_max', for_ci=True) self.linear_fc_quant('abs_max', for_ci=True)
...@@ -167,7 +169,7 @@ class TestQuantizationTransformPass(unittest.TestCase): ...@@ -167,7 +169,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
def test_linear_fc_quant_moving_average_abs_max(self): def test_linear_fc_quant_moving_average_abs_max(self):
self.linear_fc_quant('moving_average_abs_max', for_ci=True) self.linear_fc_quant('moving_average_abs_max', for_ci=True)
def residual_block_quant(self, quant_type, for_ci=False): def residual_block_quant(self, activation_quant_type, for_ci=False):
main = fluid.Program() main = fluid.Program()
startup = fluid.Program() startup = fluid.Program()
with fluid.program_guard(main, startup): with fluid.program_guard(main, startup):
...@@ -180,14 +182,15 @@ class TestQuantizationTransformPass(unittest.TestCase): ...@@ -180,14 +182,15 @@ class TestQuantizationTransformPass(unittest.TestCase):
transform_pass = QuantizationTransformPass( transform_pass = QuantizationTransformPass(
scope=fluid.global_scope(), scope=fluid.global_scope(),
place=place, place=place,
activation_quantize_type=quant_type) activation_quantize_type=activation_quant_type)
transform_pass.apply(graph) transform_pass.apply(graph)
if not for_ci: if not for_ci:
marked_nodes = set() marked_nodes = set()
for op in graph.all_op_nodes(): for op in graph.all_op_nodes():
if op.name().find('quantize') > -1: if op.name().find('quantize') > -1:
marked_nodes.add(op) marked_nodes.add(op)
graph.draw('.', 'quantize_residual_' + quant_type, marked_nodes) graph.draw('.', 'quantize_residual_' + activation_quant_type,
marked_nodes)
program = graph.to_program() program = graph.to_program()
self.check_program(transform_pass, program) self.check_program(transform_pass, program)
val_graph = IrGraph(core.Graph(program.desc), for_test=False) val_graph = IrGraph(core.Graph(program.desc), for_test=False)
...@@ -196,7 +199,8 @@ class TestQuantizationTransformPass(unittest.TestCase): ...@@ -196,7 +199,8 @@ class TestQuantizationTransformPass(unittest.TestCase):
for op in val_graph.all_op_nodes(): for op in val_graph.all_op_nodes():
if op.name().find('quantize') > -1: if op.name().find('quantize') > -1:
val_marked_nodes.add(op) val_marked_nodes.add(op)
val_graph.draw('.', 'val_residual_' + quant_type, val_marked_nodes) val_graph.draw('.', 'val_residual_' + activation_quant_type,
val_marked_nodes)
def test_residual_block_abs_max(self): def test_residual_block_abs_max(self):
self.residual_block_quant('abs_max', for_ci=True) self.residual_block_quant('abs_max', for_ci=True)
...@@ -209,7 +213,12 @@ class TestQuantizationTransformPass(unittest.TestCase): ...@@ -209,7 +213,12 @@ class TestQuantizationTransformPass(unittest.TestCase):
class TestQuantizationFreezePass(unittest.TestCase): class TestQuantizationFreezePass(unittest.TestCase):
def freeze_graph(self, use_cuda, seed, quant_type, for_ci=False): def freeze_graph(self,
use_cuda,
seed,
activation_quant_type,
weight_quant_type='abs_max',
for_ci=False):
def build_program(main, startup, is_test): def build_program(main, startup, is_test):
main.random_seed = seed main.random_seed = seed
startup.random_seed = seed startup.random_seed = seed
...@@ -243,7 +252,12 @@ class TestQuantizationFreezePass(unittest.TestCase): ...@@ -243,7 +252,12 @@ class TestQuantizationFreezePass(unittest.TestCase):
with fluid.scope_guard(scope): with fluid.scope_guard(scope):
exe.run(startup) exe.run(startup)
transform_pass = QuantizationTransformPass( transform_pass = QuantizationTransformPass(
scope=scope, place=place, activation_quantize_type=quant_type) scope=scope,
place=place,
activation_quantize_type=activation_quant_type,
weight_quantize_type=weight_quant_type)
#transform_pass = QuantizationTransformPass(
# scope=scope, place=place, activation_quantize_type=activation_quant_type)
transform_pass.apply(main_graph) transform_pass.apply(main_graph)
transform_pass.apply(test_graph) transform_pass.apply(test_graph)
dev_name = '_gpu_' if use_cuda else '_cpu_' dev_name = '_gpu_' if use_cuda else '_cpu_'
...@@ -252,12 +266,14 @@ class TestQuantizationFreezePass(unittest.TestCase): ...@@ -252,12 +266,14 @@ class TestQuantizationFreezePass(unittest.TestCase):
for op in main_graph.all_op_nodes(): for op in main_graph.all_op_nodes():
if op.name().find('quantize') > -1: if op.name().find('quantize') > -1:
marked_nodes.add(op) marked_nodes.add(op)
main_graph.draw('.', 'main' + dev_name + quant_type, marked_nodes) main_graph.draw('.', 'main' + dev_name + activation_quant_type + '_'
+ weight_quant_type, marked_nodes)
marked_nodes = set() marked_nodes = set()
for op in test_graph.all_op_nodes(): for op in test_graph.all_op_nodes():
if op.name().find('quantize') > -1: if op.name().find('quantize') > -1:
marked_nodes.add(op) marked_nodes.add(op)
test_graph.draw('.', 'test' + dev_name + quant_type, marked_nodes) test_graph.draw('.', 'test' + dev_name + activation_quant_type + '_'
+ weight_quant_type, marked_nodes)
build_strategy = fluid.BuildStrategy() build_strategy = fluid.BuildStrategy()
build_strategy.memory_optimize = False build_strategy.memory_optimize = False
...@@ -282,8 +298,9 @@ class TestQuantizationFreezePass(unittest.TestCase): ...@@ -282,8 +298,9 @@ class TestQuantizationFreezePass(unittest.TestCase):
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[loss]) fetch_list=[loss])
if not for_ci: if not for_ci:
print('{}: {}'.format('loss' + dev_name + quant_type, print('{}: {}'.format('loss' + dev_name +
loss_v)) activation_quant_type + '_' +
weight_quant_type, loss_v))
test_data = next(test_reader()) test_data = next(test_reader())
with fluid.program_guard(quantized_test_program): with fluid.program_guard(quantized_test_program):
...@@ -296,14 +313,17 @@ class TestQuantizationFreezePass(unittest.TestCase): ...@@ -296,14 +313,17 @@ class TestQuantizationFreezePass(unittest.TestCase):
fetch_list=[loss, w_var]) fetch_list=[loss, w_var])
# Freeze graph for inference, but the weight of fc/conv is still float type. # Freeze graph for inference, but the weight of fc/conv is still float type.
freeze_pass = QuantizationFreezePass(scope=scope, place=place) freeze_pass = QuantizationFreezePass(
scope=scope, place=place, weight_quantize_type=weight_quant_type)
#freeze_pass = QuantizationFreezePass(scope=scope, place=place)
freeze_pass.apply(test_graph) freeze_pass.apply(test_graph)
if not for_ci: if not for_ci:
marked_nodes = set() marked_nodes = set()
for op in test_graph.all_op_nodes(): for op in test_graph.all_op_nodes():
if op.name().find('quantize') > -1: if op.name().find('quantize') > -1:
marked_nodes.add(op) marked_nodes.add(op)
test_graph.draw('.', 'test_freeze' + dev_name + quant_type, test_graph.draw('.', 'test_freeze' + dev_name +
activation_quant_type + '_' + weight_quant_type,
marked_nodes) marked_nodes)
server_program = test_graph.to_program() server_program = test_graph.to_program()
...@@ -313,18 +333,20 @@ class TestQuantizationFreezePass(unittest.TestCase): ...@@ -313,18 +333,20 @@ class TestQuantizationFreezePass(unittest.TestCase):
fetch_list=[loss]) fetch_list=[loss])
self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3) self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
if not for_ci: if not for_ci:
print('{}: {}'.format('test_loss1' + dev_name + quant_type, print(
test_loss1)) '{}: {}'.format('test_loss1' + dev_name + activation_quant_type
print('{}: {}'.format('test_loss2' + dev_name + quant_type, + '_' + weight_quant_type, test_loss1))
test_loss2)) print(
'{}: {}'.format('test_loss2' + dev_name + activation_quant_type
+ '_' + weight_quant_type, test_loss2))
w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor()) w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor())
# Maybe failed, this is due to the calculation precision # Maybe failed, this is due to the calculation precision
# self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant)) # self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
if not for_ci: if not for_ci:
print('{}: {}'.format('w_freeze' + dev_name + quant_type, print('{}: {}'.format('w_freeze' + dev_name + activation_quant_type
np.sum(w_freeze))) + '_' + weight_quant_type, np.sum(w_freeze)))
print('{}: {}'.format('w_quant' + dev_name + quant_type, print('{}: {}'.format('w_quant' + dev_name + activation_quant_type +
np.sum(w_quant))) '_' + weight_quant_type, np.sum(w_quant)))
# Convert parameter to 8-bit. # Convert parameter to 8-bit.
convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place) convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place)
...@@ -334,26 +356,28 @@ class TestQuantizationFreezePass(unittest.TestCase): ...@@ -334,26 +356,28 @@ class TestQuantizationFreezePass(unittest.TestCase):
for op in test_graph.all_op_nodes(): for op in test_graph.all_op_nodes():
if op.name().find('quantize') > -1: if op.name().find('quantize') > -1:
marked_nodes.add(op) marked_nodes.add(op)
test_graph.draw('.', 'test_int8' + dev_name + quant_type, test_graph.draw('.', 'test_int8' + dev_name + activation_quant_type
marked_nodes) + '_' + weight_quant_type, marked_nodes)
server_program_int8 = test_graph.to_program() server_program_int8 = test_graph.to_program()
# Save the 8-bit parameter and model file. # Save the 8-bit parameter and model file.
with fluid.scope_guard(scope): with fluid.scope_guard(scope):
fluid.io.save_inference_model('server_int8' + dev_name + quant_type, fluid.io.save_inference_model(
['image', 'label'], [loss], exe, 'server_int8' + dev_name + activation_quant_type + '_' +
server_program_int8) weight_quant_type, ['image', 'label'], [loss], exe,
server_program_int8)
# Test whether the 8-bit parameter and model file can be loaded successfully. # Test whether the 8-bit parameter and model file can be loaded successfully.
[infer, feed, fetch] = fluid.io.load_inference_model( [infer, feed, fetch] = fluid.io.load_inference_model(
'server_int8' + dev_name + quant_type, exe) 'server_int8' + dev_name + activation_quant_type + '_' +
weight_quant_type, exe)
# Check the loaded 8-bit weight. # Check the loaded 8-bit weight.
w_8bit = np.array(scope.find_var('conv2d_1.w_0.int8').get_tensor()) w_8bit = np.array(scope.find_var('conv2d_1.w_0.int8').get_tensor())
self.assertEqual(w_8bit.dtype, np.int8) self.assertEqual(w_8bit.dtype, np.int8)
self.assertEqual(np.sum(w_8bit), np.sum(w_freeze)) self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
if not for_ci: if not for_ci:
print('{}: {}'.format('w_8bit' + dev_name + quant_type, print('{}: {}'.format('w_8bit' + dev_name + activation_quant_type +
np.sum(w_8bit))) '_' + weight_quant_type, np.sum(w_8bit)))
print('{}: {}'.format('w_freeze' + dev_name + quant_type, print('{}: {}'.format('w_freeze' + dev_name + activation_quant_type
np.sum(w_freeze))) + '_' + weight_quant_type, np.sum(w_freeze)))
mobile_pass = TransformForMobilePass() mobile_pass = TransformForMobilePass()
mobile_pass.apply(test_graph) mobile_pass.apply(test_graph)
...@@ -362,42 +386,103 @@ class TestQuantizationFreezePass(unittest.TestCase): ...@@ -362,42 +386,103 @@ class TestQuantizationFreezePass(unittest.TestCase):
for op in test_graph.all_op_nodes(): for op in test_graph.all_op_nodes():
if op.name().find('quantize') > -1: if op.name().find('quantize') > -1:
marked_nodes.add(op) marked_nodes.add(op)
test_graph.draw('.', 'test_mobile' + dev_name + quant_type, test_graph.draw('.', 'test_mobile' + dev_name +
activation_quant_type + '_' + weight_quant_type,
marked_nodes) marked_nodes)
mobile_program = test_graph.to_program() mobile_program = test_graph.to_program()
with fluid.scope_guard(scope): with fluid.scope_guard(scope):
fluid.io.save_inference_model('mobile_int8' + dev_name + quant_type, fluid.io.save_inference_model(
['image', 'label'], [loss], exe, 'mobile_int8' + dev_name + activation_quant_type + '_' +
mobile_program) weight_quant_type, ['image', 'label'], [loss], exe,
mobile_program)
def test_freeze_graph_cuda_dynamic(self): def test_freeze_graph_cuda_dynamic(self):
if fluid.core.is_compiled_with_cuda(): if fluid.core.is_compiled_with_cuda():
with fluid.unique_name.guard(): with fluid.unique_name.guard():
self.freeze_graph( self.freeze_graph(
True, seed=1, quant_type='abs_max', for_ci=True) True,
seed=1,
activation_quant_type='abs_max',
weight_quant_type='abs_max',
for_ci=True)
with fluid.unique_name.guard():
self.freeze_graph(
True,
seed=1,
activation_quant_type='abs_max',
weight_quant_type='channel_wise_abs_max',
for_ci=True)
def test_freeze_graph_cpu_dynamic(self): def test_freeze_graph_cpu_dynamic(self):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
self.freeze_graph(False, seed=2, quant_type='abs_max', for_ci=True) self.freeze_graph(
False,
seed=2,
activation_quant_type='abs_max',
weight_quant_type='abs_max',
for_ci=True)
self.freeze_graph(
False,
seed=2,
activation_quant_type='abs_max',
weight_quant_type='channel_wise_abs_max',
for_ci=True)
def test_freeze_graph_cuda_static(self): def test_freeze_graph_cuda_static(self):
if fluid.core.is_compiled_with_cuda(): if fluid.core.is_compiled_with_cuda():
with fluid.unique_name.guard(): with fluid.unique_name.guard():
self.freeze_graph( self.freeze_graph(
True, seed=1, quant_type='range_abs_max', for_ci=True) True,
seed=1,
activation_quant_type='range_abs_max',
weight_quant_type='abs_max',
for_ci=True)
self.freeze_graph(
True,
seed=1,
activation_quant_type='moving_average_abs_max',
weight_quant_type='abs_max',
for_ci=True)
self.freeze_graph( self.freeze_graph(
True, True,
seed=1, seed=1,
quant_type='moving_average_abs_max', activation_quant_type='range_abs_max',
weight_quant_type='channel_wise_abs_max',
for_ci=True)
self.freeze_graph(
True,
seed=1,
activation_quant_type='moving_average_abs_max',
weight_quant_type='channel_wise_abs_max',
for_ci=True) for_ci=True)
def test_freeze_graph_cpu_static(self): def test_freeze_graph_cpu_static(self):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
self.freeze_graph( self.freeze_graph(
False, seed=2, quant_type='range_abs_max', for_ci=True) False,
seed=2,
activation_quant_type='range_abs_max',
weight_quant_type='abs_max',
for_ci=True)
self.freeze_graph(
False,
seed=2,
activation_quant_type='moving_average_abs_max',
weight_quant_type='abs_max',
for_ci=True)
self.freeze_graph(
False,
seed=2,
activation_quant_type='range_abs_max',
weight_quant_type='channel_wise_abs_max',
for_ci=True)
self.freeze_graph( self.freeze_graph(
False, seed=2, quant_type='moving_average_abs_max', for_ci=True) False,
seed=2,
activation_quant_type='moving_average_abs_max',
weight_quant_type='channel_wise_abs_max',
for_ci=True)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -290,7 +290,7 @@ class TestCalibrationForResnet50(unittest.TestCase): ...@@ -290,7 +290,7 @@ class TestCalibrationForResnet50(unittest.TestCase):
self.model, self.infer_iterations) self.model, self.infer_iterations)
(int8_throughput, int8_latency, (int8_throughput, int8_latency,
int8_acc1) = self.run_program("calibration_out") int8_acc1) = self.run_program("calibration_out")
delta_value = np.abs(fp32_acc1 - int8_acc1) delta_value = fp32_acc1 - int8_acc1
self.assertLess(delta_value, 0.01) self.assertLess(delta_value, 0.01)
print( print(
"FP32 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}". "FP32 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}".
......
...@@ -644,10 +644,9 @@ class Operator(object): ...@@ -644,10 +644,9 @@ class Operator(object):
outputs={"Out": [var1]}) outputs={"Out": [var1]})
""" """
OP_WITHOUT_KERNEL_SET = { OP_WITHOUT_KERNEL_SET = {
'feed', 'fetch', 'save', 'load', 'recurrent', 'go', 'feed', 'fetch', 'recurrent', 'go', 'rnn_memory_helper_grad',
'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', 'conditional_block', 'while', 'send', 'recv', 'listen_and_serv',
'listen_and_serv', 'save_combine', 'load_combine', 'ncclInit', 'select', 'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id'
'checkpoint_notify', 'gen_nccl_id'
} }
def __init__(self, def __init__(self,
......
...@@ -29,9 +29,13 @@ from .tracer import * ...@@ -29,9 +29,13 @@ from .tracer import *
from . import profiler from . import profiler
from .profiler import * from .profiler import *
from . import checkpoint
from .checkpoint import *
__all__ = [] __all__ = []
__all__ += layers.__all__ __all__ += layers.__all__
__all__ += base.__all__ __all__ += base.__all__
__all__ += nn.__all__ __all__ += nn.__all__
__all__ += tracer.__all__ __all__ += tracer.__all__
__all__ += profiler.__all__ __all__ += profiler.__all__
__all__ += checkpoint.__all__
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import collections
from .. import core
from ..framework import Variable, default_main_program
__all__ = ['save_persistables', 'load_persistables']
def save_persistables(vardict, dirname, filename=None):
"""
This function filters out all variables in layer.parameters from the
give `layer` and then trys to load these variables from the folder
`dirname` or the file `filename`.
Use the `dirname` to specify the folder where persistable variables were
saved. If variables were saved in separate files, set `filename` None;
if all variables were saved in a single file, use `filename` to specify
the file name.
Args:
vardict(dict of Parameters): The parameters will
be saved. If it is None, nothing
will be deal.
dirname(str): The directory path.
filename(str|None): The file which saved all variables. If variables were
saved in differnet files, set it to None.
Default: None
Returns:
Examples:
.. code-block:: python
ptb_model = PtbModel(
hidden_size=hidden_size,
vocab_size=vocab_size,
num_layers=num_layers,
num_steps=num_steps,
init_scale=init_scale)
x_data = np.arange(12).reshape(4, 3).astype('int64')
y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
x_data = x_data.reshape((-1, num_steps, 1))
y_data = y_data.reshape((-1, 1))
init_hidden_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
init_cell_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
x = to_variable(x_data)
y = to_variable(y_data)
init_hidden = to_variable(init_hidden_data)
init_cell = to_variable(init_cell_data)
dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
init_cell)
param_path = "./my_paddle_model"
fluid.imperative.checkpoint.save_persistables(ptb_model.state_dict(), dirname=param_path,
layer=ptb_model)
"""
if isinstance(vardict, collections.OrderedDict):
_save_var_to_file(vardict, dirname, filename)
def load_persistables(vardict, dirname, filename=None):
"""
This function trys to load persistable variables from the folder
`dirname` or the file `filename`.
Use the `dirname` to specify the folder where persistable variables were
saved. If variables were saved in separate files, set `filename` None;
if all variables were saved in a single file, use `filename` to specify
the file name.
Args:
vardict(dict of Parameters): The parameters will be loaded.
dirname(str): The directory path.
filename(str|None): The file which saved all variables, this file path should be end with '.npz'. If variables were
saved in differnet files, set it to None.
Default: None
Returns:
dict: The parameter-dict resumed from file
Examples:
.. code-block:: python
my_layer = layer(fluid.imperative.Layer)
param_path = "./my_paddle_model"
param_dict = fluid.imperative.checkpoint.load_persistables(my_layer.parameters(), param_path)
param_1 = param_dict['PtbModel_0.w_1']
or:
my_layer = layer(fluid.imperative.Layer)
param_path = "./my_paddle_model"
filename = "model.file"
param_dict = fluid.imperative.checkpoint.load_persistables(my_layer.state_dict(), param_path,
filename=filename)
param_1 = param_dict['PtbModel_0.w_1']
"""
if isinstance(vardict, collections.OrderedDict):
return _load_var_from_file(vardict, dirname, filename)
return {}
def _save_var_to_file(stat_dict, file_dir, file_name):
save_block = default_main_program().global_block()
save_var_map = {}
for each_var in stat_dict.items():
save_var_map[each_var.name] = each_var
if file_name is None:
save_block.append_op(
type='save',
inputs={'X': [each_var]},
outputs={},
attrs={'file_path': os.path.join(file_dir, each_var.name)})
if file_name is not None:
save_var_list = []
for name in sorted(save_var_map.keys()):
save_var_list.append(save_var_map[name])
save_block.append_op(
type='save_combine',
inputs={'X': save_var_list},
outputs={},
attrs={'file_path': os.path.join(file_dir, file_name)})
def _load_var_from_file(stat_dict, file_dir, file_name):
load_block = default_main_program().global_block()
load_var_map = {}
for each_var in stat_dict.items():
assert isinstance(each_var, Variable)
if each_var.type == core.VarDesc.VarType.RAW:
continue
new_var = _clone_var_in_block_(load_block, each_var)
if file_name is None:
load_block.append_op(
type='load',
inputs={},
outputs={'Out': [new_var]},
attrs={'file_path': os.path.join(file_dir, each_var.name)})
load_var_map[new_var.name] = new_var
if file_name is not None:
load_var_list = []
for name in sorted(load_var_map.keys()):
load_var_list.append(load_var_map[name])
load_block.append_op(
type='load_combine',
inputs={},
outputs={"Out": load_var_list},
attrs={'file_path': os.path.join(file_dir, file_name)})
for res_var in load_var_list:
load_var_map[res_var.name] = res_var
return load_var_map
def _clone_var_in_block_(block, var):
assert isinstance(var, Variable)
return block.create_var(
name=var.name,
shape=var.shape,
dtype=var.dtype,
type=var.type,
lod_level=var.lod_level,
persistable=True)
...@@ -212,6 +212,34 @@ class Layer(core.Layer): ...@@ -212,6 +212,34 @@ class Layer(core.Layer):
else: else:
object.__delattr__(self, name) object.__delattr__(self, name)
def state_dict(self, destination=None, prefix='', include_sublayers=True):
if destination is None:
destination = collections.OrderedDict()
for name, data in self._parameters.items():
if data is not None:
destination[prefix + name] = data
if include_sublayers:
for layer_name, layer_item in self._sub_layers.items():
if layer_item is not None:
destination_temp = destination.copy()
destination_temp.update(
layer_item.state_dict(destination_temp, prefix +
layer_name + ".",
include_sublayers))
destination = destination_temp
return destination
def load_dict(self, stat_dict, include_sublayers=True):
for name, item in self.__dict__.get('_parameters', None).items():
if item.name in stat_dict:
self.__setattr__(name, stat_dict[item.name])
if include_sublayers:
for layer_name, layer_item in self._sub_layers.items():
if layer_item is not None:
layer_item.load_dict(stat_dict)
class PyLayer(core.PyLayer): class PyLayer(core.PyLayer):
"""Layers composed of user-defined python codes.""" """Layers composed of user-defined python codes."""
......
...@@ -896,7 +896,7 @@ def save_inference_model(dirname, ...@@ -896,7 +896,7 @@ def save_inference_model(dirname,
True is supported. True is supported.
Returns: Returns:
None target_var_name_list(list): The fetch variables' name list
Raises: Raises:
ValueError: If `feed_var_names` is not a list of basestring. ValueError: If `feed_var_names` is not a list of basestring.
...@@ -949,11 +949,13 @@ def save_inference_model(dirname, ...@@ -949,11 +949,13 @@ def save_inference_model(dirname,
# TODO(Superjomn) add an IR pass to remove 1-scale op. # TODO(Superjomn) add an IR pass to remove 1-scale op.
with program_guard(main_program): with program_guard(main_program):
uniq_target_vars = [] uniq_target_vars = []
for var in target_vars: for i, var in enumerate(target_vars):
if isinstance(var, Variable): if isinstance(var, Variable):
var1 = layers.scale(var, 1.) var = layers.scale(
uniq_target_vars.append(var1) var, 1., name="save_infer_model/scale_{}".format(i))
uniq_target_vars.append(var)
target_vars = uniq_target_vars target_vars = uniq_target_vars
target_var_name_list = [var.name for var in target_vars]
# when a pserver and a trainer running on the same machine, mkdir may conflict # when a pserver and a trainer running on the same machine, mkdir may conflict
try: try:
...@@ -1010,6 +1012,7 @@ def save_inference_model(dirname, ...@@ -1010,6 +1012,7 @@ def save_inference_model(dirname,
params_filename = os.path.basename(params_filename) params_filename = os.path.basename(params_filename)
save_persistables(executor, dirname, main_program, params_filename) save_persistables(executor, dirname, main_program, params_filename)
return target_var_name_list
def load_inference_model(dirname, def load_inference_model(dirname,
......
...@@ -33,6 +33,7 @@ from .detection import * ...@@ -33,6 +33,7 @@ from .detection import *
from . import metric_op from . import metric_op
from .metric_op import * from .metric_op import *
from .learning_rate_scheduler import * from .learning_rate_scheduler import *
from .collective import *
__all__ = [] __all__ = []
__all__ += nn.__all__ __all__ += nn.__all__
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from ..layer_helper import LayerHelper, unique_name
def _allreduce(x, out=None, reduce_type="sum"):
helper = LayerHelper("allreduce", **locals())
# Convert string reduce type to op int type
red_typ_int = 0
if reduce_type == "sum":
red_typ_int = 0
elif reduce_type == "prod":
red_typ_int = 1
elif reduce_type == "max":
red_typ_int = 2
elif reduce_type == "min":
red_typ_int = 3
else:
raise TypeError("reduce type can only be [sum|prod|max|min]")
if out is None:
out = helper.create_variable(
name=unique_name.generate(".".join([x.name, 'tmp'])),
shape=x.shape,
dtype=x.dtype,
type=x.type,
persistable=x.persistable,
stop_gradient=True)
helper.append_op(
type='allreduce',
inputs={'X': [x]},
outputs={'Out': [out]},
attrs={"reduce_type": red_typ_int})
return out
...@@ -28,21 +28,9 @@ import six ...@@ -28,21 +28,9 @@ import six
from functools import reduce from functools import reduce
__all__ = [ __all__ = [
'While', 'While', 'Switch', 'increment', 'array_write', 'create_array', 'less_than',
'Switch', 'equal', 'array_read', 'array_length', 'IfElse', 'DynamicRNN', 'StaticRNN',
'increment', 'reorder_lod_tensor_by_rank', 'Print', 'is_empty'
'array_write',
'create_array',
'less_than',
'equal',
'array_read',
'array_length',
'IfElse',
'DynamicRNN',
'StaticRNN',
'reorder_lod_tensor_by_rank',
'Print',
'is_empty',
] ]
...@@ -1448,12 +1436,13 @@ class DynamicRNN(object): ...@@ -1448,12 +1436,13 @@ class DynamicRNN(object):
self.input_array = [] self.input_array = []
self.mem_link = [] self.mem_link = []
def step_input(self, x): def step_input(self, x, level=0):
""" """
Mark a sequence as a dynamic RNN input. Mark a sequence as a dynamic RNN input.
Args: Args:
x(Variable): The input sequence. x(Variable): The input sequence.
level(int): The level of lod used to split steps. Default: 0.
Returns: Returns:
The current timestep in the input sequence. The current timestep in the input sequence.
...@@ -1471,7 +1460,8 @@ class DynamicRNN(object): ...@@ -1471,7 +1460,8 @@ class DynamicRNN(object):
parent_block.append_op( parent_block.append_op(
type='lod_rank_table', type='lod_rank_table',
inputs={"X": x}, inputs={"X": x},
outputs={"Out": self.lod_rank_table}) outputs={"Out": self.lod_rank_table},
attrs={"level": level})
self.max_seq_len = parent_block.create_var( self.max_seq_len = parent_block.create_var(
name=unique_name.generate('dynamic_rnn_max_seq_len'), name=unique_name.generate('dynamic_rnn_max_seq_len'),
dtype='int64') dtype='int64')
......
...@@ -174,6 +174,8 @@ def monkey_patch_variable(): ...@@ -174,6 +174,8 @@ def monkey_patch_variable():
("__rtruediv__", "elementwise_div", True), ("__rtruediv__", "elementwise_div", True),
("__pow__", "elementwise_pow", False), ("__pow__", "elementwise_pow", False),
("__rpow__", "elementwise_pow", True), ("__rpow__", "elementwise_pow", True),
("__floordiv__", "elementwise_floordiv", False),
("__mod__", "elementwise_mod", False),
# for logical compare # for logical compare
("__eq__", "equal", False), ("__eq__", "equal", False),
("__ne__", "not_equal", False), ("__ne__", "not_equal", False),
......
...@@ -1348,7 +1348,7 @@ def dropout(x, ...@@ -1348,7 +1348,7 @@ def dropout(x,
1. downgrade_in_infer(default), downgrade the outcome at inference 1. downgrade_in_infer(default), downgrade the outcome at inference
- train: out = input * mask - train: out = input * mask
- inference: out = input * dropout_prob - inference: out = input * (1.0 - dropout_prob)
(mask is a tensor same shape with input, value is 0 or 1 (mask is a tensor same shape with input, value is 0 or 1
ratio of 0 is dropout_prob) ratio of 0 is dropout_prob)
...@@ -4901,6 +4901,9 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None): ...@@ -4901,6 +4901,9 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
if len(y_shape) > 2 and len(x_shape) > 2: if len(y_shape) > 2 and len(x_shape) > 2:
for i, dim_x in enumerate(x_shape[:-2]): for i, dim_x in enumerate(x_shape[:-2]):
# don't check neg shape
if dim_x < 0 or y_shape[i] < 0:
continue
if dim_x != y_shape[i]: if dim_x != y_shape[i]:
raise ValueError("Invalid inputs for matmul. x(%s), y(%s)" % raise ValueError("Invalid inputs for matmul. x(%s), y(%s)" %
(x.shape, y.shape)) (x.shape, y.shape))
...@@ -9228,9 +9231,24 @@ def elementwise_pow(x, y, axis=-1, act=None, name=None): ...@@ -9228,9 +9231,24 @@ def elementwise_pow(x, y, axis=-1, act=None, name=None):
return _elementwise_op(LayerHelper('elementwise_pow', **locals())) return _elementwise_op(LayerHelper('elementwise_pow', **locals()))
def elementwise_mod(x, y, axis=-1, act=None, name=None):
return _elementwise_op(LayerHelper('elementwise_mod', **locals()))
def elementwise_floordiv(x, y, axis=-1, act=None, name=None):
return _elementwise_op(LayerHelper('elementwise_floordiv', **locals()))
for func in [ for func in [
elementwise_add, elementwise_div, elementwise_sub, elementwise_mul, elementwise_add,
elementwise_max, elementwise_min, elementwise_pow elementwise_div,
elementwise_sub,
elementwise_mul,
elementwise_max,
elementwise_min,
elementwise_pow,
elementwise_mod,
elementwise_floordiv,
]: ]:
op_proto = OpProtoHolder.instance().get_op_proto(func.__name__) op_proto = OpProtoHolder.instance().get_op_proto(func.__name__)
func.__doc__ = _generate_doc_string_( func.__doc__ = _generate_doc_string_(
...@@ -9706,7 +9724,12 @@ def sequence_reverse(x, name=None): ...@@ -9706,7 +9724,12 @@ def sequence_reverse(x, name=None):
return out return out
def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None): def affine_channel(x,
scale=None,
bias=None,
data_layout='NCHW',
name=None,
act=None):
""" """
Applies a separate affine transformation to each channel of the input. Applies a separate affine transformation to each channel of the input.
Useful for replacing spatial batch norm with its equivalent fixed Useful for replacing spatial batch norm with its equivalent fixed
...@@ -9725,6 +9748,7 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None): ...@@ -9725,6 +9748,7 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
data_layout (string, default NCHW): NCHW or NHWC. If input is 2D data_layout (string, default NCHW): NCHW or NHWC. If input is 2D
tensor, you can ignore data_layout. tensor, you can ignore data_layout.
name (str, default None): The name of this layer. name (str, default None): The name of this layer.
act (str, default None): Activation to be applied to the output of this layer.
Returns: Returns:
out (Variable): A tensor of the same shape and data layout with x. out (Variable): A tensor of the same shape and data layout with x.
...@@ -9744,7 +9768,7 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None): ...@@ -9744,7 +9768,7 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
'Bias': bias}, 'Bias': bias},
attrs={"data_layout": data_layout}, attrs={"data_layout": data_layout},
outputs={"Out": out}) outputs={"Out": out})
return out return helper.append_activation(out)
def similarity_focus(input, axis, indexes, name=None): def similarity_focus(input, axis, indexes, name=None):
......
...@@ -25,10 +25,26 @@ from .layer_function_generator import templatedoc ...@@ -25,10 +25,26 @@ from .layer_function_generator import templatedoc
import numpy import numpy
__all__ = [ __all__ = [
'create_tensor', 'create_parameter', 'create_global_var', 'cast', 'create_tensor',
'tensor_array_to_tensor', 'concat', 'sums', 'assign', 'create_parameter',
'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax', 'create_global_var',
'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite' 'cast',
'tensor_array_to_tensor',
'concat',
'sums',
'assign',
'fill_constant_batch_size_like',
'fill_constant',
'argmin',
'argmax',
'argsort',
'ones',
'zeros',
'reverse',
'has_inf',
'has_nan',
'isfinite',
'range',
] ]
...@@ -764,3 +780,50 @@ def isfinite(x): ...@@ -764,3 +780,50 @@ def isfinite(x):
out = helper.create_variable_for_type_inference(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(type="isfinite", inputs={"X": x}, outputs={"Out": out}) helper.append_op(type="isfinite", inputs={"X": x}, outputs={"Out": out})
return out return out
def range(start, end, step, dtype):
"""
Return evenly spaced values within a given interval.
Values are generated within the half-open interval [start, stop) (in other words,
the interval including start but excluding stop).
args:
start(int|float|Variable): Start of interval. The interval includes this value.
end(int|float|Variable): End of interval. The interval does not include this
value, except in some cases where step is not an integer
and floating point round-off affects the length of out.
step(int|float|Variable): Spacing between values. For any output out, this is the
distance between two adjacent values, out[i+1] - out[i].
The default step size is 1.
dtype(string): 'float32'|'int32'|..., the data type of the output tensor.
returns:
Evenly spaced values within a given interval.
examples:
.. code-block:: python
data = fluid.layers.range(0, 10, 2, 'int32')
"""
helper = LayerHelper("range", **locals())
if not isinstance(start, Variable):
start = fill_constant([1], dtype, start)
if not isinstance(end, Variable):
end = fill_constant([1], dtype, end)
if not isinstance(step, Variable):
step = fill_constant([1], dtype, step)
out = helper.create_variable_for_type_inference(dtype=start.dtype)
helper.append_op(
type='range',
inputs={'Start': start,
'End': end,
'Step': step},
outputs={'Out': [out]})
return out
...@@ -70,6 +70,10 @@ class Optimizer(object): ...@@ -70,6 +70,10 @@ class Optimizer(object):
# {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...} # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
self._accumulators = defaultdict(lambda: dict()) self._accumulators = defaultdict(lambda: dict())
self.helper = None self.helper = None
self._opti_name_list = []
def get_opti_var_name_list(self):
return self._opti_name_list
def _create_global_learning_rate(self): def _create_global_learning_rate(self):
lr = self._global_learning_rate() lr = self._global_learning_rate()
...@@ -166,8 +170,13 @@ class Optimizer(object): ...@@ -166,8 +170,13 @@ class Optimizer(object):
if shape == None: if shape == None:
shape = param.shape shape = param.shape
assert isinstance(self.helper, LayerHelper) assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_" + name
var_name = unique_name.generate(var_name)
self._opti_name_list.append(var_name)
var = self.helper.create_global_variable( var = self.helper.create_global_variable(
name=unique_name.generate(name), name=var_name,
persistable=True, persistable=True,
dtype=dtype or param.dtype, dtype=dtype or param.dtype,
type=param.type, type=param.type,
......
...@@ -70,6 +70,7 @@ list(REMOVE_ITEM TEST_OPS test_dist_transpiler) ...@@ -70,6 +70,7 @@ list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_nccl)
list(REMOVE_ITEM TEST_OPS test_dist_transformer) list(REMOVE_ITEM TEST_OPS test_dist_transformer)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer) list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
list(REMOVE_ITEM TEST_OPS test_image_classification_resnet) list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
...@@ -95,14 +96,16 @@ if(WITH_DISTRIBUTE) ...@@ -95,14 +96,16 @@ if(WITH_DISTRIBUTE)
if(NOT APPLE) if(NOT APPLE)
set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext) py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000) set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl)
set_tests_properties(test_dist_se_resnext_nccl PROPERTIES TIMEOUT 1000)
# FIXME(typhoonzero): add these tests back # FIXME(typhoonzero): add these tests back
# py_test_modules(test_dist_transformer MODULES test_dist_transformer) # py_test_modules(test_dist_transformer MODULES test_dist_transformer)
# set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000) # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
set_tests_properties(test_dist_ctr test_dist_mnist test_dist_mnist_batch_merge test_dist_save_load test_dist_se_resnext test_dist_simnet_bow test_dist_text_classification test_dist_train test_dist_word2vec PROPERTIES RUN_SERIAL TRUE) set_tests_properties(test_dist_ctr test_dist_mnist test_dist_mnist_batch_merge test_dist_save_load test_dist_se_resnext test_dist_simnet_bow test_dist_text_classification test_dist_train test_dist_word2vec PROPERTIES RUN_SERIAL TRUE)
endif(NOT APPLE) endif(NOT APPLE)
py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) # py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
endif() endif()
py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
...@@ -115,8 +118,8 @@ if(NOT APPLE) ...@@ -115,8 +118,8 @@ if(NOT APPLE)
py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
endif() endif()
if(CMAKE_BUILD_TYPE STREQUAL "Debug") if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# change the timeout from 600 to 1200, because in debug mode, this test need more time. # change the timeout from 600 to 2200, because in debug mode, this test need more time.
set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 1200) set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 2200)
endif() endif()
if (WITH_NGRAPH) if (WITH_NGRAPH)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import argparse
import time
import math
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from paddle.fluid import core
import unittest
from multiprocessing import Process
import os
import signal
from functools import reduce
from test_dist_base import TestDistRunnerBase, runtime_main
DTYPE = "float32"
paddle.dataset.mnist.fetch()
# Fix seed for test
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1
def cnn_model(data):
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=data,
filter_size=5,
num_filters=20,
pool_size=2,
pool_stride=2,
act="relu",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=0.01)))
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
pool_size=2,
pool_stride=2,
act="relu",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=0.01)))
SIZE = 10
input_shape = conv_pool_2.shape
param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
predict = fluid.layers.fc(
input=conv_pool_2,
size=SIZE,
act="softmax",
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01)))
return predict
class TestDistMnist2x2(TestDistRunnerBase):
def get_model(self, batch_size=2, single_device=False):
# Input data
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program
predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(
input=predict, label=label, total=batch_size_tensor)
inference_program = fluid.default_main_program().clone()
# Reader
train_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size)
# Optimization
# TODO(typhoonzero): fix distributed adam optimizer
# opt = fluid.optimizer.AdamOptimizer(
# learning_rate=0.001, beta1=0.9, beta2=0.999)
opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9)
if single_device:
opt.minimize(avg_cost)
else:
# multi device or distributed multi device
params_grads = opt.backward(avg_cost)
data_parallel_param_grads = []
for p, g in params_grads:
# NOTE: scale will be done on loss scale in multi_devices_graph_pass using nranks.
grad_reduce = fluid.layers.collective._allreduce(g)
data_parallel_param_grads.append([p, grad_reduce])
opt.apply_gradients(data_parallel_param_grads)
return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
if __name__ == "__main__":
runtime_main(TestDistMnist2x2)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from paddle.fluid.tests.unittests.op_test import OpTest
class TestConcatOp(OpTest):
def setUp(self):
self.op_type = "concat"
self.use_mkldnn = True
self._cpu_only = True
self.init_axis()
self.init_shape()
self.init_test_data()
self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
self.attrs = {'axis': self.axis, 'use_mkldnn': True}
self.output = np.concatenate(
(self.x0, self.x1, self.x2), axis=self.axis).astype('int')
self.outputs = {'Out': self.output}
def test_check_output(self):
self.check_output()
#--------------------test concat s8 in with axis 0--------------------
def init_test_data(self):
self.x0 = (np.random.randint(0, 100, self.x0_shape) - 50).astype('int8')
self.x1 = (np.random.randint(0, 80, self.x1_shape) - 30).astype('int8')
self.x2 = (np.random.randint(0, 110, self.x2_shape) - 80).astype('int8')
def init_axis(self):
self.axis = 0
def init_shape(self):
self.x0_shape = [2, 2, 1, 2]
self.x1_shape = [1, 2, 1, 2]
self.x2_shape = [3, 2, 1, 2]
#--------------------test concat u8 in with axis 0--------------------
class TestConcatOp2(TestConcatOp):
def init_test_data(self):
self.x0 = (np.random.randint(0, 100, self.x0_shape)).astype('uint8')
self.x1 = (np.random.randint(0, 50, self.x1_shape)).astype('uint8')
self.x2 = (np.random.randint(0, 80, self.x2_shape)).astype('uint8')
def init_axis(self):
self.axis = 0
def init_shape(self):
self.x0_shape = [2, 1, 5, 5]
self.x1_shape = [1, 1, 5, 5]
self.x2_shape = [3, 1, 5, 5]
def create_test_int8_class(parent):
#--------------------test concat s8/u8 in with axis 1--------------------
class TestAxis1Case(parent):
def init_axis(self):
self.axis = 1
def init_shape(self):
self.x0_shape = [1, 1, 5, 5]
self.x1_shape = [1, 2, 5, 5]
self.x2_shape = [1, 3, 5, 5]
#--------------------test concat s8/u8 in with axis 2--------------------
class TestAxis2Case(parent):
def init_axis(self):
self.axis = 2
def init_shape(self):
self.x0_shape = [2, 3, 4, 5]
self.x1_shape = [2, 3, 5, 5]
self.x2_shape = [2, 3, 6, 5]
#--------------------test concat s8/u8 in with axis 3--------------------
class TestAxis3Case(parent):
def init_axis(self):
self.axis = 3
def init_shape(self):
self.x0_shape = [2, 3, 5, 5]
self.x1_shape = [2, 3, 5, 6]
self.x2_shape = [2, 3, 5, 7]
cls_name_1 = "{0}_axis_{1}".format(parent.__name__, "1")
cls_name_2 = "{0}_axis_{1}".format(parent.__name__, "2")
cls_name_3 = "{0}_axis_{1}".format(parent.__name__, "3")
TestAxis1Case.__name__ = cls_name_1
TestAxis2Case.__name__ = cls_name_2
TestAxis3Case.__name__ = cls_name_3
globals()[cls_name_1] = TestAxis1Case
globals()[cls_name_2] = TestAxis2Case
globals()[cls_name_3] = TestAxis3Case
create_test_int8_class(TestConcatOp)
create_test_int8_class(TestConcatOp2)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -12,31 +12,9 @@ ...@@ -12,31 +12,9 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
__all__ = ['GraphPass', 'PruneParameterPass'] from __future__ import print_function
import unittest
from paddle.fluid.tests.unittests.test_softmax_with_cross_entropy_op import TestSoftmaxWithCrossEntropyOp, TestSoftmaxWithCrossEntropyOp2, TestSoftmaxWithCrossEntropyOp3
if __name__ == "__main__":
class GraphPass(object): unittest.main()
"""
Base class for all graph pass.
"""
def __init__(self):
pass
def apply(self, graph):
pass
class PruneParameterPass(GraphPass):
"""
Generate a graph for pruning parameters from target graph.
"""
def __init__(self, pruned_params, thresholds):
super(PruneParameterPass, self).__init__()
self.pruned_params = pruned_params
self.thresholds = thresholds
self.default_threshold = thresholds['*']
def apply(self, graph):
pass
...@@ -43,6 +43,7 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -43,6 +43,7 @@ class TestParallelExecutorBase(unittest.TestCase):
use_ir_memory_optimize=True, use_ir_memory_optimize=True,
enable_inplace=True, enable_inplace=True,
fuse_elewise_add_act_ops=False, fuse_elewise_add_act_ops=False,
fuse_all_reduce_ops=False,
fuse_relu_depthwise_conv=False, fuse_relu_depthwise_conv=False,
optimizer=fluid.optimizer.Adam, optimizer=fluid.optimizer.Adam,
use_fast_executor=False, use_fast_executor=False,
...@@ -80,6 +81,7 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -80,6 +81,7 @@ class TestParallelExecutorBase(unittest.TestCase):
build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv
build_strategy.memory_optimize = False if memory_opt else use_ir_memory_optimize build_strategy.memory_optimize = False if memory_opt else use_ir_memory_optimize
build_strategy.fuse_all_reduce_ops = fuse_all_reduce_ops
# python memory optimization is conflict with inplace pass. # python memory optimization is conflict with inplace pass.
# Use ir graph memory optimization after inplace pass is the correct way. # Use ir graph memory optimization after inplace pass is the correct way.
build_strategy.enable_inplace = False if memory_opt else enable_inplace build_strategy.enable_inplace = False if memory_opt else enable_inplace
......
...@@ -11,39 +11,25 @@ ...@@ -11,39 +11,25 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
import os
import subprocess
from ....framework import Program
from ....framework import Block
from .... import core
__all__ = ['Graph', 'ImitationGraph', 'IRGraph']
class Graph(object): from __future__ import print_function
""" import unittest
Base class for all graph. from test_dist_base import TestDistBase
"""
def __init__(self):
pass
def all_parameters(self):
"""
Return all the parameters in current graph.
"""
pass
class ImitationGraph(Graph): class TestDistMnistNCCL2(TestDistBase):
def __init__(self, program=None): def _setup_config(self):
super(ImitationGraph, self).__init__() self._sync_mode = True
self.program = Program() if program is None else program self._use_reduce = False
self._use_reader_alloc = False
self._nccl2_mode = True
self._nccl2_reduce_layer = True
def all_parameters(self): def test_dist_train(self):
return self.program.global_block().all_parameters() import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda():
self.check_with_place("dist_allreduce_op.py", delta=1e-5)
class IRGraph(Graph): if __name__ == '__main__':
pass unittest.main()
...@@ -33,7 +33,10 @@ DEFAULT_BATCH_SIZE = 2 ...@@ -33,7 +33,10 @@ DEFAULT_BATCH_SIZE = 2
class TestDistRunnerBase(object): class TestDistRunnerBase(object):
def get_model(self, batch_size=DEFAULT_BATCH_SIZE, lr=0.1): def get_model(self,
batch_size=DEFAULT_BATCH_SIZE,
lr=0.1,
single_device=False):
raise NotImplementedError( raise NotImplementedError(
"get_model should be implemented by child classes.") "get_model should be implemented by child classes.")
...@@ -76,8 +79,12 @@ class TestDistRunnerBase(object): ...@@ -76,8 +79,12 @@ class TestDistRunnerBase(object):
def run_trainer(self, args): def run_trainer(self, args):
self.lr = args.lr self.lr = args.lr
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ if args.nccl2_reduce_layer_local_run:
self.get_model(batch_size=args.batch_size) test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
self.get_model(batch_size=args.batch_size, single_device=True)
else:
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
self.get_model(batch_size=args.batch_size)
if args.mem_opt: if args.mem_opt:
fluid.memory_optimize(fluid.default_main_program(), skip_grads=True) fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
...@@ -87,7 +94,7 @@ class TestDistRunnerBase(object): ...@@ -87,7 +94,7 @@ class TestDistRunnerBase(object):
args.endpoints, args.trainers, args.endpoints, args.trainers,
args.sync_mode, args.dc_asgd) args.sync_mode, args.dc_asgd)
trainer_prog = t.get_trainer_program() trainer_prog = t.get_trainer_program()
elif args.update_method == "nccl2": elif args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer":
# transpile for nccl2 # transpile for nccl2
config = fluid.DistributeTranspilerConfig() config = fluid.DistributeTranspilerConfig()
config.mode = "nccl2" config.mode = "nccl2"
...@@ -103,16 +110,17 @@ class TestDistRunnerBase(object): ...@@ -103,16 +110,17 @@ class TestDistRunnerBase(object):
trainer_prog = fluid.default_main_program() trainer_prog = fluid.default_main_program()
if args.use_cuda: if args.use_cuda:
place = fluid.CUDAPlace(0) device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
place = fluid.CUDAPlace(device_id)
else: else:
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
strategy = fluid.ExecutionStrategy() exec_strategy = fluid.ExecutionStrategy()
strategy.num_threads = 1 exec_strategy.num_threads = 1
strategy.allow_op_delay = False exec_strategy.allow_op_delay = False
build_stra = fluid.BuildStrategy() build_stra = fluid.BuildStrategy()
# FIXME force disable enable_inplace and memory_optimize # FIXME force disable enable_inplace and memory_optimize
...@@ -124,23 +132,25 @@ class TestDistRunnerBase(object): ...@@ -124,23 +132,25 @@ class TestDistRunnerBase(object):
else: else:
build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
pass_builder = None
if args.batch_merge_repeat > 1: if args.batch_merge_repeat > 1:
pass_builder = build_stra._finalize_strategy_and_create_passes() pass_builder = build_stra._finalize_strategy_and_create_passes()
mypass = pass_builder.insert_pass( mypass = pass_builder.insert_pass(
len(pass_builder.all_passes()) - 3, "multi_batch_merge_pass") len(pass_builder.all_passes()) - 3, "multi_batch_merge_pass")
mypass.set("num_repeats", args.batch_merge_repeat) mypass.set("num_repeats", args.batch_merge_repeat)
if args.update_method == "nccl2": if args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer":
build_stra.num_trainers = len(args.endpoints.split(",")) build_stra.num_trainers = len(args.endpoints.split(","))
build_stra.trainer_id = args.trainer_id build_stra.trainer_id = args.trainer_id
else: else:
# case args.update_method == "nccl2_reduce_layer":
build_stra.num_trainers = 1 build_stra.num_trainers = 1
build_stra.trainer_id = 0 build_stra.trainer_id = 0
binary = compiler.CompiledProgram(trainer_prog).with_data_parallel( binary = compiler.CompiledProgram(trainer_prog).with_data_parallel(
loss_name=avg_cost.name, loss_name=avg_cost.name,
build_strategy=build_stra, build_strategy=build_stra,
exec_strategy=strategy) exec_strategy=exec_strategy)
feed_var_list = [ feed_var_list = [
var for var in trainer_prog.global_block().vars.values() var for var in trainer_prog.global_block().vars.values()
...@@ -182,7 +192,7 @@ def runtime_main(test_class): ...@@ -182,7 +192,7 @@ def runtime_main(test_class):
'--update_method', '--update_method',
type=str, type=str,
default="local", default="local",
choices=["pserver", "nccl2", "local"]) choices=["pserver", "nccl2", "local", "nccl2_reduce_layer"])
parser.add_argument('--trainer_id', type=int, required=False, default=0) parser.add_argument('--trainer_id', type=int, required=False, default=0)
parser.add_argument('--trainers', type=int, required=False, default=1) parser.add_argument('--trainers', type=int, required=False, default=1)
parser.add_argument( parser.add_argument(
...@@ -198,6 +208,11 @@ def runtime_main(test_class): ...@@ -198,6 +208,11 @@ def runtime_main(test_class):
parser.add_argument('--lr', required=False, type=float, default=0.001) parser.add_argument('--lr', required=False, type=float, default=0.001)
parser.add_argument( parser.add_argument(
'--batch_merge_repeat', required=False, type=int, default=1) '--batch_merge_repeat', required=False, type=int, default=1)
parser.add_argument(
'--nccl2_reduce_layer_local_run',
required=False,
type=bool,
default=False)
args = parser.parse_args() args = parser.parse_args()
...@@ -242,6 +257,12 @@ class TestDistBase(unittest.TestCase): ...@@ -242,6 +257,12 @@ class TestDistBase(unittest.TestCase):
self._dc_asgd = False # must use with async mode self._dc_asgd = False # must use with async mode
self._use_reader_alloc = True self._use_reader_alloc = True
self._nccl2_mode = False self._nccl2_mode = False
self._mp_mode = False
# FIXME(typhoonzero): I added this stupid argument to enable
# testing allreduce layers, which users can call layers.allreduce
# to accumulate tensors at anywhere. Find a better way to do this
# test, reduce check this argument everywhere.
self._nccl2_reduce_layer = False
self._lr = 0.001 self._lr = 0.001
self._setup_config() self._setup_config()
self._after_setup_config() self._after_setup_config()
...@@ -307,10 +328,16 @@ class TestDistBase(unittest.TestCase): ...@@ -307,10 +328,16 @@ class TestDistBase(unittest.TestCase):
cmd += " --batch_size %d" % batch_size cmd += " --batch_size %d" % batch_size
if batch_merge_repeat > 1: if batch_merge_repeat > 1:
cmd += " --batch_merge_repeat %d" % batch_merge_repeat cmd += " --batch_merge_repeat %d" % batch_merge_repeat
if self._nccl2_reduce_layer:
cmd += " --nccl2_reduce_layer_local_run 1"
if self.__use_cuda: if self.__use_cuda:
cmd += " --use_cuda" cmd += " --use_cuda"
env_local = {"CUDA_VISIBLE_DEVICES": "0"} env_local = {
"CUDA_VISIBLE_DEVICES": "0",
"PADDLE_TRAINERS_NUM": "1",
"PADDLE_TRAINER_ID": "0"
}
else: else:
env_local = {'CPU_NUM': '1'} env_local = {'CPU_NUM': '1'}
...@@ -427,29 +454,30 @@ class TestDistBase(unittest.TestCase): ...@@ -427,29 +454,30 @@ class TestDistBase(unittest.TestCase):
sys.stderr.write("ps1 stderr: %s\n" % fn.read()) sys.stderr.write("ps1 stderr: %s\n" % fn.read())
# print log # print log
if stat0 == 0:
sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out))
with open("/tmp/tr0_err.log", "r") as fn: with open("/tmp/tr0_err.log", "r") as fn:
sys.stderr.write('trainer 0 stderr: %s\n' % fn.read()) sys.stderr.write('trainer 0 stderr: %s\n' % fn.read())
if stat1 == 0:
sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out))
with open("/tmp/tr1_err.log", "r") as fn: with open("/tmp/tr1_err.log", "r") as fn:
sys.stderr.write('trainer 1 stderr: %s\n' % fn.read()) sys.stderr.write('trainer 1 stderr: %s\n' % fn.read())
return pickle.loads(tr0_out), pickle.loads(tr1_out) return pickle.loads(tr0_out), pickle.loads(tr1_out)
def _run_cluster_nccl2(self, model, envs, check_error_log): def _run_cluster_nccl2(self, model, envs, nccl2_reduce_layer,
check_error_log):
# NOTE: we reuse ps_endpoints as nccl2 worker endpoints # NOTE: we reuse ps_endpoints as nccl2 worker endpoints
worker_endpoints = self._ps_endpoints.split(",") worker_endpoints = self._ps_endpoints.split(",")
w0_ep, w1_ep = worker_endpoints w0_ep, w1_ep = worker_endpoints
if nccl2_reduce_layer:
update_method = "nccl2_reduce_layer"
else:
update_method = "nccl2"
tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2 --lr %f" tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method %s --lr %f"
tr0_cmd = tr_cmd % \ tr0_cmd = tr_cmd % \
(self._python_interp, model, self._ps_endpoints, (self._python_interp, model, self._ps_endpoints,
0, w0_ep, self._lr) 0, w0_ep, update_method, self._lr)
tr1_cmd = tr_cmd % \ tr1_cmd = tr_cmd % \
(self._python_interp, model, self._ps_endpoints, (self._python_interp, model, self._ps_endpoints,
1, w1_ep, self._lr) 1, w1_ep, update_method, self._lr)
if self._mem_opt: if self._mem_opt:
tr0_cmd += " --mem_opt" tr0_cmd += " --mem_opt"
...@@ -463,12 +491,25 @@ class TestDistBase(unittest.TestCase): ...@@ -463,12 +491,25 @@ class TestDistBase(unittest.TestCase):
if self.__use_cuda: if self.__use_cuda:
tr0_cmd += " --use_cuda" tr0_cmd += " --use_cuda"
tr1_cmd += " --use_cuda" tr1_cmd += " --use_cuda"
env0 = {"CUDA_VISIBLE_DEVICES": "0"} env0 = {
env1 = {"CUDA_VISIBLE_DEVICES": "1"} "CUDA_VISIBLE_DEVICES": "0",
# for test nccl2 layer
"PADDLE_TRAINERS_NUM": "2",
"PADDLE_TRAINER_ID": "0"
}
env1 = {
"CUDA_VISIBLE_DEVICES": "1",
"PADDLE_TRAINERS_NUM": "2",
"PADDLE_TRAINER_ID": "1"
}
else: else:
env0 = {'CPU_NUM': '1'} env0 = {'CPU_NUM': '1'}
env1 = {'CPU_NUM': '1'} env1 = {'CPU_NUM': '1'}
if self._mp_mode:
env0 = {"FLAGS_selected_gpus": "0"}
env1 = {"FLAGS_selected_gpus": "1"}
env0.update(envs) env0.update(envs)
env1.update(envs) env1.update(envs)
...@@ -498,8 +539,6 @@ class TestDistBase(unittest.TestCase): ...@@ -498,8 +539,6 @@ class TestDistBase(unittest.TestCase):
# print log # print log
sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err) sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
sys.stderr.write('trainer 0 stdout: %s\n' % tr0_out)
sys.stderr.write('trainer 1 stdout: %s\n' % tr1_out)
return pickle.loads(tr0_out), pickle.loads(tr1_out) return pickle.loads(tr0_out), pickle.loads(tr1_out)
...@@ -528,10 +567,14 @@ class TestDistBase(unittest.TestCase): ...@@ -528,10 +567,14 @@ class TestDistBase(unittest.TestCase):
local_losses\ local_losses\
= self._run_local(model_file, required_envs, = self._run_local(model_file, required_envs,
check_error_log) check_error_log)
if self._nccl2_mode: if self._nccl2_mode:
tr0_losses, tr1_losses = self._run_cluster_nccl2( if self._nccl2_reduce_layer:
model_file, required_envs, check_error_log) tr0_losses, tr1_losses = self._run_cluster_nccl2(
model_file, required_envs, True, check_error_log)
else:
tr0_losses, tr1_losses = self._run_cluster_nccl2(
model_file, required_envs, False, check_error_log)
else: else:
tr0_losses, tr1_losses = self._run_cluster( tr0_losses, tr1_losses = self._run_cluster(
model_file, required_envs, check_error_log) model_file, required_envs, check_error_log)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from test_dist_base import TestDistBase
import os
def skip_ci(func):
on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
def __func__(*args, **kwargs):
if on_ci:
return
return func(*args, **kwargs)
return __func__
class TestDistSeResneXtNCCL(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_reader_alloc = False
self._nccl2_mode = True
@skip_ci
def test_dist_train(self):
import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda():
self.check_with_place("dist_se_resnext.py", delta=1e-5)
class TestDistSeResneXtNCCLMP(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_reader_alloc = False
self._nccl2_mode = True
self._mp_mode = True
@skip_ci
def test_dist_train(self):
import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda():
self.check_with_place(
"dist_se_resnext.py",
delta=1e-5,
need_envs={"NCCL_P2P_DISABLE": "1"})
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid.core as core
from op_test import OpTest
import random
class TestElementwiseModOp(OpTest):
def init_kernel_type(self):
self.use_mkldnn = False
def setUp(self):
self.op_type = "elementwise_floordiv"
self.dtype = np.int32
self.axis = -1
self.init_dtype()
self.init_input_output()
self.init_kernel_type()
self.init_axis()
self.inputs = {
'X': OpTest.np_dtype_to_fluid_dtype(self.x),
'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
}
self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
self.outputs = {'Out': self.out}
def test_check_output(self):
self.check_output()
def init_input_output(self):
self.x = np.random.uniform(0, 10000, [10, 10]).astype(self.dtype)
self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
self.out = np.floor_divide(self.x, self.y)
def init_dtype(self):
pass
def init_axis(self):
pass
class TestElementwiseModOp_scalar(TestElementwiseModOp):
def init_input_output(self):
scale_x = random.randint(0, 100000000)
scale_y = random.randint(1, 100000000)
self.x = (np.random.rand(2, 3, 4) * scale_x).astype(self.dtype)
self.y = (np.random.rand(1) * scale_y + 1).astype(self.dtype)
self.out = np.floor_divide(self.x, self.y)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid.core as core
from op_test import OpTest
import random
class TestElementwiseModOp(OpTest):
def init_kernel_type(self):
self.use_mkldnn = False
def setUp(self):
self.op_type = "elementwise_mod"
self.dtype = np.int32
self.axis = -1
self.init_dtype()
self.init_input_output()
self.init_kernel_type()
self.init_axis()
self.inputs = {
'X': OpTest.np_dtype_to_fluid_dtype(self.x),
'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
}
self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
self.outputs = {'Out': self.out}
def test_check_output(self):
self.check_output()
def init_input_output(self):
self.x = np.random.uniform(0, 10000, [10, 10]).astype(self.dtype)
self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
self.out = np.mod(self.x, self.y)
def init_dtype(self):
pass
def init_axis(self):
pass
class TestElementwiseModOp_scalar(TestElementwiseModOp):
def init_input_output(self):
scale_x = random.randint(0, 100000000)
scale_y = random.randint(1, 100000000)
self.x = (np.random.rand(2, 3, 4) * scale_x).astype(self.dtype)
self.y = (np.random.rand(1) * scale_y + 1).astype(self.dtype)
self.out = np.mod(self.x, self.y)
if __name__ == '__main__':
unittest.main()
...@@ -31,15 +31,27 @@ def dequantize_max_abs(x, scale, max_range): ...@@ -31,15 +31,27 @@ def dequantize_max_abs(x, scale, max_range):
return y return y
def channel_wise_quantize_max_abs(x, quant_bit=8): def channel_wise_quantize_max_abs(x, quant_bit=8, use_second_dim=False):
scales = [] scales = []
for i in range(x.shape[0]): if not use_second_dim:
scales.append(np.max(np.abs(x[i])).astype("float32")) for i in range(x.shape[0]):
scales.append(np.max(np.abs(x[i])).astype("float32"))
y = x.copy() y = x.copy()
max_range = math.pow(2, quant_bit - 1) - 1 max_range = math.pow(2, quant_bit - 1) - 1
for i, scale in enumerate(scales): for i, scale in enumerate(scales):
y[i] = np.round(y[i] / scale * max_range) y[i] = np.round(x[i] / scale * max_range)
else:
for i in range(x.shape[0]):
s = []
for j in range(x.shape[1]):
s.append(np.max(np.abs(x[i][j])).astype("float32"))
scales.append(s)
scales = np.amax(np.array(scales), axis=0)
y = x.copy()
max_range = math.pow(2, quant_bit - 1) - 1
for i in range(x.shape[0]):
for j, scale in enumerate(scales):
y[i][j] = np.round(x[i][j] / scale * max_range)
return y, scales return y, scales
...@@ -47,10 +59,16 @@ def channel_wise_dequantize_max_abs(x, ...@@ -47,10 +59,16 @@ def channel_wise_dequantize_max_abs(x,
scales, scales,
quant_bits, quant_bits,
activation_scale=None): activation_scale=None):
y = x.copy() if activation_scale is None:
for i in range(x.shape[0]): y = x.copy()
y[i] = (scales[i] / (math.pow(2, quant_bits[0] - 1) - 1)) * y[i] for i in range(x.shape[0]):
if activation_scale is not None: y[i] = (scales[i] / (math.pow(2, quant_bits[0] - 1) - 1)) * x[i]
else:
y = x.copy()
for i in range(x.shape[0]):
for j in range(x.shape[1]):
y[i][j] = (scales[j] /
(math.pow(2, quant_bits[0] - 1) - 1)) * x[i][j]
y *= activation_scale / (math.pow(2, quant_bits[1] - 1) - 1) y *= activation_scale / (math.pow(2, quant_bits[1] - 1) - 1)
return y return y
...@@ -65,7 +83,8 @@ class TestFakeChannelWiseDequantizeMaxAbsOpTwoScales(OpTest): ...@@ -65,7 +83,8 @@ class TestFakeChannelWiseDequantizeMaxAbsOpTwoScales(OpTest):
self.set_args() self.set_args()
self.op_type = "fake_channel_wise_dequantize_max_abs" self.op_type = "fake_channel_wise_dequantize_max_abs"
x = np.random.randn(4, 3, 64, 64).astype(self.data_type) x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0]) yq, scales = channel_wise_quantize_max_abs(
x, self.quant_bits[0], use_second_dim=True)
ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits, ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits,
self.activation_scale) self.activation_scale)
......
...@@ -53,7 +53,7 @@ class TestFakeChannelWiseQuantizeOp(OpTest): ...@@ -53,7 +53,7 @@ class TestFakeChannelWiseQuantizeOp(OpTest):
self.outputs = { self.outputs = {
'Out': outputs, 'Out': outputs,
'OutScales': np.array(scales).astype("float32"), 'OutScale': np.array(scales).astype("float32"),
} }
def test_check_output(self): def test_check_output(self):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parallel_executor_test_base import TestParallelExecutorBase
import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy as np
import paddle
import paddle.dataset.mnist as mnist
import unittest
import os
def simple_fc_net(use_feed):
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in range(4):
hidden = fluid.layers.fc(
hidden,
size=200,
act='relu',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
def fc_with_batchnorm(use_feed):
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in range(2):
hidden = fluid.layers.fc(
hidden,
size=200,
act='relu',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
hidden = fluid.layers.batch_norm(input=hidden)
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
class TestMNIST(TestParallelExecutorBase):
@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
def _init_data(self, random=True):
np.random.seed(5)
if random:
img = np.random.random(size=[32, 784]).astype(np.float32)
else:
img = np.ones(shape=[32, 784], dtype='float32')
label = np.ones(shape=[32, 1], dtype='int64')
return img, label
def _compare_fuse_all_reduce_ops(self, model, use_cuda, random_data=True):
if use_cuda and not core.is_compiled_with_cuda():
return
img, label = self._init_data(random_data)
def _optimizer(learning_rate=1e-6):
optimizer = fluid.optimizer.SGD(
learning_rate=learning_rate,
regularization=fluid.regularizer.L2Decay(1e-6))
return optimizer
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
fuse_all_reduce_ops=False,
memory_opt=False,
optimizer=_optimizer)
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
fuse_all_reduce_ops=True,
memory_opt=False,
optimizer=_optimizer)
for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
def test_simple_fc_with_fuse_op(self):
self._compare_fuse_all_reduce_ops(simple_fc_net, True)
self._compare_fuse_all_reduce_ops(simple_fc_net, False)
def test_batchnorm_fc_with_fuse_op(self):
self._compare_fuse_all_reduce_ops(fc_with_batchnorm, True)
self._compare_fuse_all_reduce_ops(fc_with_batchnorm, False)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC
from paddle.fluid.imperative.base import to_variable
class SimpleImgConvPool(fluid.imperative.Layer):
def __init__(self,
name_scope,
num_channels,
num_filters,
filter_size,
pool_size,
pool_stride,
pool_padding=0,
pool_type='max',
global_pooling=False,
conv_stride=1,
conv_padding=0,
conv_dilation=1,
conv_groups=1,
act=None,
use_cudnn=False,
param_attr=None,
bias_attr=None):
super(SimpleImgConvPool, self).__init__(name_scope)
self._conv2d = Conv2D(
self.full_name(),
num_channels=num_channels,
num_filters=num_filters,
filter_size=filter_size,
stride=conv_stride,
padding=conv_padding,
dilation=conv_dilation,
groups=conv_groups,
param_attr=None,
bias_attr=None,
use_cudnn=use_cudnn)
self._pool2d = Pool2D(
self.full_name(),
pool_size=pool_size,
pool_type=pool_type,
pool_stride=pool_stride,
pool_padding=pool_padding,
global_pooling=global_pooling,
use_cudnn=use_cudnn)
def forward(self, inputs):
x = self._conv2d(inputs)
x = self._pool2d(x)
return x
class MNIST(fluid.imperative.Layer):
def __init__(self, name_scope):
super(MNIST, self).__init__(name_scope)
self._simple_img_conv_pool_1 = SimpleImgConvPool(
self.full_name(), 1, 20, 5, 2, 2, act="relu")
self._simple_img_conv_pool_2 = SimpleImgConvPool(
self.full_name(), 20, 50, 5, 2, 2, act="relu")
pool_2_shape = 50 * 4 * 4
SIZE = 10
scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
self._fc = FC(self.full_name(),
10,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)),
act="softmax")
def forward(self, inputs):
x = self._simple_img_conv_pool_1(inputs)
x = self._simple_img_conv_pool_2(x)
x = self._fc(x)
return x
class TestImperativeCheckpoint(unittest.TestCase):
def save_load_persistables(self):
seed = 90
epoch_num = 1
with fluid.imperative.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
mnist = MNIST("mnist")
sgd = SGDOptimizer(learning_rate=1e-3)
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
dy_param_init_value = {}
step = 0
for epoch in range(epoch_num):
for batch_id, data in enumerate(train_reader()):
dy_x_data = np.array(
[x[0].reshape(1, 28, 28)
for x in data]).astype('float32')
y_data = np.array(
[x[1] for x in data]).astype('int64').reshape(128, 1)
img = to_variable(dy_x_data)
label = to_variable(y_data)
label._stop_gradient = True
cost = mnist(img)
loss = fluid.layers.cross_entropy(cost, label)
avg_loss = fluid.layers.mean(loss)
dy_out = avg_loss._numpy()
avg_loss._backward()
sgd.minimize(avg_loss)
fluid.imperative.save_persistables(mnist, "save_dir")
mnist.clear_gradients()
for param in mnist.parameters():
dy_param_init_value[param.name] = param._numpy()
mnist.load_dict(
fluid.imperative.load_persistables(mnist, "save_dir"))
restore = mnist.parameters()
self.assertEqual(len(dy_param_init_value), len(restore))
for value in restore:
self.assertTrue(
np.allclose(value, dy_param_init_value[value.name]))
self.assertTrue(np.isfinite(value.all()))
self.assertFalse(np.isnan(value.any()))
step += 1
if step > 20:
break
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import random
import sys
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from test_imperative_base import new_program_scope
from paddle.fluid.imperative.base import to_variable
NUM_USERS = 100
NUM_ITEMS = 1000
BATCH_SIZE = 32
NUM_BATCHES = 2
class MLP(fluid.imperative.Layer):
def __init__(self, name_scope):
super(MLP, self).__init__(name_scope)
self._user_latent = fluid.imperative.FC(self.full_name(), 256)
self._item_latent = fluid.imperative.FC(self.full_name(), 256)
self._user_layers = []
self._item_layers = []
self._hid_sizes = [128, 64]
for i in range(len(self._hid_sizes)):
self._user_layers.append(
self.add_sublayer(
'user_layer_%d' % i,
fluid.imperative.FC(
self.full_name(), self._hid_sizes[i], act='relu')))
self._item_layers.append(
self.add_sublayer(
'item_layer_%d' % i,
fluid.imperative.FC(
self.full_name(), self._hid_sizes[i], act='relu')))
def forward(self, users, items):
users = self._user_latent(users)
items = self._item_latent(items)
for ul, il in zip(self._user_layers, self._item_layers):
users = ul(users)
items = il(items)
return fluid.layers.elementwise_mul(users, items)
class DMF(fluid.imperative.Layer):
def __init__(self, name_scope):
super(DMF, self).__init__(name_scope)
self._user_latent = fluid.imperative.FC(self.full_name(), 256)
self._item_latent = fluid.imperative.FC(self.full_name(), 256)
self._match_layers = []
self._hid_sizes = [128, 64]
for i in range(len(self._hid_sizes)):
self._match_layers.append(
self.add_sublayer(
'match_layer_%d' % i,
fluid.imperative.FC(
self.full_name(), self._hid_sizes[i], act='relu')))
self._mat
def forward(self, users, items):
users = self._user_latent(users)
items = self._item_latent(items)
match_vec = fluid.layers.concat(
[users, items], axis=len(users.shape) - 1)
for l in self._match_layers:
match_vec = l(match_vec)
return match_vec
class DeepCF(fluid.imperative.Layer):
def __init__(self, name_scope):
super(DeepCF, self).__init__(name_scope)
self._user_emb = fluid.imperative.Embedding(self.full_name(),
[NUM_USERS, 256])
self._item_emb = fluid.imperative.Embedding(self.full_name(),
[NUM_ITEMS, 256])
self._mlp = MLP(self.full_name())
self._dmf = DMF(self.full_name())
self._match_fc = fluid.imperative.FC(self.full_name(), 1, act='sigmoid')
def forward(self, users, items):
users_emb = self._user_emb(users)
items_emb = self._item_emb(items)
mlp_predictive = self._mlp(users_emb, items_emb)
dmf_predictive = self._dmf(users_emb, items_emb)
predictive = fluid.layers.concat(
[mlp_predictive, dmf_predictive],
axis=len(mlp_predictive.shape) - 1)
prediction = self._match_fc(predictive)
return prediction
def get_data():
user_ids = []
item_ids = []
labels = []
for uid in range(NUM_USERS):
for iid in range(NUM_ITEMS):
# 10% positive
label = float(random.randint(1, 10) == 1)
user_ids.append(uid)
item_ids.append(iid)
labels.append(label)
indices = np.arange(NUM_USERS * NUM_ITEMS)
np.random.shuffle(indices)
users_np = np.array(user_ids, dtype=np.int64)[indices]
items_np = np.array(item_ids, dtype=np.int64)[indices]
labels_np = np.array(labels, dtype=np.float32)[indices]
return np.expand_dims(users_np, -1), \
np.expand_dims(items_np, -1), \
np.expand_dims(labels_np, -1)
class TestImperativeDeepCF(unittest.TestCase):
def test_gan_float32(self):
seed = 90
users_np, items_np, labels_np = get_data()
startup = fluid.Program()
startup.random_seed = seed
main = fluid.Program()
main.random_seed = seed
scope = fluid.core.Scope()
with new_program_scope(main=main, startup=startup, scope=scope):
users = fluid.layers.data('users', [1], dtype='int64')
items = fluid.layers.data('items', [1], dtype='int64')
labels = fluid.layers.data('labels', [1], dtype='float32')
deepcf = DeepCF('deepcf')
prediction = deepcf(users, items)
loss = fluid.layers.reduce_sum(
fluid.layers.log_loss(prediction, labels))
adam = fluid.optimizer.AdamOptimizer(0.01)
adam.minimize(loss)
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
exe.run(startup)
for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
static_loss = exe.run(
main,
feed={
users.name: users_np[slice:slice + BATCH_SIZE],
items.name: items_np[slice:slice + BATCH_SIZE],
labels.name: labels_np[slice:slice + BATCH_SIZE]
},
fetch_list=[loss])[0]
sys.stderr.write('static loss %s\n' % static_loss)
with fluid.imperative.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
deepcf = DeepCF('deepcf')
for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
prediction = deepcf(
to_variable(users_np[slice:slice + BATCH_SIZE]),
to_variable(items_np[slice:slice + BATCH_SIZE]))
loss = fluid.layers.reduce_sum(
fluid.layers.log_loss(prediction,
to_variable(labels_np[slice:slice +
BATCH_SIZE])))
loss._backward()
adam = fluid.optimizer.AdamOptimizer(0.01)
adam.minimize(loss)
deepcf.clear_gradients()
dy_loss = loss._numpy()
self.assertEqual(static_loss, dy_loss)
if __name__ == '__main__':
unittest.main()
...@@ -51,7 +51,7 @@ class Generator(fluid.imperative.Layer): ...@@ -51,7 +51,7 @@ class Generator(fluid.imperative.Layer):
return self._fc3(x) return self._fc3(x)
class TestImperativeMnist(unittest.TestCase): class TestImperativeGAN(unittest.TestCase):
def test_gan_float32(self): def test_gan_float32(self):
seed = 90 seed = 90
......
...@@ -1240,6 +1240,14 @@ class TestBook(unittest.TestCase): ...@@ -1240,6 +1240,14 @@ class TestBook(unittest.TestCase):
print(str(program)) print(str(program))
def test_range(self):
program = Program()
with program_guard(program):
layers.range(0, 10, 2, 'int32')
layers.range(0.1, 10.0, 0.2, 'float32')
print(str(program))
def test_spectral_norm(self): def test_spectral_norm(self):
program = Program() program = Program()
with program_guard(program): with program_guard(program):
......
...@@ -13,6 +13,9 @@ ...@@ -13,6 +13,9 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
import os
os.environ['FLAGS_fuse_parameter_memory_size'] = "131072"
os.environ['FLAGS_fuse_parameter_groups_size'] = "3"
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers.ops as ops import paddle.fluid.layers.ops as ops
...@@ -22,7 +25,6 @@ import paddle.fluid.core as core ...@@ -22,7 +25,6 @@ import paddle.fluid.core as core
from parallel_executor_test_base import TestParallelExecutorBase from parallel_executor_test_base import TestParallelExecutorBase
import unittest import unittest
import math import math
import os
import numpy as np import numpy as np
# FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor # FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor
...@@ -312,17 +314,59 @@ class TestResnet(TestParallelExecutorBase): ...@@ -312,17 +314,59 @@ class TestResnet(TestParallelExecutorBase):
self.assertAlmostEquals( self.assertAlmostEquals(
np.mean(parallel_last_loss), single_last_loss[0], delta=delta2) np.mean(parallel_last_loss), single_last_loss[0], delta=delta2)
def _compare_with_fused_all_reduce(self,
model,
use_cuda,
iter=20,
delta2=1e-5):
if use_cuda and not core.is_compiled_with_cuda():
return
global remove_bn
remove_bn = True
img, label = self._init_data(batch_size=batch_size)
all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
iter=iter,
batch_size=batch_size,
use_cuda=use_cuda,
fuse_all_reduce_ops=False,
optimizer=optimizer)
reduce_first_loss, reduce_last_loss = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
iter=iter,
batch_size=batch_size,
use_cuda=use_cuda,
fuse_all_reduce_ops=True,
optimizer=optimizer)
for loss in zip(all_reduce_first_loss, reduce_first_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
for loss in zip(all_reduce_last_loss, reduce_last_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
def test_seresnext_with_learning_rate_decay(self): def test_seresnext_with_learning_rate_decay(self):
self._check_resnet_convergence(model=SE_ResNeXt50Small, use_cuda=True) self._check_resnet_convergence(model=SE_ResNeXt50Small, use_cuda=True)
self._check_resnet_convergence( self._check_resnet_convergence(
model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3) model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3)
def test_seresnext_with_new_strategy(self): def test_seresnext_with_reduce(self):
self._compare_reduce_and_allreduce( self._compare_reduce_and_allreduce(
model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-2) model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-2)
self._compare_reduce_and_allreduce( self._compare_reduce_and_allreduce(
model=SE_ResNeXt50Small, use_cuda=False, iter=5) model=SE_ResNeXt50Small, use_cuda=False, iter=5)
def test_seresnext_with_fused_all_reduce(self):
self._compare_with_fused_all_reduce(
model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-3)
self._compare_with_fused_all_reduce(
model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
class TestRangeOp(OpTest):
def setUp(self):
self.op_type = "range"
self.init_config()
self.inputs = {
'Start': np.array([self.case[0]]).astype(self.dtype),
'End': np.array([self.case[1]]).astype(self.dtype),
'Step': np.array([self.case[2]]).astype(self.dtype)
}
self.outputs = {
'Out': np.arange(self.case[0], self.case[1],
self.case[2]).astype(self.dtype)
}
def init_config(self):
self.dtype = np.float32
self.case = (0, 1, 0.2)
def test_check_output(self):
self.check_output()
class TestFloatRangeOpCase0(TestRangeOp):
def init_config(self):
self.dtype = np.float32
self.case = (0, 5, 1)
class TestInt32RangeOpCase0(TestRangeOp):
def init_config(self):
self.dtype = np.int32
self.case = (0, 5, 2)
class TestInt32RangeOpCase1(TestRangeOp):
def init_config(self):
self.dtype = np.int32
self.case = (10, 1, -2)
class TestInt32RangeOpCase2(TestRangeOp):
def init_config(self):
self.dtype = np.int32
self.case = (-1, -10, -2)
if __name__ == "__main__":
unittest.main()
...@@ -12,3 +12,4 @@ six ...@@ -12,3 +12,4 @@ six
funcsigs funcsigs
pyyaml pyyaml
decorator decorator
prettytable
...@@ -52,7 +52,7 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /o ...@@ -52,7 +52,7 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /o
LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \ LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
RUN wget -O /opt/swig-2.0.12.tar.gz https://cytranet.dl.sourceforge.net/project/swig/swig/swig-2.0.12/swig-2.0.12.tar.gz && \ RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"] CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册