diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 9a6d0d1c083dcaa1bee570455ad46caddd2a3d18..70a4d7b40b154ff80ff6d30adaa147556749e905 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -56,7 +56,7 @@ paddle.fluid.io.save_persistables (ArgSpec(args=['executor', 'dirname', 'main_pr paddle.fluid.io.load_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '0a5308f496632ab1ec3ba1f1377e6f95')) paddle.fluid.io.load_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '41779819cef32f2246e83aebc5a002e2')) paddle.fluid.io.load_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '28df5bfe26ca7a077f91156abb0fe6d2')) -paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)), ('document', '582d87b8df75a5a639a107db8ff86f9c')) +paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)), ('document', '70f4f53f13572436ac72d1c8b5efeb9d')) paddle.fluid.io.load_inference_model (ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '7a5255386075dac3c75b7058254fcdcb')) paddle.fluid.initializer.ConstantInitializer.__init__ (ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.initializer.UniformInitializer.__init__ (ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) @@ -109,7 +109,7 @@ paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name paddle.fluid.layers.sequence_first_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '2b290d3d77882bfe9bb8d331cac8cdd3')) paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'c16a892f44f7fe71bfa5afc32d3f34ce')) paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'fdcea0e8b5bc7d8d4b1b072c521014e6')) -paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')), ('document', 'dc7042734c6d8b8ce97321f017f01d6f')) +paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')), ('document', 'f1dd22f7351f7f9853212958e0d8aa7a')) paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '652625345c2acb900029c78cc75f8aa6')) paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbf2adbd79683dc93db03454dfa18c2')) paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)), ('document', '97f0262f97602644c83142789d784571')) @@ -205,7 +205,7 @@ paddle.fluid.layers.maxout (ArgSpec(args=['x', 'groups', 'name'], varargs=None, paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5f207ae10589ebe38a63575ef6ff8e1e')) paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '51def402b8910e163cbace9d0c0526ed')) paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '77a6d80aa5551ca70324fc975c44507f')) -paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)), ('document', '2f46f1ff39a13ab00857e7b9f44b2fa7')) +paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name', 'act'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None, None)), ('document', 'ab84fdc6dc60f3ad9aa397e6007e3bf9')) paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '70e3b5182a18b40b47ecabd7c8490a35')) paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', '9bb77f8dc002dd2ce75d4769eaaf5007')) paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd256cba1c41a5ed92ce3f31e24a2ca6d')) @@ -255,6 +255,7 @@ paddle.fluid.layers.reverse (ArgSpec(args=['x', 'axis'], varargs=None, keywords= paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '8f8c0306117ea441f20dcbbdba1f0ecc')) paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '2e53e83127dbfd86e7098bdfe9a549e8')) paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '0a437011c3906079fd8947ed3e52d292')) +paddle.fluid.layers.range (ArgSpec(args=['start', 'end', 'step', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '2ec937ede953ded2fdff2675883900bb')) paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) @@ -277,7 +278,7 @@ paddle.fluid.layers.DynamicRNN.block (ArgSpec(args=['self'], varargs=None, keywo paddle.fluid.layers.DynamicRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32')), ('document', 'b9174d4e91505b0c8ecc193eb51e248d')) paddle.fluid.layers.DynamicRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'b439a176a3328de8a75bdc5c08eece4a')) paddle.fluid.layers.DynamicRNN.static_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', 'f29ad2478b6b2ad4f413d2936a331ea0')) -paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '169d694d2224f62b4f3afdc3dbc19e95')) +paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x', 'level'], varargs=None, keywords=None, defaults=(0,)), ('document', '7568c5ac7622a10288d3307a94134655')) paddle.fluid.layers.DynamicRNN.update_memory (ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None), ('document', '5d83987da13b98363d6a807a52d8024f')) paddle.fluid.layers.StaticRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', 'c24e368e23afac1ed91a78a639d7a9c7')) @@ -296,7 +297,7 @@ paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=Non paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3a46e0b5f9ce82348406478e610f14c9')) paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7')) paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13')) -paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '072a8541e0f632366bba10f67cb0db27')) +paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9e27491c39ac74d0b1ffe506aec0ebb')) paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '64650ac42cf82e9920cb0b172b1d29fd')) paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad')) paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973')) @@ -376,23 +377,9 @@ paddle.fluid.contrib.Calibrator.__init__ (ArgSpec(args=['self'], varargs='args', paddle.fluid.contrib.Calibrator.sample_data (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3b8c85ca1e2cf753cc8c90a6c6992958')) paddle.fluid.contrib.Calibrator.save_int8_model (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.reader.ctr_reader.ctr_reader (ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2ebf3de2a6ef1af2c3b88d2db7591ab')) -paddle.fluid.contrib.build_compressor (ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.CompressPass.__init__ (ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.CompressPass.add_strategy (ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None), ('document', '3bf6010b6f47d3c86df0ec8957be95e0')) -paddle.fluid.contrib.CompressPass.apply (ArgSpec(args=['self', 'graph'], varargs=None, keywords=None, defaults=None), ('document', 'a92bf85d4b59bd4f2ac1706d7c4899a6')) -paddle.fluid.contrib.ImitationGraph.__init__ (ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.ImitationGraph.all_parameters (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.SensitivePruneStrategy.__init__ (ArgSpec(args=['self', 'pruner', 'start_epoch', 'end_epoch', 'delta_rate', 'acc_loss_threshold', 'sensitivities'], varargs=None, keywords=None, defaults=(None, 0, 10, 0.2, 0.2, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.SensitivePruneStrategy.on_batch_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.SensitivePruneStrategy.on_batch_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.SensitivePruneStrategy.on_compress_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.SensitivePruneStrategy.on_compress_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.MagnitudePruner.__init__ (ArgSpec(args=['self', 'threshold'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.MagnitudePruner.prune (ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.RatioPruner.__init__ (ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e7a81a325b296a9ca502ee5adb4fc85d')) -paddle.fluid.contrib.RatioPruner.prune (ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,)), ('document', '358cbf2978c91028fb96a195a9884645')) +paddle.fluid.contrib.Compressor.__init__ (ArgSpec(args=['self', 'place', 'scope', 'train_program', 'train_reader', 'train_feed_list', 'train_fetch_list', 'eval_program', 'eval_reader', 'eval_feed_list', 'eval_fetch_list', 'teacher_programs', 'checkpoint_path', 'train_optimizer', 'distiller_optimizer'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, [], './checkpoints', None, None)), ('document', '31ae143830c9bf6b43547dd546c5ba80')) +paddle.fluid.contrib.Compressor.config (ArgSpec(args=['self', 'config_file'], varargs=None, keywords=None, defaults=None), ('document', '780d9c007276ccbb95b292400d7807b0')) +paddle.fluid.contrib.Compressor.run (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'c6e43d6a078d307672283c1f36e04fe9')) paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '2ab36d4f7a564f5f65e455807ad06c67')) paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '59066bac9db0ac6ce414d05780b7333f')) paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '74c39c595dc70d6be2f16d8e462d282b')) @@ -432,48 +419,59 @@ paddle.fluid.nets.img_conv_group (ArgSpec(args=['input', 'conv_num_filter', 'poo paddle.fluid.optimizer.SGDOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.SGDOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.SGDOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.SGDOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.SGDOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.MomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.MomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.MomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.MomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.MomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.AdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name', 'initial_accumulator_value'], varargs=None, keywords=None, defaults=(1e-06, None, None, 0.0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.AdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.AdamOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdamOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.AdamOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdamOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdamOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.AdamaxOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdamaxOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.DecayedAdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.DecayedAdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.FtrlOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.FtrlOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.FtrlOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.FtrlOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.FtrlOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.RMSPropOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.RMSPropOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.RMSPropOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.RMSPropOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.AdadeltaOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.AdadeltaOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdadeltaOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdadeltaOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.ModelAverage.__init__ (ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.ModelAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '46234a5470590feb336346f70a3db715')) paddle.fluid.optimizer.ModelAverage.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.ModelAverage.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.ModelAverage.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.ModelAverage.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.optimizer.ModelAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '18db9c70be9c4dd466f9844457b21bfe')) paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871')) paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.LarsMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea')) paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '1a79bd7d10ae54ca763ec81bca36ba24')) paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 9f06455ea5410bcab081ed212a34960f8fe6f0bf..7a371af510b8050aec3708d82923c707fd9d3a90 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -9,6 +9,7 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper) cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper) cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper) +cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad_pass.cc DEPS graph graph_helper) cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows) @@ -22,6 +23,8 @@ endif() if(WITH_GPU) nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory dynload_cuda variable_visitor) + nv_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory + dynload_cuda variable_visitor) if(WITH_DISTRIBUTE) nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda selected_rows_functor sendrecvop_rpc) @@ -35,6 +38,8 @@ if(WITH_GPU) else() cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory variable_visitor) + cc_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory + variable_visitor) if(WITH_DISTRIBUTE) cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim selected_rows_functor sendrecvop_rpc) @@ -46,9 +51,7 @@ else() cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) endif() -cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor) cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) -cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) if(WITH_GPU) cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info) @@ -69,7 +72,9 @@ cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS grap cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass) cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle - scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) + scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle fused_broadcast_op_handle) + +cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle) set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass) if (WITH_GPU) @@ -98,5 +103,5 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass fuse_elewise_add_act_pass multi_batch_merge_pass - fuse_relu_depthwise_conv_pass - memory_optimize_pass lock_free_optimize_pass) + fuse_relu_depthwise_conv_pass + memory_optimize_pass lock_free_optimize_pass alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index c1f9c2b60c915370df7793f26fe83812a7ced96d..fdaff08e53755dc43df01e4734d355a286bb5863 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -11,9 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include - #include "paddle/fluid/framework/details/all_reduce_op_handle.h" +#include #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/variable_visitor.h" @@ -56,6 +55,7 @@ void AllReduceOpHandle::RunImpl() { platform::RecordEvent record_event(Name()); WaitInputVarGenerated(); + auto in_var_handles = DynamicCast(this->Inputs()); auto out_var_handles = DynamicCast(this->Outputs()); PADDLE_ENFORCE_EQ( diff --git a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..fbc8bbf56b0d1ce75fbb459038c63571d0a16cd3 --- /dev/null +++ b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc @@ -0,0 +1,393 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" +DEFINE_uint32(fuse_parameter_memory_size, 0, // 0 KB + "fuse_parameter_memory_size is up limited memory size " + "of one group parameters' gradient which is the input " + "of communication calling(e.g NCCLAllReduce). " + "The default value is 0, it means that " + "not set group according to memory_size."); +DEFINE_int32( + fuse_parameter_groups_size, 3, + "fuse_parameter_groups_size is the size of one group parameters' gradient. " + "The default value is a experimental result. If the " + "fuse_parameter_groups_size is 1, it means that the groups size is " + "the number of parameters' gradient. If the fuse_parameter_groups_size is " + "-1, it means that there are only one group. The default value is 3, it is " + "an experimental value."); + +namespace paddle { +namespace framework { +namespace details { + +static const char kUnKnow[] = "@UNKNOW@"; +static framework::proto::VarType::Type kDefaultDtype = + framework::proto::VarType::Type::VarType_Type_BOOL; + +class AllocContinuousSpaceForGradPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override { + ir::Graph &result = *graph; + + auto &places = Get>(kPlaces); + auto &local_scopes = Get>(kLocalScopes); + + ResetAttribute(kParamsAndGrads, &result); + ResetAttribute(kGroupGradsAndParams, &result); + + // NOTE: The operator nodes should be in topology order. + std::vector topo_nodes = ir::TopologySortOperations(result); + auto ¶ms_grads = result.Get(kParamsAndGrads); + for (auto &node : topo_nodes) { + RecordParamsAndGrads(node, ¶ms_grads); + } + + if (params_grads.size() == 0) { + VLOG(10) << "Doesn't find gradients"; + return std::move(graph); + } + + std::unordered_map vars; + for (ir::Node *node : result.Nodes()) { + if (node->IsVar() && node->Var()) { + // Note: The graph may have the same name node. For example, parameter + // is the input of operator and it also is the output of optimizer; + vars.emplace(node->Var()->Name(), node); + } + } + + auto &group_grads_params = + result.Get(kGroupGradsAndParams); + + // Note: the order of params_grads may be changed by SetGroupGradsAndParams. + SetGroupGradsAndParams(vars, params_grads, &group_grads_params); + + params_grads.clear(); + for (auto &group_p_g : group_grads_params) { + params_grads.insert(params_grads.begin(), group_p_g.begin(), + group_p_g.end()); + } + for (auto &p_g : params_grads) { + std::swap(p_g.first, p_g.second); + } + + // Set Gradients as Persistable to prevent this var becoming reusable. + auto dtype = kDefaultDtype; + for (auto &p_g : params_grads) { + // Get gradient var + auto iter = vars.find(p_g.second); + PADDLE_ENFORCE(iter != vars.end(), "%s is not found.", p_g.second); + iter->second->Var()->SetPersistable(true); + + PADDLE_ENFORCE(IsSupportedVarType(iter->second->Var()->GetType())); + + // Get Dtype + auto ele_dtype = iter->second->Var()->GetDataType(); + if (dtype == kDefaultDtype) { + dtype = ele_dtype; + PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype); + } + PADDLE_ENFORCE_EQ(ele_dtype, dtype); + } + + // Create the fused variable name. + if (!result.Has(kFusedVars)) { + result.Set(kFusedVars, new FusedVars); + } + const std::string prefix(kFusedVarNamePrefix); + // The fused_var_name should be unique. + auto fused_var_name = prefix + "GRAD@" + params_grads[0].second; + auto &fused_var_set = result.Get(kFusedVars); + PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0); + fused_var_set.insert(fused_var_name); + + InitFusedVarsAndAllocSpaceForVars(places, local_scopes, vars, + fused_var_name, params_grads); + + return std::move(graph); + } + + template + void ResetAttribute(const std::string &attr_name, ir::Graph *graph) const { + if (graph->Has(attr_name)) { + VLOG(10) << attr_name << " is reset."; + graph->Erase(attr_name); + } + graph->Set(attr_name, new AttrType); + } + + void SetGroupGradsAndParams( + const std::unordered_map &var_nodes, + const ParamsAndGrads ¶ms_grads, + GroupGradsAndParams *group_grads_params) const { + SetGroupAccordingToLayers(var_nodes, params_grads, group_grads_params); + SetGroupAccordingToMemorySize(var_nodes, group_grads_params); + SetGroupAccordingToGroupSize(var_nodes, group_grads_params); + } + + void SetGroupAccordingToLayers( + const std::unordered_map &var_nodes, + const ParamsAndGrads ¶ms_grads, + GroupGradsAndParams *group_grads_params) const { + std::unordered_map> layer_params; + + for (size_t i = 0; i < params_grads.size(); ++i) { + auto pos = params_grads[i].first.find_first_of("."); + if (pos == std::string::npos) { + layer_params[std::string(kUnKnow)].emplace_back(i); + } else { + layer_params[params_grads[i].first.substr(0, pos)].emplace_back(i); + } + } + + group_grads_params->reserve(layer_params.size()); + for (size_t i = 0; i < params_grads.size(); ++i) { + auto pos = params_grads[i].first.find_first_of("."); + std::string key = kUnKnow; + if (pos != std::string::npos) { + key = params_grads[i].first.substr(0, pos); + } + auto iter = layer_params.find(key); + if (iter == layer_params.end()) continue; + + group_grads_params->emplace_back(); + auto &local_group_grads_params = group_grads_params->back(); + for (auto &idx : iter->second) { + local_group_grads_params.emplace_back( + std::make_pair(params_grads[idx].second, params_grads[idx].first)); + } + layer_params.erase(iter); + } + + VLOG(10) << "SetGroupAccordingToLayers: "; + for (size_t i = 0; i < group_grads_params->size(); ++i) { + VLOG(10) << "group " << i; + std::stringstream out; + for (auto &p_g : group_grads_params->at(i)) { + out << "(" << p_g.second << ", " << p_g.first << "), "; + } + VLOG(10) << out.str(); + } + } + + void SetGroupAccordingToMemorySize( + const std::unordered_map &var_nodes, + GroupGradsAndParams *group_grads_params) const { + if (FLAGS_fuse_parameter_memory_size == 0) { + return; + } + size_t group_memory_size = + static_cast(FLAGS_fuse_parameter_memory_size); + GroupGradsAndParams local_group_grads_params; + + size_t j = 0; + while (j < group_grads_params->size()) { + local_group_grads_params.emplace_back(); + auto &group_p_g = local_group_grads_params.back(); + size_t local_group_memory_size = 0; + while (j < group_grads_params->size()) { + std::for_each( + group_grads_params->at(j).begin(), group_grads_params->at(j).end(), + [&local_group_memory_size, + &var_nodes](const std::pair &g_p) { + auto iter = var_nodes.find(g_p.second); + PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.", + g_p.second); + auto shape = iter->second->Var()->GetShape(); + size_t size = + framework::SizeOfType(iter->second->Var()->GetDataType()); + std::for_each(shape.begin(), shape.end(), + [&size](const int64_t &n) { size *= n; }); + local_group_memory_size += size; + }); + group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(), + group_grads_params->at(j).end()); + ++j; + if (local_group_memory_size >= group_memory_size) { + break; + } + } + } + + std::swap(*group_grads_params, local_group_grads_params); + + VLOG(10) << string::Sprintf( + "SetGroupAccordingToMemorySize(memory_size: %d):", + FLAGS_fuse_parameter_memory_size); + for (size_t i = 0; i < group_grads_params->size(); ++i) { + VLOG(10) << "group " << i; + std::stringstream out; + for (auto &g_p : group_grads_params->at(i)) { + auto iter = var_nodes.find(g_p.second); + PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.", g_p.second); + auto shape = iter->second->Var()->GetShape(); + size_t size = framework::SizeOfType(iter->second->Var()->GetDataType()); + std::for_each(shape.begin(), shape.end(), + [&size](const int64_t &n) { size *= n; }); + out << string::Sprintf("(%s(%d), %s)", g_p.second, size, g_p.first); + } + VLOG(10) << out.str(); + } + } + + void SetGroupAccordingToGroupSize( + const std::unordered_map &var_nodes, + GroupGradsAndParams *group_grads_params) const { + if (FLAGS_fuse_parameter_groups_size == 1) { + return; + } + size_t group_size = static_cast(FLAGS_fuse_parameter_groups_size); + if (FLAGS_fuse_parameter_groups_size == -1) { + group_size = group_grads_params->size(); + } + PADDLE_ENFORCE_GT(group_size, 1); + size_t groups = (group_grads_params->size() + group_size - 1) / group_size; + GroupGradsAndParams local_group_grads_params; + local_group_grads_params.reserve(groups); + + size_t j = 0; + for (size_t i = 0; i < groups; ++i) { + local_group_grads_params.emplace_back(); + auto &group_p_g = local_group_grads_params.back(); + group_p_g.reserve(group_size); + while (j < group_grads_params->size()) { + group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(), + group_grads_params->at(j).end()); + ++j; + if (j % group_size == 0) break; + } + } + std::swap(*group_grads_params, local_group_grads_params); + + VLOG(10) << "SetGroupAccordingToGroupSize(group_size: " << group_size + << "): "; + for (size_t i = 0; i < group_grads_params->size(); ++i) { + VLOG(10) << "group " << i; + std::stringstream out; + for (auto &p_g : group_grads_params->at(i)) { + out << "(" << p_g.second << ", " << p_g.first << "), "; + } + VLOG(10) << out.str(); + } + } + + private: + bool IsSupportedVarType(const proto::VarType::Type &type) const { + // Current only support LOD_TENSOR. + return type == proto::VarType::LOD_TENSOR; + } + + void AppendAllocSpaceForVarsOp(const std::vector ¶ms_name, + const std::vector &grads_name, + const std::string &fused_var_name, + BlockDesc *global_block) const { + auto op_desc = global_block->AppendOp(); + op_desc->SetType("alloc_continuous_space"); + op_desc->SetInput("Input", params_name); + op_desc->SetOutput("Output", grads_name); + op_desc->SetOutput("FusedOutput", {fused_var_name}); + } + + void RecordParamsAndGrads(ir::Node *node, + ParamsAndGrads *params_grads) const { + try { + bool is_bk_op = + static_cast(boost::get(node->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) return; + + // Currently, we assume that once gradient is generated, it can be + // broadcast, and each gradient is only broadcast once. + auto backward_vars = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, static_cast(0)); + + for (size_t i = 0; i < backward_vars.size(); i += 2) { + VLOG(10) << "Trainable parameter: " << backward_vars[i] + << ", gradient: " << backward_vars[i + 1]; + + params_grads->emplace_back(std::make_pair( + backward_vars[i] /*param*/, backward_vars[i + 1] /*grad*/)); + } + } catch (boost::bad_get e) { + } + } + + void InitFusedVarsAndAllocSpaceForVars( + const std::vector &places, + const std::vector &local_scopes, + const std::unordered_map &vars, + const std::string &fused_var_name, + const ParamsAndGrads ¶ms_grads) const { + // Init Gradients and FusedVars + VLOG(10) << "Init FusedVars and Gradients."; + for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) { + auto &scope = *it; + + PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr, + "%s has existed in scope.", fused_var_name); + scope->Var(fused_var_name)->GetMutable(); + + for (auto &p_g : params_grads) { + auto iter = vars.find(p_g.second); + PADDLE_ENFORCE(iter != vars.end()); + PADDLE_ENFORCE_NOT_NULL(iter->second->Var()); + PADDLE_ENFORCE_EQ(iter->second->Var()->GetType(), + proto::VarType::LOD_TENSOR); + scope->Var(p_g.second)->GetMutable(); + } + } + + std::vector grads_name; + std::vector params_name; + grads_name.reserve(params_grads.size()); + params_name.reserve(params_grads.size()); + for (auto &p_g : params_grads) { + params_name.emplace_back(p_g.first); + grads_name.emplace_back(p_g.second); + } + framework::ProgramDesc program_desc; + AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name, + program_desc.MutableBlock(0)); + + // Run Only Once Programs + for (size_t i = 0; i < local_scopes.size(); ++i) { + for (auto &op_desc : program_desc.Block(0).AllOps()) { + auto op = OpRegistry::CreateOp(*op_desc); + op->Run(*local_scopes[i], places[i]); + } + } + } +}; + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(alloc_continuous_space_for_grad_pass, + paddle::framework::details::AllocContinuousSpaceForGradPass) + .RequirePassAttr(paddle::framework::details::kPlaces) + .RequirePassAttr(paddle::framework::details::kLocalScopes); diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h index 0c75e05f861636565ae855ddd534c1082d40d237..0b4d33513506d41a63db8316abaa5cd0458ff352 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.h +++ b/paddle/fluid/framework/details/broadcast_op_handle.h @@ -57,7 +57,7 @@ struct BroadcastOpHandle : public OpHandleBase { std::string Name() const override; - bool IsMultiDeviceTransfer() override { return false; }; + bool IsMultiDeviceTransfer() override { return true; }; protected: void RunImpl() override; diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 932d0b4538eb2ec5df97d0bde806c33f825b6f68..5d9db237538599ec9a6887317b61af73f1113b97 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -46,7 +46,16 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { public: explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy) : ir::PassBuilder(), strategy_(strategy) { + // Add a graph viz pass to record a graph. + if (!strategy_.debug_graphviz_path_.empty()) { + auto viz_pass = AppendPass("graph_viz_pass"); + const std::string graph_path = string::Sprintf( + "%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph"); + viz_pass->Set("graph_viz_path", new std::string(graph_path)); + } + if (strategy_.enable_sequential_execution_) { + VLOG(10) << "Add sequential_execution_pass"; AppendPass("sequential_execution_pass"); } @@ -57,6 +66,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Add op fusion. if (strategy.fuse_relu_depthwise_conv_) { + VLOG(10) << "Add fuse_relu_depthwise_conv_pass"; AppendPass("fuse_relu_depthwise_conv_pass"); } @@ -68,29 +78,30 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Add automatically inplace. if (strategy_.enable_inplace_) { + VLOG(10) << "Add inplace_pass"; AppendPass("inplace_pass"); } + if (strategy.fuse_elewise_add_act_ops_) { + VLOG(10) << "Add fuse_elewise_add_act_pass"; + AppendPass("fuse_elewise_add_act_pass"); + } + + // for single card training, fuse_all_reduce_ops is unnecessary. + // alloc_continuous_space_for_grad_pass should be before of MultiDevPass. + if (strategy.fuse_all_reduce_ops_) { + VLOG(10) << "Add alloc_continuous_space_for_grad_pass"; + AppendPass("alloc_continuous_space_for_grad_pass"); + } + // Add a graph viz pass to record a graph. - if (!strategy_.debug_graphviz_path_.empty()) { + if (!strategy.debug_graphviz_path_.empty()) { auto viz_pass = AppendPass("graph_viz_pass"); const std::string graph_path = string::Sprintf( - "%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph"); + "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph"); viz_pass->Set("graph_viz_path", new std::string(graph_path)); } - if (strategy.fuse_elewise_add_act_ops_) { - auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass"); - // Add a graph viz pass to record a graph. - if (!strategy.debug_graphviz_path_.empty()) { - auto viz_pass = AppendPass("graph_viz_pass"); - const std::string graph_path = string::Sprintf( - "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph"); - viz_pass->Set("graph_viz_path", - new std::string(graph_path)); - } - } - CollectiveContext *context = CollectiveContext::GetInstance(); context->endpoints_ = strategy_.trainers_endpoints_; context->trainer_id_ = strategy_.trainer_id_; @@ -108,11 +119,19 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // A side-effect of that, memory optimize cannot forsee the fetched vars // , so fetchlist should be set persistable before call the Run interface. if (strategy.memory_optimize_) { - auto memory_optimize_pass = AppendPass("memory_optimize_pass"); + VLOG(10) << "Add memory_optimize_pass"; + AppendPass("memory_optimize_pass"); } AppendMultiDevPass(strategy); + if (strategy.fuse_all_reduce_ops_) { + // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator + // first, if the number is zero, fuse_all_reduce_ops will do nothing. + VLOG(10) << "Add fuse_all_reduce_op_pass"; + AppendPass("fuse_all_reduce_op_pass"); + } + // Add a graph print pass to record a graph with device info. if (!strategy_.debug_graphviz_path_.empty()) { auto multi_devices_print_pass = AppendPass("multi_devices_print_pass"); @@ -128,28 +147,34 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Verify that the graph is correct for multi-device executor. AppendPass("multi_devices_check_pass"); + if (VLOG_IS_ON(2)) { + AppendPass("all_reduce_deps_pass"); + } + if (SeqOnlyAllReduceOps(strategy)) { + VLOG(10) << "Add all_reduce_deps_pass"; AppendPass("all_reduce_deps_pass"); } if (strategy_.remove_unnecessary_lock_) { + VLOG(10) << "Add modify_op_lock_and_record_event_pass"; AppendPass("modify_op_lock_and_record_event_pass"); } } // Convert graph to run on multi-devices. void AppendMultiDevPass(const BuildStrategy &strategy) { - ir::Pass *multi_devices_pass; + ir::Pass *multi_devices_pass = nullptr; if (strategy_.is_distribution_) { - VLOG(3) << "multi device parameter server mode"; + VLOG(10) << "Add dist_multi_devices_pass"; multi_devices_pass = AppendPass("dist_multi_devices_pass").get(); } else { if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { - VLOG(3) << "multi devices collective mode with allreduce"; + VLOG(10) << "Add all_reduce_mode_multi_devices_pass"; multi_devices_pass = - AppendPass("allreduce_mode_multi_devices_pass").get(); + AppendPass("all_reduce_mode_multi_devices_pass").get(); } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { - VLOG(3) << "multi deivces collective mode with reduce"; + VLOG(10) << "Add reduce_mode_multi_devices_pass"; multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get(); } else { PADDLE_THROW("Unknown reduce strategy."); @@ -206,9 +231,26 @@ std::unique_ptr BuildStrategy::Apply( #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; - pass->Erase("nccl_ctxs"); - pass->SetNotOwned("nccl_ctxs", nctx); + pass->Erase(kNCCLCtxs); + pass->SetNotOwned(kNCCLCtxs, nctx); +#endif + } else if (pass->Type() == "fuse_all_reduce_op_pass") { + pass->Erase(kPlaces); + pass->SetNotOwned>(kPlaces, &places); + pass->Erase(kLocalScopes); + pass->SetNotOwned>(kLocalScopes, + &local_scopes); +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; + pass->Erase(kNCCLCtxs); + pass->SetNotOwned(kNCCLCtxs, nctx); #endif + } else if (pass->Type() == "alloc_continuous_space_for_grad_pass") { + pass->Erase(kPlaces); + pass->SetNotOwned>(kPlaces, &places); + pass->Erase(kLocalScopes); + pass->SetNotOwned>(kLocalScopes, + &local_scopes); } else if (pass->Type() == "sequential_execution_pass") { LOG(INFO) << "set enable_sequential_execution:" << enable_sequential_execution_; @@ -239,7 +281,7 @@ USE_PASS(fuse_elewise_add_act_pass); USE_PASS(graph_viz_pass); USE_PASS(multi_batch_merge_pass); USE_PASS(reduce_mode_multi_devices_pass); -USE_PASS(allreduce_mode_multi_devices_pass); +USE_PASS(all_reduce_mode_multi_devices_pass); USE_PASS(dist_multi_devices_pass); USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_print_pass); @@ -249,4 +291,6 @@ USE_PASS(all_reduce_deps_pass); USE_PASS(modify_op_lock_and_record_event_pass); USE_PASS(inplace_pass); USE_PASS(lock_free_optimize_pass); +USE_PASS(alloc_continuous_space_for_grad_pass); USE_PASS(graph_to_program_pass); +USE_PASS(fuse_all_reduce_op_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 122411641dacde57ef3851f05bc92d86c1f83866..4b599fb914dc7c35a0524ea62ba8d458b8dccf8f 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -16,6 +16,7 @@ #include #include +#include #include #include "paddle/fluid/framework/ir/pass_builder.h" @@ -75,6 +76,8 @@ struct BuildStrategy { bool fuse_elewise_add_act_ops_{false}; + bool fuse_all_reduce_ops_{false}; + bool fuse_relu_depthwise_conv_{false}; bool sync_batch_norm_{false}; diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc deleted file mode 100644 index c9b52b68205ade000e21a3d06b80af86cbe01f34..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/data_balance_op_handle.cc +++ /dev/null @@ -1,154 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/data_balance_op_handle.h" -#include -#include "paddle/fluid/framework/details/container_cast.h" - -namespace paddle { -namespace framework { -namespace details { - -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) -DataBalanceOpHandle::DataBalanceOpHandle( - ir::Node *node, const std::vector &local_scopes, - const std::vector &places, - const platform::NCCLContextMap *ctxs) - : OpHandleBase(node), local_scopes_(local_scopes), places_(places) { - if (ctxs) { - for (auto &p : places_) { - this->SetDeviceContext(p, ctxs->DevCtx(p)); - } - } -} -#else -DataBalanceOpHandle::DataBalanceOpHandle( - ir::Node *node, const std::vector &local_scopes, - const std::vector &places) - : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {} -#endif - -std::string DataBalanceOpHandle::Name() const { return "data balance"; } - -std::vector> DataBalanceOpHandle::GetBalancePlan( - const std::vector &device_sizes) { - int device_num = device_sizes.size(); - int total_size = 0; - int empty_num = 0; - std::vector> size_device_vec; - size_device_vec.reserve(device_num); - for (int i = 0; i < device_num; ++i) { - if (device_sizes[i] == 0) { - ++empty_num; - } - total_size += device_sizes[i]; - size_device_vec.push_back({{device_sizes[i], i}}); - } - std::vector> res; - if (empty_num == 0) { - // No need to do data balance. - return res; - } - if (total_size < device_num) { - // No enough data. - PADDLE_THROW_EOF(); - } - std::sort(size_device_vec.begin(), size_device_vec.end(), - [](const std::array &a, const std::array &b) { - return a[0] > b[0]; - }); - int expected_device_size = total_size / device_num; - int src_idx = 0; - for (int dst_idx = device_num - empty_num; dst_idx < device_num; ++dst_idx) { - if (size_device_vec[src_idx][0] <= expected_device_size) { - ++src_idx; - PADDLE_ENFORCE_LT( - src_idx, device_num - empty_num, - "In current srategy an empty tensor should not be copy source."); - } - size_device_vec[src_idx][0] -= expected_device_size; - size_device_vec[dst_idx][0] += expected_device_size; - res.push_back({{size_device_vec[src_idx][1], size_device_vec[dst_idx][1], - expected_device_size}}); - } - return res; -} - -void DataBalanceOpHandle::RunImpl() { - PADDLE_ENFORCE_GT(places_.size(), 1UL, - "Data balance can only be enabled when the number of " - "places to run larger than 1."); - auto in_var_handles = DynamicCast(this->Inputs()); - auto out_var_handles = DynamicCast(this->Outputs()); - PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0); - PADDLE_ENFORCE_EQ( - in_var_handles.size(), out_var_handles.size(), - "The NoDummyInputSize and NoDummyOutputSize should be equal."); - int data_num = in_var_handles.size() / places_.size(); - WaitInputVarGenerated(); - std::vector> lod_tensors(data_num); - std::vector device_sizes; - for (int i = 0; i < static_cast(in_var_handles.size()); ++i) { - PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(), - "The name of input and output should be equal."); - int place_idx = i / data_num; - int data_idx = i % data_num; - auto *local_scope = - local_scopes_[place_idx]->FindVar(kLocalExecScopeName)->Get(); - auto *tensor_var = local_scope->FindVar(in_var_handles[i]->name()); - PADDLE_ENFORCE(tensor_var->IsType()); - auto *tensor = tensor_var->GetMutable(); - lod_tensors[data_idx].push_back(tensor); - int ins_size = - tensor->lod().empty() ? tensor->dims()[0] : tensor->NumElements(); - if (data_idx == 0) { - device_sizes.emplace_back(ins_size); - } else { - PADDLE_ENFORCE_EQ( - ins_size, device_sizes.at(place_idx), - "All data on the same device shall have the same batch size."); - } - } - const auto &balance_plan = GetBalancePlan(device_sizes); - - for (const auto &trans : balance_plan) { - for (int data_idx = 0; data_idx < data_num; ++data_idx) { - LoDTensor *src_tensor = lod_tensors[data_idx][trans[0]]; - LoDTensor *dst_tensor = lod_tensors[data_idx][trans[1]]; - int trans_ins_size = trans[2]; - LoD src_lod = src_tensor->lod(); - int src_ins_size = - src_lod.empty() ? src_tensor->dims()[0] : src_tensor->NumElements(); - int cut_point = src_ins_size - trans_ins_size; - if (!src_lod.empty()) { - for (auto &level : src_lod) { - cut_point = level[cut_point]; - } - } - TensorCopySync(src_tensor->Slice(cut_point, src_tensor->dims()[0]), - dst_tensor->place(), dst_tensor); - src_tensor->ShareDataWith(src_tensor->Slice(0, cut_point)); - if (!src_lod.empty()) { - dst_tensor->set_lod(SliceInLevel( - src_lod, 0, src_ins_size - trans_ins_size, src_ins_size)); - src_tensor->set_lod( - SliceInLevel(src_lod, 0, 0, src_ins_size - trans_ins_size)); - } - } - } -} - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/data_balance_op_handle.h b/paddle/fluid/framework/details/data_balance_op_handle.h deleted file mode 100644 index 2db18a1a7203f85aac6338576f2e68c7b37d7c69..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/data_balance_op_handle.h +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "paddle/fluid/framework/details/op_handle_base.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) -#include "paddle/fluid/platform/nccl_helper.h" -#endif - -namespace paddle { -namespace framework { -namespace details { - -struct DataBalanceOpHandle : public OpHandleBase { - public: -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - DataBalanceOpHandle(ir::Node *node, const std::vector &local_scopes, - const std::vector &places, - const platform::NCCLContextMap *ctxs); -#else - DataBalanceOpHandle(ir::Node *node, const std::vector &local_scopes, - const std::vector &places); -#endif - - std::string Name() const override; - - bool IsMultiDeviceTransfer() override { return false; }; - - protected: - void RunImpl() override; - - private: - // std::vector<(src_dev_id, dst_dev_id, trans_size)> - std::vector> GetBalancePlan( - const std::vector &batch_size_per_device); - - const std::vector local_scopes_; - const std::vector places_; -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index bbf81e1b8e49cae133858f7aa121701fb0f5456f..232d82a5da596a78d2999c4a4c4f7dda0c7cad7e 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -82,6 +82,8 @@ void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) { } } +bool FetchOpHandle::IsMultiDeviceTransfer() { return true; } + std::string FetchOpHandle::Name() const { return "Fetch"; } } // namespace details diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h index 6ce42f92d7f1e81eeafd1eb5c28ce3564a5ffebc..dbb7f4f6582f6e0f0b9b5702533852d12da1051c 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.h +++ b/paddle/fluid/framework/details/fetch_op_handle.h @@ -39,6 +39,8 @@ struct FetchOpHandle : public OpHandleBase { std::string Name() const override; + bool IsMultiDeviceTransfer() override; + protected: void RunImpl() override; diff --git a/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..f226491c9f5355d0418d672069b19a78ef53c595 --- /dev/null +++ b/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc @@ -0,0 +1,195 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/framework/details/all_reduce_op_handle.h" +#include "paddle/fluid/framework/details/container_cast.h" +#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +namespace paddle { +namespace framework { +namespace details { + +class FuseAllReduceOpPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override { + ir::Graph &result = *graph; + + auto &places = Get>(kPlaces); + auto &local_scopes = Get>(kLocalScopes); +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + auto *nccl_ctxs = &Get(kNCCLCtxs); +#endif + + std::unordered_set grads; + auto ¶ms_grads = result.Get(kParamsAndGrads); + size_t num_of_all_reduce = params_grads.size(); + grads.reserve(num_of_all_reduce); + for (auto p_g : params_grads) { + grads.insert(p_g.second); + } + + size_t num_place = places.size(); + std::unordered_map all_reduce_ops; + all_reduce_ops.reserve(grads.size()); + for (auto &node : result.Nodes()) { + if (node->IsOp()) { + PADDLE_ENFORCE(node->IsWrappedBy()); + auto *all_reduce_op_handle = + dynamic_cast(&node->Wrapper()); + if (all_reduce_op_handle) { + auto inputs = DynamicCast(all_reduce_op_handle->Inputs()); + PADDLE_ENFORCE_EQ(inputs.size(), num_place); + // The inputs' name should be the same. + auto &grad_name = inputs[0]->name(); + for (size_t i = 1; i < inputs.size(); ++i) { + PADDLE_ENFORCE_EQ(inputs[i]->name(), grad_name, + "The input name should be the same."); + } + PADDLE_ENFORCE_NE(grads.count(grad_name), static_cast(0)); + all_reduce_ops.emplace(grad_name, node); + } + } + } + + VLOG(10) << "Find all_reduce_ops: " << all_reduce_ops.size(); + if (all_reduce_ops.size() == 0) { + return std::move(graph); + } + + PADDLE_ENFORCE_EQ(all_reduce_ops.size(), grads.size(), + "The number of all_reduce OpHandle is not equal to the " + "number of grads. Maybe some gradients are sparse type, " + "it is not supported currently."); + VLOG(10) << "Insert fused_all_reduce"; + + auto &group_grads_params = + graph->Get(kGroupGradsAndParams); + + for (auto &group_g_p : group_grads_params) { + size_t group_size = group_g_p.size(); + PADDLE_ENFORCE_GT(group_size, static_cast(0)); + std::vector group_all_reduce_ops; + group_all_reduce_ops.reserve(group_size); + for (auto &g_p : group_g_p) { + group_all_reduce_ops.emplace_back(all_reduce_ops.at(g_p.first)); + } +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + InsertFusedAllReduce(places, local_scopes, group_size, + group_all_reduce_ops, nccl_ctxs, &result); +#else + InsertFusedAllReduce(places, local_scopes, group_size, + group_all_reduce_ops, &result); +#endif + } + return std::move(graph); + } + + void InsertFusedAllReduce(const std::vector &places, + const std::vector &local_scopes, + const size_t num_of_all_reduce, + const std::vector &all_reduce_ops, +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + const platform::NCCLContextMap *nccl_ctxs, +#endif + ir::Graph *result) const { + std::vector inputs; + std::vector outputs; + for (auto &op : all_reduce_ops) { + auto &op_handle = op->Wrapper(); + inputs.insert(inputs.end(), op_handle.Inputs().begin(), + op_handle.Inputs().end()); + // Remove output + for_each(op_handle.Inputs().begin(), op_handle.Inputs().end(), + [&op_handle](VarHandleBase *var_handle) { + var_handle->RemoveOutput(&op_handle, op_handle.Node()); + }); + + outputs.insert(outputs.end(), op_handle.Outputs().begin(), + op_handle.Outputs().end()); + // Remove Input + for_each( + op_handle.Outputs().begin(), op_handle.Outputs().end(), + [](VarHandleBase *var_handle) { var_handle->ClearGeneratedOp(); }); + + result->RemoveNode(op_handle.Node()); + } + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places, + local_scopes, nccl_ctxs, result); +#else + CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places, + local_scopes, result); +#endif + } + + private: + void CreateFusedAllReduceOp(const std::vector &inputs, + const std::vector &outputs, + const size_t num_of_all_reduce, + const std::vector &places, + const std::vector &local_scopes, +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + const platform::NCCLContextMap *nccl_ctxs, +#endif + ir::Graph *result) const { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + auto *op_handle = new FusedAllReduceOpHandle( + result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation), + local_scopes, places, num_of_all_reduce, nccl_ctxs); +#else + auto *op_handle = new FusedAllReduceOpHandle( + result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation), + local_scopes, places, num_of_all_reduce); +#endif + + for (auto in : inputs) { + op_handle->AddInput(in); + } + + for (auto out : outputs) { + op_handle->AddOutput(out); + } + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + if (!nccl_ctxs) { + SetCommunicationContext(places, op_handle); + } +#else + SetCommunicationContext(places, op_handle); +#endif + } + + void SetCommunicationContext(const std::vector &places, + FusedAllReduceOpHandle *op_handle) const { + for (size_t i = 0; i < places.size(); ++i) { + op_handle->SetDeviceContext( + places[i], platform::DeviceContextPool::Instance().Get(places[i])); + } + } +}; + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_all_reduce_op_pass, + paddle::framework::details::FuseAllReduceOpPass); diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.cc b/paddle/fluid/framework/details/fuse_vars_op_handle.cc deleted file mode 100644 index 14292c0a5d06aa3ff12b46b5768b136fa925752d..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/fuse_vars_op_handle.cc +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/fuse_vars_op_handle.h" - -namespace paddle { -namespace framework { -namespace details { - -void FuseVarsOpHandle::RunImpl() { - WaitInputVarGenerated(place_); - - auto in_var_handles = DynamicCast(this->Inputs()); - auto out_var_handles = DynamicCast(this->Outputs()); - PADDLE_ENFORCE_EQ(in_var_handles.size(), 0UL); - PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), ""); - - auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get(); - - auto out_var_handle = out_var_handles[0]; - auto out_var = scope->Var(out_var_handle->name()); - - auto out_tensor = out_var->GetMutable(); - out_tensor->Resize({total_numel_}).mutable_data(this->place_, type_); - - int64_t s = 0; - for (size_t i = 1; i < out_var_handles.size(); ++i) { - auto out_name = out_var_handles[i]->name(); - auto out_t = scope->Var(out_name)->GetMutable(); - auto numel = this->inputs_numel_.at(out_name); - out_t->ShareDataWith(out_tensor->Slice(s, s + numel)); - s += numel; - } - this->RunAndRecordEvent([] {}); -} - -std::string FuseVarsOpHandle::Name() const { return "fuse vars"; } -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.h b/paddle/fluid/framework/details/fuse_vars_op_handle.h deleted file mode 100644 index b40b01df36479543e8b2779762210ae144d7d9be..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/fuse_vars_op_handle.h +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/details/container_cast.h" -#include "paddle/fluid/framework/details/op_handle_base.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/platform/device_context.h" - -namespace paddle { -namespace framework { -namespace details { - -struct FuseVarsOpHandle : public OpHandleBase { - public: - FuseVarsOpHandle(ir::Node *node, Scope *local_scope, - const platform::Place &place, - const std::unordered_map &inputs_numel, - const proto::VarType::Type var_type) - : OpHandleBase(node), - local_scope_(local_scope), - place_(place), - inputs_numel_(inputs_numel), - type_(var_type) { - total_numel_ = 0; - for (auto in_numel : inputs_numel) { - PADDLE_ENFORCE_GT(in_numel.second, 0); - total_numel_ += in_numel.second; - } - } - - std::string Name() const override; - - bool IsMultiDeviceTransfer() override { return false; }; - - protected: - void RunImpl() override; - - private: - Scope *local_scope_; - const platform::Place place_; - const std::unordered_map inputs_numel_; - const proto::VarType::Type type_; - int64_t total_numel_; -}; -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc new file mode 100644 index 0000000000000000000000000000000000000000..644cd4e15083519d6c685ae3e6a0737692018a07 --- /dev/null +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -0,0 +1,249 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h" +#include +#include +#include "paddle/fluid/framework/details/container_cast.h" +#include "paddle/fluid/framework/details/reduce_and_gather.h" +#include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/platform/profiler.h" + +DEFINE_bool(skip_fused_all_reduce_check, false, ""); +namespace paddle { +namespace framework { +namespace details { + +typedef std::vector>> + GradientAndLoDTensor; + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +FusedAllReduceOpHandle::FusedAllReduceOpHandle( + ir::Node *node, const std::vector &local_scopes, + const std::vector &places, const size_t num_of_all_reduce, + const platform::NCCLContextMap *ctxs) + : OpHandleBase(node), + local_scopes_(local_scopes), + places_(places), + num_of_all_reduce_(num_of_all_reduce), + nccl_ctxs_(ctxs) { + if (nccl_ctxs_) { + for (auto &p : places_) { + this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p)); + } + } + PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); +} +#else + +FusedAllReduceOpHandle::FusedAllReduceOpHandle( + ir::Node *node, const std::vector &local_scopes, + const std::vector &places, const size_t num_of_all_reduce) + : OpHandleBase(node), + local_scopes_(local_scopes), + places_(places), + num_of_all_reduce_(num_of_all_reduce) { + PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); +} + +#endif + +void FusedAllReduceOpHandle::RunImpl() { + platform::RecordEvent record_event(Name()); + + VLOG(4) << this->DebugString(); + + WaitInputVarGenerated(); + // The input: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)... + // The output: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)... + auto in_var_handles = DynamicCast(this->Inputs()); + auto out_var_handles = DynamicCast(this->Outputs()); + + size_t place_num = places_.size(); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), place_num * num_of_all_reduce_, + "The NoDummyInputSize should be equal to the number of places."); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), out_var_handles.size(), + "The NoDummyInputSize and NoDummyOutputSize should be equal."); + + GradientAndLoDTensor grads_tensor; + grads_tensor.resize(place_num); + + int64_t numel = -1; + auto dtype = static_cast(0); + for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) { + auto &g_tensor = grads_tensor.at(scope_idx); + g_tensor.reserve(num_of_all_reduce_); + + GetGradLoDTensor(scope_idx, in_var_handles, out_var_handles, &g_tensor); + + int64_t element_num = 0; + framework::proto::VarType::Type ele_dtype = + static_cast(0); + GetDTypeAndNumel(g_tensor, &ele_dtype, &element_num); + + if (numel == -1) { + numel = element_num; + } + if (dtype == static_cast(0)) { + dtype = ele_dtype; + PADDLE_ENFORCE_NE(ele_dtype, + static_cast(0)); + } + PADDLE_ENFORCE_EQ(ele_dtype, dtype); + + // Check whether the address space is contiguous. + std::sort( + g_tensor.begin(), g_tensor.end(), + [](const std::pair &grad1, + const std::pair &grad2) -> bool { + return grad1.second->data() < grad2.second->data(); + }); + + for (size_t k = 1; k < g_tensor.size(); ++k) { + const void *cur_address = g_tensor.at(k - 1).second->data(); + int64_t len = g_tensor.at(k - 1).second->numel(); + auto offset = len * framework::SizeOfType(dtype); + void *infer_next_address = reinterpret_cast( + reinterpret_cast(cur_address) + offset); + const void *next_address = g_tensor.at(k).second->data(); + + VLOG(10) << string::Sprintf( + "Input[%d](%s) address: 0X%02x, Input[%d](%s) address: 0X%02x, Infer " + "input[%d] address: 0X%02x. The offset: %d", + k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k, + next_address, k, infer_next_address, offset); + PADDLE_ENFORCE_EQ(infer_next_address, next_address, + "The address is not consistent."); + } + } + + if (!FLAGS_skip_fused_all_reduce_check) { + for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) { + for (size_t j = 1; j < num_of_all_reduce_; ++j) { + PADDLE_ENFORCE_EQ(grads_tensor.at(0).at(j).first, + grads_tensor.at(scope_idx).at(j).first); + } + } + } + + std::vector lod_tensor_data; + for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) { + auto data = grads_tensor.at(scope_idx).at(0).second->data(); + lod_tensor_data.emplace_back(data); + } + + if (platform::is_gpu_place(places_[0])) { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); + int nccl_dtype = platform::ToNCCLDataType(dtype); + std::vector> all_reduce_calls; + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto &p = places_[i]; + void *buffer = const_cast(lod_tensor_data.at(i)); + + int dev_id = boost::get(p).device; + auto &nccl_ctx = nccl_ctxs_->at(dev_id); + auto stream = nccl_ctx.stream(); + auto comm = nccl_ctx.comm_; + all_reduce_calls.emplace_back([=] { + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + buffer, buffer, numel, static_cast(nccl_dtype), + ncclSum, comm, stream)); + }); + } + + this->RunAndRecordEvent([&] { + if (all_reduce_calls.size() == 1UL) { + // Do not use NCCLGroup when manage NCCL by per thread per device + all_reduce_calls[0](); + } else { + platform::NCCLGroupGuard guard; + for (auto &call : all_reduce_calls) { + call(); + } + } + }); +#else + PADDLE_THROW("Not compiled with CUDA"); +#endif + } else { + // Special handle CPU only Operator's gradient. Like CRF + auto grad_name = grads_tensor.at(0).at(0).first; + auto &trg = *this->local_scopes_[0] + ->FindVar(kLocalExecScopeName) + ->Get() + ->FindVar(grad_name) + ->GetMutable(); + + // Reduce All data to trg in CPU + ReduceBufferData func(lod_tensor_data, trg.data(), numel); + VisitDataType(trg.type(), func); + + for (size_t i = 1; i < local_scopes_.size(); ++i) { + auto &scope = + *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get(); + auto &p = places_[i]; + auto *var = scope.FindVar(grad_name); + auto *dev_ctx = dev_ctxes_.at(p); + size_t size = numel * SizeOfType(trg.type()); + RunAndRecordEvent(p, [&trg, var, dev_ctx, p, size] { + auto dst_ptr = var->GetMutable()->data(); + platform::CPUPlace cpu_place; + memory::Copy(cpu_place, dst_ptr, cpu_place, trg.data(), size); + }); + } + } +} + +void FusedAllReduceOpHandle::GetGradLoDTensor( + const size_t &scope_idx, const std::vector &in_var_handles, + const std::vector &out_var_handles, + std::vector> *grad_tensor) const { + auto *local_scope = + local_scopes_.at(scope_idx)->FindVar(kLocalExecScopeName)->Get(); + size_t place_num = places_.size(); + + for (size_t j = 0; j < in_var_handles.size(); j += place_num) { + auto var_name = in_var_handles[j]->name(); + PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name()); + auto &lod_tensor = local_scope->FindVar(var_name)->Get(); + PADDLE_ENFORCE_EQ(lod_tensor.place(), places_.at(scope_idx)); + grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor)); + } +} + +void FusedAllReduceOpHandle::GetDTypeAndNumel( + const std::vector> &grad_tensor, + proto::VarType::Type *dtype, int64_t *numel) const { + *numel = 0; + for (size_t i = 0; i < grad_tensor.size(); ++i) { + // Get element number + int64_t len = grad_tensor.at(i).second->numel(); + PADDLE_ENFORCE_GT(len, 0); + *numel += len; + + // Get dtype + auto ele_type = grad_tensor.at(i).second->type(); + if (i == 0) { + *dtype = ele_type; + } + PADDLE_ENFORCE_EQ(ele_type, *dtype); + } +} + +std::string FusedAllReduceOpHandle::Name() const { return "fused_all_reduce"; } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h new file mode 100644 index 0000000000000000000000000000000000000000..79772c61f8c8b7abe3cf26dd8a94c2acdc0872a0 --- /dev/null +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h @@ -0,0 +1,76 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace framework { +namespace details { + +struct FusedAllReduceOpHandle : public OpHandleBase { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + FusedAllReduceOpHandle(ir::Node *node, + const std::vector &local_scopes, + const std::vector &places, + const size_t num_of_all_reduce, + const platform::NCCLContextMap *ctxs); +#else + FusedAllReduceOpHandle(ir::Node *node, + const std::vector &local_scopes, + const std::vector &places, + const size_t num_of_all_reduce); +#endif + std::string Name() const override; + + // Delay and buffer nccl_all_reduce together can significantly increase + // performance. Disable this feature by returning false. + bool IsMultiDeviceTransfer() override { return true; }; + + protected: + void RunImpl() override; + + private: + std::vector local_scopes_; + std::vector places_; + size_t num_of_all_reduce_; +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + const platform::NCCLContextMap *nccl_ctxs_; +#endif + + // Check the dtype of the input + void GetDTypeAndNumel( + const std::vector> &g_tensor, + proto::VarType::Type *dtype, int64_t *total_num) const; + + // Get gradient's name and LoDTensor + void GetGradLoDTensor(const size_t &scope_idx, + const std::vector &in_var_handles, + const std::vector &out_var_handles, + std::vector> + *grad_tensor) const; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 478d2ffbcf2988487893984284d4597f018f0ca0..125dbf746c3880e142af4d4bffd3ccda8654c0a1 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -11,18 +11,19 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include #include +#include #include +#include +#include #include #include - #include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h" -#include "paddle/fluid/framework/details/data_balance_op_handle.h" #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" -#include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/rpc_op_handle.h" #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" @@ -134,21 +135,26 @@ void AddOutputToLeafOps(ir::Graph *graph) { } } // namespace +void MultiDevSSAGraphBuilderBase::CheckGraph(const ir::Graph &graph) const {} + void MultiDevSSAGraphBuilderBase::Init() const { all_vars_.clear(); loss_var_name_ = Get(kLossVarName); + VLOG(10) << "Init MultiDevSSAGraphBuilder, loss name: " << loss_var_name_; places_ = Get>(kPlaces); local_scopes_ = Get>(kLocalScopes); strategy_ = Get(kStrategy); #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - nccl_ctxs_ = &Get("nccl_ctxs"); + nccl_ctxs_ = &Get(kNCCLCtxs); #endif + PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); } std::unique_ptr MultiDevSSAGraphBuilderBase::ApplyImpl( std::unique_ptr graph) const { Init(); + CheckGraph(*graph); std::vector sorted_ops = SortOperations(*graph); auto nodes = graph->ReleaseNodes(); @@ -166,7 +172,6 @@ std::unique_ptr MultiDevSSAGraphBuilderBase::ApplyImpl( result.Set(kGraphOps, new GraphOps); bool is_forwarding = true; - bool insert_collection_ops = NeedCollectiveOps(); for (ir::Node *node : sorted_ops) { if (DealWithSpecialOp(&result, node)) { @@ -185,8 +190,8 @@ std::unique_ptr MultiDevSSAGraphBuilderBase::ApplyImpl( CreateComputationalOps(&result, node, places_.size()); } - // Insert collection ops - if (!is_forwarding && insert_collection_ops) { + // Insert collective ops if nranks > 1 + if (!is_forwarding && Get(kNRanks) > 1) { try { bool is_bk_op = static_cast(boost::get(node->Op()->GetAttr( @@ -200,13 +205,13 @@ std::unique_ptr MultiDevSSAGraphBuilderBase::ApplyImpl( boost::get>(node->Op()->GetNullableAttr( OpProtoAndCheckerMaker::OpRoleVarAttrName())); PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); - for (size_t i = 0; i < backward_vars.size(); i += 2) { auto &p_name = backward_vars[i]; auto &g_name = backward_vars[i + 1]; VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; - - InsertCollectiveOp(&result, p_name, g_name); + if (NeedCollectiveForGrad(g_name, sorted_ops)) { + InsertCollectiveOp(&result, p_name, g_name); + } } } catch (boost::bad_get e) { } @@ -226,6 +231,7 @@ std::unique_ptr MultiDevSSAGraphBuilderBase::ApplyImpl( * Only variables should be the leaves of graph. */ AddOutputToLeafOps(&result); + result.Erase(kGraphOps); return graph; } @@ -258,6 +264,11 @@ void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp( } } +bool MultiDevSSAGraphBuilderBase::DealWithSpecialOp(ir::Graph *result, + ir::Node *node) const { + return false; +} + std::vector MultiDevSSAGraphBuilderBase::SortOperations( const ir::Graph &graph) const { return ir::TopologySortOperations(graph); @@ -271,8 +282,20 @@ bool MultiDevSSAGraphBuilderBase::UseGPU() const { return use_gpu; } -bool MultiDevSSAGraphBuilderBase::NeedCollectiveOps() const { - return Get(kNRanks) > 1; +bool MultiDevSSAGraphBuilderBase::NeedCollectiveForGrad( + const std::string &grad_name, std::vector ops) const { + // if we have allreduce_op for current gradient variable in the graph, + // then we don't need to add allreduce_op_handle for this gradient + // NOTE: This is for the case that all gradients should add collective ops + for (auto *node : ops) { + if (node->Op()->Type() != "allreduce") continue; + for (auto in_name : node->Op()->InputArgumentNames()) { + if (in_name == grad_name) { + return false; + } + } + } + return true; } void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result, @@ -496,20 +519,17 @@ VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(ir::Graph *result, } bool MultiDevSSAGraphBuilderBase::IsScaleLossOp(ir::Node *node) const { - return boost::get( + return !loss_var_name_.empty() && node->Op() && + boost::get( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == (static_cast(OpRole::kBackward) | - static_cast(OpRole::kLoss)) && - !loss_var_name_.empty(); // If loss_var is empty. This is test mode + static_cast(OpRole::kLoss)); } bool MultiDevSSAGraphBuilderBase::IsSparseGradient( const std::string &og) const { PADDLE_ENFORCE(all_vars_.count(og) != 0); - if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { - return true; - } - return false; + return all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS; } void AllReduceSSAGraphBuilder::InsertCollectiveOp( @@ -995,7 +1015,7 @@ static int MultiDevSSAGraphBuilderRegister(const std::string &builder_mode) { REGISTER_MULTI_DEVICES_PASS(reduce_mode_multi_devices_pass, paddle::framework::details::ReduceSSAGraphBuilder); REGISTER_MULTI_DEVICES_PASS( - allreduce_mode_multi_devices_pass, + all_reduce_mode_multi_devices_pass, paddle::framework::details::AllReduceSSAGraphBuilder); REGISTER_MULTI_DEVICES_PASS(dist_multi_devices_pass, paddle::framework::details::DistSSAGraphBuilder); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 6d4386538ea7d0cc318647c92282af9d598fa699..0ee3a0606291797b8c238ccbface591e30bfe502 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -14,7 +14,10 @@ #pragma once +#include #include +#include +#include #include #include @@ -31,12 +34,6 @@ namespace framework { class Scope; namespace details { -constexpr char kLossVarName[] = "loss_var_name"; -constexpr char kPlaces[] = "places"; -constexpr char kLocalScopes[] = "local_scopes"; -constexpr char kStrategy[] = "strategy"; -constexpr char kNRanks[] = "nranks"; - class MultiDevSSAGraphBuilderBase : public ir::Pass { protected: std::unique_ptr ApplyImpl( @@ -44,18 +41,21 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass { virtual void Init() const; + virtual void CheckGraph(const ir::Graph &graph) const; + virtual std::vector SortOperations(const ir::Graph &graph) const; virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, const std::string &g_name) const = 0; - virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const = 0; + virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const; virtual void InsertPostprocessOps(ir::Graph *result) const = 0; bool UseGPU() const; - bool NeedCollectiveOps() const; + bool NeedCollectiveForGrad(const std::string &grad_name, + std::vector ops) const; bool IsScaleLossOp(ir::Node *node) const; @@ -109,10 +109,6 @@ class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, const std::string &g_name) const; - virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const { - return false; - } - virtual void InsertPostprocessOps(ir::Graph *result) const {} }; diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h index 9afbb91005c9c3a9d2e185f4dfa901ebf812ee19..ab5e099023382c4e28a9613d321ea8dc182d3534 100644 --- a/paddle/fluid/framework/details/multi_devices_helper.h +++ b/paddle/fluid/framework/details/multi_devices_helper.h @@ -16,6 +16,9 @@ #include #include +#include +#include +#include #include #include "paddle/fluid/framework/details/op_handle_base.h" @@ -44,6 +47,26 @@ const char kGraphVars[] = "vars"; typedef std::unordered_set GraphDepVars; const char kGraphDepVars[] = "dep_vars"; +constexpr char kNCCLCtxs[] = "nccl_ctxs"; + +constexpr char kLossVarName[] = "loss_var_name"; +constexpr char kPlaces[] = "places"; +constexpr char kLocalScopes[] = "local_scopes"; +constexpr char kStrategy[] = "strategy"; +constexpr char kNRanks[] = "nranks"; + +typedef std::unordered_set FusedVars; +constexpr char kFusedVars[] = "fused_vars"; + +typedef std::vector> ParamsAndGrads; +constexpr char kParamsAndGrads[] = "params_grads"; + +typedef std::vector>> + GroupGradsAndParams; +constexpr char kGroupGradsAndParams[] = "group_grads_params"; + +constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@"; + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 4822627ac3b65972f41d9a23d9fe3dba3de3f97d..158da6f606f3f5a7062a4aaed7cf7e3fe71c817a 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/details/op_handle_base.h" #include +#include namespace paddle { namespace framework { @@ -41,15 +42,42 @@ OpHandleBase::~OpHandleBase() { void OpHandleBase::Run(bool use_cuda) { #ifdef PADDLE_WITH_CUDA - if (events_.empty() && use_cuda) { + if (events_.empty() && use_cuda && dev_ctxes_.size() > 0) { for (auto &p : dev_ctxes_) { int dev_id = boost::get(p.first).device; PADDLE_ENFORCE(cudaSetDevice(dev_id)); PADDLE_ENFORCE( cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming)); } + if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) { + for (auto &out_var : outputs_) { + auto *out_var_handle = dynamic_cast(out_var); + if (out_var_handle) { + int dev_id = + boost::get(out_var_handle->place()).device; + out_var_handle->SetGenerateEvent(events_[dev_id]); + } + } + } else { + PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL, + "%s should have only one dev_ctx.", Name()); + auto &place = dev_ctxes_.begin()->first; + int dev_id = boost::get(place).device; + for (auto &out_var : outputs_) { + auto *out_var_handle = dynamic_cast(out_var); + if (out_var_handle) { + PADDLE_ENFORCE( + platform::is_same_place(place, out_var_handle->place()), + "The place of input(%s) is not consistent with the " + "place of current op(%s).", + out_var_handle->Name(), Name()); + out_var_handle->SetGenerateEvent(events_[dev_id]); + } + } + } } #else + PADDLE_ENFORCE(!use_cuda); #endif @@ -93,17 +121,48 @@ void OpHandleBase::AddOutput(VarHandleBase *out) { void OpHandleBase::WaitInputVarGenerated() { for (auto in_var : inputs_) { if (NeedWait(in_var)) { - for (auto &pair : dev_ctxes_) { - in_var->GeneratedOp()->RecordWaitEventOnCtx(pair.second); + // Dummy Variable is used to represent dependencies between operators, so + // there doesn't add event for it. + auto *in_var_handle = dynamic_cast(in_var); + if (in_var_handle) { + auto &place = in_var_handle->place(); + if (platform::is_gpu_place(place)) { +#ifdef PADDLE_WITH_CUDA + auto stream = + static_cast(dev_ctxes_.at(place)) + ->stream(); + PADDLE_ENFORCE( + cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); +#else + PADDLE_THROW("Doesn't compile the GPU."); +#endif + } + // There are nothing to do when the place is CPUPlace. } } } } void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { - for (auto *in : inputs_) { - if (NeedWait(in)) { - in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(place)); + for (auto in_var : inputs_) { + if (NeedWait(in_var)) { + // Dummy Variable is used to represent dependencies between operators, so + // there doesn't add event for it. + auto *in_var_handle = dynamic_cast(in_var); + if (in_var_handle) { + if (platform::is_gpu_place(in_var_handle->place())) { +#ifdef PADDLE_WITH_CUDA + auto stream = static_cast( + dev_ctxes_.at(in_var_handle->place())) + ->stream(); + PADDLE_ENFORCE( + cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); +#else + PADDLE_THROW("Doesn't compile the GPU."); +#endif + } + // There are nothing to do when the place is CPUPlace. + } } } } diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h index 2e5256fbd49a3f8c72840cd55dada4301cb04eb9..0de8e436518ea353a185087b0e4668b5d200c966 100644 --- a/paddle/fluid/framework/details/reduce_and_gather.h +++ b/paddle/fluid/framework/details/reduce_and_gather.h @@ -53,6 +53,31 @@ struct ReduceLoDTensor { } }; +struct ReduceBufferData { + const std::vector &src_data_; + void *dst_data_; + int64_t numel_; + + ReduceBufferData(const std::vector &src, void *dst, + int64_t numel) + : src_data_(src), dst_data_(dst), numel_(numel) {} + + template + void apply() const { + T *dst_data = reinterpret_cast(dst_data_); + for (size_t i = 0; i < src_data_.size(); ++i) { + auto srd_data = reinterpret_cast(src_data_[i]); + VLOG(10) << "dst: " << dst_data_ << ", " << srd_data; + if (srd_data == dst_data_) { + continue; + } + + std::transform(srd_data, srd_data + numel_, dst_data, dst_data, + [](T a, T b) -> T { return a + b; }); + } + } +}; + inline void GatherLocalSelectedRows( const std::vector &src_selecte_rows_, const std::vector &in_places, diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 9ba295a2b06a5ee9c3069e95fa688595fe72d6fd..c4254bbadfa17682f437f46f02adc9c884d24304 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -14,7 +14,6 @@ #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" -#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/platform/profiler.h" @@ -27,62 +26,49 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( : graph_(graph), pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_) : nullptr), + prepare_pool_(1), local_scopes_(local_scopes), places_(places), fetch_ctxs_(places), - running_ops_(0), - strategy_(strategy) {} + strategy_(strategy) { + PrepareOpDeps(); + CopyOpDeps(); +} FeedFetchList ThreadedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { std::unique_ptr event( new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare")); - std::unordered_map pending_ops; - std::unordered_set pending_vars; - auto ready_vars = std::make_shared>(); - std::unordered_set ready_ops; + std::unique_ptr op_deps = op_deps_futures_.get(); + CopyOpDeps(); + VLOG(10) << "ThreadedSSAGraphExecutor::Run"; + std::shared_ptr> ready_vars( + new BlockingQueue); + auto &pending_ops = op_deps->pending_ops_; + auto &pending_vars = op_deps->pending_vars_; + auto &ready_ops = op_deps->ready_ops_; + // For ops (e.g. nccl_all_reduce) that need to coordinate multiple // streams from multiple GPUs, it's faster to buffer them and schedule // together since we currently cannot overlap computation and memcpy streams. // Should revisit it if overlapping is available. std::unordered_set delayed_ops; - // Transform SSAGraph to pending_ops & pending_vars - for (auto &var_map : graph_->Get(details::kGraphVars)) { - for (auto &name_pair : var_map) { - for (auto &version_pair : name_pair.second) { - InsertPendingVar(&pending_vars, ready_vars.get(), version_pair); - } - } - } - for (auto &var : graph_->Get(details::kGraphDepVars)) { - InsertPendingVar(&pending_vars, ready_vars.get(), var); - } - - for (auto &op : ir::FilterByNodeWrapper(*graph_)) { - if (op->Inputs().empty()) { // Special case, Op has no input. - ready_ops.insert(op); - } else { - InsertPendingOp(&pending_ops, op); - } - } - // Step 2. Insert FetchOps std::vector fetch_ops; std::unordered_set fetch_dependencies; FeedFetchList fetch_data(fetch_tensors.size()); - InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops, - &pending_vars, ready_vars.get(), &fetch_data); + InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &ready_ops, + &pending_ops, &pending_vars, &fetch_data); auto run_all_ops = [&](std::unordered_set &set) { for (auto *op : set) { - running_ops_++; RunOp(ready_vars, op); } set.clear(); }; - + auto run_all_op = [&](OpHandleBase *op) { RunOp(ready_vars, op); }; // Clean run context run_op_futures_.clear(); exception_holder_.Clear(); @@ -91,19 +77,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( while (!pending_vars.empty()) { // 1. Run All Ready ops // Keep loop until all vars are ready. - // - // NOTE: DelayedOps have a lower priority. It will be scheduled after all - // ready_ops have been performed. - if (ready_ops.empty() && strategy_.allow_op_delay_ && running_ops_ == 0) { - run_all_ops(delayed_ops); - } else { - run_all_ops(ready_ops); - } + run_all_ops(ready_ops); // 2. Find ready variable bool timeout; auto cur_ready_vars = ready_vars->PopAll(1, &timeout); - if (timeout) { if (exception_holder_.IsCaught()) { for (auto &run_op_future : run_op_futures_) { @@ -115,6 +93,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( continue; } } + // 3. Remove the dependency of ready_var. // Find the ready_ops after the ready_var. for (auto ready_var : cur_ready_vars) { @@ -123,11 +102,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( auto &deps = pending_ops[op]; --deps; if (deps == 0) { - if (op->IsMultiDeviceTransfer() && strategy_.allow_op_delay_) { - delayed_ops.insert(op); - } else { - ready_ops.insert(op); - } + run_all_op(op); } } } @@ -143,16 +118,17 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( const std::vector &fetch_tensors, std::vector *fetch_ops, std::unordered_set *fetch_dependencies, + std::unordered_set *ready_ops, std::unordered_map *pending_ops, std::unordered_set *pending_vars, - BlockingQueue *ready_vars, FeedFetchList *fetch_data) { + FeedFetchList *fetch_data) { std::unordered_map> fetched_vars; - + std::unordered_set local_ready_vars; for (auto &fetch_var_name : fetch_tensors) { for (auto &var_map : graph_->Get(details::kGraphVars)) { auto it = var_map.find(fetch_var_name); if (it != var_map.end()) { - fetched_vars[fetch_var_name].push_back(*it->second.rbegin()); + fetched_vars[fetch_var_name].emplace_back(*it->second.rbegin()); } } } @@ -161,8 +137,9 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( auto &var_name = fetch_tensors[i]; auto fetched_var_it = fetched_vars.find(var_name); PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(), - "Cannot find fetched variable.(Perhaps the main_program " - "is not set to ParallelExecutor)"); + "Cannot find fetched variable(%s).(Perhaps the main_program " + "is not set to ParallelExecutor)", + var_name); auto &vars = fetched_var_it->second; @@ -184,9 +161,23 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( auto *fetch_dummy = new DummyVarHandle(fetch_var); op->AddOutput(fetch_dummy); fetch_dependencies->emplace(fetch_dummy); - this->InsertPendingVar(pending_vars, ready_vars, fetch_dummy); - this->InsertPendingOp(pending_ops, op); + + this->InsertPendingVar(pending_vars, &local_ready_vars, fetch_dummy); + + size_t wait_input_num = 0; + std::unordered_set input_set(vars.begin(), vars.end()); + for (auto *var : input_set) { + if (pending_vars->count(var)) { + ++wait_input_num; + } + } + if (wait_input_num) { + pending_ops->insert({op, wait_input_num}); + } else { + ready_ops->insert(static_cast(op)); + } } + PADDLE_ENFORCE_EQ(local_ready_vars.size(), 0); } void ThreadedSSAGraphExecutor::InsertPendingOp( @@ -197,11 +188,63 @@ void ThreadedSSAGraphExecutor::InsertPendingOp( void ThreadedSSAGraphExecutor::InsertPendingVar( std::unordered_set *pending_vars, - BlockingQueue *ready_vars, VarHandleBase *var) const { + std::unordered_set *ready_vars, VarHandleBase *var) const { pending_vars->insert(var); if (var->GeneratedOp() == nullptr) { - ready_vars->Push(var); + ready_vars->insert(var); + } +} + +void ThreadedSSAGraphExecutor::PrepareOpDeps() { + op_deps_.reset(new OpDependentData()); + std::unordered_map &pending_ops = + op_deps_->pending_ops_; + std::unordered_set &pending_vars = op_deps_->pending_vars_; + std::unordered_set &ready_ops = op_deps_->ready_ops_; + std::unordered_set ready_vars; + + // Transform SSAGraph to pending_ops & pending_vars + for (auto &var_map : graph_->Get(details::kGraphVars)) { + for (auto &name_pair : var_map) { + for (auto &version_pair : name_pair.second) { + InsertPendingVar(&pending_vars, &ready_vars, version_pair); + } + } + } + for (auto &var : graph_->Get(details::kGraphDepVars)) { + InsertPendingVar(&pending_vars, &ready_vars, var); + } + + for (auto &op : ir::FilterByNodeWrapper(*graph_)) { + if (op->Inputs().empty()) { // Special case, Op has no input. + ready_ops.insert(op); + } else { + InsertPendingOp(&pending_ops, op); + } } + for (auto ready_var : ready_vars) { + pending_vars.erase(ready_var); + for (auto *op : ready_var->PendingOps()) { + auto &deps = pending_ops[op]; + --deps; + if (deps == 0) { + ready_ops.insert(op); + } + } + } +} + +void ThreadedSSAGraphExecutor::CopyOpDeps() { + op_deps_futures_ = prepare_pool_.enqueue([&] { + auto *op_deps = new OpDependentData(); + op_deps->pending_ops_.insert(op_deps_->pending_ops_.begin(), + op_deps_->pending_ops_.end()); + op_deps->pending_vars_.insert(op_deps_->pending_vars_.begin(), + op_deps_->pending_vars_.end()); + op_deps->ready_ops_.insert(op_deps_->ready_ops_.begin(), + op_deps_->ready_ops_.end()); + return std::unique_ptr(op_deps); + }); } void ThreadedSSAGraphExecutor::RunOp( @@ -216,7 +259,6 @@ void ThreadedSSAGraphExecutor::RunOp( op->Run(strategy_.use_cuda_); } VLOG(10) << op << " " << op->Name() << " Done "; - running_ops_--; ready_var_q->Extend(op->Outputs()); VLOG(10) << op << " " << op->Name() << " Signal posted"; } catch (...) { diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 0867f6210480ec405e7cc4ea42c74b750133ea4e..b9bccba8fa2fa13d99a9a39a5135106101daa903 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -15,18 +15,20 @@ #pragma once #include +#include #include +#include #include +#include #include #include #include - -#include #include "ThreadPool.h" // ThreadPool in thrird party #include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/fetch_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h" #include "paddle/fluid/framework/ir/graph.h" @@ -36,6 +38,12 @@ class Scope; namespace details { +struct OpDependentData { + std::unordered_map pending_ops_; + std::unordered_set pending_vars_; + std::unordered_set ready_ops_; +}; + class ThreadedSSAGraphExecutor : public SSAGraphExecutor { public: ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy, @@ -57,29 +65,35 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { private: ir::Graph *graph_; std::unique_ptr<::ThreadPool> pool_; + ::ThreadPool prepare_pool_; std::vector local_scopes_; std::vector places_; platform::DeviceContextPool fetch_ctxs_; ExceptionHolder exception_holder_; - std::atomic running_ops_; void InsertPendingOp(std::unordered_map *pending_ops, OpHandleBase *op_instance) const; void InsertPendingVar(std::unordered_set *pending_vars, - BlockingQueue *ready_vars, + std::unordered_set *ready_vars, VarHandleBase *var) const; void InsertFetchOps(const std::vector &fetch_tensors, std::vector *fetch_ops, std::unordered_set *fetch_dependencies, + std::unordered_set *ready_ops, std::unordered_map *pending_ops, std::unordered_set *pending_vars, - BlockingQueue *ready_vars, FeedFetchList *fetch_data); + void PrepareOpDeps(); + void CopyOpDeps(); + private: + std::future> op_deps_futures_; + ExecutionStrategy strategy_; + std::unique_ptr op_deps_; // use std::list because clear(), push_back, and for_each are O(1) std::list> run_op_futures_; }; diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h index 8321c32f8b1d73bf5e6080b4b314abc9fd20536d..93060ef2593cbc032a382b617f9690e392a15b63 100644 --- a/paddle/fluid/framework/details/var_handle.h +++ b/paddle/fluid/framework/details/var_handle.h @@ -43,6 +43,7 @@ struct VarHandleBase { virtual ~VarHandleBase(); virtual std::string DebugString() const = 0; + virtual const std::string& Name() const = 0; void AddInput(OpHandleBase* in, ir::Node* node) { node_->inputs.clear(); @@ -95,8 +96,6 @@ struct VarHandleBase { // // NOTE: runtime variables have place. struct VarHandle : public VarHandleBase { - explicit VarHandle(ir::Node* node) : VarHandleBase(node) {} - virtual ~VarHandle(); std::string DebugString() const override; @@ -109,6 +108,20 @@ struct VarHandle : public VarHandleBase { name_(std::move(name)), place_(std::move(place)) {} +#ifdef PADDLE_WITH_CUDA + bool HasEvent() { return has_event_; } + + const cudaEvent_t& GetEvent() { + PADDLE_ENFORCE(HasEvent(), "The event is not set."); + return event_; + } + + void SetGenerateEvent(const cudaEvent_t& event) { + has_event_ = true; + event_ = event; + } +#endif + // version field currently is not used, however, just store the version to // debug easily. private: @@ -116,6 +129,11 @@ struct VarHandle : public VarHandleBase { size_t scope_idx_; std::string name_; platform::Place place_; +#ifdef PADDLE_WITH_CUDA + // Only when this event is triggered, var is generated. + cudaEvent_t event_; + bool has_event_{false}; +#endif public: bool IsTheSameVar(const VarHandle& o) const { @@ -125,6 +143,7 @@ struct VarHandle : public VarHandleBase { size_t version() const { return version_; } size_t scope_idx() const { return scope_idx_; } + const std::string& Name() const override { return name_; } const std::string& name() const { return name_; } const platform::Place& place() const { return place_; } }; @@ -136,6 +155,10 @@ struct DummyVarHandle : public VarHandleBase { virtual ~DummyVarHandle(); std::string DebugString() const override; + + public: + const std::string& Name() const override { return name_; } + std::string name_{"DummyVar"}; }; } // namespace details diff --git a/paddle/fluid/framework/ir/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/cpu_quantize_pass.cc index edfaf47f018a61d72aa3764185f2c185722b553f..ed80f9cae347cfb2bf23859daea2f1f47dba599b 100644 --- a/paddle/fluid/framework/ir/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/cpu_quantize_pass.cc @@ -224,8 +224,8 @@ std::unique_ptr CPUQuantizePass::ApplyImpl( PADDLE_ENFORCE(param_scope()); + QuantizeConv(graph.get(), false /* with_residual_data */); QuantizeConv(graph.get(), true /* with_residual_data */); - QuantizeConv(graph.get()); QuantizePool(graph.get()); return graph; diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index b653e5a521eeb81d1ac3cb5cca1dc86025837ecd..d0d72127f08f4a83cca5daed57ae6d72c33ae1e3 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -599,10 +599,19 @@ bool VarLinksToOp(Node *node, const std::string &op_type) { bool IsNthInput(Node *var, Node *op, const std::string &argument, size_t nth) { PADDLE_ENFORCE(var->IsVar()); PADDLE_ENFORCE(op->IsOp()); - if (op->Op()->Input(argument).size() <= nth) return false; + if (!HasInput(op, argument) || op->Op()->Input(argument).size() <= nth) + return false; return var->Name() == op->Op()->Input(argument)[nth]; } +bool HasInput(Node *op, const std::string &argument) { + PADDLE_ENFORCE(op->IsOp()); + auto const &names = op->Op()->InputNames(); + if (std::find(names.begin(), names.end(), argument) == names.end()) + return false; + return true; +} + bool IsNthOutput(Node *var, Node *op, const std::string &argument, size_t nth) { PADDLE_ENFORCE(var->IsVar()); PADDLE_ENFORCE(op->IsOp()); @@ -1082,8 +1091,15 @@ PDNode *patterns::Conv::operator()() { PDNode *patterns::ConvResidual::operator()(bool with_residual_data) { auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); - if (!with_residual_data) - conv_op->assert_op_attr("fuse_residual_connection", false); + if (!with_residual_data) { + conv_op->assert_more([&](Node *x) { + auto node_names = x->Op()->InputNames(); + if (!HasInput(x, "ResidualData") || + x->Op()->Input("ResidualData").size() == 0) + return true; + return false; + }); + } auto input_var = pattern->NewNode(conv_input_repr()) ->AsInput() diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index fc30b5b21c580afdede64421bb4a1f4174bbad03..bac23b651305419a5bcc4fc1efacb721e6e5d0ad 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -305,6 +305,9 @@ bool VarLinksFromOp(Node* node, const std::string& op_type); // Check whether a var node is a op node's nth input. bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth); +// Check whether the op node has input of given name. +bool HasInput(Node* op, const std::string& argument); + // Tell whether a var node is a op node's nth output. bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth); diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h index 3d4dc9e2b6ecccddea4d63e45710c80d55ef2772..c071d9aed20bd40f5c1076d2dc5d3098a4e65495 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h @@ -14,12 +14,16 @@ limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { +/* + * Specifies which operators should use MKLDNN. + */ class MKLDNNPlacementPass : public Pass { protected: std::unique_ptr ApplyImpl( diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index ab96201b3399c9daf3cf7e132d3f088a5ab41e7d..1ba2bed886beb04d05856ac1235b7164e80f3676 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -874,23 +874,24 @@ std::vector* OperatorWithKernel::GetKernelConfig( return kernel_configs; } -RuntimeContext* OperatorWithKernel::GetRuntimeContext( - const Scope& scope) const { +void OperatorWithKernel::RunImpl(const Scope& scope, + const platform::Place& place) const { if (!HasAttr(kEnableCacheRuntimeContext)) { - return new RuntimeContext(Inputs(), Outputs(), scope); + RuntimeContext ctx(Inputs(), Outputs(), scope); + RunImpl(scope, place, &ctx); } else { const Scope* cur_scope = &scope; if (!runtime_ctx_ || pre_scope_ != cur_scope) { runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope)); pre_scope_ = cur_scope; } - return runtime_ctx_.get(); + RunImpl(scope, place, runtime_ctx_.get()); } } void OperatorWithKernel::RunImpl(const Scope& scope, - const platform::Place& place) const { - auto runtime_ctx = GetRuntimeContext(scope); + const platform::Place& place, + RuntimeContext* runtime_ctx) const { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 38bac8acde589eabdce6c7a47aeed6125115ebe4..77271334569e2fe65eabaaeccc4ce78a746fe732 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -461,7 +461,8 @@ class OperatorWithKernel : public OperatorBase { // same. proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const; void RunImpl(const Scope& scope, const platform::Place& place) const final; - RuntimeContext* GetRuntimeContext(const Scope& scope) const; + void RunImpl(const Scope& scope, const platform::Place& place, + RuntimeContext* runtime_ctx) const; /** * Transfer data from scope to a transfered scope. If there is no data need to diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 56f108cea2e5d7dadbea2e2cbec39dbe7f4ba094..20a8c47d5d85f5962d697b48ec1bdaad74cbe4d7 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -254,18 +254,29 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, member_->places_, nccl_id, build_strategy.num_trainers_, build_strategy.trainer_id_)); - std::unique_ptr dev_nccl_ctxs; - dev_nccl_ctxs.reset(new platform::NCCLContextMap(member_->places_)); - // Initialize device context's nccl comm - // Note, more than one ParallelExecutor with same place, the nccl comm will + // Initialize device context's nccl comm, will be used by normal + // Operators like sync_batch_norm, and collective ops. + // NOTE: more than one ParallelExecutor with same place, the nccl comm will // be rewrite and there will be some problem. + // NOTE: NCCL group-calls and non-group-calls can not use the same + // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use + // same communicators. + std::unique_ptr dev_nccl_ctxs; + if (nccl_id == nullptr) { + dev_nccl_ctxs.reset(new platform::NCCLContextMap(member_->places_)); + } for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { - auto &nccl_ctx = dev_nccl_ctxs->at(dev_id); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = static_cast( pool.Get(member_->places_[dev_id])); - dev_ctx->set_nccl_comm(nccl_ctx.comm()); + if (nccl_id != nullptr) { + auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[dev_id]); + dev_ctx->set_nccl_comm(nccl_ctx.comm()); + } else { + auto &nccl_ctx = dev_nccl_ctxs->at(member_->places_[dev_id]); + dev_ctx->set_nccl_comm(nccl_ctx.comm()); + } } #else PADDLE_THROW("Not compiled with CUDA"); diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 87f0f307d30bc90a43a698c3766b16c975f0635e..d79bf25518bbe624f2913839ec7d7d80816b3b69 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -34,7 +34,7 @@ DEFINE_double( "Memory size threshold (GB) when the garbage collector clear tensors." "Disabled when this value is less than 0"); -DEFINE_bool(fast_eager_deletion_mode, false, +DEFINE_bool(fast_eager_deletion_mode, true, "Fast eager deletion mode. If enabled, memory would release " "immediately without waiting GPU kernel ends."); diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 321deccf86718aad013c106b5a783161f96cbcb9..997f3575f457b67d4df5000705724b46cd8b951d 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -131,6 +131,15 @@ struct Argument { // Pass a set of op types to enable its mkldnn kernel DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes, std::unordered_set); + + // A set of op types to enable their quantized kernels + DECL_ARGUMENT_FIELD(quantize_enabled_op_types, QuantizeEnabledOpTypes, + std::unordered_set); + + // A set of op IDs to exclude from enabling their quantized kernels + DECL_ARGUMENT_FIELD(quantize_excluded_op_ids, QuantizeExcludedOpIds, + std::unordered_set); + // Scales for variables to be quantized DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 8fd86b2cc56c4af50e735be2d660ec3db23e1547..1556caa46412c8a2dacd44f2187666c6a1fda6bf 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/inference/analysis/ir_pass_manager.h" #include #include +#include #include #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" @@ -60,6 +61,13 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("mkldnn_enabled_op_types", new std::unordered_set( argument->mkldnn_enabled_op_types())); + } else if (pass_name == "cpu_quantize_placement_pass") { + pass->Set("quantize_enabled_op_types", + new std::unordered_set( + argument->quantize_enabled_op_types())); + pass->Set( + "quantize_excluded_op_ids", + new std::unordered_set(argument->quantize_excluded_op_ids())); } else if (pass_name == "cpu_quantize_pass") { pass->Set("quant_var_scales", new VarQuantScale(argument->quant_var_scales())); diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 4cad8a9dfc3fc6ba06a28a1ad3a5e4d43ec38395..1be25de497346913f24eec147a2db58b0f7065f4 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -118,9 +118,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(serialized_info_cache_); - // framework related. - CP_MEMBER(enable_runtime_context_cache_); - if (use_gpu_) { pass_builder_.reset(new GpuPassStrategy( *static_cast(other.pass_builder()))); @@ -205,6 +202,7 @@ void AnalysisConfig::Update() { // Append after the Affine_channel_conv_fuse pass. pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); } + pass_builder()->DeletePass("runtime_context_cache_pass"); } if (use_mkldnn_) { @@ -235,10 +233,6 @@ void AnalysisConfig::Update() { if (ir_debug_) { pass_builder()->TurnOnDebug(); } - - if (enable_runtime_context_cache_) { - pass_builder()->AppendPass("runtime_context_cache_pass"); - } } std::string AnalysisConfig::SerializeInfoCache() { @@ -272,7 +266,6 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << specify_input_name_; ss << cpu_math_library_num_threads_; - ss << enable_runtime_context_cache_; return ss.str(); } diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 3b7faa54000a26310d0117fe6f1e68cc404c461a..9b05c335047d7f9a0c50004e4ff6817ddd53d80f 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -194,23 +194,6 @@ struct AnalysisConfig { /** Tell whether the memory optimization is activated. */ bool enable_memory_optim() const; - // framework related - /** \brief Control whether to perform runtime context cache optimization. - * - * If turned off, in Op's every execution, RuntimeContext would be called to - * relate input/output names of this Op with the corresponding variables in - * Scope. - */ - void SwitchRuntimeContextCache(int x = true) { - enable_runtime_context_cache_ = x; - } - /** A boolean state tell whether the runtime context cache optimization is - * actived. - */ - bool runtime_context_cache_enabled() const { - return enable_runtime_context_cache_; - } - friend class ::paddle::AnalysisPredictor; /** NOTE just for developer, not an official API, easily to be broken. @@ -271,15 +254,6 @@ struct AnalysisConfig { int cpu_math_library_num_threads_{1}; - // framework related - // RuntimeContext is used to relate input/output names of Operator with - // the corresponding variables in Scope. - // If enable_runtime_context_cache_ is true, it means that in a same Scope, - // since the input/output names of this Op do not change in the execution, - // RuntimeContext could be created only at the first iteration of this Op's - // execution to save the elapsed time. - bool enable_runtime_context_cache_{false}; - // A runtime cache, shouldn't be transferred to others. std::string serialized_info_cache_; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 92c24647e87a096e7cfbbf69876b678fe48842a4..d413a418c88241a15808474f753a3900e0a5293e 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -80,6 +80,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { "conv_elementwise_add_act_fuse_pass", // "conv_elementwise_add2_act_fuse_pass", // "conv_elementwise_add_fuse_pass", // + "runtime_context_cache_pass", // #endif }); @@ -90,6 +91,10 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { use_gpu_ = true; } +void GpuPassStrategy::EnableQuantizer() { + LOG(ERROR) << "GPU not support quantization yet"; +} + void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { analysis_passes_.push_back(pass); } @@ -115,6 +120,7 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { "conv_eltwiseadd_bn_fuse_pass", // "is_test_pass", // "identity_scale_op_clean_pass", // + "runtime_context_cache_pass", // }); use_gpu_ = false; } diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 2524d89fcd1322e105ad2217347aa2380448f2bc..84645fef018ce41ee2cba7ae25d2b0c13e49dfc0 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -84,6 +84,10 @@ class PassStrategy : public PaddlePassBuilder { */ virtual void EnableMKLDNN() {} + /** Enable quantize optimization + */ + virtual void EnableQuantizer() {} + bool use_gpu() const { return use_gpu_; } virtual ~PassStrategy() = default; @@ -124,6 +128,16 @@ class CpuPassStrategy : public PassStrategy { use_mkldnn_ = false; #endif } + + void EnableQuantizer() override { + if (!use_quantizer_) { + passes_.push_back("cpu_quantize_placement_pass"); + } + use_quantizer_ = true; + } + + protected: + bool use_quantizer_{false}; }; /** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode. @@ -138,6 +152,7 @@ class GpuPassStrategy : public PassStrategy { } void EnableMKLDNN() override; + void EnableQuantizer() override; virtual ~GpuPassStrategy() = default; }; diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc index e1787a71775207d4d94f4005cffb82c2b24274e6..5157bd280d0f3ee327d5cee7799477b5e6fd3f71 100644 --- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc @@ -107,7 +107,6 @@ void SetConfig(AnalysisConfig *cfg) { cfg->DisableGpu(); cfg->SwitchSpecifyInputNames(); cfg->SwitchIrOptim(); - cfg->SwitchRuntimeContextCache(); if (FLAGS_zero_copy) { cfg->SwitchUseFeedFetchOps(false); } diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h index b7b39d4dd4675dd1bebec608914c2fe3153b360b..b0c23fbd534847c8aad244749761e9c072148796 100644 --- a/paddle/fluid/inference/tests/api/config_printer.h +++ b/paddle/fluid/inference/tests/api/config_printer.h @@ -72,8 +72,7 @@ std::ostream &operator<<(std::ostream &os, const AnalysisConfig &config) { } os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim() << "\n"; - os << GenSpaces(num_spaces) - << "use_runtime_context_cache: " << config.runtime_context_cache_enabled() + os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim() << "\n"; os << GenSpaces(num_spaces) << "use_feed_fetch_ops: " << config.use_feed_fetch_ops_enabled() << "\n"; diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt index c43eaf7f9849ee4a88ed95bdb8b6966da8760435..2104e4ac7222258ee025bd5acd60b1db251df654 100644 --- a/paddle/fluid/inference/utils/CMakeLists.txt +++ b/paddle/fluid/inference/utils/CMakeLists.txt @@ -1,4 +1,2 @@ cc_library(benchmark SRCS benchmark.cc DEPS enforce) cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark) -cc_binary(visualizer SRCS visualizer.cc DEPS analysis - paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes) diff --git a/paddle/fluid/inference/utils/visualizer.cc b/paddle/fluid/inference/utils/visualizer.cc deleted file mode 100644 index 7c0dd64dea88e51b24c4bc04818d633ee0d2f722..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/utils/visualizer.cc +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/inference/utils/visualizer.h" -#include -#include -#include -#include -#include "paddle/fluid/framework/ir/graph_viz_pass.h" -#include "paddle/fluid/inference/analysis/analyzer.h" -#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" -#include "paddle/fluid/platform/init.h" - -DEFINE_string(model_dir, "", "model directory"); -DEFINE_string(model_program_path, "", "model program path"); -DEFINE_string(model_params_path, "", "model params path"); - -using paddle::inference::analysis::Argument; - -namespace paddle { -namespace inference { -namespace utils { - -void Visualizer::SetArgument(Argument *argument) { argument_ = argument; } - -bool Visualizer::Run() { - paddle::framework::InitDevices(false); - paddle::inference::analysis::Analyzer().Run(argument_); - return true; -} - -} // namespace utils -} // namespace inference -} // namespace paddle - -// Generate a dot file describing the structure of graph. -// To use this tool, run command: ./visualizer [options...] -// Options: -// --model_dir: the directory of model -// --model_program_path: the path of program -// --model_params_path: the path of params -int main(int argc, char *argv[]) { - gflags::ParseCommandLineFlags(&argc, &argv, true); - google::InitGoogleLogging(argv[0]); - - paddle::inference::analysis::Argument argument; - argument.SetUseGPU(false); - argument.SetUseTensorRT(false); - - if (FLAGS_model_dir.empty()) { - if (FLAGS_model_program_path.empty() || FLAGS_model_params_path.empty()) { - LOG(ERROR) << "Please set model_dir" - " or model_program_path and model_params_path"; - return -1; - } else { - argument.SetModelProgramPath(FLAGS_model_program_path); - argument.SetModelParamsPath(FLAGS_model_params_path); - } - } else { - argument.SetModelDir(FLAGS_model_dir); - } - - // Only 1 pass, default filename is 0_ir_origin.dot - // For more details, looking for paddle::inference::analysis::IRPassManager - argument.SetIrAnalysisPasses({"infer_clean_graph_pass", "graph_viz_pass"}); - - std::unique_ptr scope{ - new paddle::framework::Scope()}; - argument.SetScopeNotOwned( - const_cast(scope.get())); - - paddle::inference::utils::Visualizer visualizer; - visualizer.SetArgument(&argument); - visualizer.Run(); - - return 0; -} - -USE_PASS(infer_clean_graph_pass); -USE_PASS(graph_viz_pass); -USE_PASS(graph_to_program_pass); diff --git a/paddle/fluid/inference/utils/visualizer.h b/paddle/fluid/inference/utils/visualizer.h deleted file mode 100644 index be532f92cf60e06094bfcf8cc2be85085795fcf4..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/utils/visualizer.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "paddle/fluid/inference/analysis/argument.h" - -namespace paddle { -namespace inference { -namespace utils { - -using paddle::inference::analysis::Argument; - -class Visualizer final { - public: - Visualizer() = default; - ~Visualizer() = default; - Visualizer(const Visualizer &) = delete; - Visualizer &operator=(const Visualizer &) = delete; - - void SetArgument(Argument *); - bool Run(); - - private: - Argument *argument_; -}; - -} // namespace utils -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 96cda01f7b6bd2358428002cb36cb1b0ef7cb500..09328aded58cb0cccd9de0aba399f5c49313042f 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -128,7 +128,7 @@ class ChunkedAllocator : public Allocator { allocator = WrapRetryAllocator(allocator, retry_time_); - return std::make_shared>(std::move(allocator)); + return std::make_shared>(std::move(allocator)); } bool IsAllocThreadSafe() const override { return true; } diff --git a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..67905973ff620a7e0fb863fef80778aceba7aeb2 --- /dev/null +++ b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include +#include + +#ifdef PADDLE_WITH_CUDA +DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_double(fraction_of_cuda_pinned_memory_to_use); +DECLARE_uint64(initial_gpu_memory_in_mb); +DECLARE_uint64(reallocate_gpu_memory_in_mb); +DECLARE_int64(gpu_allocator_retry_time); +#endif + +namespace paddle { +namespace memory { +namespace allocation { + +//! Run allocate test cases for different places +void AllocateTestCases() { + auto &instance = AllocatorFacade::Instance(); + platform::Place place; + size_t size = 1024; + + { + place = platform::CPUPlace(); + size = 1024; + auto cpu_allocation = instance.Alloc(place, size); + ASSERT_NE(cpu_allocation, nullptr); + ASSERT_NE(cpu_allocation->ptr(), nullptr); + ASSERT_EQ(cpu_allocation->place(), place); + ASSERT_EQ(cpu_allocation->size(), size); + } + +#ifdef PADDLE_WITH_CUDA + { + place = platform::CUDAPlace(0); + size = 1024; + auto gpu_allocation = instance.Alloc(place, size); + ASSERT_NE(gpu_allocation, nullptr); + ASSERT_NE(gpu_allocation->ptr(), nullptr); + ASSERT_EQ(gpu_allocation->place(), place); + ASSERT_GE(gpu_allocation->size(), size); + } + + { + // Allocate 2GB gpu memory + place = platform::CUDAPlace(0); + size = 2 * static_cast(1 << 30); + auto gpu_allocation = instance.Alloc(place, size); + ASSERT_NE(gpu_allocation, nullptr); + ASSERT_NE(gpu_allocation->ptr(), nullptr); + ASSERT_EQ(gpu_allocation->place(), place); + ASSERT_GE(gpu_allocation->size(), size); + } + + { + place = platform::CUDAPinnedPlace(); + size = (1 << 20); + auto cuda_pinned_allocation = + instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20); + ASSERT_NE(cuda_pinned_allocation, nullptr); + ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr); + ASSERT_EQ(cuda_pinned_allocation->place(), place); + ASSERT_GE(cuda_pinned_allocation->size(), size); + } +#endif +} + +TEST(Allocator, SpecifyGpuMemory) { +#ifdef PADDLE_WITH_CUDA + // Set to 0.0 to test FLAGS_initial_gpu_memory_in_mb and + // FLAGS_reallocate_gpu_memory_in_mb + FLAGS_fraction_of_gpu_memory_to_use = 0.0; + // 512 MB + FLAGS_initial_gpu_memory_in_mb = 512; + // 4 MB + FLAGS_reallocate_gpu_memory_in_mb = 4; + FLAGS_gpu_allocator_retry_time = 500; + FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5; +#endif + + AllocateTestCases(); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_facade_test.cc b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc similarity index 92% rename from paddle/fluid/memory/allocation/allocator_facade_test.cc rename to paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc index 802d79e15de253d4e67e35046bdf1d689258da6d..decdc62f1361a9c159b8ccb09910e0f164b35210 100644 --- a/paddle/fluid/memory/allocation/allocator_facade_test.cc +++ b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc @@ -19,6 +19,8 @@ #ifdef PADDLE_WITH_CUDA DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_cuda_pinned_memory_to_use); +DECLARE_uint64(initial_gpu_memory_in_mb); +DECLARE_uint64(reallocate_gpu_memory_in_mb); DECLARE_int64(gpu_allocator_retry_time); #endif @@ -26,13 +28,8 @@ namespace paddle { namespace memory { namespace allocation { -TEST(allocator, allocator) { -#ifdef PADDLE_WITH_CUDA - FLAGS_fraction_of_gpu_memory_to_use = 0.01; - FLAGS_gpu_allocator_retry_time = 500; - FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5; -#endif - +//! Run allocate test cases for different places +void AllocateTestCases() { auto &instance = AllocatorFacade::Instance(); platform::Place place; size_t size = 1024; @@ -82,6 +79,16 @@ TEST(allocator, allocator) { #endif } +TEST(Allocator, Allocator) { +#ifdef PADDLE_WITH_CUDA + FLAGS_fraction_of_gpu_memory_to_use = 0.01; + FLAGS_gpu_allocator_retry_time = 500; + FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5; +#endif + + AllocateTestCases(); +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index eac9fce58f981b63aab5c05e8dee0c715e7e2b19..0dc2de37467b7e7d23c88b4a255c14795db4c275 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -37,6 +37,8 @@ DEFINE_bool(init_allocated_mem, false, "that initializing the allocated memory with a small value " "during unit testing."); DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_uint64(initial_gpu_memory_in_mb); +DECLARE_uint64(reallocate_gpu_memory_in_mb); DECLARE_bool(benchmark); namespace paddle { @@ -148,6 +150,7 @@ class GPUBuddyAllocatorList { std::unique_ptr( new detail::GPUAllocator(dev_id)), platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + VLOG(10) << "\n\nNOTE:\n" << "You can set GFlags environment variable " << "'FLAGS_fraction_of_gpu_memory_to_use' " diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt index c725dba5e98c200c2542d97cb8f53a938f6b614a..a555b6b299228720c7559e610f4d6f31167e1555 100644 --- a/paddle/fluid/memory/detail/CMakeLists.txt +++ b/paddle/fluid/memory/detail/CMakeLists.txt @@ -9,3 +9,5 @@ endif(${WITH_GPU}) cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator) cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS memory_block system_allocator glog) + +cc_test(buddy_allocator_test SRCS buddy_allocator_test.cc DEPS buddy_allocator) diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index 26ef27c3caafadb4801b0ae52133f6175655ce0a..edd6ea4adec2e080d294fdb207d8dd4880fdcf79 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -13,6 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/memory/detail/buddy_allocator.h" + +#include +#include + #include "glog/logging.h" DEFINE_bool(free_idle_memory, false, @@ -36,9 +40,10 @@ BuddyAllocator::~BuddyAllocator() { "have actually been freed"; while (!pool_.empty()) { auto block = static_cast(std::get<2>(*pool_.begin())); - VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")"; + VLOG(10) << "Free from block (" << block << ", " << block->size(cache_) + << ")"; - system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); + system_allocator_->Free(block, block->size(cache_), block->index(cache_)); cache_.invalidate(block); pool_.erase(pool_.begin()); } @@ -71,7 +76,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { // refill the pool if failure if (it == pool_.end()) { - it = RefillPool(); + it = RefillPool(size); // if still failure, fail fatally if (it == pool_.end()) { return nullptr; @@ -184,19 +189,28 @@ void* BuddyAllocator::SystemAlloc(size_t size) { return static_cast(p)->data(); } -BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { +BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( + size_t request_bytes) { + size_t allocate_bytes = max_chunk_size_; + size_t index = 0; + #ifdef PADDLE_WITH_CUDA if (system_allocator_->UseGpu()) { if ((total_used_ + total_free_) == 0) { - // Compute the maximum allocation size for the first allocation. - max_chunk_size_ = platform::GpuMaxChunkSize(); + // Compute the allocation size for gpu for the first allocation. + allocate_bytes = std::max(platform::GpuInitAllocSize(), request_bytes); + } else { + // Reallocation size + if (realloc_size_ == 0) { + realloc_size_ = platform::GpuReallocSize(); + } + allocate_bytes = std::max(realloc_size_, request_bytes); } } #endif - // Allocate a new maximum sized block - size_t index = 0; - void* p = system_allocator_->Alloc(&index, max_chunk_size_); + // Allocate a new block + void* p = system_allocator_->Alloc(&index, allocate_bytes); if (p == nullptr) return pool_.end(); @@ -204,7 +218,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { << " from system allocator"; static_cast(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index, - max_chunk_size_, nullptr, nullptr); + allocate_bytes, nullptr, nullptr); // gpu fallback allocation if (system_allocator_->UseGpu() && @@ -212,10 +226,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { fallback_alloc_count_++; } - total_free_ += max_chunk_size_; + total_free_ += allocate_bytes; // dump the block into pool - return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first; + return pool_.insert(IndexSizeAddress(index, allocate_bytes, p)).first; } BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) { @@ -286,12 +300,12 @@ void BuddyAllocator::CleanIdleFallBackAlloc() { VLOG(10) << "Return block " << block << " to fallback allocator."; - system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); + system_allocator_->Free(block, block->size(cache_), block->index(cache_)); cache_.invalidate(block); pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); - total_free_ -= max_chunk_size_; + total_free_ -= block->size(cache_); fallback_alloc_count_--; // If no fall allocation exists, return directly @@ -322,12 +336,12 @@ void BuddyAllocator::CleanIdleNormalAlloc() { VLOG(10) << "Return block " << block << " to base allocator."; - system_allocator_->Free(block, max_chunk_size_, block->index(cache_)); + system_allocator_->Free(block, block->size(cache_), block->index(cache_)); cache_.invalidate(block); pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); - total_free_ -= max_chunk_size_; + total_free_ -= block->size(cache_); if (!shall_free_alloc()) return; } diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h index 3f86a51f0d0b8504bbc4b0477f123093b343e9cf..bdc8cca4b55e6fe67618fb13cd8bf40c2c24858b 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.h +++ b/paddle/fluid/memory/detail/buddy_allocator.h @@ -60,7 +60,7 @@ class BuddyAllocator { void* SystemAlloc(size_t size); /*! \brief If existing chunks are not suitable, refill pool */ - PoolSet::iterator RefillPool(); + PoolSet::iterator RefillPool(size_t request_bytes); /** * \brief Find the suitable chunk from existing pool and split @@ -89,6 +89,8 @@ class BuddyAllocator { size_t min_chunk_size_; // the minimum size of each chunk size_t max_chunk_size_; // the maximum size of each chunk + size_t realloc_size_ = 0; // the size of re-allocated chunk + private: /** * \brief A list of free allocation diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..1edc9f2034c87d4dbd655135c557bdb86ec4354d --- /dev/null +++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc @@ -0,0 +1,133 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/detail/buddy_allocator.h" + +#include + +#include "gflags/gflags.h" +#include "gtest/gtest.h" +#include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/platform/gpu_info.h" + +#ifdef PADDLE_WITH_CUDA +DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_uint64(initial_gpu_memory_in_mb); +DECLARE_uint64(reallocate_gpu_memory_in_mb); +#endif + +namespace paddle { +namespace memory { +namespace detail { + +constexpr static int test_gpu_id = 0; + +void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) { + bool freed = false; + size_t used_bytes = allocator->Used(); + + if (size_bytes > 0) { + void* p = allocator->Alloc(size_bytes); + + EXPECT_NE(p, nullptr); +#ifdef PADDLE_WITH_CUDA + if (size_bytes < platform::GpuMaxChunkSize()) { +#else + if (size_bytes < platform::CpuMaxChunkSize()) { +#endif + // Not allocate from SystemAllocator + EXPECT_GE(allocator->Used(), used_bytes + size_bytes); + } else { + // Allocate from SystemAllocator doesn't count in Used() + EXPECT_EQ(allocator->Used(), used_bytes); + } + + int* intp = static_cast(p); + std::shared_ptr ptr(intp, [&](void* p) { + allocator->Free(intp); + freed = true; + }); + } else { + freed = true; + } + + EXPECT_EQ(used_bytes, allocator->Used()); + EXPECT_TRUE(freed); +} + +#ifdef PADDLE_WITH_CUDA +TEST(BuddyAllocator, GpuFraction) { + FLAGS_fraction_of_gpu_memory_to_use = 0.01; + + BuddyAllocator buddy_allocator( + std::unique_ptr(new GPUAllocator(test_gpu_id)), + platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + + TestBuddyAllocator(&buddy_allocator, 10); + TestBuddyAllocator(&buddy_allocator, 10 << 10); + TestBuddyAllocator(&buddy_allocator, 10 << 20); + TestBuddyAllocator(&buddy_allocator, 2 * static_cast(1 << 30)); +} + +TEST(BuddyAllocator, InitRealloc) { + FLAGS_initial_gpu_memory_in_mb = 100; + FLAGS_reallocate_gpu_memory_in_mb = 50; + + EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast(100 << 20)); + + BuddyAllocator buddy_allocator( + std::unique_ptr(new GPUAllocator(test_gpu_id)), + platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + + // Less then initial size and reallocate size + TestBuddyAllocator(&buddy_allocator, 10 << 20); + // Between initial size and reallocate size and not exceed pool + TestBuddyAllocator(&buddy_allocator, 80 << 20); + // Less then reallocate size and exceed pool + TestBuddyAllocator(&buddy_allocator, 40 << 20); + // Greater then reallocate size and exceed pool + TestBuddyAllocator(&buddy_allocator, 80 << 20); + // Greater then initial size and reallocate size + TestBuddyAllocator(&buddy_allocator, 2 * static_cast(1 << 30)); +} + +TEST(BuddyAllocator, ReallocSizeGreaterThanInit) { + FLAGS_initial_gpu_memory_in_mb = 5; + FLAGS_reallocate_gpu_memory_in_mb = 10; + + EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast(10 << 20)); + + BuddyAllocator buddy_allocator( + std::unique_ptr(new GPUAllocator(test_gpu_id)), + platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + + // Less then initial size and reallocate size + TestBuddyAllocator(&buddy_allocator, 1 << 20); + // Between initial size and reallocate size and not exceed pool + TestBuddyAllocator(&buddy_allocator, 3 << 20); + // Less then initial size and exceed pool + TestBuddyAllocator(&buddy_allocator, 3 << 20); + // Less then reallocate size and not exceed pool (now pool is 15 MB, used 7 + // MB) + TestBuddyAllocator(&buddy_allocator, 7 << 20); + // Less then reallocate size and exceed pool + TestBuddyAllocator(&buddy_allocator, 8 << 20); + // Greater then initial size and reallocate size + TestBuddyAllocator(&buddy_allocator, 2 * static_cast(1 << 30)); +} +#endif + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 197d1c2f21fd818879aafe17599bc87d33caa198..41d79c5beb1367907a401b572d3d0eaf3a8ac67b 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -32,6 +32,9 @@ limitations under the License. */ DECLARE_bool(use_pinned_memory); DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_uint64(initial_gpu_memory_in_mb); +DECLARE_uint64(reallocate_gpu_memory_in_mb); + namespace paddle { namespace memory { namespace detail { @@ -119,11 +122,18 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { gpu_alloc_size_ += size; return p; } else { - LOG(WARNING) - << "Cannot malloc " << size / 1024.0 / 1024.0 - << " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use " - "environment variable to a lower value. Current value is " - << FLAGS_fraction_of_gpu_memory_to_use; + LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0 + << " MB GPU memory. Please shrink " + "FLAGS_fraction_of_gpu_memory_to_use or " + "FLAGS_initial_gpu_memory_in_mb or " + "FLAGS_reallocate_gpu_memory_in_mb" + "environment variable to a lower value. " + << "Current FLAGS_fraction_of_gpu_memory_to_use value is " + << FLAGS_fraction_of_gpu_memory_to_use + << ". Current FLAGS_initial_gpu_memory_in_mb value is " + << FLAGS_initial_gpu_memory_in_mb + << ". Current FLAGS_reallocate_gpu_memory_in_mb value is " + << FLAGS_reallocate_gpu_memory_in_mb; return nullptr; } } diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index f79960317aa1bac7ae9f8d80e4886dde8fe8ebcb..c87e4b22b37027efd1293e74f72598283946e62d 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -76,12 +76,16 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, const std::string& name) { framework::LibraryType library{framework::LibraryType::kPlain}; framework::DataLayout layout = framework::DataLayout::kAnyLayout; -#ifdef PADDLE_WITH_CUDA - auto it1 = oper.Attrs().find("use_cudnn"); - if (it1 != oper.Attrs().end() && platform::CanCUDNNBeUsed(ctx)) { - library = framework::LibraryType::kCUDNN; - } -#endif +// FIXME(liuwei1031) temporarily disable the code to unblock users +// TODO(liuwei1031) figure out the reason behind +// https://github.com/PaddlePaddle/Paddle/issues/16096 +// and re-enable this in the future +// #ifdef PADDLE_WITH_CUDA +// auto it1 = oper.Attrs().find("use_cudnn"); +// if (it1 != oper.Attrs().end() && platform::CanCUDNNBeUsed(ctx)) { +// library = framework::LibraryType::kCUDNN; +// } +// #endif #ifdef PADDLE_WITH_MKLDNN auto it = oper.Attrs().find("use_mkldnn"); if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() && @@ -188,6 +192,9 @@ $$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ UNUSED constexpr char SqrtDoc[] = R"DOC( Sqrt Activation Operator. +Please make sure legal input, when input a negative value closed to zero, +you should add a small epsilon(1e-12) to avoid negative number caused by numerical errors. + $out = \sqrt{x}$ )DOC"; diff --git a/paddle/fluid/operators/affine_channel_op.cc b/paddle/fluid/operators/affine_channel_op.cc index 8944a749674c3ba6c83526e4d66f449075716f43..268a5b894a95df8e27730879473b457a31e18cd6 100644 --- a/paddle/fluid/operators/affine_channel_op.cc +++ b/paddle/fluid/operators/affine_channel_op.cc @@ -67,6 +67,22 @@ class AffineChannelOp : public framework::OperatorWithKernel { "Input(Bias) of AffineChannelOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of AffineChannelOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto scale_dims = ctx->GetInputDim("Scale"); + auto b_dims = ctx->GetInputDim("Bias"); + const framework::DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); + + const int64_t C = (data_layout == framework::DataLayout::kNCHW + ? x_dims[1] + : x_dims[x_dims.size() - 1]); + + PADDLE_ENFORCE_EQ(scale_dims.size(), 1UL); + PADDLE_ENFORCE_EQ(scale_dims[0], C); + PADDLE_ENFORCE_EQ(b_dims.size(), 1UL); + PADDLE_ENFORCE_EQ(b_dims[0], C); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->ShareLoD("X", "Out"); } @@ -97,6 +113,27 @@ class AffineChannelOpGrad : public framework::OperatorWithKernel { } }; +class AffineChannelGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("affine_channel_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput("Scale", Input("Scale")); + + op->SetAttrMap(Attrs()); + + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale")); + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + + return std::unique_ptr(op); + } +}; + template using EigenArrayMap = Eigen::Map>; @@ -244,8 +281,7 @@ namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; REGISTER_OPERATOR(affine_channel, ops::AffineChannelOp, - ops::AffineChannelOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::AffineChannelOpMaker, ops::AffineChannelGradMaker); REGISTER_OPERATOR(affine_channel_grad, ops::AffineChannelOpGrad); REGISTER_OP_CPU_KERNEL(affine_channel, ops::AffineChannelKernel, diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index 194f9cf5033a3a73afeb8e92ddbdcc7b316fcd35..6e3c9f28649b9f15a2a78fc832ab5e52986fcf46 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -50,9 +50,19 @@ class ConcatOp : public framework::OperatorWithKernel { if (j == axis) { out_dims[axis] += ins[i][j]; } else { - PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j], - "Input tensors should have the same " - "elements except the specify axis."); + if (ctx->IsRuntime()) { + // check all shape in run time + PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j], + "Input tensors should have the same " + "elements except the specify axis."); + } else { + // not check -1 with other in compile time + if (out_dims[j] > 0 && ins[i][j] > 0) { + PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j], + "Input tensors should have the same " + "elements except the specify axis."); + } + } } } } diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index c994c6f642d286d9b52ada667058b064ff242ce6..baa39c0f9926efc233f9a228e055e2eb2116dbcc 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/conv_transpose_op.h" +#include #include #include @@ -344,6 +345,28 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType( ctx.GetPlace(), layout_, library_); } +class ConvTransposeGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType(ForwardOp().Type() + "_grad"); + op->SetInput("Input", Input("Input")); + op->SetInput("Filter", Input("Filter")); + op->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter")); + if (ForwardOp().Inputs().count("Bias") > 0) { + op->SetInput("Bias", Input("Bias")); + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + } + op->SetInput(framework::GradVarName("Output"), OutputGrad("Output")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle @@ -352,7 +375,7 @@ namespace ops = paddle::operators; // conv2d_transpose REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::ConvTransposeGradOpDescMaker); REGISTER_OPERATOR(conv2d_transpose_grad, ops::ConvTransposeOpGrad); REGISTER_OP_CPU_KERNEL( @@ -368,7 +391,7 @@ REGISTER_OP_CPU_KERNEL( // conv3d_transpose REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::ConvTransposeGradOpDescMaker); REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad); REGISTER_OP_CPU_KERNEL( @@ -384,7 +407,7 @@ REGISTER_OP_CPU_KERNEL( // depthwise conv2d_transpose REGISTER_OPERATOR(depthwise_conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::ConvTransposeGradOpDescMaker); REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc index 8f3644039f9950a8a70e2fd66c20837a5f52bd7f..30ec74d8442d2f42510220b825988b340f79d0a2 100644 --- a/paddle/fluid/operators/cos_sim_op.cc +++ b/paddle/fluid/operators/cos_sim_op.cc @@ -74,6 +74,9 @@ class CosSimOpMaker : public framework::OpProtoAndCheckerMaker { "Norm of the second input, reduced along the 1st " "dimension.") .AsIntermediate(); + AddAttr(framework::kAllKernelsMustComputeRuntimeShape, + "Skip calling InferShape() function in the runtime.") + .SetDefault(true); AddComment(R"DOC( **Cosine Similarity Operator** diff --git a/paddle/fluid/operators/cos_sim_op.h b/paddle/fluid/operators/cos_sim_op.h index 76cfc680518a3caaa68abc48cedf82ce7d21c8b8..0b4e3f774674112ddc268ba911e1df317d5edcca 100644 --- a/paddle/fluid/operators/cos_sim_op.h +++ b/paddle/fluid/operators/cos_sim_op.h @@ -28,17 +28,21 @@ class CosSimKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { // get Tensor - auto* in_x = context.Input("X"); + auto* in_x = context.Input("X"); auto* in_y = context.Input("Y"); - auto* out_z = context.Output("Out"); + auto* out_z = context.Output("Out"); auto* out_x_norm = context.Output("XNorm"); auto* out_y_norm = context.Output("YNorm"); - out_z->mutable_data(context.GetPlace()); - out_x_norm->mutable_data(context.GetPlace()); - out_y_norm->mutable_data(context.GetPlace()); int rows_x = in_x->dims()[0]; int rows_y = in_y->dims()[0]; + out_z->Resize({rows_x, 1}); + out_x_norm->Resize({rows_x, 1}); + out_y_norm->Resize({rows_y, 1}); + out_z->mutable_data(context.GetPlace()); + out_x_norm->mutable_data(context.GetPlace()); + out_y_norm->mutable_data(context.GetPlace()); + out_z->set_lod(in_x->lod()); int cols = framework::product(in_x->dims()) / rows_x; @@ -81,6 +85,7 @@ class CosSimGradKernel : public framework::OpKernel { if (rows_x == rows_y) { if (out_grad_x) { + out_grad_x->Resize(in_x->dims()); math::CosSimGradFunctor functor( in_x_norm->data(), in_y_norm->data(), in_x->data(), in_y->data(), in_z->data(), in_grad_z->data(), @@ -91,6 +96,7 @@ class CosSimGradKernel : public framework::OpKernel { for_range(functor); } if (out_grad_y) { + out_grad_y->Resize(in_y->dims()); math::CosSimGradFunctor functor( in_y_norm->data(), in_x_norm->data(), in_y->data(), in_x->data(), in_z->data(), in_grad_z->data(), @@ -102,6 +108,7 @@ class CosSimGradKernel : public framework::OpKernel { } } else { if (out_grad_x) { + out_grad_x->Resize(in_x->dims()); math::CosSimDxFunctor functor( in_x_norm->data(), in_y_norm->data(), in_x->data(), in_y->data(), in_z->data(), in_grad_z->data(), @@ -112,6 +119,7 @@ class CosSimGradKernel : public framework::OpKernel { for_range(functor); } if (out_grad_y) { + out_grad_y->Resize(in_y->dims()); out_grad_y->mutable_data(context.GetPlace()); math::SetConstant set_zero; auto& dev_ctx = context.template device_context(); diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.cc b/paddle/fluid/operators/distributed_ops/allreduce_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0fbc27515cec9f7982852954055aa929f678a096 --- /dev/null +++ b/paddle/fluid/operators/distributed_ops/allreduce_op.cc @@ -0,0 +1,143 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include // NOLINT +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +struct MutableDataFunctor { + MutableDataFunctor(void** data, framework::LoDTensor* tensor, + const platform::Place& place) + : data_(data), tensor_(tensor), place_(place) {} + + template + void apply() { + *data_ = tensor_->mutable_data(place_); + } + + void** data_; + framework::LoDTensor* tensor_; + platform::Place place_; +}; + +class AllReduceOp : public framework::OperatorBase { + using OperatorBase::OperatorBase; + + void RunImpl(const framework::Scope& scope, + const platform::Place& place) const override { + PADDLE_ENFORCE(is_gpu_place(place), + "AllReduce op can run on gpu place only for now."); +#ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* ctx = pool.Get(place); + auto in_names = Inputs("X"); + auto out_names = Outputs("Out"); + PADDLE_ENFORCE_EQ(in_names.size(), 1, "Only support one input"); + PADDLE_ENFORCE_EQ(out_names.size(), 1, "Only support one output"); + + auto* in = scope.FindVar(in_names[0]); + auto* out = scope.FindVar(out_names[0]); + + PADDLE_ENFORCE(in->IsType() || + out->IsType(), + "Only support allreduce LoDTensors"); + + int dtype = -1; + auto in_tensor = in->Get(); + dtype = platform::ToNCCLDataType(in_tensor.type()); + + int64_t numel = in_tensor.numel(); + auto* sendbuff = in_tensor.data(); + auto* out_tensor = out->GetMutable(); + out_tensor->Resize(in_tensor.dims()); + void* recvbuff = nullptr; + framework::VisitDataType(in_tensor.type(), + MutableDataFunctor(&recvbuff, out_tensor, place)); + + auto cuda_ctx = static_cast(ctx); + auto* comm = cuda_ctx->nccl_comm(); + // FIXME(typhoonzero): should use nccl stream here. + auto stream = cuda_ctx->stream(); + + int reduce_type = Attr("reduce_type"); + ncclRedOp_t red_type = ncclSum; + switch (reduce_type) { + case 0: + red_type = ncclSum; + break; + case 1: + red_type = ncclProd; + break; + case 2: + red_type = ncclMax; + break; + case 3: + red_type = ncclMin; + break; + } + + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + sendbuff, recvbuff, numel, static_cast(dtype), red_type, + comm, stream)); +#endif + } +}; + +class AllReduceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor), tensor to be allreduced."); + AddOutput("Out", "(Tensor) the result of allreduced."); + AddAttr("reduce_type", "(int) determin the reduce type.") + .SetDefault(0); + AddComment(R"DOC( +***AllReduce Operator*** + +Call NCCL AllReduce internally. Note that this op must be used when one +thread is managing one GPU device. + +For speed reasons, reduce_type should be an integer: + +0: sum +1: prod +2: max +3: min + +If input and output are the same variable, in-place allreduce will be used. +)DOC"); + } +}; + +class AllReduceOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override {} +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(allreduce, ops::AllReduceOp, + paddle::framework::EmptyGradOpMaker, ops::AllReduceOpMaker, + ops::AllReduceOpShapeInference); diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc index 2ccc86c1dc04a3afeb02b24677e6ebce40cca4fa..65c2ff6415c1d51fdc05d6014da589678761b676 100644 --- a/paddle/fluid/operators/dropout_op.cc +++ b/paddle/fluid/operators/dropout_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/dropout_op.h" +#include #include namespace paddle { @@ -70,7 +71,7 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { "1. downgrade_in_infer(default), downgrade the outcome at inference " "time" " train: out = input * mask" - " inference: out = input * dropout_prob" + " inference: out = input * (1.0 - dropout_prob)" "2. upscale_in_train, upscale the outcome at training time, do nothing " "in inference" " train: out = input * mask / ( 1.0 - dropout_prob )" @@ -106,21 +107,31 @@ class DropoutOpGrad : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(ctx->Attrs().Get("is_test"), false, "GradOp is only callable when is_test is false"); - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); PADDLE_ENFORCE(ctx->HasInput("Mask"), "Mask must not be null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@GRAD) must not be null."); - auto x_dims = ctx->GetInputDim("X"); auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); - PADDLE_ENFORCE_EQ(x_dims, out_dims, - "Dimensions of Input(X) and Out@Grad must be the same."); - auto mask_dims = ctx->GetInputDim("Mask"); - PADDLE_ENFORCE_EQ(x_dims, mask_dims, - "Dimensions of Input(X) and Mask must be the same."); - - ctx->SetOutputDim(framework::GradVarName("X"), x_dims); - ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + + ctx->SetOutputDim(framework::GradVarName("X"), out_dims); + ctx->ShareLoD(framework::GradVarName("Out"), + /*->*/ framework::GradVarName("X")); + } +}; + +class DropoutGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("dropout_grad"); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput("Mask", Output("Mask")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; } }; @@ -129,7 +140,7 @@ class DropoutOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::DropoutGradOpDescMaker); REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad); REGISTER_OP_CPU_KERNEL( dropout, ops::CPUDropoutKernel, diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..66c56da417487e3b2ee94ad572d83a971958ab62 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc @@ -0,0 +1,38 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h" +#include +#include "paddle/fluid/operators/elementwise/elementwise_op.h" + +namespace paddle { +namespace operators { +class ElementwiseFloorDivOpMaker : public ElementwiseOpMaker { + protected: + std::string GetName() const override { return "FloorDiv"; } + std::string GetEquation() const override { return "Out = X // Y"; } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp, + ops::ElementwiseFloorDivOpMaker); + +REGISTER_OP_CPU_KERNEL( + elementwise_floordiv, + ops::ElementwiseFloorDivKernel, + ops::ElementwiseFloorDivKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..60846d1e8fee1c7f68ac101f18355750c2c15a4d --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu @@ -0,0 +1,23 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h" +#include "paddle/fluid/platform/float16.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + elementwise_floordiv, + ops::ElementwiseFloorDivKernel, + ops::ElementwiseFloorDivKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h new file mode 100644 index 0000000000000000000000000000000000000000..2d24e394d5c823dbd22c837210e46cefeceba1be --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/operators/elementwise/elementwise_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" +#include "paddle/fluid/operators/math/blas.h" + +namespace paddle { +namespace operators { + +template +struct FloorDivFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a / b; } +}; + +template +void elementwise_floor_div(const framework::ExecutionContext &ctx, + const framework::Tensor *x, + const framework::Tensor *y, framework::Tensor *z) { + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>( + ctx, x, y, axis, FloorDivFunctor(), z); +} + +template +class ElementwiseFloorDivKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *x = ctx.Input("X"); + auto *y = ctx.Input("Y"); + auto *z = ctx.Output("Out"); + + z->mutable_data(ctx.GetPlace()); + + // dtype of x and y is int64 or int32 + elementwise_floor_div(ctx, x, y, z); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d63a7df03d0de7489a507825b066ab365e1ef8b9 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc @@ -0,0 +1,36 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise/elementwise_mod_op.h" +#include +#include "paddle/fluid/operators/elementwise/elementwise_op.h" + +namespace paddle { +namespace operators { +class ElementwiseModOpMaker : public ElementwiseOpMaker { + protected: + std::string GetName() const override { return "Mod"; } + std::string GetEquation() const override { return "Out = X % Y"; } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(elementwise_mod, ops::ElementwiseOp, + ops::ElementwiseModOpMaker); + +REGISTER_OP_CPU_KERNEL( + elementwise_mod, + ops::ElementwiseModKernel, + ops::ElementwiseModKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..da3304a83952d448ffcad61f1878b06d354168b9 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu @@ -0,0 +1,22 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/elementwise/elementwise_mod_op.h" +#include "paddle/fluid/platform/float16.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + elementwise_mod, ops::ElementwiseModKernel, + ops::ElementwiseModKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.h b/paddle/fluid/operators/elementwise/elementwise_mod_op.h new file mode 100644 index 0000000000000000000000000000000000000000..5b139fd4b33152b4a340c6c5a0f094338bbdffc8 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/operators/elementwise/elementwise_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" +#include "paddle/fluid/operators/math/blas.h" + +namespace paddle { +namespace operators { + +template +struct ModFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a % b; } +}; + +template +void elementwise_mod(const framework::ExecutionContext &ctx, + const framework::Tensor *x, const framework::Tensor *y, + framework::Tensor *z) { + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + ModFunctor(), z); +} + +template +class ElementwiseModKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *x = ctx.Input("X"); + auto *y = ctx.Input("Y"); + auto *z = ctx.Output("Out"); + + z->mutable_data(ctx.GetPlace()); + + // dtype of x and y is int64 or int32 + elementwise_mod(ctx, x, y, z); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc index 68c7227e5a7123e1e751dd55e243ee481bf36540..4a8937ba1c7ef9827ecc9bf575d9893c95a3b22b 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cc +++ b/paddle/fluid/operators/fake_dequantize_op.cc @@ -33,8 +33,51 @@ struct DequantizeFunctor { } }; +template +struct ChannelDequantizeFunctor { + void operator()(const platform::CPUDeviceContext& dev_ctx, + const framework::Tensor* in, const framework::Tensor** scales, + const int scale_num, T max_range, framework::Tensor* out) { + if (scale_num == 1) { + const int channel = in->dims()[0]; + const T* scale_factor = scales[0]->data(); + for (int i = 0; i < channel; i++) { + T s = scale_factor[i]; + framework::Tensor one_channel_in = in->Slice(i, i + 1); + framework::Tensor one_channel_out = out->Slice(i, i + 1); + auto in_e = framework::EigenVector::Flatten(one_channel_in); + auto out_e = framework::EigenVector::Flatten(one_channel_out); + auto& dev = *dev_ctx.eigen_device(); + out_e.device(dev) = (s / max_range) * in_e; + } + } else if (scale_num == 2) { + int batch_size = in->dims()[0]; + int channel = in->dims()[1]; + const T* scale_one = scales[0]->data(); + const T* scale_two = scales[1]->data(); + for (int i = 0; i < batch_size; i++) { + framework::Tensor one_batch_in = in->Slice(i, i + 1).Resize( + framework::slice_ddim(in->dims(), 1, in->dims().size())); + framework::Tensor one_batch_out = out->Slice(i, i + 1).Resize( + framework::slice_ddim(out->dims(), 1, out->dims().size())); + for (int j = 0; j < channel; j++) { + T s = scale_one[j]; + framework::Tensor one_channel_in = one_batch_in.Slice(j, j + 1); + framework::Tensor one_channel_out = one_batch_out.Slice(j, j + 1); + auto in_e = framework::EigenVector::Flatten(one_channel_in); + auto out_e = framework::EigenVector::Flatten(one_channel_out); + auto& dev = *dev_ctx.eigen_device(); + out_e.device(dev) = (s * scale_two[0] / max_range) * in_e; + } + } + } + } +}; + template struct DequantizeFunctor; template struct DequantizeFunctor; +template struct ChannelDequantizeFunctor; +template struct ChannelDequantizeFunctor; class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel { public: diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu index 35dcc69279d0119e75c4c5072e7817c839b9e819..02f9dc827d68cbb58447ed1557ff4bf310b2c017 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cu +++ b/paddle/fluid/operators/fake_dequantize_op.cu @@ -44,8 +44,66 @@ struct DequantizeFunctor { } }; +template +__global__ void DequantizeOneScale(const T* in, const T* scale, T max_range, + int num, int channel, T* out) { + int tid = threadIdx.x; + int channel_size = num / channel; + const T* in_c = in + blockIdx.x * channel_size; + T* out_c = out + blockIdx.x * channel_size; + for (int i = tid; i < channel_size; i += blockDim.x) { + out_c[i] = in_c[i] * scale[blockIdx.x] / max_range; + } +} + +template +__global__ void DequantizeTwoScale(const T* in, const T* scale_one, + const T* scale_two, T max_range, int num, + int batch_size, int channel, T* out) { + int tid = threadIdx.x; + int channel_size = num / (batch_size * channel); + int scale_index = blockIdx.x % channel; + const T* in_c = in + blockIdx.x * channel_size; + T* out_c = out + blockIdx.x * channel_size; + for (int i = tid; i < channel_size; i += blockDim.x) { + out_c[i] = in_c[i] * scale_one[scale_index] * scale_two[0] / max_range; + } +} + +template +struct ChannelDequantizeFunctor { + void operator()(const platform::CUDADeviceContext& dev_ctx, + const framework::Tensor* in, const framework::Tensor** scales, + const int scale_num, T max_range, framework::Tensor* out) { + const T* in_data = in->data(); + T* out_data = out->mutable_data(dev_ctx.GetPlace()); + if (scale_num == 1) { + int num = in->numel(); + int channel = in->dims()[0]; + const T* scale_factor = scales[0]->data(); + int block = 1024; + int grid = channel; + DequantizeOneScale<<>>( + in_data, scale_factor, max_range, num, channel, out_data); + } else if (scale_num == 2) { + int num = in->numel(); + int batch_size = in->dims()[0]; + int channel = in->dims()[1]; + const T* scale_one = scales[0]->data(); + const T* scale_two = scales[1]->data(); + int block = 1024; + int grid = batch_size * channel; + DequantizeTwoScale<<>>( + in_data, scale_one, scale_two, max_range, num, batch_size, channel, + out_data); + } + } +}; + template struct DequantizeFunctor; template struct DequantizeFunctor; +template struct ChannelDequantizeFunctor; +template struct ChannelDequantizeFunctor; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h index d05f2038531bbe9c35da54c94d2ef4d659acca70..ed9a0a4d65fab5ce1ef48835c332fade978d2bae 100644 --- a/paddle/fluid/operators/fake_dequantize_op.h +++ b/paddle/fluid/operators/fake_dequantize_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -28,6 +29,13 @@ struct DequantizeFunctor { framework::Tensor* out); }; +template +struct ChannelDequantizeFunctor { + void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in, + const framework::Tensor** scales, const int scale_num, + T max_range, framework::Tensor* out); +}; + template class FakeDequantizeMaxAbsKernel : public framework::OpKernel { public: @@ -54,32 +62,33 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel { auto scales = ctx.MultiInput("Scales"); auto* out = ctx.Output("Out"); - PADDLE_ENFORCE_EQ(scales[0]->numel(), in->dims()[0], - "The number of first scale values must be the same with " - "first dimension value of Input(X)."); - auto quant_bits = ctx.Attr>("quant_bits"); - int max_range = std::pow(2, quant_bits[0] - 1) - 1; + int max_range = 1; auto& dev_ctx = ctx.template device_context(); out->mutable_data(dev_ctx.GetPlace()); - - auto dequant = DequantizeFunctor(); - for (int64_t i = 0; i < in->dims()[0]; i++) { - framework::Tensor one_channel_in = in->Slice(i, i + 1); - framework::Tensor one_channel_out = out->Slice(i, i + 1); - framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1); - dequant(dev_ctx, &one_channel_in, &one_channel_scale, - static_cast(max_range), &one_channel_out); - } - - if (scales.size() == 2) { + int scale_num = scales.size(); + if (scale_num == 1) { + PADDLE_ENFORCE_EQ( + scales[0]->numel(), in->dims()[0], + "The number of first scale values must be the same with " + "first dimension value of Input(X) when the `Scales` has only one " + "element."); + max_range *= (std::pow(2, quant_bits[0] - 1) - 1); + } else if (scale_num == 2) { + PADDLE_ENFORCE_EQ( + scales[0]->numel(), in->dims()[1], + "The number of first scale values must be the same with " + "second dimension value of Input(X) when the `Scales` has two " + "elements."); PADDLE_ENFORCE_EQ( scales[1]->numel(), 1, "The second scale tensor should only have one value at now."); - max_range = std::pow(2, quant_bits[1] - 1) - 1; - dequant(dev_ctx, out, scales[1], static_cast(max_range), out); + max_range *= (std::pow(2, quant_bits[0] - 1) - 1) * + (std::pow(2, quant_bits[1] - 1) - 1); } + ChannelDequantizeFunctor()( + dev_ctx, in, scales.data(), scale_num, static_cast(max_range), out); } }; diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc index d51d51b4953073e9a350806f041bb3112fad239c..054ef4658cc0c4448d49870849017d3191d57db9 100644 --- a/paddle/fluid/operators/fake_quantize_op.cc +++ b/paddle/fluid/operators/fake_quantize_op.cc @@ -37,6 +37,21 @@ struct FindAbsMaxFunctor { template struct FindAbsMaxFunctor; +template +struct FindChannelAbsMaxFunctor { + void operator()(const platform::CPUDeviceContext& ctx, const T* in, + const int num, const int channel, T* out) { + const int channel_size = num / channel; + for (int i = 0; i < channel; i++) { + auto* start = in + i * channel_size; + auto* end = in + (i + 1) * channel_size; + out[i] = std::abs(*(std::max_element(start, end, Compare()))); + } + } +}; + +template struct FindChannelAbsMaxFunctor; + template struct ClipAndFakeQuantFunctor { void operator()(const platform::CPUDeviceContext& ctx, @@ -53,6 +68,36 @@ struct ClipAndFakeQuantFunctor { template struct ClipAndFakeQuantFunctor; +template +struct ChannelClipAndFakeQuantFunctor { + void operator()(const platform::CPUDeviceContext& ctx, + const framework::Tensor& in, const framework::Tensor& scale, + const int bin_cnt, const int channel, + framework::Tensor* out) { + auto* scale_data = scale.data(); + auto* in_data = in.data(); + auto* out_data = out->mutable_data(ctx.GetPlace()); + const int channel_size = in.numel() / channel; + platform::Transform trans; + for (int i = 0; i < channel; i++) { + T s = scale_data[i]; + auto* start = in_data + i * channel_size; + auto* end = in_data + (i + 1) * channel_size; + trans(ctx, start, end, out_data + i * channel_size, + ClipFunctor(-s, s)); + } + for (int i = 0; i < channel; i++) { + T s = scale_data[i]; + framework::Tensor one_channel_out = out->Slice(i, i + 1); + auto out_e = framework::EigenVector::Flatten(one_channel_out); + out_e.device(*ctx.eigen_device()) = (bin_cnt / s * out_e).round(); + } + } +}; + +template struct ChannelClipAndFakeQuantFunctor; + template struct FindRangeAbsMaxFunctor { void operator()(const platform::CPUDeviceContext& ctx, @@ -169,10 +214,10 @@ class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel { ctx->HasOutput("Out"), "Output(Out) of FakeChannelWiseQuantizeOp should not be null."); PADDLE_ENFORCE( - ctx->HasOutput("OutScales"), - "Output(Scales) of FakeChannelWiseQuantizeOp should not be null."); + ctx->HasOutput("OutScale"), + "Output(Scale) of FakeChannelWiseQuantizeOp should not be null."); ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - ctx->SetOutputDim("OutScales", {ctx->GetInputDim("X")[0]}); + ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[0]}); ctx->ShareLoD("X", /*->*/ "Out"); } @@ -192,7 +237,7 @@ class FakeChannelWiseQuantizeAbsMaxOpMaker AddOutput("Out", "(Tensor) Output of quantized low level tensor, " "but also saved as float data type."); - AddOutput("OutScales", "(Tensor) Current channel wise scale"); + AddOutput("OutScale", "(Tensor) Current channel wise scale"); AddAttr("bit_length", "(int, default 8)") .SetDefault(8) .AddCustomChecker([](const int& bit_length) { diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index 3707f6772eac0d568c170d60c17d431e254d0b6b..33bd275e5cc507ec700b3694cd8b1df9672ec512 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -74,6 +74,45 @@ struct FindAbsMaxFunctor { template struct FindAbsMaxFunctor; +template +__global__ void FindChannelAbsMaxKernel(const T* in, const int n, const int c, + T* out) { + int tid = threadIdx.x; + int channel_size = n / c; + const T* in_c = in + blockIdx.x * channel_size; + extern __shared__ T shared_max_data[]; + shared_max_data[tid] = T(0); + for (int i = tid; i < channel_size; i += blockDim.x) { + T tmp = fabs(in_c[i]); + if (tmp > shared_max_data[tid]) { + shared_max_data[tid] = tmp; + } + } + __syncthreads(); + for (int i = blockDim.x / 2; i > 0; i >>= 1) { + if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) { + shared_max_data[tid] = shared_max_data[tid + i]; + } + __syncthreads(); + } + if (tid == 0) { + out[blockIdx.x] = shared_max_data[0]; + } +} + +template +struct FindChannelAbsMaxFunctor { + void operator()(const platform::CUDADeviceContext& ctx, const T* in, + const int num, const int channel, T* out) { + int block = 1024; + int grid = channel; + FindChannelAbsMaxKernel<<>>( + in, num, channel, out); + } +}; + +template struct FindChannelAbsMaxFunctor; + template __global__ void ClipAndQuantKernel(const T* in, const T* scale, const int bin_cnt, const int n, T* out) { @@ -82,14 +121,76 @@ __global__ void ClipAndQuantKernel(const T* in, const T* scale, T s = scale[0]; for (int i = bid; i < n; i += blockDim.x * gridDim.x) { - T x = in[bid]; + T x = in[i]; T v = x > s ? s : x; v = v < -s ? -s : v; v = bin_cnt / s * v; - out[bid] = round(v); + out[i] = round(v); } } +template +struct ClipAndFakeQuantFunctor { + void operator()(const platform::CUDADeviceContext& ctx, + const framework::Tensor& in, const framework::Tensor& scale, + const int bin_cnt, framework::Tensor* out) { + int num = in.numel(); + int block = 1024; + int grid = (block - 1 + num) / block; + + const T* in_data = in.data(); + const T* scale_data = scale.data(); + T* out_data = out->mutable_data(ctx.GetPlace()); + + ClipAndQuantKernel<<>>( + in_data, scale_data, bin_cnt, num, out_data); + } +}; + +template struct ClipAndFakeQuantFunctor; + +template +__global__ void ChannelClipAndQuantKernel(const T* in, const T* scale, + const int bin_cnt, const int n, + const int c, T* out) { + int tid = threadIdx.x; + + int channel_size = n / c; + const T* in_c = in + blockIdx.x * channel_size; + T* out_c = out + blockIdx.x * channel_size; + + T s = scale[blockIdx.x]; + for (int i = tid; i < channel_size; i += blockDim.x) { + T x = in_c[i]; + T v = x > s ? s : x; + v = v < -s ? -s : v; + v = bin_cnt / s * v; + out_c[i] = round(v); + } +} + +template +struct ChannelClipAndFakeQuantFunctor { + void operator()(const platform::CUDADeviceContext& ctx, + const framework::Tensor& in, const framework::Tensor& scale, + const int bin_cnt, const int channel, + framework::Tensor* out) { + int num = in.numel(); + int block = 1024; + int grid = channel; + + const T* in_data = in.data(); + const T* scale_data = scale.data(); + T* out_data = out->mutable_data(ctx.GetPlace()); + + ChannelClipAndQuantKernel<<>>( + in_data, scale_data, bin_cnt, num, channel, out_data); + } +}; + +template struct ChannelClipAndFakeQuantFunctor; + template __global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale, const T* last_scale, @@ -182,26 +283,6 @@ struct FindMovingAverageAbsMaxFunctor { template struct FindMovingAverageAbsMaxFunctor; -template -struct ClipAndFakeQuantFunctor { - void operator()(const platform::CUDADeviceContext& ctx, - const framework::Tensor& in, const framework::Tensor& scale, - const int bin_cnt, framework::Tensor* out) { - int num = in.numel(); - int block = 1024; - int grid = (block - 1 + num) / block; - - const T* in_data = in.data(); - const T* scale_data = scale.data(); - T* out_data = out->mutable_data(ctx.GetPlace()); - - ClipAndQuantKernel<<>>( - in_data, scale_data, bin_cnt, num, out_data); - } -}; - -template struct ClipAndFakeQuantFunctor; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h index ec667e89e7699d87db9423f17014a2761ce62763..5ab38b086df7f9df33996ec83b5ec07047c204ba 100644 --- a/paddle/fluid/operators/fake_quantize_op.h +++ b/paddle/fluid/operators/fake_quantize_op.h @@ -42,6 +42,19 @@ struct FindRangeAbsMaxFunctor { framework::Tensor* scales_arr, framework::Tensor* out_scale); }; +template +struct FindChannelAbsMaxFunctor { + void operator()(const DeviceContext& ctx, const T* in, const int num, + const int channel, T* out); +}; + +template +struct ChannelClipAndFakeQuantFunctor { + void operator()(const DeviceContext& ctx, const framework::Tensor& in, + const framework::Tensor& scale, const int bin_cnt, + const int channel, framework::Tensor* out); +}; + template struct FindMovingAverageAbsMaxFunctor { void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum, @@ -78,29 +91,18 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel { auto* in = context.Input("X"); auto* out = context.Output("Out"); - auto* out_scales = context.Output("OutScales"); - T* out_scales_data = out_scales->mutable_data(context.GetPlace()); + auto* out_scale = context.Output("OutScale"); + T* out_scale_data = out_scale->mutable_data(context.GetPlace()); out->mutable_data(context.GetPlace()); int bit_length = context.Attr("bit_length"); int bin_cnt = std::pow(2, bit_length - 1) - 1; auto& dev_ctx = context.template device_context(); - auto find_abs_max = FindAbsMaxFunctor(); - for (int64_t i = 0; i < in->dims()[0]; i++) { - framework::Tensor one_channel = in->Slice(i, i + 1); - const T* one_channel_data = one_channel.data(); - find_abs_max(dev_ctx, one_channel_data, one_channel.numel(), - &out_scales_data[i]); - } - auto clip_quant = ClipAndFakeQuantFunctor(); - for (int64_t i = 0; i < in->dims()[0]; i++) { - framework::Tensor one_channel_in = in->Slice(i, i + 1); - framework::Tensor one_channel_out = out->Slice(i, i + 1); - framework::Tensor one_channel_scale = out_scales->Slice(i, i + 1); - clip_quant(dev_ctx, one_channel_in, one_channel_scale, bin_cnt, - &one_channel_out); - } + FindChannelAbsMaxFunctor()( + dev_ctx, in->data(), in->numel(), in->dims()[0], out_scale_data); + ChannelClipAndFakeQuantFunctor()( + dev_ctx, *in, *out_scale, bin_cnt, in->dims()[0], out); } }; diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc index b9db6daf0825b573bfc7f684266212f998c91627..9b1a854a312551732424e0d127a43328b8db6085 100644 --- a/paddle/fluid/operators/layer_norm_op.cc +++ b/paddle/fluid/operators/layer_norm_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/layer_norm_op.h" +#include namespace paddle { namespace operators { @@ -133,7 +134,7 @@ class LayerNormGradOp : public framework::OperatorWithKernel { } if (ctx->HasOutput(framework::GradVarName("Bias"))) { ctx->SetOutputDim(framework::GradVarName("Bias"), - ctx->GetInputDim("Bias")); + ctx->GetInputDim("Scale")); } } @@ -157,12 +158,39 @@ class LayerNormGradOp : public framework::OperatorWithKernel { } }; +class LayerNormGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("layer_norm_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Mean", Output("Mean")); + op->SetInput("Variance", Output("Variance")); + if (ForwardOp().Inputs().count("Scale") > 0) { + op->SetInput("Scale", Input("Scale")); + op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale")); + } + + if (ForwardOp().Inputs().count("Bias") > 0) { + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + } + + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::LayerNormGradOpDescMaker); REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp); REGISTER_OP_CPU_KERNEL( layer_norm, ops::LayerNormKernel, diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h index 8627c83b43cc0ff0f56417c0f7f67effa494cd37..db794ed42116144f310b9d7dc529cff49ba2c405 100644 --- a/paddle/fluid/operators/layer_norm_op.h +++ b/paddle/fluid/operators/layer_norm_op.h @@ -245,11 +245,9 @@ class LayerNormGradKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { const float epsilon = ctx.Attr("epsilon"); auto x = *ctx.Input("X"); - auto* y = ctx.Input("Y"); auto* mean = ctx.Input("Mean"); auto* var = ctx.Input("Variance"); auto* scale = ctx.Input("Scale"); - auto* bias = ctx.Input("Bias"); auto d_y = *ctx.Input(framework::GradVarName("Y")); const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); @@ -275,18 +273,13 @@ class LayerNormGradKernel : public framework::OpKernel { x.Resize(matrix_shape); temp.mutable_data(matrix_shape, ctx.GetPlace()); - if (!(bias && scale)) { - temp_norm.ShareDataWith(*y); - temp_norm.Resize(matrix_shape); - } else { - temp_norm.mutable_data(matrix_shape, ctx.GetPlace()); - // get x_norm - ElementwiseComputeEx, DeviceContext, T>( - ctx, &x, mean, /*axis*/ 0, SubFunctor(), &temp_norm); - ElementwiseComputeEx, DeviceContext, T>( - ctx, &temp_norm, var, /*axis*/ 0, - DivAndSqrtFunctor(static_cast(epsilon)), &temp_norm); - } + temp_norm.mutable_data(matrix_shape, ctx.GetPlace()); + // get x_norm + ElementwiseComputeEx, DeviceContext, T>( + ctx, &x, mean, /*axis*/ 0, SubFunctor(), &temp_norm); + ElementwiseComputeEx, DeviceContext, T>( + ctx, &temp_norm, var, /*axis*/ 0, + DivAndSqrtFunctor(static_cast(epsilon)), &temp_norm); } if (d_bias) { diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc index f5c802986e0573e81b3ab6187b57657b52b37215..2948cf71a911b296f8cee7ff9a2fb75f644dbe71 100644 --- a/paddle/fluid/operators/load_combine_op.cc +++ b/paddle/fluid/operators/load_combine_op.cc @@ -11,89 +11,27 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device_context.h" + +#include +#include + +#include "paddle/fluid/operators/load_combine_op.h" namespace paddle { namespace operators { -class LoadCombineOp : public framework::OperatorBase { +class LoadCombineOp : public framework::OperatorWithKernel { public: - LoadCombineOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto filename = Attr("file_path"); - auto load_as_fp16 = Attr("load_as_fp16"); - auto model_from_memory = Attr("model_from_memory"); - auto out_var_names = Outputs("Out"); - PADDLE_ENFORCE_GT( - static_cast(out_var_names.size()), 0, - "The number of output variables should be greater than 0."); - if (!model_from_memory) { - std::ifstream fin(filename, std::ios::binary); - PADDLE_ENFORCE(static_cast(fin), - "Cannot open file %s for load_combine op", filename); - LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names); - } else { - PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory"); - std::stringstream fin(filename, std::ios::in | std::ios::binary); - LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names); - } - } - void LoadParamsFromBuffer( - const framework::Scope &scope, const platform::Place &place, - std::istream *buffer, bool load_as_fp16, - const std::vector &out_var_names) const { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - for (size_t i = 0; i < out_var_names.size(); i++) { - auto *out_var = scope.FindVar(out_var_names[i]); - - PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found", - out_var_names[i]); - - auto *tensor = out_var->GetMutable(); - - // Error checking - PADDLE_ENFORCE(static_cast(*buffer), "Cannot read more"); - - // Get data from fin to tensor - DeserializeFromStream(*buffer, tensor, dev_ctx); - - auto in_dtype = tensor->type(); - auto out_dtype = - load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; - - if (in_dtype != out_dtype) { - // convert to float16 tensor - auto in_kernel_type = framework::OpKernelType(in_dtype, place); - auto out_kernel_type = framework::OpKernelType(out_dtype, place); - framework::LoDTensor fp16_tensor; - // copy LoD info to the new tensor - fp16_tensor.set_lod(tensor->lod()); - framework::TransDataType(in_kernel_type, out_kernel_type, *tensor, - &fp16_tensor); - - // reset output tensor - out_var->Clear(); - tensor = out_var->GetMutable(); - tensor->set_lod(fp16_tensor.lod()); - tensor->ShareDataWith(fp16_tensor); - } - } - buffer->peek(); - PADDLE_ENFORCE(buffer->eof(), - "You are not allowed to load partial data via " - "load_combine_op, use load_op instead."); + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override {} + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::OpKernelType kt = framework::OpKernelType( + framework::proto::VarType::FP32, ctx.GetPlace()); + return kt; } }; @@ -124,21 +62,30 @@ class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( LoadCombine Operator. -LoadCombine operator loads LoDTensor variables from a file, which could be -loaded in memory already. The file should contain one or more LoDTensors +LoadCombine operator loads LoDTensor variables from a file, which could be +loaded in memory already. The file should contain one or more LoDTensors serialized using the SaveCombine operator. The -LoadCombine operator applies a deserialization strategy to appropriately load -the LodTensors, and this strategy complements the serialization strategy used +LoadCombine operator applies a deserialization strategy to appropriately load +the LodTensors, and this strategy complements the serialization strategy used in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled -with the SaveCombine operator, and can only deserialize one or more LoDTensors +with the SaveCombine operator, and can only deserialize one or more LoDTensors that were saved using the SaveCombine operator. )DOC"); } }; + } // namespace operators } // namespace paddle + namespace ops = paddle::operators; REGISTER_OPERATOR(load_combine, ops::LoadCombineOp, ops::LoadCombineOpProtoMaker); + +REGISTER_OP_CPU_KERNEL( + load_combine, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel); diff --git a/paddle/fluid/operators/load_combine_op.cu b/paddle/fluid/operators/load_combine_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..2a42c0daa7fc58165e85d851c602a65ec287c905 --- /dev/null +++ b/paddle/fluid/operators/load_combine_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/load_combine_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + load_combine, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel); diff --git a/paddle/fluid/operators/load_combine_op.h b/paddle/fluid/operators/load_combine_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8f620ba7d2f1c2797ad4fd76a16af9aeee9c2806 --- /dev/null +++ b/paddle/fluid/operators/load_combine_op.h @@ -0,0 +1,102 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +template +class LoadCombineOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto place = ctx.GetPlace(); + auto filename = ctx.Attr("file_path"); + auto load_as_fp16 = ctx.Attr("load_as_fp16"); + auto model_from_memory = ctx.Attr("model_from_memory"); + auto &out_var_names = ctx.Outputs("Out"); + + PADDLE_ENFORCE_GT( + static_cast(out_var_names.size()), 0, + "The number of output variables should be greater than 0."); + if (!model_from_memory) { + std::ifstream fin(filename, std::ios::binary); + PADDLE_ENFORCE(static_cast(fin), + "Cannot open file %s for load_combine op", filename); + LoadParamsFromBuffer(ctx, place, &fin, load_as_fp16, out_var_names); + } else { + PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory"); + std::stringstream fin(filename, std::ios::in | std::ios::binary); + LoadParamsFromBuffer(ctx, place, &fin, load_as_fp16, out_var_names); + } + } + + void LoadParamsFromBuffer( + const framework::ExecutionContext &context, const platform::Place &place, + std::istream *buffer, bool load_as_fp16, + const std::vector &out_var_names) const { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + auto out_vars = context.MultiOutputVar("Out"); + + for (size_t i = 0; i < out_var_names.size(); i++) { + PADDLE_ENFORCE(out_vars[i] != nullptr, + "Output variable %s cannot be found", out_var_names[i]); + + auto *tensor = out_vars[i]->GetMutable(); + + // Error checking + PADDLE_ENFORCE(static_cast(*buffer), "Cannot read more"); + + // Get data from fin to tensor + DeserializeFromStream(*buffer, tensor, dev_ctx); + + auto in_dtype = tensor->type(); + auto out_dtype = + load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; + + if (in_dtype != out_dtype) { + // convert to float16 tensor + auto in_kernel_type = framework::OpKernelType(in_dtype, place); + auto out_kernel_type = framework::OpKernelType(out_dtype, place); + framework::LoDTensor fp16_tensor; + // copy LoD info to the new tensor + fp16_tensor.set_lod(tensor->lod()); + framework::TransDataType(in_kernel_type, out_kernel_type, *tensor, + &fp16_tensor); + + // reset output tensor + out_vars[i]->Clear(); + tensor = out_vars[i]->GetMutable(); + tensor->set_lod(fp16_tensor.lod()); + tensor->ShareDataWith(fp16_tensor); + } + } + buffer->peek(); + PADDLE_ENFORCE(buffer->eof(), + "You are not allowed to load partial data via " + "load_combine_op, use load_op instead."); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index 4bce4eba22e4a8900f8d12454fd233e17c9ad617..2d8e6ca854b55e01dacd1e0e7898ba59ea6078dc 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -11,89 +11,26 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/profiler.h" +#include + +#include "paddle/fluid/operators/load_op.h" namespace paddle { namespace operators { -class LoadOp : public framework::OperatorBase { +class LoadOp : public framework::OperatorWithKernel { public: - LoadOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - // FIXME(yuyang18): We save variable to local file now, but we should change - // it to save an output stream. - auto filename = Attr("file_path"); - std::ifstream fin(filename, std::ios::binary); - PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load op", - filename); + using framework::OperatorWithKernel::OperatorWithKernel; - auto out_var_name = Output("Out"); - auto *out_var = scope.FindVar(out_var_name); - PADDLE_ENFORCE(out_var != nullptr, - "Output variable %s cannot be found in scope %p", - out_var_name, &scope); + void InferShape(framework::InferShapeContext *ctx) const override {} - if (out_var->IsType()) { - LoadLodTensor(fin, place, out_var); - } else if (out_var->IsType()) { - LoadSelectedRows(fin, place, out_var); - } else { - PADDLE_ENFORCE( - false, - "Load only support LoDTensor and SelectedRows, %s has wrong type", - out_var_name); - } - } - - void LoadLodTensor(std::istream &fin, const platform::Place &place, - framework::Variable *var) const { - // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - auto *tensor = var->GetMutable(); - DeserializeFromStream(fin, tensor, dev_ctx); - - auto load_as_fp16 = Attr("load_as_fp16"); - auto in_dtype = tensor->type(); - auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; - - if (in_dtype != out_dtype) { - // convert to float16 tensor - auto in_kernel_type = framework::OpKernelType(in_dtype, place); - auto out_kernel_type = framework::OpKernelType(out_dtype, place); - framework::LoDTensor fp16_tensor; - // copy LoD info to the new tensor - fp16_tensor.set_lod(tensor->lod()); - framework::TransDataType(in_kernel_type, out_kernel_type, *tensor, - &fp16_tensor); - - // reset output tensor - var->Clear(); - tensor = var->GetMutable(); - tensor->set_lod(fp16_tensor.lod()); - tensor->ShareDataWith(fp16_tensor); - } - } - - void LoadSelectedRows(std::istream &fin, const platform::Place &place, - framework::Variable *var) const { - auto *selectedRows = var->GetMutable(); - // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - framework::DeserializeFromStream(fin, selectedRows, dev_ctx); - selectedRows->SyncIndex(); + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::OpKernelType kt = framework::OpKernelType( + framework::proto::VarType::FP32, platform::CPUPlace()); + return kt; } }; @@ -116,8 +53,15 @@ class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { "file."); } }; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(load, ops::LoadOp, ops::LoadOpProtoMaker); + +REGISTER_OP_CPU_KERNEL( + load, ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel); diff --git a/paddle/fluid/operators/load_op.cu b/paddle/fluid/operators/load_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..90f78110f8f349ebc834570c4fb9f15af24b144d --- /dev/null +++ b/paddle/fluid/operators/load_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/load_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + load, ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel); diff --git a/paddle/fluid/operators/load_op.h b/paddle/fluid/operators/load_op.h new file mode 100644 index 0000000000000000000000000000000000000000..3bf3c6bed2f0ddf352a2bad65b0d710097016b28 --- /dev/null +++ b/paddle/fluid/operators/load_op.h @@ -0,0 +1,102 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace operators { +template +class LoadOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto place = ctx.GetPlace(); + // FIXME(yuyang18): We save variable to local file now, but we should change + // it to save an output stream. + auto filename = ctx.Attr("file_path"); + std::ifstream fin(filename, std::ios::binary); + PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load op", + filename); + + auto out_var_name = ctx.Outputs("Out").data(); + auto *out_var = ctx.OutputVar("Out"); + + PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found ", + out_var_name); + + PADDLE_ENFORCE(out_var != nullptr, "Output variable cannot be found "); + + if (out_var->IsType()) { + LoadLodTensor(fin, place, out_var, ctx); + } else if (out_var->IsType()) { + LoadSelectedRows(fin, place, out_var); + } else { + PADDLE_ENFORCE( + false, + "Load only support LoDTensor and SelectedRows, %s has wrong type", + out_var_name); + } + } + + void LoadLodTensor(std::istream &fin, const platform::Place &place, + framework::Variable *var, + const framework::ExecutionContext &ctx) const { + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + auto *tensor = var->GetMutable(); + DeserializeFromStream(fin, tensor, dev_ctx); + + auto load_as_fp16 = ctx.Attr("load_as_fp16"); + auto in_dtype = tensor->type(); + auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; + + if (in_dtype != out_dtype) { + // convert to float16 tensor + auto in_kernel_type = framework::OpKernelType(in_dtype, place); + auto out_kernel_type = framework::OpKernelType(out_dtype, place); + framework::LoDTensor fp16_tensor; + // copy LoD info to the new tensor + fp16_tensor.set_lod(tensor->lod()); + framework::TransDataType(in_kernel_type, out_kernel_type, *tensor, + &fp16_tensor); + + // reset output tensor + var->Clear(); + tensor = var->GetMutable(); + tensor->set_lod(fp16_tensor.lod()); + tensor->ShareDataWith(fp16_tensor); + } + } + + void LoadSelectedRows(std::istream &fin, const platform::Place &place, + framework::Variable *var) const { + auto *selectedRows = var->GetMutable(); + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + framework::DeserializeFromStream(fin, selectedRows, dev_ctx); + selectedRows->SyncIndex(); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc index 7c8fe5fbd7629b2d82552135bc1b052dfbabeba0..a814c365d70ae91490e7fb50a0baf8fec05d97ef 100644 --- a/paddle/fluid/operators/lod_reset_op.cc +++ b/paddle/fluid/operators/lod_reset_op.cc @@ -32,7 +32,10 @@ class LoDResetOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_GT(level0.size(), 1, "If Input(Y) not provided, the target lod should be " "specified by attribute `target_lod`."); + } else { + ctx->ShareLoD("Y", "Out"); } + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); } diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu index 035e10dcbe4e2083723e47d7dda75ce267a9f141..1b433067900af71bb8a6833cef019d41f9c76858 100644 --- a/paddle/fluid/operators/math/sequence_padding.cu +++ b/paddle/fluid/operators/math/sequence_padding.cu @@ -78,12 +78,6 @@ class PaddingLoDTensorFunctor { "The numel of 'pad_value' can only be 1 or be equal to the " "'step_width'."); - if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) { - TensorCopy(seq_tensor, context.GetPlace(), context, pad_tensor); - pad_tensor->Resize(pad_tensor_dims); - return; - } - const int kBlockSize = 512; /* At least use 32 threads to copy sequence_width elements, @@ -129,12 +123,13 @@ class UnpaddingLoDTensorFunctor { CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len, step_width, layout); - + /* if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) { TensorCopy(pad_tensor, context.GetPlace(), context, seq_tensor); seq_tensor->Resize(seq_tensor_dims); return; } + */ const int kBlockSize = 512; diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index 242a1b9ae92ade0caf1b0f1fcb5458b8b7070d84..f18282745200cc8ef9460e60728d777112f2b798 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -290,8 +290,10 @@ class MatMulOp : public framework::OperatorWithKernel { context->Attrs().Get("transpose_Y")); PADDLE_ENFORCE_EQ(mat_dim_x.width_, mat_dim_y.height_); - PADDLE_ENFORCE(mat_dim_x.batch_size_ == mat_dim_y.batch_size_ || - mat_dim_x.batch_size_ == 0 || mat_dim_y.batch_size_ == 0); + if (context->IsRuntime()) { + PADDLE_ENFORCE(mat_dim_x.batch_size_ == mat_dim_y.batch_size_ || + mat_dim_x.batch_size_ == 0 || mat_dim_y.batch_size_ == 0); + } std::vector dim_out; if (mat_dim_x.batch_size_ != 0) { dim_out = framework::vectorize(dim_x); diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index 54c6a71111a2cc2f9e5004922ae5d3541a9d0a70..97387af92ffbd123ae6e795f17ef2273dadeab9d 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/concat_op.h" #include "paddle/fluid/platform/mkldnn_helper.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { namespace operators { @@ -38,15 +39,20 @@ static void EnforceLayouts(const std::vector inputs) { } static memory::primitive_desc CreateMemPrimDesc(const Tensor& input, - const mkldnn::engine& engine) { - constexpr auto data_type = mkldnn::memory::f32; + const mkldnn::engine& engine, + const memory::data_type& dt) { const auto dims = paddle::framework::vectorize2int(input.dims()); const auto format = input.format(); - auto description = memory::desc(dims, data_type, format); + auto description = memory::desc(dims, dt, format); auto mem_prim_desc = memory::primitive_desc(description, engine); return mem_prim_desc; } +static mkldnn::memory::format GetDstMemFormat( + const concat::primitive_desc& concat_pd) { + return (memory::format)concat_pd.dst_primitive_desc().desc().data.format; +} + static platform::CPUPlace GetCpuPlace( const paddle::framework::ExecutionContext& ctx) { auto place = ctx.GetPlace(); @@ -61,14 +67,30 @@ static const mkldnn::engine& GetMKLDNNEngine( return dev_ctx.GetEngine(); } +std::string CreateKey(const paddle::framework::ExecutionContext& ctx, + const std::vector multi_input, + const int64_t& concat_axis, const memory::data_type& dt) { + std::string key; + key.reserve(platform::MKLDNNHandler::MaxKeyLength); + for (size_t i = 0; i < multi_input.size(); i++) { + platform::MKLDNNHandler::AppendKeyDims( + &key, paddle::framework::vectorize2int(multi_input[i]->dims())); + } + platform::MKLDNNHandler::AppendKey(&key, std::to_string(concat_axis)); + platform::MKLDNNHandler::AppendKey(&key, ctx.op().Output("Out")); + platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt)); + return key; +} + template class ConcatPrimitiveFactory { public: concat::primitive_desc CreateConcatPrimDescriptor( const std::vector multi_input, Tensor* output, - int concat_axis, const mkldnn::engine& mkldnn_engine) { - CreateSourcesDescriptors(multi_input, mkldnn_engine); - auto dst_desc = CreateDstMemDescriptor(output); + int concat_axis, const mkldnn::engine& mkldnn_engine, + const memory::data_type& dt = memory::data_type::f32) { + CreateSourcesDescriptors(multi_input, mkldnn_engine, dt); + auto dst_desc = CreateDstMemDescriptor(output, dt); return concat::primitive_desc(dst_desc, concat_axis, srcs_pd); } @@ -79,23 +101,39 @@ class ConcatPrimitiveFactory { return concat(concat_pd, inputs, dst_mem.get()); } + void SetSrcDataHandleByIndex(const std::vector& srcs, const size_t& i, + void* handler) { + srcs[i].set_data_handle(handler); + } + + void SetDstDataHandle(const memory& dst_mem, void* handler) { + dst_mem.set_data_handle(handler); + } + + std::vector GetSrcs() { return srcs; } + + memory GetDst() { return dst_mem.get(); } + private: - memory::desc CreateDstMemDescriptor(Tensor* output) { + memory::desc CreateDstMemDescriptor(Tensor* output, + const memory::data_type& dt) { auto dst_dims = paddle::framework::vectorize2int(output->dims()); - return memory::desc(dst_dims, platform::MKLDNNGetDataType(), - memory::format::any); + return memory::desc(dst_dims, dt, memory::format::any); } mkldnn::memory CreateDstMemory(const concat::primitive_desc& concat_pd, - Tensor* output, platform::CPUPlace place) { + Tensor* output, + const platform::CPUPlace& place) { return memory(concat_pd.dst_primitive_desc(), output->mutable_data(place)); } void CreateSourcesDescriptors(const std::vector multi_input, - const mkldnn::engine& mkldnn_engine) { + const mkldnn::engine& mkldnn_engine, + const memory::data_type& dt) { for (size_t i = 0; i < multi_input.size(); i++) { - auto mem_prim_desc = CreateMemPrimDesc(*multi_input[i], mkldnn_engine); + auto mem_prim_desc = + CreateMemPrimDesc(*multi_input[i], mkldnn_engine, dt); srcs_pd.push_back(mem_prim_desc); srcs.push_back( memory(mem_prim_desc, to_void_cast(multi_input[i]->data()))); @@ -120,21 +158,59 @@ template class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - auto place = GetCpuPlace(ctx); - const auto& mkldnn_engine = GetMKLDNNEngine(ctx); - auto multi_input = ctx.MultiInput("X"); EnforceLayouts(multi_input); Tensor* output = ctx.Output("Out"); int64_t concat_axis = static_cast(ctx.Attr("axis")); + auto& dev_ctx = + ctx.template device_context(); + auto place = GetCpuPlace(ctx); + + memory::data_type dt = + paddle::framework::ToMKLDNNDataType(multi_input[0]->type()); ConcatPrimitiveFactory prim_creator; - auto concat_pd = prim_creator.CreateConcatPrimDescriptor( - multi_input, output, static_cast(concat_axis), mkldnn_engine); - auto concat = prim_creator.CreateConcatPrimitive(concat_pd, output, place); - stream(stream::kind::eager).submit({concat}).wait(); + std::string key = CreateKey(ctx, multi_input, concat_axis, dt); + const std::string key_prim = key + "@concat_p"; + const std::string key_concat_pd = key + "@concat_pd"; + const std::string key_srcs = key + "@concat_srcs"; + const std::string key_dst = key + "@concat_dst"; + + std::shared_ptr concat_pd; + std::shared_ptr> srcs; + std::shared_ptr dst_mem; + auto concat_p = std::static_pointer_cast(dev_ctx.GetBlob(key_prim)); + + if (concat_p == nullptr) { + const auto& mkldnn_engine = dev_ctx.GetEngine(); + concat_pd = std::make_shared( + prim_creator.CreateConcatPrimDescriptor(multi_input, output, + static_cast(concat_axis), + mkldnn_engine, dt)); + concat_p = std::make_shared( + prim_creator.CreateConcatPrimitive(*concat_pd, output, place)); + srcs = std::make_shared>(prim_creator.GetSrcs()); + dst_mem = std::make_shared(prim_creator.GetDst()); + dev_ctx.SetBlob(key_prim, concat_p); + dev_ctx.SetBlob(key_concat_pd, concat_pd); + dev_ctx.SetBlob(key_srcs, srcs); + dev_ctx.SetBlob(key_dst, dst_mem); + } else { + srcs = std::static_pointer_cast>( + dev_ctx.GetBlob(key_srcs)); + dst_mem = std::static_pointer_cast(dev_ctx.GetBlob(key_dst)); + concat_pd = std::static_pointer_cast( + dev_ctx.GetBlob(key_concat_pd)); + for (size_t i = 0; i < multi_input.size(); i++) { + prim_creator.SetSrcDataHandleByIndex( + *srcs, i, to_void_cast(multi_input[i]->data())); + } + prim_creator.SetDstDataHandle(*dst_mem, output->mutable_data(place)); + } + + stream(stream::kind::eager).submit({*concat_p}).wait(); - output->set_mkldnn_prim_desc(concat_pd.dst_primitive_desc()); + output->set_mkldnn_prim_desc(concat_pd->dst_primitive_desc()); } }; } // namespace operators @@ -143,4 +219,6 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace, - ops::ConcatMKLDNNOpKernel) + ops::ConcatMKLDNNOpKernel, + ops::ConcatMKLDNNOpKernel, + ops::ConcatMKLDNNOpKernel); diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.cc b/paddle/fluid/operators/ngraph/ngraph_engine.cc index cd32200e925193b393f4531b87ed6b1e4291109d..9f73bbc1fdc72766a0b57bc72c62d208277c2f20 100644 --- a/paddle/fluid/operators/ngraph/ngraph_engine.cc +++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc @@ -92,12 +92,10 @@ static std::vector> NgraphOpIntervals( int size = ops->size(); int left = 0; - while (left < size && ops->at(left)->Type() != framework::kFeedOpType) { + while (left < size && ops->at(left)->Type() != framework::kFeedOpType && + ops->at(left)->Type() != framework::kFetchOpType) { ++left; } - if (left == size) { - return intervals; - } while (left < size && ops->at(left)->Type() == framework::kFeedOpType) { for (auto& var_name_item : ops->at(left)->Outputs()) { @@ -112,10 +110,6 @@ static std::vector> NgraphOpIntervals( while (right < size && ops->at(right)->Type() != framework::kFetchOpType) { ++right; } - if (right == size) { - return intervals; - } - if (left >= right) return intervals; int index = right; while (index < size && ops->at(index)->Type() == framework::kFetchOpType) { @@ -127,6 +121,10 @@ static std::vector> NgraphOpIntervals( ++index; } + if (left == size || ops->at(left)->Type() == framework::kFetchOpType) { + left = 0; + } + // (left, right - 1) represents indices between feed and fetch int pivot = left; while (pivot < right) { @@ -234,6 +232,7 @@ NgraphEngine::NgraphEngine(const framework::Scope& scope, } void NgraphEngine::Prepare(const std::vector& interval) { + bool has_fetch = false, is_full = false; for (auto& var : p_bdesc->AllVars()) { if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS || var->GetType() == framework::proto::VarType::LOD_TENSOR || @@ -264,6 +263,9 @@ void NgraphEngine::Prepare(const std::vector& interval) { std::vector ops_desc; for (auto op_desc : p_bdesc->AllOps()) { ops_desc.emplace_back(op_desc); + if (op_desc->Type() == framework::kFetchOpType) { + has_fetch = true; + } } for (auto op_desc : ops_desc) { @@ -276,11 +278,11 @@ void NgraphEngine::Prepare(const std::vector& interval) { if (interval[0] > 0 && ops_desc.at(interval[0] - 1)->Type() == framework::kFeedOpType && interval[1] < static_cast(ops_desc.size()) && - ops_desc.at(interval.at(1))->Type() == framework::kFetchOpType) { - this->op_state_ = OpState::FULL; + ops_desc.at(interval[1])->Type() == framework::kFetchOpType) { + is_full = true; } - if (this->op_state_ == OpState::FULL) { + if (is_full) { this->op_state_ = this->is_test_ ? OpState::FULL_TEST : OpState::FULL_TRAIN; } else { this->op_state_ = @@ -293,7 +295,8 @@ void NgraphEngine::Prepare(const std::vector& interval) { framework::OpRegistry::CreateOp(*(ops_desc[idx]))); ++idx; } - while (ops_desc.at(idx)->Type() != framework::kFetchOpType) { + while (idx < static_cast(ops_desc.size()) && + ops_desc.at(idx)->Type() != framework::kFetchOpType) { auto op_desc = ops_desc.at(idx); for (auto& var_name_item : op_desc->Inputs()) { for (auto& var_name : var_name_item.second) { @@ -303,6 +306,10 @@ void NgraphEngine::Prepare(const std::vector& interval) { ++idx; } + if (!has_fetch) { + op_state_ = OpState::UNKNOWN; + } + BuildNgIO(ops_desc, interval); } @@ -318,7 +325,8 @@ void NgraphEngine::BuildNgIO(const std::vector& ops_desc, const bool is_output = outputs.find(var_name) != outputs.end(); if (!is_output && std::find(var_in_.begin(), var_in_.end(), var_name) == - var_in_.end()) { + var_in_.end() && + scope_.FindVar(var_name)) { // fill var_in here to keep lhs and rhs order this->var_in_.emplace_back(var_name); } @@ -378,6 +386,7 @@ void NgraphEngine::BuildNgIO(const std::vector& ops_desc, } } } + for (size_t i = 0; i < var_in_.size(); ++i) { auto var_name = var_in_[i]; if (persistables_.find(var_name) == persistables_.end()) { diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.h b/paddle/fluid/operators/ngraph/ngraph_engine.h index fef51464b5702e61d052f28050f6aefaecf0f615..b6532519e947bc59f0605c4f2008270f5e51b0e0 100644 --- a/paddle/fluid/operators/ngraph/ngraph_engine.h +++ b/paddle/fluid/operators/ngraph/ngraph_engine.h @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_ -#define PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_ +#pragma once + #include #include #include @@ -35,7 +35,6 @@ enum class OpState { /* nGraph support state on ops */ PARTIAL_TRAIN, /* Support partial ops for train */ FULL_TEST, /* Support full list of ops for test */ PARTIAL_TEST, /* Support partial list of ops for test */ - FULL, /* All ops supported from feed to fetch */ UNKNOWN /* Output all for debug purpose */ }; @@ -119,4 +118,3 @@ class NgraphEngine { } // namespace operators } // namespace paddle -#endif // PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_ diff --git a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h index be36b9d21ef6ebe5c11d783462e7dc564afe2aba..c92ebb7e96fa22f8fd463c5837134cd74542766c 100644 --- a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h +++ b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h @@ -27,13 +27,9 @@ namespace paddle { namespace operators { namespace ngraphs { -void BuildCrossEntropyNode( - const std::shared_ptr& op, - std::shared_ptr< - std::unordered_map>> - ngb_node_map) { - auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); - auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map); +std::shared_ptr GetCrossEntropy( + std::shared_ptr x, std::shared_ptr label, + const bool is_soft_label, int ignore_index) { auto label_shape = label->get_shape(); auto x_shape = x->get_shape(); auto label_rank = label_shape.size(); @@ -46,18 +42,16 @@ void BuildCrossEntropyNode( label_2d = paddle::platform::NgReshaper(label, label_2d_shape); } if (x_rank > 2) { - x_2d_shape = paddle::platform::FlattenTo2d(x_shape, x_rank - 1); - x_2d = paddle::platform::NgReshaper(x, x_2d_shape); + x_2d_shape = platform::FlattenTo2d(x_shape, x_rank - 1); + x_2d = platform::NgReshaper(x, x_2d_shape); } auto batch_size = x_2d_shape.at(0); - auto op_attrs = paddle::framework::AttrReader(op->Attrs()); - const bool is_soft_label = op_attrs.Get("soft_label"); std::shared_ptr node_1_hot = label_2d; if (!is_soft_label) { - auto label_1d = paddle::platform::NgReshaper( - label_2d, ngraph::Shape{label_2d_shape.at(0)}); + auto label_1d = + platform::NgReshaper(label_2d, ngraph::Shape{label_2d_shape.at(0)}); node_1_hot = std::make_shared(label_1d, x_2d_shape, 1); } if (x->get_element_type() != node_1_hot->get_element_type()) { @@ -76,11 +70,9 @@ void BuildCrossEntropyNode( auto node_sum = std::make_shared(node_mul, ngraph::AxisSet{1}); auto node_neg = std::make_shared(node_sum); - auto xe = - paddle::platform::NgReshaper(node_neg, ngraph::Shape{batch_size, 1}); + auto xe = platform::NgReshaper(node_neg, ngraph::Shape{batch_size, 1}); if (!is_soft_label) { - auto ignore_index = op_attrs.Get("ignore_index"); auto ignore_node = ngraph::op::Constant::create( label->get_element_type(), label_2d_shape, {ignore_index}); auto not_equal_node = @@ -89,21 +81,13 @@ void BuildCrossEntropyNode( xe->get_element_type()); xe = xe * mask; } - - paddle::platform::SetOutputNode(op, "Y", xe, ngb_node_map); + return xe; } -void BuildCrossEntropyGradNode( - const std::shared_ptr& op, - std::shared_ptr< - std::unordered_map>> - ngb_node_map) { - auto op_attrs = paddle::framework::AttrReader(op->Attrs()); - const bool is_soft_label = op_attrs.Get("soft_label"); - - auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); - auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map); - auto dy = paddle::platform::GetInputNode(op, "Y@GRAD", ngb_node_map); +std::shared_ptr GetCrossEntropyGrad( + std::shared_ptr x, std::shared_ptr label, + std::shared_ptr dy, const bool is_soft_label, + int ignore_index) { auto x_shape = x->get_shape(); auto rank = x_shape.size(); @@ -111,9 +95,8 @@ void BuildCrossEntropyGradNode( if (!is_soft_label) { auto label_shape = label->get_shape(); label_shape.pop_back(); - label = paddle::platform::NgReshaper(label, label_shape); + label = platform::NgReshaper(label, label_shape); - auto ignore_index = op_attrs.Get("ignore_index"); auto ignore_node = ngraph::op::Constant::create( label->get_element_type(), label_shape, {ignore_index}); auto not_equal_node = @@ -128,7 +111,7 @@ void BuildCrossEntropyGradNode( auto dy_shape = dy->get_shape(); dy_shape.pop_back(); - auto dy_reshape = paddle::platform::NgReshaper(dy, dy_shape); + auto dy_reshape = platform::NgReshaper(dy, dy_shape); auto dy_bcast = std::make_shared( dy_reshape, x_shape, ngraph::AxisSet{rank - 1}); if (x->get_element_type() != label->get_element_type()) { @@ -140,7 +123,35 @@ void BuildCrossEntropyGradNode( if (!is_soft_label) { xe_grad = xe_grad * mask; } + return xe_grad; +} +void BuildCrossEntropyNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map); + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + const bool is_soft_label = op_attrs.Get("soft_label"); + int ignore_index = op_attrs.Get("ignore_index"); + auto xe = GetCrossEntropy(x, label, is_soft_label, ignore_index); + paddle::platform::SetOutputNode(op, "Y", xe, ngb_node_map); +} + +void BuildCrossEntropyGradNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + const bool is_soft_label = op_attrs.Get("soft_label"); + int ignore_index = op_attrs.Get("ignore_index"); + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map); + auto dy = paddle::platform::GetInputNode(op, "Y@GRAD", ngb_node_map); + auto xe_grad = GetCrossEntropyGrad(x, label, dy, is_soft_label, ignore_index); paddle::platform::SetOutputNode(op, "X@GRAD", xe_grad, ngb_node_map); } } // namespace ngraphs diff --git a/paddle/fluid/operators/ngraph/ops/softmax_op.h b/paddle/fluid/operators/ngraph/ops/softmax_op.h index 7d5720c460c4194ce06670a715b8d7ff4435bb2a..174b7a91a8dd0e3edb06f224c3914e24c6c4a96d 100644 --- a/paddle/fluid/operators/ngraph/ops/softmax_op.h +++ b/paddle/fluid/operators/ngraph/ops/softmax_op.h @@ -27,12 +27,7 @@ namespace paddle { namespace operators { namespace ngraphs { -void BuildSoftmaxNode( - const std::shared_ptr& op, - std::shared_ptr< - std::unordered_map>> - ngb_node_map) { - auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); +std::shared_ptr GetSoftmax(std::shared_ptr x) { auto x_shape = x->get_shape(); int rank = x_shape.size(); auto x_2d_shape = paddle::platform::FlattenTo2d(x_shape, rank - 1); @@ -47,16 +42,11 @@ void BuildSoftmaxNode( -64., x_shifted); auto softmax = std::make_shared(x_clipped, ngraph::AxisSet{1}); - paddle::platform::SetOutputNode(op, "Out", softmax, ngb_node_map); + return softmax; } -void BuildSoftmaxGradNode( - const std::shared_ptr& op, - std::shared_ptr< - std::unordered_map>> - ngb_node_map) { - auto out = paddle::platform::GetInputNode(op, "Out", ngb_node_map); - auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map); +std::shared_ptr GetSoftmaxGrad( + std::shared_ptr out, std::shared_ptr dout) { auto out_shape = out->get_shape(); int rank = out_shape.size(); auto out_2d_shape = paddle::platform::FlattenTo2d(out_shape, rank - 1); @@ -70,6 +60,27 @@ void BuildSoftmaxGradNode( auto node_bcast = std::make_shared( node_sum, out_2d_shape, ngraph::AxisSet{1}); auto dx = (dout - node_bcast) * out; + return dx; +} + +void BuildSoftmaxNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + auto softmax = GetSoftmax(x); + paddle::platform::SetOutputNode(op, "Out", softmax, ngb_node_map); +} + +void BuildSoftmaxGradNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto out = paddle::platform::GetInputNode(op, "Out", ngb_node_map); + auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map); + auto dx = GetSoftmaxGrad(out, dout); paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map); } } // namespace ngraphs diff --git a/paddle/fluid/operators/ngraph/ops/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/ngraph/ops/softmax_with_cross_entropy_op.h new file mode 100644 index 0000000000000000000000000000000000000000..a6bdf4de9522e08caf4a9ae606db8277f98cdab3 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/softmax_with_cross_entropy_op.h @@ -0,0 +1,90 @@ +/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/cross_entropy_op.h" +#include "paddle/fluid/operators/ngraph/ops/softmax_op.h" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +void BuildSoftmaxWithCrossEntropyNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto logits = paddle::platform::GetInputNode(op, "Logits", ngb_node_map); + auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map); + auto softmax = paddle::operators::ngraphs::GetSoftmax(logits); + + auto op_attrs = framework::AttrReader(op->Attrs()); + const bool is_soft_label = op_attrs.Get("soft_label"); + int ignore_index = op_attrs.Get("ignore_index"); + auto xe = paddle::operators::ngraphs::GetCrossEntropy( + softmax, label, is_soft_label, ignore_index); + + paddle::platform::SetOutputNode(op, "Softmax", softmax, ngb_node_map); + paddle::platform::SetOutputNode(op, "Loss", xe, ngb_node_map); +} + +void BuildSoftmaxWithCrossEntropyGradNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto op_attrs = framework::AttrReader(op->Attrs()); + const bool is_soft_label = op_attrs.Get("soft_label"); + auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map); + auto softmax = paddle::platform::GetInputNode(op, "Softmax", ngb_node_map); + auto loss_grad = + paddle::platform::GetInputNode(op, "Loss@GRAD", ngb_node_map); + auto softmax_shape = softmax->get_shape(); + auto rank = softmax_shape.size(); + if (!is_soft_label) { + auto label_shape = label->get_shape(); + label_shape.pop_back(); + label = platform::NgReshaper(label, label_shape); + + label = + std::make_shared(label, softmax_shape, rank - 1); + } + + auto loss_grad_shape = loss_grad->get_shape(); + loss_grad_shape.pop_back(); + auto loss_grad_reshape = platform::NgReshaper(loss_grad, loss_grad_shape); + auto loss_grad_bcast = std::make_shared( + loss_grad_reshape, softmax_shape, ngraph::AxisSet{rank - 1}); + if (softmax->get_element_type() != label->get_element_type()) { + label = std::make_shared(label, + softmax->get_element_type()); + } + + auto logits_grad = loss_grad_bcast * (softmax - label); + paddle::platform::SetOutputNode(op, "Logits@GRAD", logits_grad, ngb_node_map); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle + +REGISTER_NG_OP(softmax_with_cross_entropy, BuildSoftmaxWithCrossEntropyNode); +REGISTER_NG_OP(softmax_with_cross_entropy_grad, + BuildSoftmaxWithCrossEntropyGradNode); diff --git a/paddle/fluid/operators/range_op.cc b/paddle/fluid/operators/range_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ee8c68fd008c8c9764e9ef74dc37fa08cf31be19 --- /dev/null +++ b/paddle/fluid/operators/range_op.cc @@ -0,0 +1,69 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/range_op.h" + +namespace paddle { +namespace operators { + +class RangeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + if (ctx->HasInput("Start")) { + auto s_dims = ctx->GetInputDim("Start"); + PADDLE_ENFORCE((s_dims.size() == 1) && (s_dims[0] == 1), + "The shape of Input(Start) should be [1]."); + } + if (ctx->HasInput("End")) { + auto e_dims = ctx->GetInputDim("End"); + PADDLE_ENFORCE((e_dims.size() == 1) && (e_dims[0] == 1), + "The shape of Input(End) should be [1]."); + } + if (ctx->HasInput("Step")) { + auto step_dims = ctx->GetInputDim("Step"); + PADDLE_ENFORCE((step_dims.size() == 1) && (step_dims[0] == 1), + "The shape of Input(Step) should be [1]."); + } + ctx->SetOutputDim("Out", {-1}); + } +}; + +class RangeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Start", + "Start of interval. The interval includes this value. It is a " + "tensor with shape=[1]."); + AddInput("End", + "End of interval. The interval does not include this value, " + "except in some cases where step is not an integer and floating " + "point round-off affects the length of out. It is a tensor with " + "shape=[1]."); + AddInput("Step", "Spacing between values. It is a tensor with shape=[1]."); + AddOutput("Out", "A sequence of numbers."); + AddComment(R"DOC( + Return evenly spaced values within a given interval. Values are generated within the half-open interval [start, stop) (in other words, the interval including start but excluding stop). Like arange function of numpy. +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(range, ops::RangeOp, ops::RangeOpMaker); +REGISTER_OP_CPU_KERNEL(range, ops::CPURangeKernel, + ops::CPURangeKernel, ops::CPURangeKernel, + ops::CPURangeKernel); diff --git a/paddle/fluid/operators/range_op.cu b/paddle/fluid/operators/range_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..e2c03716d55ee41ce3a9053b48b5c6d4c70e391f --- /dev/null +++ b/paddle/fluid/operators/range_op.cu @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/range_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__global__ void RangeKernel(T start, T step, int64_t size, T* out) { + CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; } +} + +template +class CUDARangeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* start_t = context.Input("Start"); + auto* end_t = context.Input("End"); + auto* step_t = context.Input("Step"); + auto* out = context.Output("Out"); + + framework::Tensor n; + framework::TensorCopy(*start_t, platform::CPUPlace(), &n); + T start = n.data()[0]; + framework::TensorCopy(*end_t, platform::CPUPlace(), &n); + T end = n.data()[0]; + framework::TensorCopy(*step_t, platform::CPUPlace(), &n); + T step = n.data()[0]; + + int64_t size = 0; + GetSize(start, end, step, &size); + out->Resize(framework::make_ddim({size})); + T* out_data = out->mutable_data(context.GetPlace()); + + auto stream = context.cuda_device_context().stream(); + int block = 512; + int grid = (size + block - 1) / block; + RangeKernel<<>>(start, step, size, out_data); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(range, ops::CUDARangeKernel, + ops::CUDARangeKernel, + ops::CUDARangeKernel, + ops::CUDARangeKernel); diff --git a/paddle/fluid/operators/range_op.h b/paddle/fluid/operators/range_op.h new file mode 100644 index 0000000000000000000000000000000000000000..fce58b45c96ad76dfdd4ed7f54becde327070002 --- /dev/null +++ b/paddle/fluid/operators/range_op.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +void GetSize(T start, T end, T step, int64_t* size) { + PADDLE_ENFORCE(!std::equal_to()(step, 0), + "The step of range op should not be 0."); + PADDLE_ENFORCE(((start < end) && (step > 0)) || ((start > end) && (step < 0)), + "The step should be greater than 0 while start < end. And the " + "step should be less than 0 while start > end."); + *size = std::is_integral::value + ? ((std::abs(end - start) + std::abs(step) - 1) / std::abs(step)) + : std::ceil(std::abs((end - start) / step)); +} + +template +class CPURangeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + T start = context.Input("Start")->data()[0]; + T end = context.Input("End")->data()[0]; + T step = context.Input("Step")->data()[0]; + auto* out = context.Output("Out"); + int64_t size = 0; + GetSize(start, end, step, &size); + out->Resize(framework::make_ddim({size})); + T* out_data = out->mutable_data(context.GetPlace()); + T value = start; + for (int64_t i = 0; i < size; ++i) { + out_data[i] = value; + value += step; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc index d0edcc170f0afbccdcdf83eed9a167b7602e34ab..62b1e09737a4af4d0fe08eafcb3b2999d97032c1 100644 --- a/paddle/fluid/operators/save_combine_op.cc +++ b/paddle/fluid/operators/save_combine_op.cc @@ -12,87 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include -#include -#include -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/port.h" +#include + +#include "paddle/fluid/operators/save_combine_op.h" namespace paddle { namespace operators { -class SaveCombineOp : public framework::OperatorBase { +class SaveCombineOp : public framework::OperatorWithKernel { public: - SaveCombineOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto filename = Attr("file_path"); - auto overwrite = Attr("overwrite"); - auto save_as_fp16 = Attr("save_as_fp16"); - - bool is_present = FileExists(filename); - if (is_present && !overwrite) { - PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false", - filename, overwrite); - } - - MkDirRecursively(DirName(filename).c_str()); - std::ofstream fout(filename, std::ios::binary); - PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", - filename); - - auto inp_var_names = Inputs("X"); - PADDLE_ENFORCE_GT(static_cast(inp_var_names.size()), 0, - "The number of input variables should be greater than 0"); - - // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); + using framework::OperatorWithKernel::OperatorWithKernel; - for (size_t i = 0; i < inp_var_names.size(); i++) { - auto *var = scope.FindVar(inp_var_names[i]); - - PADDLE_ENFORCE(var != nullptr, - "Cannot find variable %s for save_combine_op", - inp_var_names[i]); - PADDLE_ENFORCE(var->IsType(), - "SaveCombineOp only supports LoDTensor, %s has wrong type", - inp_var_names[i]); - - auto &tensor = var->Get(); - // Serialize tensors one by one - - // Check types to see if a fp16 transformation is required - auto in_dtype = tensor.type(); - auto out_dtype = - save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; - - if (in_dtype != out_dtype) { - auto in_kernel_type = framework::OpKernelType(in_dtype, place); - auto out_kernel_type = framework::OpKernelType(out_dtype, place); - framework::LoDTensor out; - // copy LoD info to the new tensor - out.set_lod(tensor.lod()); - framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out); - framework::SerializeToStream(fout, out, dev_ctx); - } else { - framework::SerializeToStream(fout, tensor, dev_ctx); - } - } - fout.close(); - } + void InferShape(framework::InferShapeContext *ctx) const override {} }; class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker { @@ -105,7 +36,7 @@ class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( SaveCombine operator -This operator will serialize and write a list of input LoDTensor variables +This operator will serialize and write a list of input LoDTensor variables to a file on disk. )DOC"); AddAttr("overwrite", @@ -134,3 +65,10 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(save_combine, ops::SaveCombineOp, ops::SaveCombineOpProtoMaker); + +REGISTER_OP_CPU_KERNEL( + save_combine, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel); diff --git a/paddle/fluid/operators/save_combine_op.cu b/paddle/fluid/operators/save_combine_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..bc4478b51b111518439fe250a70b8dee0df53ad9 --- /dev/null +++ b/paddle/fluid/operators/save_combine_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/save_combine_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + save_combine, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel); diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h new file mode 100644 index 0000000000000000000000000000000000000000..4ee82e17dd5e8173ce7dfb5c248890912d2cc7ef --- /dev/null +++ b/paddle/fluid/operators/save_combine_op.h @@ -0,0 +1,95 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace operators { +template +class SaveCombineOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto place = ctx.GetPlace(); + auto filename = ctx.Attr("file_path"); + auto overwrite = ctx.Attr("overwrite"); + auto save_as_fp16 = ctx.Attr("save_as_fp16"); + + bool is_present = FileExists(filename); + if (is_present && !overwrite) { + PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false", + filename, overwrite); + } + + MkDirRecursively(DirName(filename).c_str()); + std::ofstream fout(filename, std::ios::binary); + PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", + filename); + + auto &inp_var_names = ctx.Inputs("X"); + auto &inp_vars = ctx.MultiInputVar("X"); + PADDLE_ENFORCE_GT(static_cast(inp_var_names.size()), 0, + "The number of input variables should be greater than 0"); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + for (size_t i = 0; i < inp_var_names.size(); i++) { + PADDLE_ENFORCE(inp_vars[i] != nullptr, + "Cannot find variable %s for save_combine_op", + inp_var_names[i]); + PADDLE_ENFORCE(inp_vars[i]->IsType(), + "SaveCombineOp only supports LoDTensor, %s has wrong type", + inp_var_names[i]); + + auto &tensor = inp_vars[i]->Get(); + // Serialize tensors one by one + + // Check types to see if a fp16 transformation is required + auto in_dtype = tensor.type(); + auto out_dtype = + save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; + + if (in_dtype != out_dtype) { + auto in_kernel_type = framework::OpKernelType(in_dtype, place); + auto out_kernel_type = framework::OpKernelType(out_dtype, place); + framework::LoDTensor out; + // copy LoD info to the new tensor + out.set_lod(tensor.lod()); + framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out); + framework::SerializeToStream(fout, out, dev_ctx); + } else { + framework::SerializeToStream(fout, tensor, dev_ctx); + } + } + fout.close(); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/save_load_combine_op_test.cc b/paddle/fluid/operators/save_load_combine_op_test.cc index 4743e0d9499b111d8baa921dbb245431713fd7a8..5594de16b6789e99d5c4cc6828889eb0e311624a 100644 --- a/paddle/fluid/operators/save_load_combine_op_test.cc +++ b/paddle/fluid/operators/save_load_combine_op_test.cc @@ -19,8 +19,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/float16.h" -USE_NO_KERNEL_OP(save_combine); -USE_NO_KERNEL_OP(load_combine); +USE_CPU_ONLY_OP(save_combine); +USE_CPU_ONLY_OP(load_combine); template T* CreateForSaveCombineOp(int x, int y, const std::vector& lod_info, diff --git a/paddle/fluid/operators/save_load_op_test.cc b/paddle/fluid/operators/save_load_op_test.cc index ccaea0eef2906953d922e097348b6c0a86dad6f1..d277198a2f92c426586e774873c6770b93660e85 100644 --- a/paddle/fluid/operators/save_load_op_test.cc +++ b/paddle/fluid/operators/save_load_op_test.cc @@ -16,8 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/float16.h" -USE_NO_KERNEL_OP(save); -USE_NO_KERNEL_OP(load); +USE_CPU_ONLY_OP(save); +USE_CPU_ONLY_OP(load); TEST(SaveLoadOp, CPU) { paddle::framework::Scope scope; diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index b02c098099625ca544fd889d5bb1c13ef2374450..338e2fbb5d868f146c9ff420b2d5d4cf6088316e 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -15,118 +15,24 @@ limitations under the License. */ #include #include #include +#include +#include -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/port.h" +#include "paddle/fluid/operators/save_op.h" namespace paddle { namespace operators { - -// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables -// to directory specified. -constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath"; - -class SaveOp : public framework::OperatorBase { +class SaveOp : public framework::OperatorWithKernel { public: - SaveOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto iname = Input("X"); - auto *var = scope.FindVar(iname); - PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op", - iname); - - if (var->IsType()) { - SaveLodTensor(place, var); - } else if (var->IsType()) { - SaveSelectedRows(scope, place, var); - } else { - PADDLE_ENFORCE( - false, - "SaveOp only support LoDTensor and SelectedRows, %s has wrong type", - iname); - } - } + using framework::OperatorWithKernel::OperatorWithKernel; - void SaveLodTensor(const platform::Place &place, - framework::Variable *var) const { - auto filename = Attr("file_path"); - auto overwrite = Attr("overwrite"); - - if (FileExists(filename) && !overwrite) { - PADDLE_THROW("%s is existed, cannot save to it when overwrite=false", - filename, overwrite); - } - - MkDirRecursively(DirName(filename).c_str()); - - auto &tensor = var->Get(); - - // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - // FIXME(yuyang18): We save variable to local file now, but we should change - // it to save an output stream. - std::ofstream fout(filename, std::ios::binary); - PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", - filename); - - auto save_as_fp16 = Attr("save_as_fp16"); - auto in_dtype = tensor.type(); - auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; - - if (in_dtype != out_dtype) { - auto in_kernel_type = framework::OpKernelType(in_dtype, place); - auto out_kernel_type = framework::OpKernelType(out_dtype, place); - framework::LoDTensor out; - framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out); - // copy LoD info to the new tensor - out.set_lod(tensor.lod()); - framework::SerializeToStream(fout, out, dev_ctx); - } else { - framework::SerializeToStream(fout, tensor, dev_ctx); - } - fout.close(); - } + void InferShape(framework::InferShapeContext *ctx) const override {} - void SaveSelectedRows(const framework::Scope &scope, - const platform::Place &place, - framework::Variable *var) const { - auto *lt_var = scope.FindVar(LOOKUP_TABLE_PATH)->GetMutable(); - PADDLE_ENFORCE( - lt_var != nullptr, - "Can not find variable kLookupTablePath for SaveSelectedRows"); - std::string filename = lt_var->data(); - VLOG(4) << "SaveSelectedRows get File name: " << filename; - - MkDirRecursively(DirName(filename).c_str()); - - auto &selectedRows = var->Get(); - - // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - // FIXME(yuyang18): We save variable to local file now, but we should change - // it to save an output stream. - std::ofstream fout(filename, std::ios::binary); - PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", - filename); - framework::SerializeToStream(fout, selectedRows, dev_ctx); - fout.close(); + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); } }; @@ -154,14 +60,20 @@ This operator will serialize and write LoDTensor / SelectedRows variable to file "The \"file_path\" where the variable will be saved.") .AddCustomChecker( [](const std::string &path) { return !path.empty(); }); + AddOutput(LOOKUP_TABLE_PATH, + "(string)" + "for pserver: The \"kLookupTablePath\" where checkpoint notify " + "to save lookup table variables" + " to directory specified.") + .AsDispensable(); } }; class SaveOpVarTypeInference : public framework::VarTypeInference { public: void operator()(framework::InferVarTypeContext *ctx) const override { - auto out_var_name = ctx->Output(LOOKUP_TABLE_PATH).front(); - ctx->SetType(out_var_name, framework::proto::VarType::RAW); + auto var_type = framework::proto::VarType::RAW; + ctx->SetType(LOOKUP_TABLE_PATH, var_type); } }; @@ -169,11 +81,18 @@ class SaveOpShapeInference : public framework::InferShapeBase { public: void operator()(framework::InferShapeContext *ctx) const override {} }; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(save, ops::SaveOp, paddle::framework::EmptyGradOpMaker, - ops::SaveOpProtoMaker, ops::SaveOpVarTypeInference, - ops::SaveOpShapeInference); +REGISTER_OPERATOR(save, ops::SaveOp, ops::SaveOpProtoMaker, + ops::SaveOpVarTypeInference, ops::SaveOpShapeInference); + +REGISTER_OP_CPU_KERNEL( + save, ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel); diff --git a/paddle/fluid/operators/save_op.cu b/paddle/fluid/operators/save_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..0a778a694e52f146b6cceddb969b8af08f40ef9e --- /dev/null +++ b/paddle/fluid/operators/save_op.cu @@ -0,0 +1,27 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/save_op.h" +#include "paddle/fluid/platform/float16.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + save, ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel); diff --git a/paddle/fluid/operators/save_op.h b/paddle/fluid/operators/save_op.h new file mode 100644 index 0000000000000000000000000000000000000000..642235aad58bef2ec7f741ee5fb5a65a2081f4ce --- /dev/null +++ b/paddle/fluid/operators/save_op.h @@ -0,0 +1,133 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace operators { +// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables +// to directory specified. +constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath"; +template +class SaveOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto place = ctx.GetPlace(); + + auto *input_var = ctx.InputVar("X"); + auto iname = ctx.Inputs("X").data(); + PADDLE_ENFORCE(input_var != nullptr, "Cannot find variable %s for save_op", + iname); + + if (input_var->IsType()) { + SaveLodTensor(ctx, place, input_var); + } else if (input_var->IsType()) { + SaveSelectedRows(ctx, place, input_var); + } else { + PADDLE_ENFORCE( + false, + "SaveOp only support LoDTensor and SelectedRows, %s has wrong type", + iname); + } + } + + void SaveLodTensor(const framework::ExecutionContext &ctx, + const platform::Place &place, + const framework::Variable *var) const { + auto filename = ctx.Attr("file_path"); + auto overwrite = ctx.Attr("overwrite"); + + if (FileExists(filename) && !overwrite) { + PADDLE_THROW("%s is existed, cannot save to it when overwrite=false", + filename, overwrite); + } + + MkDirRecursively(DirName(filename).c_str()); + + auto &tensor = var->Get(); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + // FIXME(yuyang18): We save variable to local file now, but we should change + // it to save an output stream. + std::ofstream fout(filename, std::ios::binary); + PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", + filename); + + auto save_as_fp16 = ctx.Attr("save_as_fp16"); + auto in_dtype = tensor.type(); + auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; + + if (in_dtype != out_dtype) { + auto in_kernel_type = framework::OpKernelType(in_dtype, place); + auto out_kernel_type = framework::OpKernelType(out_dtype, place); + framework::LoDTensor out; + framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out); + // copy LoD info to the new tensor + out.set_lod(tensor.lod()); + framework::SerializeToStream(fout, out, dev_ctx); + } else { + framework::SerializeToStream(fout, tensor, dev_ctx); + } + fout.close(); + } + + void SaveSelectedRows(const framework::ExecutionContext &ctx, + const platform::Place &place, + const framework::Variable *var) const { + framework::Variable *out_put_var = ctx.OutputVar(LOOKUP_TABLE_PATH); + PADDLE_ENFORCE( + out_put_var != nullptr, + "Can not find variable kLookupTablePath for SaveSelectedRows"); + auto *lt_var = out_put_var->GetMutable(); + + std::string filename = lt_var->data(); + VLOG(4) << "SaveSelectedRows get File name: " << filename; + + MkDirRecursively(DirName(filename).c_str()); + + auto &selectedRows = var->Get(); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + // FIXME(yuyang18): We save variable to local file now, but we should change + // it to save an output stream. + std::ofstream fout(filename, std::ios::binary); + PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", + filename); + framework::SerializeToStream(fout, selectedRows, dev_ctx); + fout.close(); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc index 7754d2bfebdbc81e25432641b2eb4315386f75ff..fda971b20e27b68cab6110c323469f0d1c77cb59 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" +#include namespace paddle { namespace operators { @@ -187,7 +188,6 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker { grad_op->SetType("softmax_with_cross_entropy_grad"); grad_op->SetInput("Label", Input("Label")); grad_op->SetInput("Softmax", Output("Softmax")); - grad_op->SetInput("Loss", Output("Loss")); grad_op->SetInput(framework::GradVarName("Softmax"), OutputGrad("Softmax")); grad_op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits")); diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc index ecfb4e89566f3d72b3c262946c370bf34ce7515a..dc15df2c3c1b8a2964312d983be8ce362d3ab95d 100644 --- a/paddle/fluid/operators/squeeze_op.cc +++ b/paddle/fluid/operators/squeeze_op.cc @@ -40,7 +40,7 @@ class SqueezeOpInferShape : public framework::InferShapeBase { "tensor's rank."); } - auto out_dims = GetOutputShape(axes, x_dims); + auto out_dims = GetOutputShape(axes, x_dims, false); ctx->SetOutputDim("Out", out_dims); if (x_dims[0] == out_dims[0]) { // Only pass LoD when the first dimension of output and Input(X) @@ -50,7 +50,8 @@ class SqueezeOpInferShape : public framework::InferShapeBase { } static framework::DDim GetOutputShape(const std::vector squeeze_dims, - const framework::DDim &in_dims) { + const framework::DDim &in_dims, + bool is_runtime) { size_t num_squeeze_dims = squeeze_dims.size(); int cnt_squeezed_dims = 0; bool should_squeeze[9] = {false}; @@ -71,9 +72,12 @@ class SqueezeOpInferShape : public framework::InferShapeBase { // Check current index, the upper limit has beed checked in line 36. PADDLE_ENFORCE(current >= 0, "Invalid axis, the negative axis is out of range."); - PADDLE_ENFORCE(in_dims[current] == 1, - "Invalid axis index, the axis that will be squeezed " - "should be equal to 1."); + + if (is_runtime) { + PADDLE_ENFORCE(in_dims[current] == 1, + "Invalid axis index, the axis that will be squeezed " + "should be equal to 1."); + } if (!(should_squeeze[current])) { ++cnt_squeezed_dims; @@ -104,7 +108,7 @@ class SqueezeOp : public framework::OperatorBase { const platform::Place &place) const override { auto &axes = Attr>("axes"); auto x_dims = scope.FindVar(Input("X"))->Get().dims(); - auto out_dims = SqueezeOpInferShape::GetOutputShape(axes, x_dims); + auto out_dims = SqueezeOpInferShape::GetOutputShape(axes, x_dims, true); framework::AttributeMap attrs; attrs["shape"] = framework::vectorize2int(out_dims); @@ -224,7 +228,7 @@ class Squeeze2Op : public framework::OperatorBase { const platform::Place &place) const override { auto &axes = Attr>("axes"); auto x_dims = scope.FindVar(Input("X"))->Get().dims(); - auto out_dims = Squeeze2OpInferShape::GetOutputShape(axes, x_dims); + auto out_dims = Squeeze2OpInferShape::GetOutputShape(axes, x_dims, true); framework::AttributeMap attrs; attrs["shape"] = framework::vectorize2int(out_dims); diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc index 9e77f7252de1545e04bd2feaff27374c189dfc48..db763a051d1e08b962a40913d290c69e7c61ec32 100644 --- a/paddle/fluid/operators/top_k_op.cc +++ b/paddle/fluid/operators/top_k_op.cc @@ -34,8 +34,11 @@ class TopkOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_GE(k, 1, "k must >= 1"); PADDLE_ENFORCE_GE(input_dims.size(), 1, "input must have >= 1d shape"); - PADDLE_ENFORCE_GE(input_dims[input_dims.size() - 1], k, - "input must have >= k columns"); + + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_GE(input_dims[input_dims.size() - 1], k, + "input must have >= k columns"); + } framework::DDim dims = input_dims; dims[dims.size() - 1] = k; diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 1eb8d9691a1e591117e49c2cbe1ab691cbab4a5b..9dbc72f561b04b3005e2ef029e0c4ea6c2c312b1 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -23,6 +23,9 @@ limitations under the License. */ #include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cudnn.h" +#if !defined(__APPLE__) && !defined(_WIN32) +#include "paddle/fluid/platform/dynload/nccl.h" +#endif #include "paddle/fluid/platform/gpu_info.h" #endif diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 400a6d7bfa5912774c4bbb2a5868dd9a471afd00..47cca879b4b71f58778cf3d1f24cab463ac73418 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/gpu_info.h" - #include #include #include @@ -31,6 +30,8 @@ constexpr static float fraction_of_gpu_memory_to_use = 0.92f; constexpr static float fraction_of_gpu_memory_to_use = 0.5f; #endif +constexpr static float fraction_reserve_gpu_memory = 0.05f; + DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, "Allocate a trunk of gpu memory that is this fraction of the " "total gpu memory size. Future memory usage will be allocated " @@ -38,6 +39,24 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, "additional trunks of the same size will be requested from gpu " "until the gpu has no memory left for another trunk."); +DEFINE_uint64( + initial_gpu_memory_in_mb, 0ul, + "Allocate a trunk of gpu memory whose byte size is specified by " + "the flag. Future memory usage will be allocated from the " + "truck. If the trunk doesn't have enough gpu memory, additional " + "trunks of the gpu memory will be requested from gpu with size " + "specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has " + "no memory left for the additional trunk. Note: if you set this " + "flag, the memory size set by " + "FLAGS_fraction_of_gpu_memory_to_use will be overrided by this " + "flag. If you don't set this flag, PaddlePaddle will use " + "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory"); + +DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul, + "If this flag is set, Paddle will reallocate the gpu memory with " + "size specified by this flag. Else Paddle will reallocate by " + "FLAGS_fraction_of_gpu_memory_to_use"); + DEFINE_bool( enable_cublas_tensor_op_math, false, "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, " @@ -180,13 +199,43 @@ void GpuMemoryUsage(size_t *available, size_t *total) { } size_t GpuMaxAllocSize() { + return std::max(GpuInitAllocSize(), GpuReallocSize()); +} + +size_t GpuInitAllocSize() { + if (FLAGS_initial_gpu_memory_in_mb > 0ul) { + // Initial memory will be allocated by FLAGS_initial_gpu_memory_in_mb + return static_cast(FLAGS_initial_gpu_memory_in_mb << 20); + } + + // FLAGS_initial_gpu_memory_in_mb is 0, initial memory will be allocated by + // fraction size_t total = 0; size_t available = 0; GpuMemoryUsage(&available, &total); + size_t reserving = static_cast(fraction_reserve_gpu_memory * total); - // Reserve the rest for page tables, etc. - return static_cast(total * FLAGS_fraction_of_gpu_memory_to_use); + return static_cast((total - reserving) * + FLAGS_fraction_of_gpu_memory_to_use); +} + +size_t GpuReallocSize() { + if (FLAGS_reallocate_gpu_memory_in_mb > 0ul) { + // Additional memory will be allocated by FLAGS_reallocate_gpu_memory_in_mb + return static_cast(FLAGS_reallocate_gpu_memory_in_mb << 20); + } + + // FLAGS_reallocate_gpu_memory_in_mb is 0, additional memory will be allocated + // by fraction + size_t total = 0; + size_t available = 0; + + GpuMemoryUsage(&available, &total); + size_t reserving = static_cast(fraction_reserve_gpu_memory * total); + + return static_cast((total - reserving) * + FLAGS_fraction_of_gpu_memory_to_use); } size_t GpuMinChunkSize() { @@ -201,16 +250,13 @@ size_t GpuMaxChunkSize() { GpuMemoryUsage(&available, &total); VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/" << total / 1024 / 1024 << "M"; - size_t reserving = static_cast(0.05 * total); + size_t reserving = static_cast(fraction_reserve_gpu_memory * total); // If available less than minimum chunk size, no usable memory exists. available = std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(), total - reserving); - // Reserving the rest memory for page tables, etc. - - size_t allocating = static_cast(FLAGS_fraction_of_gpu_memory_to_use * - (total - reserving)); + size_t allocating = GpuMaxAllocSize(); PADDLE_ENFORCE_LE(allocating, available, "Insufficient GPU memory to allocation."); diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index 1e1ab2503f53fe20bbe62c48f65d8535947f1aa8..d4be7ac97b2df6fe578582ae296e1dfc5548260c 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -60,6 +60,12 @@ void GpuMemoryUsage(size_t *available, size_t *total); //! Get the maximum allocation size of current GPU device. size_t GpuMaxAllocSize(); +//! Get the initial allocation size of current GPU device. +size_t GpuInitAllocSize(); + +//! Get the re-allocation size of current GPU device. +size_t GpuReallocSize(); + //! Get the minimum chunk size for GPU buddy allocator. size_t GpuMinChunkSize(); diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 0428c40f985d78f0262eb0a73984bc59ab43aac2..b8b14b3d15efb47cbf53a393476f25158ebb5dff 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -22,6 +22,7 @@ #include #include #include + #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/enforce.h" @@ -79,7 +80,6 @@ struct NCCLContext { : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {} cudaStream_t stream() const { return ctx_->stream(); } - ncclComm_t comm() const { return comm_; } int device_id() const { @@ -105,9 +105,6 @@ struct NCCLContextMap { order_.size(), contexts_.size(), "NCCL Context Map does not support contain two or more same device"); - if (places.size() <= 1 && num_trainers == 1) { - return; - } std::unique_ptr comms(new ncclComm_t[order_.size()]); // if num_trainers == 1, should create a new nccl id for local comms. if (num_trainers == 1 && nccl_id == nullptr) { @@ -127,8 +124,8 @@ struct NCCLContextMap { } else { rank = trainer_id; } - VLOG(30) << "init nccl rank: " << rank << " nranks: " << nranks - << "gpu id: " << gpu_id; + VLOG(3) << "init nccl rank: " << rank << " nranks: " << nranks + << " gpu id: " << gpu_id; PADDLE_ENFORCE(cudaSetDevice(gpu_id)); PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( comms.get() + i, nranks, *nccl_id, rank)); diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 11e9725aeabf4472324d76aeb78c01f6be2e8c98..236afc77f708c344665821edd4f7c7841c300465 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -242,10 +242,6 @@ void BindAnalysisConfig(py::module *m) { .def("set_mkldnn_op", &AnalysisConfig::SetMKLDNNOp) .def("set_model_buffer", &AnalysisConfig::SetModelBuffer) .def("model_from_memory", &AnalysisConfig::model_from_memory) - .def("runtime_context_cache_enabled", - &AnalysisConfig::runtime_context_cache_enabled) - .def("switch_runtime_context_cache", - &AnalysisConfig::SwitchRuntimeContextCache, py::arg("x") = true) .def("pass_builder", &AnalysisConfig::pass_builder, py::return_value_policy::reference); } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 609f9c76bfe7474d7f3b5c4f72035878d3d22232..2359d249a8a0c9b73d990210b8c4ae9148bebbb7 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1265,6 +1265,10 @@ All parameter, weight, gradient are variables in Paddle. "enable_inplace", [](const BuildStrategy &self) { return self.enable_inplace_; }, [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; }) + .def_property( + "fuse_all_reduce_ops", + [](const BuildStrategy &self) { return self.fuse_all_reduce_ops_; }, + [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; }) .def("_finalize_strategy_and_create_passes", [](BuildStrategy &self) -> std::shared_ptr { return self.CreatePassesFromStrategy(true); diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index dc0a6dcdedaf39c0c489bb6f6e3eb28e6b58a21d..025528e85c4bf4da63b588dd91681d7bf7bb78fe 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -453,6 +453,7 @@ function assert_api_spec_approvals() { echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}" if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable. + # approval_user_list: velconia 1979255,panyx0718 2887803,XiaoguangHu01 46782768,chengduoZH 30176695,Xreki 12538138,luotao1 6836917,sneaxiy 32832641,tensor-tang 21351065,jacquesqiao 3048612,typhoonzero 13348433,shanyi15 35982308. if [ "$API_FILE" == "paddle/fluid/API.spec" ];then APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308 46782768 30176695` @@ -462,14 +463,14 @@ function assert_api_spec_approvals() { fi else APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ - python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803` + python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641` fi echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" if [ "${APPROVALS}" == "FALSE" ]; then if [ "$API_FILE" == "paddle/fluid/API.spec" ];then echo "You must have one RD (panyx0718 or chengduoZH or XiaoguangHu01) and one PM (shanyi15) approval for the api change! ${API_FILE}" else - echo "You must have panyx0718 approval for the api change! ${API_FILE}" + echo "You must have one RD (velconia,panyx0718,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE}" fi exit 1 fi @@ -479,10 +480,10 @@ function assert_api_spec_approvals() { HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true` if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ - python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803` + python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641` echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" if [ "${APPROVALS}" == "FALSE" ]; then - echo "You must have panyx0718 approval for the const_cast" + echo "You must have one RD (velconia,panyx0718,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE}" exit 1 fi fi diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index e91fa9292438532a5f696082a179aea7ff3e093f..614a3586156b0a858e2c5d2decec6dc6844c8886 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -41,6 +41,8 @@ int main(int argc, char** argv) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) envs.push_back("fraction_of_gpu_memory_to_use"); + envs.push_back("initial_gpu_memory_in_mb"); + envs.push_back("reallocate_gpu_memory_in_mb"); envs.push_back("allocator_strategy"); #elif __clang__ envs.push_back("use_mkldnn"); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index cb9c75a14f5e69376e706a7da8de808a20e16f5c..63b7b28948a783bc5910d53f6e65a8c09d77bdb1 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -132,7 +132,8 @@ def __bootstrap__(): 'allocator_strategy', 'reader_queue_speed_test_mode', 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir', 'inner_op_parallelism', 'enable_parallel_graph', - 'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize', + 'fuse_parameter_groups_size', 'multiple_of_cupti_buffer_size', + 'enable_subgraph_optimize', 'fuse_parameter_memory_size', 'tracer_profile_fname' ] if 'Darwin' not in sysstr: @@ -162,7 +163,8 @@ def __bootstrap__(): if core.is_compiled_with_cuda(): read_env_flags += [ - 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', + 'fraction_of_gpu_memory_to_use', 'initial_gpu_memory_in_mb', + 'reallocate_gpu_memory_in_mb', 'cudnn_deterministic', 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus', 'sync_nccl_allreduce', 'limit_of_tmp_allocation', diff --git a/python/paddle/fluid/contrib/slim/__init__.py b/python/paddle/fluid/contrib/slim/__init__.py index 22dbf7c8b6bb2da7c310a20bdcbaffca248575b0..4a71fab6d0fc73aa3bbe9c9fe56278e473f354e1 100644 --- a/python/paddle/fluid/contrib/slim/__init__.py +++ b/python/paddle/fluid/contrib/slim/__init__.py @@ -13,13 +13,4 @@ # limitations under the License. from .core import * -from .graph import * -from .prune import * -__all__ = [ - 'build_compressor', - 'CompressPass', - 'ImitationGraph', - 'SensitivePruneStrategy', - 'MagnitudePruner', - 'RatioPruner', -] +__all__ = ['Compressor', ] diff --git a/python/paddle/fluid/contrib/slim/core/__init__.py b/python/paddle/fluid/contrib/slim/core/__init__.py index 7826d5830a6f7f6d42cb1275c2289695c080e52f..831bd70ecc62f8d576b304c52b0abea994fd2ceb 100644 --- a/python/paddle/fluid/contrib/slim/core/__init__.py +++ b/python/paddle/fluid/contrib/slim/core/__init__.py @@ -14,11 +14,9 @@ from . import config from .config import * -from . import compress_pass -from .compress_pass import * +from . import compressor +from .compressor import * from . import strategy from .strategy import * -from . import pass_builder -from .pass_builder import * -__all__ = config.__all__ + compress_pass.__all__ + strategy.__all__ + pass_builder.__all__ +__all__ = config.__all__ + compressor.__all__ + strategy.__all__ diff --git a/python/paddle/fluid/contrib/slim/core/compress_pass.py b/python/paddle/fluid/contrib/slim/core/compress_pass.py deleted file mode 100644 index c4c348b878a1df43d7fb909f506c8cf65366866f..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/contrib/slim/core/compress_pass.py +++ /dev/null @@ -1,129 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from ....core import CPUPlace -from ..graph import get_executor - -__all__ = ['Context', 'CompressPass'] - - -class Context(object): - """ - The context in the process of compression. - Args: - exe: The executor used to execute graph. - graph: The graph to be compressed. - scope: The scope used to execute graph. - program_exe: The program_exe is used to execute the program - created for modifying the variables in scope. - """ - - def __init__(self, exe, graph, scope, program_exe=None): - # The total number of epoches to be trained. - self.epoch = 0 - # Current epoch - self.epoch_id = 0 - # Current batch - self.batch_id = 0 - self.exe = exe - self.graph = graph - self.scope = scope - self.program_exe = program_exe - - -class CompressPass(object): - """ - The pass used to compress model. - Args: - place: The device used in compression. - data_reader: The data_reader used to run graph. - data_feeder: The data_feeder used to run graph. - scope: The scope used to run graph. - metrics: The metrics for evaluating model. - epoch: The total epoches of trainning in compression. - program_exe: The program_exe is used to execute the program - created for modifying the variables in scope. - """ - - def __init__(self, - place=None, - data_reader=None, - data_feeder=None, - scope=None, - metrics=None, - epoch=None, - program_exe=None): - self.strategies = [] - self.place = CPUPlace() if place is None else place - self.data_reader = data_reader - self.data_feeder = data_feeder - self.scope = scope - self.metrics = metrics - self.epoch = epoch - self.program_exe = program_exe - - def add_strategy(self, strategy): - """ - Add a strategy to current compress pass. - Args: - strategy: The strategy to be added into current compress pass. - """ - self.strategies.append(strategy) - self.epoch = max(strategy.end_epoch, self.epoch) - - def apply(self, graph): - """ - Compress a model. - Args: - graph: The target graph to be compressed. - """ - self.executor = get_executor(graph, self.place) - context = Context( - self.executor, graph, self.scope, program_exe=self.program_exe) - - for strategy in self.strategies: - strategy.on_compress_begin(context) - - for epoch in range(self.epoch): - - for strategy in self.strategies: - strategy.on_epoch_begin(context) - - for data in self.data_reader(): - - for strategy in self.strategies: - strategy.on_batch_begin(context) - fetches = None - if self.metrics: - fetches = self.metrics.values() - feed = None - if self.data_feeder: - feed = self.data_feeder.feed(data) - results = self.executor.run(graph, - fetches=fetches, - scope=self.scope, - feed=feed) - if results: - print("results: {}".format( - zip(self.metrics.keys(), results))) - for strategy in self.strategies: - strategy.on_batch_end(context) - context.batch_id += 1 - - for strategy in self.strategies: - strategy.on_epoch_end(context) - context.epoch_id += 1 - - for strategy in self.strategies: - strategy.on_compress_end(context) diff --git a/python/paddle/fluid/contrib/slim/core/compressor.py b/python/paddle/fluid/contrib/slim/core/compressor.py new file mode 100644 index 0000000000000000000000000000000000000000..832ade497c67ee16b6068cad4f0edace94128989 --- /dev/null +++ b/python/paddle/fluid/contrib/slim/core/compressor.py @@ -0,0 +1,481 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ....core import CPUPlace +from .... import compiler +from .... import io +from .... import profiler +from .... import scope_guard +from ....data_feeder import DataFeeder +from ..graph import * +from .config import ConfigFactory +import numpy as np +from collections import Iterable +import time +import os +import logging +import sys +import pickle +import functools + +__all__ = ['Context', 'Compressor'] + +logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s') +_logger = logging.getLogger(__name__) +_logger.setLevel(logging.INFO) + + +def cached_reader(reader, sampled_rate, cache_path, cached_id): + """ + Sample partial data from reader and cache them into local file system. + Args: + reader: Iterative data source. + sampled_rate(float): The sampled rate used to sample partial data for evaluation. None means using all data in eval_reader. default: None. + cache_path(str): The path to cache the sampled data. + cached_id(int): The id of dataset sampled. Evaluations with same cached_id use the same sampled dataset. default: 0. + """ + np.random.seed(cached_id) + cache_path = os.path.join(cache_path, str(cached_id)) + _logger.debug('read data from: {}'.format(cache_path)) + + def s_reader(): + if os.path.isdir(cache_path): + for file_name in open(os.path.join(cache_path, "list")): + yield np.load(os.path.join(cache_path, file_name.strip())) + else: + os.makedirs(cache_path) + list_file = open(os.path.join(cache_path, "list"), 'w') + batch = 0 + dtype = None + for data in reader(): + if batch == 0 or (np.random.uniform() < sampled_rate): + np.save( + os.path.join(cache_path, 'batch' + str(batch)), data) + list_file.write('batch' + str(batch) + '.npy\n') + batch += 1 + yield data + + return s_reader + + +class Context(object): + """ + The context in the process of compression. + """ + + def __init__(self, + place, + scope, + train_graph=None, + train_reader=None, + eval_graph=None, + eval_reader=None, + teacher_graphs=None, + train_optimizer=None, + distiller_optimizer=None): + """ + Args: + place: The device place where the compression job running. + scope: The scope used in compression job. + train_graph: The graph with loss as output node. + eval_graph: The graph used for evaluation. + eval_reader: The data reader used for evaluation. + teacher_graphs: The teacher graphs used in distillation strategies. + train_optimizer: The optimizer used to append backward ops and + optimization ops into train_graph. + distiller_optimizer: The optimizer used by distillation strategies. + """ + # The total number of epoches to be trained. + self.epoch = 0 + # Current epoch + self.epoch_id = 0 + # Current batch + self.batch_id = 0 + + self.k_v = {} + + self.place = place + self.scope = scope + self.train_graph = train_graph + self.train_reader = train_reader + self.eval_graph = eval_graph + self.eval_reader = eval_reader + self.executor = None + self.teacher_graphs = teacher_graphs + self.train_optimizer = train_optimizer + self.distiller_optimizer = distiller_optimizer + self.optimize_graph = None + self.cache_path = './eval_cache' + self.eval_results = {} + + def to_file(self, file_name): + """ + Save the context into file. + """ + data = {} + data['epoch_id'] = self.epoch_id + data['eval_results'] = self.eval_results + with open(file_name, 'wb') as context_file: + pickle.dump(data, context_file) + + def from_file(self, file_name): + """ + Load the context from file. + """ + with open(file_name) as context_file: + if sys.version_info < (3, 0): + data = pickle.load(context_file) + else: + data = pickle.load(context_file, encoding='bytes') + self.epoch_id = data['epoch_id'] + self.eval_results = data['eval_results'] + + def eval_converged(self, metric_name, delta=0.001): + """ + Check whether the training has been converged. + Args: + metric_name(str): The metric used to check convergence. + delta(float): '(metric[k] - metric[k-1] / metric[k-1]) < delta' + means that the training has been converged. + Returns: + bool: True means the training has been converged. + """ + # TODO(wanghaoshuang@baidu.com): enhence this method. + if (metric_name not in self.eval_results + ) or len(self.eval_results[metric_name]) < 2: + return False + results = self.eval_results[metric_name][-2:] + _logger.info('Latest evaluations: {}'.format(results)) + return abs(results[1] - results[0]) / results[0] < delta + + def run_eval_graph(self, sampled_rate=None, cached_id=0): + """ + Evaluate the current mode in context. + Args: + sampled_rate(float): The sampled rate used to sample partial data + for evaluation. None means using all data in eval_reader. default: None. + cached_id(int): The id of dataset sampled. Evaluations with same + cached_id use the same sampled dataset. default: 0. + """ + _logger.info('Running evaluation') + assert self.eval_graph is not None + assert self.eval_reader is not None + eval_graph = self.eval_graph.clone(for_test=True) + + executor = SlimGraphExecutor(self.place) + results = [] + batch_id = 0 + s_time = time.time() + reader = self.eval_reader + if sampled_rate: + reader = cached_reader(reader, sampled_rate, self.cache_path, + cached_id) + for data in reader(): + result = executor.run(eval_graph, self.scope, data=data) + result = [np.mean(r) for r in result] + results.append(result) + if batch_id % 20 == 0: + _logger.info("batch-{}; {}={}".format( + batch_id, eval_graph.out_nodes.keys(), result)) + batch_id += 1 + result = np.mean(np.array(results), axis=0) + _logger.info("Final eval result: {}={}".format( + eval_graph.out_nodes.keys(), result)) + if not isinstance(result, Iterable): + result = [result] + _logger.info('Finish evaluation') + return result, eval_graph.out_nodes.keys() + + def put(self, key, value): + self.k_v[key] = value + + def get(self, key): + return self.k_v.get(key) + + +class Compressor(object): + """ + The pass used to compress model. + """ + + def __init__(self, + place, + scope, + train_program, + train_reader=None, + train_feed_list=None, + train_fetch_list=None, + eval_program=None, + eval_reader=None, + eval_feed_list=None, + eval_fetch_list=None, + teacher_programs=[], + checkpoint_path='./checkpoints', + train_optimizer=None, + distiller_optimizer=None): + """ + Args: + place(fluid.Place): The device place where the compression job running. + scope(fluid.core.Scope): The scope used to run graph. + train_program(Program): The main program to be compressed. It must have loss op. + train_reader: The data reader used for training. + train_feed_list(dict): A dict to indicate the input variable of the training program. + The key is user-defined and human-readable name. + The value is the name of Variable. + train_fetch_list(dict): A dict to indicate the output variable of the training program. + The key is user-defined and human-readable name. + The value is the name of Variable. + eval_program(Program): The program used for evaluation. + eval_reader: The data reader used for evaluation. + eval_feed_list(dict): A dict to indicate the input variable of the evaluation program. + The key is user-defined and human-readable name. + The value is the name of Variable. + eval_fetch_list(dict): A dict to indicate the output variable of the evaluation program. + The key is user-defined and human-readable name. + The value is the name of Variable. + teacher_programs: The teacher graphs used in distillation strategies. + train_optimizer: The optimizer used to append backward ops and + optimization ops into train_graph. + distiller_optimizer: The optimizer used by distillation strategies. In distillation strategy, + this optimizer is used to minimize the combined loss of student-net and + teacher-net while train_optimizer is used to minimize loss of + student-net in fine-tune stage. + + """ + assert isinstance( + train_feed_list, list + ), "train_feed_list should be a list of tuple, such as [('image', image.name), ('label', gt.name)]" + assert isinstance( + eval_feed_list, list + ), "eval_feed_list should be a list of tuple, such as [('image', image.name), ('label', gt.name)]" + self.strategies = [] + self.epoch = 0 + self.place = CPUPlace() if place is None else place + self.scope = scope + self.train_graph = GraphWrapper( + train_program, in_nodes=train_feed_list, out_nodes=train_fetch_list) + self.eval_graph = GraphWrapper( + eval_program, in_nodes=eval_feed_list, out_nodes=eval_fetch_list) + self.train_reader = train_reader + self.eval_reader = eval_reader + self.teacher_graphs = [] + for teacher in teacher_programs: + self.teacher_graphs.append(ImitationGraph(teacher, scope=scope)) + + self.checkpoint = None + self.checkpoint_path = checkpoint_path + self.eval_epoch = 1 + + self.train_optimizer = train_optimizer + self.distiller_optimizer = distiller_optimizer + self.init_model = None + + def _add_strategy(self, strategy): + """ + Add a strategy to current compress pass. + Args: + strategy: The strategy to be added into current compress pass. + """ + self.strategies.append(strategy) + self.epoch = max(strategy.end_epoch, self.epoch) + + def config(self, config_file): + """ + Configure the compress pass from file with yaml format. + Args: + config_file(str): The config file in local file system. + """ + factory = ConfigFactory(config_file) + self.epoch = factory.compressor['epoch'] + for strategy in factory.compressor['strategies']: + self._add_strategy(strategy) + if 'checkpoint_path' in factory.compressor: + self.checkpoint_path = factory.compressor['checkpoint_path'] + + if 'init_model' in factory.compressor: + self.init_model = factory.compressor['init_model'] + + def _init_model(self, context): + """ + Load model that has been compressed. + """ + if self.init_model and os.path.exists(self.init_model): + exe = SlimGraphExecutor(context.place) + with scope_guard(context.scope): + context.train_graph.load_persistables(self.init_model, exe) + flops = context.eval_graph.flops() + conv_flops = context.eval_graph.flops(only_conv=True) + context.eval_graph.update_param_shape(context.scope) + context.eval_graph.update_groups_of_conv() + _logger.info("conv flops: -{}".format(1 - float( + context.eval_graph.flops(only_conv=True)) / conv_flops)) + _logger.info("total flops: -{}".format(1 - float( + context.eval_graph.flops()) / flops)) + context.train_graph.update_param_shape(context.scope) + context.train_graph.update_groups_of_conv() + context.train_graph.infer_shape() + _logger.info("Init model from: {}".format(self.init_model)) + + def _load_checkpoint(self, context): + """ + Load checkpoints from file. + """ + _logger.debug('_load_checkpoint') + strategies = self.strategies + if self.checkpoint_path: + if not os.path.exists(self.checkpoint_path): + _logger.warning("Checkpints path doesn't exist: [{}]".format( + self.checkpoint_path)) + return context, strategies + checkpoints = [ + dir for dir in os.listdir(self.checkpoint_path) + if os.path.isdir(os.path.join(self.checkpoint_path, dir)) + ] + _logger.debug('self.checkpoint_path: {}'.format( + self.checkpoint_path)) + _logger.info('checkpoints: {}'.format(checkpoints)) + if len(checkpoints) > 0: + latest = max([int(ck) for ck in checkpoints]) + latest_ck_path = os.path.join(self.checkpoint_path, str(latest)) + + model_path = os.path.join(latest_ck_path, 'model') + context_path = os.path.join(latest_ck_path, 'context') + strategy_path = os.path.join(latest_ck_path, 'strategies') + if os.path.exists(context_path): + context.from_file(context_path) + context.epoch_id += 1 + if os.path.exists(strategy_path): + with open(strategy_path, 'rb') as strategy_file: + if sys.version_info < (3, 0): + strategies = pickle.load(strategy_file) + else: + strategies = pickle.load( + strategy_file, encoding='bytes') + + if os.path.exists(model_path): + exe = SlimGraphExecutor(context.place) + with scope_guard(context.scope): + context.optimize_graph.load_persistables(model_path, + exe) + context.optimize_graph.update_param_shape(context.scope) + context.optimize_graph.update_groups_of_conv() + context.eval_graph.update_param_shape(context.scope) + context.eval_graph.update_groups_of_conv() + _logger.info("Loaded params from: {}".format(model_path)) + return context, strategies + + def _save_checkpoint(self, context): + """ + Save checkpoints to file. + """ + if context.epoch_id % 1 == 0 and self.checkpoint_path: + checkpoint_path = os.path.join(self.checkpoint_path, + str(context.epoch_id)) + model_path = os.path.join(checkpoint_path, 'model') + context_path = os.path.join(checkpoint_path, 'context') + strategy_path = os.path.join(checkpoint_path, 'strategies') + if not os.path.isdir(model_path): + os.makedirs(model_path) + exe = SlimGraphExecutor(context.place) + with scope_guard(context.scope): + context.optimize_graph.save_persistables(model_path, exe) + context.to_file(context_path) + with open(strategy_path, 'wb') as strategy_file: + pickle.dump(self.strategies, strategy_file) + _logger.info('Saved checkpoint to: {}'.format(checkpoint_path)) + + def _train_one_epoch(self, context): + """ + Train one epoch. + """ + + executor = SlimGraphExecutor(self.place) + + if context.optimize_graph.compiled_graph is None: + context.optimize_graph.compiled_graph = compiler.CompiledProgram( + context.optimize_graph.program).with_data_parallel( + loss_name=context.optimize_graph.out_nodes['loss']) + + for data in context.train_reader(): + for strategy in self.strategies: + strategy.on_batch_begin(context) + results = executor.run(context.optimize_graph, + context.scope, + data=data) + results = [float(np.mean(result)) for result in results] + if context.batch_id % 20 == 0: + _logger.info("epoch:{}; batch_id:{}; {} = {}".format( + context.epoch_id, context.batch_id, + context.optimize_graph.out_nodes.keys( + ), [round(r, 3) for r in results])) + for strategy in self.strategies: + strategy.on_batch_end(context) + context.batch_id += 1 + context.batch_id = 0 + + def _eval(self, context): + """ + Runing evaluation. + """ + results, names = context.run_eval_graph() + for name, result in zip(names, results): + if name not in context.eval_results: + context.eval_results[name] = [] + context.eval_results[name].append(result) + + def run(self): + """ + Execute compressiong pass. + """ + context = Context( + place=self.place, + scope=self.scope, + train_graph=self.train_graph, + train_reader=self.train_reader, + eval_graph=self.eval_graph, + eval_reader=self.eval_reader, + teacher_graphs=self.teacher_graphs, + train_optimizer=self.train_optimizer, + distiller_optimizer=self.distiller_optimizer) + self.context = context + if self.teacher_graphs: + context.put('teachers', self.teacher_graphs) + self._init_model(context) + if not context.optimize_graph: + if context.train_optimizer: + context.train_optimizer._name = 'train_opt' + context.optimize_graph = context.train_graph.get_optimize_graph( + context.train_optimizer, context.place, context.scope) + else: + context.optimize_graph = context.train_graph + + context, self.strategies = self._load_checkpoint(context) + + for strategy in self.strategies: + strategy.on_compression_begin(context) + start = context.epoch_id + self._eval(context) + for epoch in range(start, self.epoch): + context.epoch_id = epoch + for strategy in self.strategies: + strategy.on_epoch_begin(context) + self._train_one_epoch(context) + for strategy in self.strategies: + strategy.on_epoch_end(context) + if self.eval_epoch and epoch % self.eval_epoch == 0: + self._eval(context) + self._save_checkpoint(context) + for strategy in self.strategies: + strategy.on_compression_end(context) + return context.eval_graph diff --git a/python/paddle/fluid/contrib/slim/core/config.py b/python/paddle/fluid/contrib/slim/core/config.py index 811c45700376aff9883fe197007b582f63817f03..12df9fcd1b0042c26aabac88d6ecba5fb827cba0 100644 --- a/python/paddle/fluid/contrib/slim/core/config.py +++ b/python/paddle/fluid/contrib/slim/core/config.py @@ -17,7 +17,7 @@ import funcsigs import yaml from collections import OrderedDict from ..prune import * -from .compress_pass import * +from ..quantization import * from .strategy import * __all__ = ['ConfigFactory'] @@ -29,15 +29,10 @@ class ConfigFactory(object): def __init__(self, config): """Init a factory from configure file.""" self.instances = {} + self.compressor = {} self.version = None self._parse_config(config) - def get_compress_pass(self): - """ - Get compress pass from factory. - """ - return self.instance('compress_pass') - def instance(self, name): """ Get instance from factory. @@ -59,8 +54,16 @@ class ConfigFactory(object): args = {} for key in keys: value = attrs[key] + if isinstance(value, str) and value.lower() == 'none': + value = None if isinstance(value, str) and value in self.instances: value = self.instances[value] + if isinstance(value, list): + for i in range(len(value)): + if isinstance(value[i], + str) and value[i] in self.instances: + value[i] = self.instances[value[i]] + args[key] = value self.instances[name] = class_(**args) return self.instances.get(name) @@ -76,16 +79,23 @@ class ConfigFactory(object): assert self.version == int(key_values['version']) # parse pruners - if key == 'pruners' or key == 'strategies': + if key == 'distillers' or key == 'pruners' or key == 'quantizers' or key == 'strategies': instances = key_values[key] for name in instances: self._new_instance(name, instances[name]) - if key == 'compress_pass': - compress_pass = self._new_instance(key, key_values[key]) - for name in key_values[key]['strategies']: - strategy = self.instance(name) - compress_pass.add_strategy(strategy) + if key == 'compressor': + self.compressor['strategies'] = [] + self.compressor['epoch'] = key_values[key]['epoch'] + if 'init_model' in key_values[key]: + self.compressor['init_model'] = key_values[key][ + 'init_model'] + self.compressor['checkpoint_path'] = key_values[key][ + 'checkpoint_path'] + if 'strategies' in key_values[key]: + for name in key_values[key]['strategies']: + strategy = self.instance(name) + self.compressor['strategies'].append(strategy) if key == 'include': for config_file in key_values[key]: diff --git a/python/paddle/fluid/contrib/slim/core/pass_builder.py b/python/paddle/fluid/contrib/slim/core/pass_builder.py deleted file mode 100644 index fc1ddc94e04f1d606292071ba7e5cc74fedd5d36..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/contrib/slim/core/pass_builder.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .compress_pass import CompressPass -from .config import ConfigFactory - -__all__ = ['build_compressor'] - - -def build_compressor(place=None, - data_reader=None, - data_feeder=None, - scope=None, - metrics=None, - epoch=None, - config=None): - if config is not None: - factory = ConfigFactory(config) - comp_pass = factory.get_compress_pass() - else: - comp_pass = CompressPass() - comp_pass.place = place - comp_pass.data_reader = data_reader - comp_pass.data_feeder = data_feeder - comp_pass.scope = scope - comp_pass.metrics = metrics - comp_pass.epoch = epoch - return comp_pass diff --git a/python/paddle/fluid/contrib/slim/core/strategy.py b/python/paddle/fluid/contrib/slim/core/strategy.py index 74d98e98b0c390599acfaefeb0636a599b46d391..28bf24f4e341dd528d2cd25f6fb24543886150d6 100644 --- a/python/paddle/fluid/contrib/slim/core/strategy.py +++ b/python/paddle/fluid/contrib/slim/core/strategy.py @@ -20,7 +20,7 @@ class Strategy(object): Base class for all strategies. """ - def __init__(self, start_epoch=0, end_epoch=10): + def __init__(self, start_epoch=0, end_epoch=0): """ Args: start_epoch: The first epoch to apply the strategy. @@ -29,7 +29,7 @@ class Strategy(object): self.start_epoch = start_epoch self.end_epoch = end_epoch - def on_compress_begin(self, context): + def on_compression_begin(self, context): pass def on_epoch_begin(self, context): @@ -44,5 +44,5 @@ class Strategy(object): def on_batch_end(self, context): pass - def on_compress_end(self, context): + def on_compression_end(self, context): pass diff --git a/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml b/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml deleted file mode 100644 index ea888fa2c74a23b4769f75dce6a776afcca41a51..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml +++ /dev/null @@ -1,28 +0,0 @@ -version: 1.0 -pruners: - pruner_1: - class: 'RatioPruner' - ratios: - 'conv1_1.w': 0.3 - 'conv1_2.w': 0.4 - '*': 0.9 - group_dims: - '*': [1, 2, 3] - criterions: - '*': 'l1-norm' -strategies: - strategy_1: - class: 'SensitivePruneStrategy' - pruner: 'pruner_1' - start_epoch: 0 - end_epoch: 10 - delta_rate: 0.20 - acc_loss_threshold: 0.2 - sensitivities: - 'conv1_1.w': 0.4 - -compress_pass: - class: 'CompressPass' - epoch: 100 - strategies: - - strategy_1 diff --git a/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py b/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py deleted file mode 100644 index 21c59c0c9d2d9b76932ab6eeff73754940a3bfa0..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle.fluid as fluid -import paddle -import os -import sys -from paddle.fluid.contrib.slim import CompressPass -from paddle.fluid.contrib.slim import build_compressor -from paddle.fluid.contrib.slim import ImitationGraph - - -class LinearModel(object): - def __init__(slef): - pass - - def train(self): - train_program = fluid.Program() - startup_program = fluid.Program() - startup_program.random_seed = 10 - with fluid.program_guard(train_program, startup_program): - x = fluid.layers.data(name='x', shape=[13], dtype='float32') - y = fluid.layers.data(name='y', shape=[1], dtype='float32') - predict = fluid.layers.fc(input=x, size=1, act=None) - cost = fluid.layers.square_error_cost(input=predict, label=y) - avg_cost = fluid.layers.mean(cost) - eval_program = train_program.clone() - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) - sgd_optimizer.minimize(avg_cost) - - train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=1) - eval_reader = paddle.batch( - paddle.dataset.uci_housing.test(), batch_size=1) - place = fluid.CPUPlace() - train_feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - eval_feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - exe = fluid.Executor(place) - exe.run(startup_program) - train_metrics = {"loss": avg_cost.name} - eval_metrics = {"loss": avg_cost.name} - - graph = ImitationGraph(train_program) - config = './config.yaml' - comp_pass = build_compressor( - place, - data_reader=train_reader, - data_feeder=train_feeder, - scope=fluid.global_scope(), - metrics=train_metrics, - epoch=1, - config=config) - comp_pass.apply(graph) - - -if __name__ == "__main__": - model = LinearModel() - model.train() diff --git a/python/paddle/fluid/contrib/slim/graph/__init__.py b/python/paddle/fluid/contrib/slim/graph/__init__.py index d65472d193b639f0766e278ec14b5dc36c5d62bc..c5d1c4dbdfb208ea66bb3dc315e502309799492e 100644 --- a/python/paddle/fluid/contrib/slim/graph/__init__.py +++ b/python/paddle/fluid/contrib/slim/graph/__init__.py @@ -14,10 +14,7 @@ from . import executor from .executor import * -from . import graph -from .graph import * -from . import graph_pass -from .graph_pass import * +from . import graph_wrapper +from .graph_wrapper import * __all__ = executor.__all__ -__all__ += graph.__all__ -__all__ += graph_pass.__all__ +__all__ += graph_wrapper.__all__ diff --git a/python/paddle/fluid/contrib/slim/graph/executor.py b/python/paddle/fluid/contrib/slim/graph/executor.py index c02c3af82013287bf19e1869cb60dc65239b720a..70438a90eb790e7ca5d00be0bc09efc6c00cafe4 100644 --- a/python/paddle/fluid/contrib/slim/graph/executor.py +++ b/python/paddle/fluid/contrib/slim/graph/executor.py @@ -12,51 +12,46 @@ # See the License for the specific language governing permissions and # limitations under the License. -import abc -from abc import abstractmethod +from ....compiler import CompiledProgram +from ....data_feeder import DataFeeder from .... import executor -from .graph import IRGraph, ImitationGraph +from .graph_wrapper import GraphWrapper -__all__ = ['get_executor'] +__all__ = ['SlimGraphExecutor'] -class GraphExecutor(object): - __metaclass__ = abc.ABCMeta +class SlimGraphExecutor(object): + """ + Wrapper of executor used to run GraphWrapper. + """ def __init__(self, place): - self.place = place - - @abstractmethod - def run(self, graph, feches=None, feed=None): - pass - - -class IRGraphExecutor(GraphExecutor): - def run(self, grah, fetches, feed=None): - pass - - -class ImitationGraphExecutor(GraphExecutor): - def __init__(self, place): - super(ImitationGraphExecutor, self).__init__(place) self.exe = executor.Executor(place) + self.place = place - def run(self, graph, scope=None, fetches=None, feed=None): - assert isinstance(graph, ImitationGraph) - fetch_list = None - if fetches: - fetch_list = [ - graph.program.global_block().var(name) for name in fetches - ] - results = self.exe.run(graph.program, + def run(self, graph, scope, data=None): + """ + Runing a graph with a batch of data. + Args: + graph(GraphWrapper): The graph to be executed. + scope(fluid.core.Scope): The scope to be used. + data(list): A batch of data. Each tuple in this list is a sample. + It will feed the items of tuple to the in_nodes of graph. + Returns: + results(list): A list of result with the same order indicated by graph.out_nodes. + """ + assert isinstance(graph, GraphWrapper) + if data is not None: + feeder = DataFeeder( + feed_list=graph.in_nodes.values(), + place=self.place, + program=graph.program) + feed = feeder.feed(data) + + fetch_list = graph.out_nodes.values() + program = graph.compiled_graph if graph.compiled_graph else graph.program + results = self.exe.run(program, scope=scope, fetch_list=fetch_list, feed=feed) return results - - -def get_executor(graph, place): - if isinstance(graph, ImitationGraph): - return ImitationGraphExecutor(place) - if isinstance(graph, IRGraph): - return IRGraphExecutor(place) diff --git a/python/paddle/fluid/contrib/slim/graph/graph.py b/python/paddle/fluid/contrib/slim/graph/graph.py deleted file mode 100644 index f38d9783413a01cd1005a014c0aba5ecf5cc79c2..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/contrib/slim/graph/graph.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from __future__ import print_function -import os -import subprocess -from ....framework import Program -from ....framework import Block -from .... import core - -__all__ = ['Graph', 'ImitationGraph', 'IRGraph'] - - -class Graph(object): - """ - Base class for all graph. - """ - - def __init__(self): - pass - - def all_parameters(self): - """ - Return all the parameters in current graph. - """ - pass - - -class ImitationGraph(Graph): - def __init__(self, program=None): - super(ImitationGraph, self).__init__() - self.program = Program() if program is None else program - - def all_parameters(self): - return self.program.global_block().all_parameters() - - -class IRGraph(Graph): - pass diff --git a/python/paddle/fluid/contrib/slim/graph/graph_pass.py b/python/paddle/fluid/contrib/slim/graph/graph_pass.py deleted file mode 100644 index 1db6c4f110daa44be7fcbcc36f47224797b6dc88..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/contrib/slim/graph/graph_pass.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -__all__ = ['GraphPass', 'PruneParameterPass'] - - -class GraphPass(object): - """ - Base class for all graph pass. - """ - - def __init__(self): - pass - - def apply(self, graph): - pass - - -class PruneParameterPass(GraphPass): - """ - Generate a graph for pruning parameters from target graph. - """ - - def __init__(self, pruned_params, thresholds): - super(PruneParameterPass, self).__init__() - self.pruned_params = pruned_params - self.thresholds = thresholds - self.default_threshold = thresholds['*'] - - def apply(self, graph): - pass diff --git a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..8694be782708a6d47b3e1450305975d34fd3bd7f --- /dev/null +++ b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py @@ -0,0 +1,500 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict +from .... import io +from .... import compiler +from ....framework import Program +from ....framework import program_guard +from ....framework import Parameter +from ....framework import Variable +from ....executor import Executor +import copy +from collections import Iterable +from ....io import save_inference_model, load_inference_model, save_persistables +import numpy as np +import pickle +import os + +__all__ = ['GraphWrapper', 'VarWrapper', 'OpWrapper'] + +OPTIMIZER_OPS = [ + 'momentum', + 'lars_momentum', + 'adagrad', + 'adam', + 'adamax', + 'decayed_adagrad', + 'adadelta', + 'rmsprop', +] + + +class VarWrapper(object): + def __init__(self, var, graph): + assert isinstance(var, Variable) + assert isinstance(graph, GraphWrapper) + self._var = var + self._graph = graph + + def __eq__(self, v): + """ + Overwrite this function for ...in... syntax in python. + """ + return self._var.name == v._var.name + + def name(self): + """ + Get the name of the variable. + """ + return self._var.name + + def shape(self): + """ + Get the shape of the varibale. + """ + return self._var.shape + + def set_shape(self, shape): + """ + Set the shape of the variable. + """ + self._var.desc.set_shape(shape) + + def inputs(self): + """ + Get all the operators that use this variable as output. + Returns: + list: A list of operators. + """ + ops = [] + for op in self._graph.ops(): + if self in op.all_inputs(): + ops.append(op) + return ops + + def outputs(self): + """ + Get all the operators that use this variable as input. + Returns: + list: A list of operators. + """ + ops = [] + for op in self._graph.ops(): + if self in op.all_outputs(): + ops.append(op) + return ops + + +class OpWrapper(object): + def __init__(self, op, graph): + assert isinstance(graph, GraphWrapper) + self._op = op + self._graph = graph + + def __eq__(self, op): + """ + Overwrite this function for ...in... syntax in python. + """ + return self.idx() == op.idx() + + def all_inputs(self): + """ + Get all the input variables of this operator. + """ + return [ + self._graph.var(var_name) for var_name in self._op.input_arg_names + ] + + def all_outputs(self): + """ + Get all the output variables of this operator. + """ + return [ + self._graph.var(var_name) for var_name in self._op.output_arg_names + ] + + def idx(self): + """ + Get the id of this operator. + """ + return self._op.idx + + def type(self): + """ + Get the type of this operator. + """ + return self._op.type + + def is_bwd_op(self): + """ + Whether this operator is backward op. + """ + return self.type().endswith('_grad') + + def is_opt_op(self): + """ + Whether this operator is optimizer op. + """ + return self.type() in OPTIMIZER_OPS + + def inputs(self, name): + """ + Get all the varibales by the input name. + """ + return [self._graph.var(var_name) for var_name in self._op.input(name)] + + def outputs(self, name): + """ + Get all the varibales by the output name. + """ + return [self._graph.var(var_name) for var_name in self._op.output(name)] + + def set_attr(self, key, value): + """ + Set the value of attribute by attribute's name. + + Args: + key(str): the attribute name. + value(bool|int|str|float|list): the value of the attribute. + """ + self._op._set_attr(key, value) + + def attr(self, name): + """ + Get the attribute by name. + + Args: + name(str): the attribute name. + + Returns: + bool|int|str|float|list: The attribute value. The return value + can be any valid attribute type. + """ + return self._op.attr(name) + + +class GraphWrapper(object): + """ + It is a wrapper of paddle.fluid.framework.IrGraph with some special functions + for paddle slim framework. + """ + + def __init__(self, program=None, in_nodes=[], out_nodes=[]): + """ + Args: + program(framework.Program): A program with + in_nodes(dict): A dict to indicate the input nodes of the graph. + The key is user-defined and human-readable name. + The value is the name of Variable. + out_nodes(dict): A dict to indicate the input nodes of the graph. + The key is user-defined and human-readable name. + The value is the name of Variable. + """ + super(GraphWrapper, self).__init__() + self.program = Program() if program is None else program + self.compiled_graph = None + self.in_nodes = OrderedDict(in_nodes) + self.out_nodes = OrderedDict(out_nodes) + self._attrs = OrderedDict() + + def all_parameters(self): + """ + Get all the parameters in this graph. + Returns: + list: A list of VarWrapper instances. + """ + params = [] + for block in self.program.blocks: + for param in block.all_parameters(): + params.append(VarWrapper(param, self)) + return params + + def is_parameter(self, var): + """ + Whether the given variable is parameter. + Args: + var(VarWrapper): The given varibale. + """ + return isinstance(var._var, Parameter) + + def is_persistable(self, var): + """ + Whether the given variable is persistable. + Args: + var(VarWrapper): The given varibale. + """ + return var._var.persistable + + def compile(self, for_parallel=True, for_test=False): + """ + Compile the program in this wrapper to framework.CompiledProgram for next running. + This function must be called if the program is modified. + Args: + for_parallel(bool): Whether the program to run in data parallel way. default: True. + for_test(bool): Whether the compiled program is used for test. + """ + target = self.program + if for_test: + loss = None + else: + loss = self.out_nodes['loss'] + if for_parallel: + # disable memory optimize for stable training + build_strategy = compiler.BuildStrategy() + build_strategy.enable_inplace = False + build_strategy.memory_optimize = False + self.compiled_graph = compiler.CompiledProgram( + target).with_data_parallel( + loss_name=loss, build_strategy=build_strategy) + else: + self.compiled_graph = compiler.CompiledProgram(target) + + def ops(self): + """ + Return all operator nodes included in the graph as a set. + """ + ops = [] + for block in self.program.blocks: + for op in block.ops: + ops.append(OpWrapper(op, self)) + return ops + + def vars(self): + """ + Get all the variables. + """ + return [VarWrapper(var, self) for var in self.program.list_vars()] + + def var(self, name): + """ + Get the variable by variable name. + """ + return VarWrapper(self.program.global_block().var(name), self) + + def clone(self, for_test=False): + """ + Clone a new graph from current graph. + Returns: + (GraphWrapper): The wrapper of a new graph. + """ + return GraphWrapper( + self.program.clone(for_test), + copy.deepcopy(self.in_nodes), copy.deepcopy(self.out_nodes)) + + def merge(self, graph): + """ + Merge a graph into current graph. + Args: + graph(GraphWrapper): The graph to be merged by current graph. + """ + for var in graph.program.list_vars(): + self.program.global_block()._clone_variable(var) + # TODO: parameters should be cloned + for op in graph.ops(): + op = op._op + inputs = {} + outputs = {} + attrs = {} + for input_name in op.input_names: + inputs[input_name] = [ + self.var(in_var_name) + for in_var_name in op.inputs(input_name) + ] + for output_name in op.output_names: + outputs[output_name] = [ + self.var(out_var_name) + for out_var_name in op.output(output_name) + ] + for attr_name in op.attr_names: + attrs[attr_name] = op.attr(attr_name) + self.program.global_block().append_op( + type=op.type, inputs=inputs, outputs=outputs, attrs=attrs) + + def program(self): + """ + Get the program in current wrapper. + """ + return self.program + + def pre_ops(self, op): + """ + Get all the previous operators of target operator. + Args: + op(OpWrapper): Target operator.. + Returns: + list: A list of operators. + """ + ops = [] + for p in self.ops(): + for in_var in op.all_inputs(): + if in_var in p.all_outputs(): + ops.append(p) + return ops + + def next_ops(self, op): + """ + Get all the next operators of target operator. + Args: + op(OpWrapper): Target operator.. + Returns: + list: A list of operators. + """ + ops = [] + for p in self.ops(): + for out_var in op.all_outputs(): + if out_var in p.all_inputs(): + ops.append(p) + return ops + + def get_param_by_op(self, op): + """ + Get the parameters used by target operator. + """ + assert isinstance(op, OpWrapper) + params = [] + for var in op.all_inputs(): + if isinstance(var._var, Parameter): + params.append(var) + assert len(params) > 0 + return params + + def numel_params(self): + """ + Get the number of elements in all parameters. + """ + ret = 0 + for param in self.all_parameters(): + ret += np.product(param.shape()) + return ret + + def get_optimize_graph(self, optimizer, place, scope, no_grad_var_names=[]): + """ + Get a new graph for training by appending some backward operators and optimization operators. + Args: + optimizer: The optimzier used to generate training graph. + place: The place to run the graph. + scope: The scope used to run the graph. Some new variable will be added into this scope. + no_grad_var_names(list): Names of variables that should be ignored while computing gradients. default: []. + Returns: + (GraphWrapper): The wrapper of new graph with backward ops and optimization ops. + """ + graph = self.clone() + startup_program = Program() + with program_guard( + main_program=graph.program, startup_program=startup_program): + target_name = None + if 'loss' in graph.out_nodes: + target_name = graph.out_nodes['loss'] + elif 'cost' in graph.out_nodes: + target_name = graph.out_nodes['cost'] + target = graph.var(target_name)._var + optimizer.minimize(target, no_grad_set=no_grad_var_names) + + exe = Executor(place) + exe.run(program=startup_program, scope=scope) + return graph + + def flops(self, only_conv=False): + """ + Get the flops of current graph. + Args: + only_conv: Only calculating the conv layers. default: False. + Returns: + int: The flops of current graph. + """ + flops = 0 + for op in self.ops(): + if op.type() in ['conv2d', 'depthwise_conv2d']: + filter_shape = op.inputs("Filter")[0].shape() + input_shape = op.inputs("Input")[0].shape() + output_shape = op.outputs("Output")[0].shape() + c_out, c_in, k_h, k_w = filter_shape + _, _, h_out, w_out = output_shape + groups = op.attr("groups") + kernel_ops = k_h * k_w * (c_in / groups) + if len(op.inputs("Bias")) > 0: + with_bias = 1 + else: + with_bias = 0 + flops += 2 * h_out * w_out * c_out * (kernel_ops + with_bias) + elif op.type() == 'pool2d' and not only_conv: + input_shape = op.inputs("X")[0].shape() + output_shape = op.outputs("Out")[0].shape() + _, c_out, h_out, w_out = output_shape + k_size = op.attr("ksize") + flops += h_out * w_out * c_out * (k_size[0]**2) + + elif op.type() == 'mul' and not only_conv: + x_shape = list(op.inputs("X")[0].shape()) + y_shape = op.inputs("Y")[0].shape() + if x_shape[0] == -1: + x_shape[0] = 1 + flops += 2 * x_shape[0] * x_shape[1] * y_shape[1] + + elif op.type() in ['relu', 'sigmoid', 'batch_norm' + ] and not only_conv: + input_shape = list(op.inputs("X")[0].shape()) + if input_shape[0] == -1: + input_shape[0] = 1 + flops += np.product(input_shape) + + return flops + + def save_persistables(self, path, exe): + """ + Save all the persistable variables into file. + Args: + path(str): The path to save the persistables. + exe(framework.Executor): The executor used to save the persistables. + """ + io.save_persistables(exe.exe, path, main_program=self.program) + + def load_persistables(self, path, exe): + """ + Load the persistable variables from file. + Args: + path(str): The path to load the persistables. + exe(framework.Executor): The executor used to load the persistables. + """ + + def if_exist(var): + return os.path.exists(os.path.join(path, var.name)) + + io.load_vars( + exe.exe, path, main_program=self.program, predicate=if_exist) + + def update_param_shape(self, scope): + """ + Update the shape of parameters in the graph according to tensors in scope. + It is used after loading pruned parameters from file. + """ + for param in self.all_parameters(): + tensor_shape = np.array(scope.find_var(param.name()).get_tensor( + )).shape + param.set_shape(tensor_shape) + + def infer_shape(self): + """ + Update the groups of convolution layer according to current filters. + It is used after loading pruned parameters from file. + """ + for op in self.ops(): + if op.type() != 'conditional_block': + op._op.desc.infer_shape(op._op.block.desc) + + def update_groups_of_conv(self): + for op in self.ops(): + if op.type() == 'depthwise_conv2d': + op.set_attr('groups', op.inputs('Filter')[0].shape()[0]) diff --git a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py index 34c5107daa3cde10e7995902be37e34e19664da8..7a25c3a61e0815a20fa9b0477a6c69a4f8d2a066 100644 --- a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py +++ b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py @@ -13,54 +13,919 @@ # limitations under the License. from ..core.strategy import Strategy -from ....framework import Program, program_guard +from ..graph import VarWrapper, OpWrapper, GraphWrapper +from ....framework import Program, program_guard, Parameter from .... import layers +import prettytable as pt import numpy as np +from scipy.optimize import leastsq +import copy +import re +import os +import pickle +import logging +import sys -__all__ = ['SensitivePruneStrategy', 'PruneStrategy'] +__all__ = ['SensitivePruneStrategy', 'UniformPruneStrategy'] +logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s') +_logger = logging.getLogger(__name__) +_logger.setLevel(logging.INFO) + + +class PruneStrategy(Strategy): + """ + The base class of all pruning strategies. + """ -class SensitivePruneStrategy(Strategy): def __init__(self, pruner=None, start_epoch=0, - end_epoch=10, - delta_rate=0.20, - acc_loss_threshold=0.2, - sensitivities=None): - super(SensitivePruneStrategy, self).__init__(start_epoch, end_epoch) + end_epoch=0, + target_ratio=0.5, + metric_name=None, + pruned_params='conv.*_weights'): + """ + Args: + pruner(slim.Pruner): The pruner used to prune the parameters. + start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0 + end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0 + target_ratio(float): The flops ratio to be pruned from current model. + metric_name(str): The metric used to evaluate the model. + It should be one of keys in out_nodes of graph wrapper. + pruned_params(str): The pattern str to match the parameter names to be pruned. + """ + super(PruneStrategy, self).__init__(start_epoch, end_epoch) self.pruner = pruner - self.delta_rate = delta_rate - self.acc_loss_threshold = acc_loss_threshold - self.sensitivities = sensitivities + self.target_ratio = target_ratio + self.metric_name = metric_name + self.pruned_params = pruned_params + self.pruned_list = [] + self.backup = {} + self.param_shape_backup = {} + def _eval_graph(self, context, sampled_rate=None, cached_id=0): + """ + Evaluate the current mode in context. + Args: + context(slim.core.Context): The context storing all information used to evaluate the current model. + sampled_rate(float): The sampled rate used to sample partial data for evaluation. None means using all data in eval_reader. default: None. + cached_id(int): The id of dataset sampled. Evaluations with same cached_id use the same sampled dataset. default: 0. + """ + results, names = context.run_eval_graph(sampled_rate, cached_id) + metric = np.mean(results[list(names).index(self.metric_name)]) + return metric -class PruneStrategy(Strategy): + def _prune_filters_by_ratio(self, + scope, + params, + ratio, + place, + lazy=False, + only_graph=False): + """ + Pruning filters by given ratio. + Args: + scope(fluid.core.Scope): The scope used to pruning filters. + params(list): A list of filter parameters. + ratio(float): The ratio to be pruned. + place(fluid.Place): The device place of filter parameters. + lazy(bool): True means setting the pruned elements to zero. + False means cutting down the pruned elements. + only_graph(bool): True means only modifying the graph. + False means modifying graph and variables in scope. + """ + if params[0].name() in self.pruned_list[0]: + return + param_t = scope.find_var(params[0].name()).get_tensor() + pruned_idx = self.pruner.cal_pruned_idx( + params[0].name(), np.array(param_t), ratio, axis=0) + for param in params: + assert isinstance(param, VarWrapper) + param_t = scope.find_var(param.name()).get_tensor() + if lazy: + self.backup[param.name()] = copy.deepcopy(np.array(param_t)) + pruned_param = self.pruner.prune_tensor( + np.array(param_t), pruned_idx, pruned_axis=0, lazy=lazy) + if not only_graph: + param_t.set(pruned_param, place) + ori_shape = param.shape() + if param.name() not in self.param_shape_backup: + self.param_shape_backup[param.name()] = copy.deepcopy( + param.shape()) + new_shape = list(param.shape()) + new_shape[0] = pruned_param.shape[0] + param.set_shape(new_shape) + _logger.debug( + '|----------------------------------------+----+------------------------------+------------------------------|' + ) + _logger.debug('|{:^40}|{:^4}|{:^30}|{:^30}|'.format( + str(param.name()), str(0), str(ori_shape), str(param.shape()))) + self.pruned_list[0].append(param.name()) + return pruned_idx + + def _prune_parameter_by_idx(self, + scope, + params, + pruned_idx, + pruned_axis, + place, + lazy=False, + only_graph=False): + """ + Pruning parameters in given axis. + Args: + scope(fluid.core.Scope): The scope storing paramaters to be pruned. + params(VarWrapper): The parameter to be pruned. + pruned_idx(list): The index of elements to be pruned. + pruned_axis(int): The pruning axis. + place(fluid.Place): The device place of filter parameters. + lazy(bool): True means setting the pruned elements to zero. + False means cutting down the pruned elements. + only_graph(bool): True means only modifying the graph. + False means modifying graph and variables in scope. + """ + if params[0].name() in self.pruned_list[pruned_axis]: + return + for param in params: + assert isinstance(param, VarWrapper) + param_t = scope.find_var(param.name()).get_tensor() + if lazy: + self.backup[param.name()] = copy.deepcopy(np.array(param_t)) + pruned_param = self.pruner.prune_tensor( + np.array(param_t), pruned_idx, pruned_axis, lazy=lazy) + if not only_graph: + param_t.set(pruned_param, place) + ori_shape = param.shape() + if param.name() not in self.param_shape_backup: + self.param_shape_backup[param.name()] = copy.deepcopy( + param.shape()) + new_shape = list(param.shape()) + new_shape[pruned_axis] = pruned_param.shape[pruned_axis] + param.set_shape(new_shape) + _logger.debug( + '|----------------------------------------+----+------------------------------+------------------------------|' + ) + _logger.debug('|{:^40}|{:^4}|{:^30}|{:^30}|'.format( + str(param.name()), + str(pruned_axis), str(ori_shape), str(param.shape()))) + self.pruned_list[pruned_axis].append(param.name()) + + def _forward_search_related_op(self, graph, param): + """ + Forward search operators that will be affected by pruning of param. + Args: + graph(GraphWrapper): The graph to be searched. + param(VarWrapper): The current pruned parameter. + Returns: + list: A list of operators. + """ + assert isinstance(param, VarWrapper) + visited = {} + for op in graph.ops(): + visited[op.idx()] = False + stack = [] + for op in graph.ops(): + if (not op.is_bwd_op()) and (param in op.all_inputs()): + stack.append(op) + visit_path = [] + while len(stack) > 0: + top_op = stack[len(stack) - 1] + if visited[top_op.idx()] == False: + visit_path.append(top_op) + visited[top_op.idx()] = True + next_ops = None + if top_op.type() == "conv2d" and param not in top_op.all_inputs(): + next_ops = None + elif top_op.type() == "mul": + next_ops = None + else: + next_ops = self._get_next_unvisited_op(graph, visited, top_op) + if next_ops == None: + stack.pop() + else: + stack += next_ops + return visit_path + + def _get_next_unvisited_op(self, graph, visited, top_op): + """ + Get next unvisited adjacent operators of given operators. + Args: + graph(GraphWrapper): The graph used to search. + visited(list): The ids of operators that has been visited. + top_op: The given operator. + Returns: + list: A list of operators. + """ + assert isinstance(top_op, OpWrapper) + next_ops = [] + for op in graph.next_ops(top_op): + if (visited[op.idx()] == False) and (not op.is_bwd_op()): + next_ops.append(op) + return next_ops if len(next_ops) > 0 else None + + def _get_accumulator(self, graph, param): + """ + Get accumulators of given parameter. The accumulator was created by optimizer. + Args: + graph(GraphWrapper): The graph used to search. + param(VarWrapper): The given parameter. + Returns: + list: A list of accumulators which are variables. + """ + assert isinstance(param, VarWrapper) + params = [] + for op in param.outputs(): + if op.is_opt_op(): + for out_var in op.all_outputs(): + if graph.is_persistable(out_var) and out_var.name( + ) != param.name(): + params.append(out_var) + return params + + def _forward_pruning_ralated_params(self, + graph, + scope, + param, + place, + ratio=None, + pruned_idxs=None, + lazy=False, + only_graph=False): + """ + Pruning all the parameters affected by the pruning of given parameter. + Args: + graph(GraphWrapper): The graph to be searched. + scope(fluid.core.Scope): The scope storing paramaters to be pruned. + param(VarWrapper): The given parameter. + place(fluid.Place): The device place of filter parameters. + ratio(float): The target ratio to be pruned. + pruned_idx(list): The index of elements to be pruned. + lazy(bool): True means setting the pruned elements to zero. + False means cutting down the pruned elements. + only_graph(bool): True means only modifying the graph. + False means modifying graph and variables in scope. + """ + assert isinstance( + graph, + GraphWrapper), "graph must be instance of slim.core.GraphWrapper" + assert isinstance( + param, VarWrapper), "param must be instance of slim.core.VarWrapper" + + if param.name() in self.pruned_list[0]: + return + related_ops = self._forward_search_related_op(graph, param) + + if ratio is None: + assert pruned_idxs is not None + self._prune_parameter_by_idx( + scope, [param] + self._get_accumulator(graph, param), + pruned_idxs, + pruned_axis=0, + place=place, + lazy=lazy, + only_graph=only_graph) + + else: + pruned_idxs = self._prune_filters_by_ratio( + scope, [param] + self._get_accumulator(graph, param), + ratio, + place, + lazy=lazy, + only_graph=only_graph) + corrected_idxs = pruned_idxs[:] + + for idx, op in enumerate(related_ops): + if op.type() == "conv2d" and (param not in op.all_inputs()): + for in_var in op.all_inputs(): + if graph.is_parameter(in_var): + conv_param = in_var + self._prune_parameter_by_idx( + scope, [conv_param] + self._get_accumulator( + graph, conv_param), + corrected_idxs, + pruned_axis=1, + place=place, + lazy=lazy, + only_graph=only_graph) + if op.type() == "depthwise_conv2d": + for in_var in op.all_inputs(): + if graph.is_parameter(in_var): + conv_param = in_var + self._prune_parameter_by_idx( + scope, [conv_param] + self._get_accumulator( + graph, conv_param), + corrected_idxs, + pruned_axis=0, + place=place, + lazy=lazy, + only_graph=only_graph) + elif op.type() == "elementwise_add": + # pruning bias + for in_var in op.all_inputs(): + if graph.is_parameter(in_var): + bias_param = in_var + self._prune_parameter_by_idx( + scope, [bias_param] + self._get_accumulator( + graph, bias_param), + pruned_idxs, + pruned_axis=0, + place=place, + lazy=lazy, + only_graph=only_graph) + elif op.type() == "mul": # pruning fc layer + fc_input = None + fc_param = None + for in_var in op.all_inputs(): + if graph.is_parameter(in_var): + fc_param = in_var + else: + fc_input = in_var + + idx = [] + feature_map_size = fc_input.shape()[2] * fc_input.shape()[3] + range_idx = np.array(range(feature_map_size)) + for i in corrected_idxs: + idx += list(range_idx + i * feature_map_size) + corrected_idxs = idx + self._prune_parameter_by_idx( + scope, [fc_param] + self._get_accumulator(graph, fc_param), + corrected_idxs, + pruned_axis=0, + place=place, + lazy=lazy, + only_graph=only_graph) + + elif op.type() == "concat": + concat_inputs = op.all_inputs() + last_op = related_ops[idx - 1] + for out_var in last_op.all_outputs(): + if out_var in concat_inputs: + concat_idx = concat_inputs.index(out_var) + offset = 0 + for ci in range(concat_idx): + offset += concat_inputs[ci].shape()[1] + corrected_idxs = [x + offset for x in pruned_idxs] + elif op.type() == "batch_norm": + bn_inputs = op.all_inputs() + mean = bn_inputs[2] + variance = bn_inputs[3] + alpha = bn_inputs[0] + beta = bn_inputs[1] + self._prune_parameter_by_idx( + scope, [mean] + self._get_accumulator(graph, mean), + corrected_idxs, + pruned_axis=0, + place=place, + lazy=lazy, + only_graph=only_graph) + self._prune_parameter_by_idx( + scope, [variance] + self._get_accumulator(graph, variance), + corrected_idxs, + pruned_axis=0, + place=place, + lazy=lazy, + only_graph=only_graph) + self._prune_parameter_by_idx( + scope, [alpha] + self._get_accumulator(graph, alpha), + corrected_idxs, + pruned_axis=0, + place=place, + lazy=lazy, + only_graph=only_graph) + self._prune_parameter_by_idx( + scope, [beta] + self._get_accumulator(graph, beta), + corrected_idxs, + pruned_axis=0, + place=place, + lazy=lazy, + only_graph=only_graph) + + def _prune_parameters(self, + graph, + scope, + params, + ratios, + place, + lazy=False, + only_graph=False): + """ + Pruning the given parameters. + Args: + graph(GraphWrapper): The graph to be searched. + scope(fluid.core.Scope): The scope storing paramaters to be pruned. + params(list): A list of parameter names to be pruned. + ratios(list): A list of ratios to be used to pruning parameters. + place(fluid.Place): The device place of filter parameters. + pruned_idx(list): The index of elements to be pruned. + lazy(bool): True means setting the pruned elements to zero. + False means cutting down the pruned elements. + only_graph(bool): True means only modifying the graph. + False means modifying graph and variables in scope. + + """ + _logger.debug('\n################################') + _logger.debug('# pruning parameters #') + _logger.debug('################################\n') + _logger.debug( + '|----------------------------------------+----+------------------------------+------------------------------|' + ) + _logger.debug('|{:^40}|{:^4}|{:^30}|{:^30}|'.format('parameter', 'axis', + 'from', 'to')) + assert len(params) == len(ratios) + self.pruned_list = [[], []] + for param, ratio in zip(params, ratios): + assert isinstance(param, str) or isinstance(param, unicode) + param = graph.var(param) + self._forward_pruning_ralated_params( + graph, + scope, + param, + place, + ratio=ratio, + lazy=lazy, + only_graph=only_graph) + ops = param.outputs() + for op in ops: + if op.type() == 'conv2d': + brother_ops = self._search_brother_ops(graph, op) + for broher in brother_ops: + for p in graph.get_param_by_op(broher): + self._forward_pruning_ralated_params( + graph, + scope, + p, + place, + ratio=ratio, + lazy=lazy, + only_graph=only_graph) + _logger.debug( + '|----------------------------------------+----+------------------------------+------------------------------|' + ) + + def _search_brother_ops(self, graph, op_node): + """ + Search brother operators that was affected by pruning of given operator. + Args: + graph(GraphWrapper): The graph to be searched. + op_node(OpWrapper): The start node for searching. + Returns: + list: A list of operators. + """ + visited = [op_node.idx()] + stack = [] + brothers = [] + for op in graph.next_ops(op_node): + if (op.type() != 'conv2d') and (op.type() != 'fc') and ( + not op._is_bwd_op()): + stack.append(op) + visited.append(op.idx()) + while len(stack) > 0: + top_op = stack.pop() + for parent in graph.pre_ops(top_op): + if parent.idx() not in visited and (not parent._is_bwd_op()): + if ((parent.type == 'conv2d') or (parent.type == 'fc')): + brothers.append(parent) + else: + stack.append(parent) + visited.append(parent.idx()) + + for child in graph.next_ops(top_op): + if (child.type != 'conv2d') and (child.type != 'fc') and ( + child.idx() not in visited) and ( + not child._is_bwd_op()): + stack.append(child) + visited.append(child.idx()) + return brothers + + def _prune_graph(self, graph, target_graph): + """ + Pruning parameters of graph according to target graph. + Args: + graph(GraphWrapper): The graph to be pruned. + target_graph(GraphWrapper): The reference graph. + Return: None + """ + count = 1 + _logger.debug( + '|----+----------------------------------------+------------------------------+------------------------------|' + ) + _logger.debug('|{:^4}|{:^40}|{:^30}|{:^30}|'.format('id', 'parammeter', + 'from', 'to')) + for param in target_graph.all_parameters(): + var = graph.var(param.name()) + ori_shape = var.shape() + var.set_shape(param.shape()) + _logger.debug( + '|----+----------------------------------------+------------------------------+------------------------------|' + ) + _logger.debug('|{:^4}|{:^40}|{:^30}|{:^30}|'.format( + str(count), + str(param.name()), str(ori_shape), str(param.shape()))) + count += 1 + _logger.debug( + '|----+----------------------------------------+------------------------------+------------------------------|' + ) + + +class UniformPruneStrategy(PruneStrategy): """ - The strategy that pruning weights by threshold or ratio iteratively. + The uniform pruning strategy. The parameters will be pruned by uniform ratio. """ def __init__(self, - pruner, - mini_batch_pruning_frequency=1, + pruner=None, start_epoch=0, - end_epoch=10): - super(PruneStrategy, self).__init__(start_epoch, end_epoch) - self.pruner = pruner - self.mini_batch_pruning_frequency = mini_batch_pruning_frequency - - def _triger(self, context): - return (context.batch_id % self.mini_batch_pruning_frequency == 0 and - self.start_epoch <= context.epoch_id < self.end_epoch) - - def on_batch_end(self, context): - if self._triger(context): - prune_program = Program() - with program_guard(prune_program): - for param in context.graph.all_parameters(): - prune_program.global_block().clone_variable(param) - p = prune_program.global_block().var(param.name) - zeros_mask = self.pruner.prune(p) - pruned_param = p * zeros_mask - layers.assign(input=pruned_param, output=param) - context.program_exe.run(prune_program, scope=context.scope) + end_epoch=0, + target_ratio=0.5, + metric_name=None, + pruned_params='conv.*_weights'): + """ + Args: + pruner(slim.Pruner): The pruner used to prune the parameters. + start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0 + end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0 + target_ratio(float): The flops ratio to be pruned from current model. + metric_name(str): The metric used to evaluate the model. + It should be one of keys in out_nodes of graph wrapper. + pruned_params(str): The pattern str to match the parameter names to be pruned. + """ + super(UniformPruneStrategy, self).__init__(pruner, start_epoch, + end_epoch, target_ratio, + metric_name, pruned_params) + + def _get_best_ratios(self, context): + """ + Search a group of ratios for pruning target flops. + """ + _logger.info('_get_best_ratios') + pruned_params = [] + for param in context.eval_graph.all_parameters(): + if re.match(self.pruned_params, param.name()): + pruned_params.append(param.name()) + + min_ratio = 0. + max_ratio = 1. + + flops = context.eval_graph.flops() + model_size = context.eval_graph.numel_params() + + while min_ratio < max_ratio: + ratio = (max_ratio + min_ratio) / 2 + _logger.debug( + '-----------Try pruning ratio: {:.2f}-----------'.format(ratio)) + ratios = [ratio] * len(pruned_params) + self._prune_parameters( + context.eval_graph, + context.scope, + pruned_params, + ratios, + context.place, + only_graph=True) + + pruned_flops = 1 - (float(context.eval_graph.flops()) / flops) + pruned_size = 1 - (float(context.eval_graph.numel_params()) / + model_size) + _logger.debug('Pruned flops: {:.2f}'.format(pruned_flops)) + _logger.debug('Pruned model size: {:.2f}'.format(pruned_size)) + for param in self.param_shape_backup.keys(): + context.eval_graph.var(param).set_shape(self.param_shape_backup[ + param]) + self.param_shape_backup = {} + + if abs(pruned_flops - self.target_ratio) < 1e-2: + break + if pruned_flops > self.target_ratio: + max_ratio = ratio + else: + min_ratio = ratio + _logger.info('Get ratios: {}'.format([round(r, 2) for r in ratios])) + return pruned_params, ratios + + def on_epoch_begin(self, context): + if context.epoch_id == self.start_epoch: + params, ratios = self._get_best_ratios(context) + + self._prune_parameters(context.optimize_graph, context.scope, + params, ratios, context.place) + + model_size = context.eval_graph.numel_params() + flops = context.eval_graph.flops() + _logger.debug('\n################################') + _logger.debug('# pruning eval graph #') + _logger.debug('################################\n') + self._prune_graph(context.eval_graph, context.optimize_graph) + context.optimize_graph.update_groups_of_conv() + context.eval_graph.update_groups_of_conv() + + _logger.info( + '------------------finish pruning--------------------------------' + ) + _logger.info('Pruned size: {:.2f}'.format(1 - (float( + context.eval_graph.numel_params()) / model_size))) + _logger.info('Pruned flops: {:.2f}'.format(1 - (float( + context.eval_graph.flops()) / flops))) + # metric = self._eval_graph(context) + # _logger.info('Metric after pruning: {:.2f}'.format(metric)) + _logger.info( + '------------------UniformPruneStrategy.on_compression_begin finish--------------------------------' + ) + + +class SensitivePruneStrategy(PruneStrategy): + """ + Sensitive pruning strategy. Different pruned ratio was applied on each layer. + """ + + def __init__(self, + pruner=None, + start_epoch=0, + end_epoch=0, + delta_rate=0.20, + target_ratio=0.5, + metric_name='top1_acc', + pruned_params='conv.*_weights', + sensitivities_file='./sensitivities.data', + sensitivities={}, + num_steps=1, + eval_rate=None): + """ + Args: + pruner(slim.Pruner): The pruner used to prune the parameters. + start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0. + end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 10. + delta_rate(float): The delta used to generate ratios when calculating sensitivities. default: 0.2 + target_ratio(float): The flops ratio to be pruned from current model. default: 0.5 + metric_name(str): The metric used to evaluate the model. + It should be one of keys in out_nodes of graph wrapper. default: 'top1_acc' + pruned_params(str): The pattern str to match the parameter names to be pruned. default: 'conv.*_weights'. + sensitivities_file(str): The sensitivities file. default: './sensitivities.data' + sensitivities(dict): The user-defined sensitivities. default: {}. + num_steps(int): The number of pruning steps. default: 1. + eval_rate(float): The rate of sampled data used to calculate sensitivities. + None means using all the data. default: None. + """ + super(SensitivePruneStrategy, self).__init__(pruner, start_epoch, + end_epoch, target_ratio, + metric_name, pruned_params) + self.delta_rate = delta_rate + self.pruned_list = [] + self.sensitivities = sensitivities + self.sensitivities_file = sensitivities_file + self.backup = {} + self.param_shape_backup = {} + self.num_steps = num_steps + self.eval_rate = eval_rate + self.pruning_step = 1 - pow((1 - target_ratio), 1.0 / self.num_steps) + + def _save_sensitivities(self, sensitivities, sensitivities_file): + """ + Save sensitivities into file. + """ + with open(sensitivities_file, 'wb') as f: + pickle.dump(sensitivities, f) + + def _load_sensitivities(self, sensitivities_file): + """ + Load sensitivities from file. + """ + sensitivities = {} + if sensitivities_file and os.path.exists(sensitivities_file): + with open(sensitivities_file, 'rb') as f: + if sys.version_info < (3, 0): + sensitivities = pickle.load(f) + else: + sensitivities = pickle.load(f, encoding='bytes') + + for param in sensitivities: + sensitivities[param]['pruned_percent'] = [ + round(p, 2) for p in sensitivities[param]['pruned_percent'] + ] + self._format_sensitivities(sensitivities) + return sensitivities + + def _format_sensitivities(self, sensitivities): + """ + Print formated sensitivities in debug log level. + """ + tb = pt.PrettyTable() + tb.field_names = ["parameter", "size"] + [ + str(round(i, 2)) + for i in np.arange(self.delta_rate, 1, self.delta_rate) + ] + for param in sensitivities: + if len(sensitivities[param]['loss']) == (len(tb.field_names) - 2): + tb.add_row([param, sensitivities[param]['size']] + [ + round(loss, 2) for loss in sensitivities[param]['loss'] + ]) + _logger.debug('\n################################') + _logger.debug('# sensitivities table #') + _logger.debug('################################\n') + _logger.debug(tb) + + def _compute_sensitivities(self, context): + """ + Computing the sensitivities of all parameters. + """ + _logger.info("calling _compute_sensitivities.") + self.param_shape_backup = {} + self.backup = {} + cached_id = np.random.randint(1000) + if self.start_epoch == context.epoch_id: + sensitivities_file = self.sensitivities_file + else: + sensitivities_file = self.sensitivities_file + ".epoch" + str( + context.epoch_id) + sensitivities = self._load_sensitivities(sensitivities_file) + + for param in context.eval_graph.all_parameters(): + if not re.match(self.pruned_params, param.name()): + continue + if param.name() not in sensitivities: + sensitivities[param.name()] = { + 'pruned_percent': [], + 'loss': [], + 'size': param.shape()[0] + } + + metric = None + + for param in sensitivities.keys(): + ratio = self.delta_rate + while ratio < 1: + ratio = round(ratio, 2) + if ratio in sensitivities[param]['pruned_percent']: + _logger.debug('{}, {} has computed.'.format(param, ratio)) + ratio += self.delta_rate + continue + if metric is None: + metric = self._eval_graph(context, self.eval_rate, + cached_id) + # prune parameter by ratio + self._prune_parameters( + context.eval_graph, + context.scope, [param], [ratio], + context.place, + lazy=True) + self.pruned_list[0] + # get accuracy after pruning and update self.sensitivities + pruned_metric = self._eval_graph(context, self.eval_rate, + cached_id) + loss = metric - pruned_metric + _logger.info("pruned param: {}; {}; loss={}".format( + param, ratio, loss)) + for brother in self.pruned_list[0]: + if re.match(self.pruned_params, brother): + if brother not in sensitivities: + sensitivities[brother] = { + 'pruned_percent': [], + 'loss': [] + } + sensitivities[brother]['pruned_percent'].append(ratio) + sensitivities[brother]['loss'].append(loss) + + self._save_sensitivities(sensitivities, sensitivities_file) + + # restore pruned parameters + for param_name in self.backup.keys(): + param_t = context.scope.find_var(param_name).get_tensor() + param_t.set(self.backup[param_name], context.place) + +# pruned_metric = self._eval_graph(context) + self.backup = {} + + ratio += self.delta_rate + return sensitivities + + def _get_best_ratios(self, context, sensitivities, target_ratio): + """ + Search a group of ratios for pruning target flops. + """ + _logger.info('_get_best_ratios for pruning ratie: {}'.format( + target_ratio)) + self.param_shape_backup = {} + self.backup = {} + + def func(params, x): + a, b, c, d = params + return a * x * x * x + b * x * x + c * x + d + + def error(params, x, y): + return func(params, x) - y + + def slove_coefficient(x, y): + init_coefficient = [10, 10, 10, 10] + coefficient, loss = leastsq(error, init_coefficient, args=(x, y)) + return coefficient + + min_loss = 0. + max_loss = 0. + + # step 1: fit curve by sensitivities + coefficients = {} + for param in sensitivities: + losses = np.array([0] * 5 + sensitivities[param]['loss']) + precents = np.array([0] * 5 + sensitivities[param][ + 'pruned_percent']) + coefficients[param] = slove_coefficient(precents, losses) + loss = np.max(losses) + max_loss = np.max([max_loss, loss]) + + # step 2: Find a group of ratios by binary searching. + flops = context.eval_graph.flops() + model_size = context.eval_graph.numel_params() + ratios = [] + while min_loss < max_loss: + loss = (max_loss + min_loss) / 2 + _logger.info( + '-----------Try pruned ratios while acc loss={:.4f}-----------'. + format(loss)) + ratios = [] + # step 2.1: Get ratios according to current loss + for param in sensitivities: + coefficient = copy.deepcopy(coefficients[param]) + coefficient[-1] = coefficient[-1] - loss + roots = np.roots(coefficient) + for root in roots: + min_root = 1 + if np.isreal(root) and root > 0 and root < 1: + selected_root = min(root.real, min_root) + ratios.append(selected_root) + _logger.info('Pruned ratios={}'.format( + [round(ratio, 3) for ratio in ratios])) + # step 2.2: Pruning by current ratios + self._prune_parameters( + context.eval_graph, + context.scope, + sensitivities.keys(), + ratios, + context.place, + only_graph=True) + + pruned_flops = 1 - (float(context.eval_graph.flops()) / flops) + pruned_size = 1 - (float(context.eval_graph.numel_params()) / + model_size) + _logger.info('Pruned flops: {:.4f}'.format(pruned_flops)) + _logger.info('Pruned model size: {:.4f}'.format(pruned_size)) + for param in self.param_shape_backup.keys(): + context.eval_graph.var(param).set_shape(self.param_shape_backup[ + param]) + self.param_shape_backup = {} + + # step 2.3: Check whether current ratios is enough + if abs(pruned_flops - target_ratio) < 0.015: + break + if pruned_flops > target_ratio: + max_loss = loss + else: + min_loss = loss + return sensitivities.keys(), ratios + + def _current_pruning_target(self, context): + ''' + Get the target pruning rate in current epoch. + ''' + _logger.info('Left number of pruning steps: {}'.format(self.num_steps)) + if self.num_steps <= 0: + return None + if (self.start_epoch == context.epoch_id) or context.eval_converged( + self.metric_name, 0.005): + self.num_steps -= 1 + return self.pruning_step + + def on_epoch_begin(self, context): + current_ratio = self._current_pruning_target(context) + if current_ratio is not None: + sensitivities = self._compute_sensitivities(context) + params, ratios = self._get_best_ratios(context, sensitivities, + current_ratio) + self._prune_parameters(context.optimize_graph, context.scope, + params, ratios, context.place) + + self.param_shape_backup = {} + self.backup = {} + + model_size = context.eval_graph.numel_params() + flops = context.eval_graph.flops() + _logger.debug('################################') + _logger.debug('# pruning eval graph #') + _logger.debug('################################') + self._prune_graph(context.eval_graph, context.optimize_graph) + context.optimize_graph.update_groups_of_conv() + context.eval_graph.update_groups_of_conv() + context.optimize_graph.compile() # to update the compiled program + context.eval_graph.compile( + for_parallel=False, + for_test=True) # to update the compiled program + _logger.info( + '------------------finish pruning--------------------------------' + ) + _logger.info('Pruned size: {:.3f}'.format(1 - (float( + context.eval_graph.numel_params()) / model_size))) + _logger.info('Pruned flops: {:.3f}'.format(1 - (float( + context.eval_graph.flops()) / flops))) + metric = self._eval_graph(context) + _logger.info('Metric after pruning: {:.2f}'.format(metric)) + _logger.info( + '------------------SensitivePruneStrategy.on_epoch_begin finish--------------------------------' + ) diff --git a/python/paddle/fluid/contrib/slim/prune/pruner.py b/python/paddle/fluid/contrib/slim/prune/pruner.py index ca72bcb6f6004c18f3ec794850e0aeaecb92d7ac..506b8fbe1de2e0f8a036f591bd2baacd5759c9c8 100644 --- a/python/paddle/fluid/contrib/slim/prune/pruner.py +++ b/python/paddle/fluid/contrib/slim/prune/pruner.py @@ -13,9 +13,10 @@ # limitations under the License. import numpy as np +import collections from .... import layers -__all__ = ['Pruner', 'MagnitudePruner', 'RatioPruner'] +__all__ = ['Pruner', 'StructurePruner'] class Pruner(object): @@ -30,54 +31,77 @@ class Pruner(object): pass -class MagnitudePruner(Pruner): +class StructurePruner(Pruner): """ - Pruner used to pruning a parameter by threshold. + Pruner used to pruning parameters by groups. """ - def __init__(self, threshold): - self.threshold = threshold - - def prune(self, param, threshold=None): - if threshold is None: - thres = layers.fill_constant( - shape=[1], dtype='float32', value=self.threshold) - else: - thres = threshold - zeros_mask = layers.less_than(x=param, y=thres) - return zeros_mask - - -class RatioPruner(Pruner): - """ - Pruner used to pruning a parameter by ratio. - """ + def __init__(self, pruning_axis, criterions): + """ + Args: + pruning_axis(dict): The key is the name of parameter to be pruned, + '*' means all the parameters. + The value is the axis to be used. Given a parameter + with shape [3, 4], the result of pruning 50% on aixs 1 + is a parameter with shape [3, 2]. + criterions(dict): The key is the name of parameter to be pruned, + '*' means all the parameters. + The value is the criterion used to sort groups for pruning. + It only supports 'l1_norm' currently. + """ + self.pruning_axis = pruning_axis + self.criterions = criterions - def __init__(self, ratios=None): + def cal_pruned_idx(self, name, param, ratio, axis=None): """ + Calculate the index to be pruned on axis by given pruning ratio. Args: - ratios: dict with pair (paramer_name, pruned_ratio). + name(str): The name of parameter to be pruned. + param(np.array): The data of parameter to be pruned. + ratio(float): The ratio to be pruned. + axis(int): The axis to be used for pruning given parameter. + If it is None, the value in self.pruning_axis will be used. + default: None. + Returns: + list: The indexes to be pruned on axis. """ - self.ratios = ratios + criterion = self.criterions[ + name] if name in self.criterions else self.criterions['*'] + if axis is None: + assert self.pruning_axis is not None, "pruning_axis should set if axis is None." + axis = self.pruning_axis[ + name] if name in self.pruning_axis else self.pruning_axis['*'] + prune_num = int(round(param.shape[axis] * ratio)) + reduce_dims = [i for i in range(len(param.shape)) if i != axis] + if criterion == 'l1_norm': + criterions = np.sum(np.abs(param), axis=tuple(reduce_dims)) + pruned_idx = criterions.argsort()[:prune_num] + return pruned_idx - def prune(self, param, ratio=None): + def prune_tensor(self, tensor, pruned_idx, pruned_axis, lazy=False): """ + Pruning a array by indexes on given axis. Args: - ratio: `ratio=40%` means pruning (1 - 40%) weights to zero. + tensor(numpy.array): The target array to be pruned. + pruned_idx(list): The indexes to be pruned. + pruned_axis(int): The axis of given array to be pruned on. + lazy(bool): True means setting the pruned elements to zero. + False means remove the pruned elements from memory. + default: False. + Returns: + numpy.array: The pruned array. """ - if ratio is None: - rat = self.ratios[ - param.name] if param.name in self.ratios else self.ratios['*'] - else: - rat = ratio - if rat < 1.0: - k = max(int(rat * np.prod(param.shape)), 1) - param_vec = layers.reshape(x=param, shape=[1, -1]) - param_topk, _ = layers.topk(param_vec, k=k) - threshold = layers.slice( - param_topk, axes=[1], starts=[-1], ends=[k]) - threshold = layers.reshape(x=threshold, shape=[1]) - zeros_mask = layers.less_than(x=param, y=threshold) + mask = np.zeros(tensor.shape[pruned_axis], dtype=bool) + mask[pruned_idx] = True + + def func(data): + return data[~mask] + + def lazy_func(data): + data[mask] = 0 + return data + + if lazy: + return np.apply_along_axis(lazy_func, pruned_axis, tensor) else: - zeros_mask = layers.ones(param.shape) - return zeros_mask + return np.apply_along_axis(func, pruned_axis, tensor) diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index 919db4c78e52edc9a8be44744f4b7704e3f62de4..5dcef506711b78c2aef30d16719f8766359ae8f3 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -22,6 +22,7 @@ from ....framework import IrGraph from ....framework import IrNode from ....framework import Program from ....initializer import Constant +from ....initializer import NumpyArrayInitializer from .... import unique_name __all__ = [ @@ -54,14 +55,15 @@ class QuantizationTransformPass(object): the bias is not quantized. activation_bits (int): quantization bit number for activation. activation_quantize_type (str): quantization type for activation, - now support 'abs_max', 'range_abs_max'. If use 'abs_max' mode, - the quantization scale will be calculated dynamically each step - in both training and testing period. If use 'range_abs_max', - a static quantization scale will be calculated during training - and used in inference. + now support 'abs_max', 'range_abs_max' and 'moving_average_abs_max'. + If use 'abs_max' mode, the quantization scale will be calculated + dynamically each step in both training and testing period. If use + 'range_abs_max', a static quantization scale will be calculated + during training and used in inference. weight_quantize_type (str): quantization type for weights, - support 'abs_max'. The 'range_abs_max' usually is not used for - weight, since weights are fixed once the model is well trained. + support 'abs_max' and 'channel_wise_abs_max'. The 'range_abs_max' + usually is not used for weight, since weights are fixed once the + model is well trained. window_size (int): the window size for 'range_abs_max' quantization. Examples: @@ -84,7 +86,11 @@ class QuantizationTransformPass(object): self._weight_bits = weight_bits self._activation_bits = activation_bits - quant_type = ['abs_max', 'range_abs_max', 'moving_average_abs_max'] + quant_type = [ + 'abs_max', 'channel_wise_abs_max', 'range_abs_max', + 'moving_average_abs_max' + ] + assert activation_quantize_type != 'channel_wise_abs_max', "The activation quantization type does not support 'channel_wise_abs_max'." if activation_quantize_type not in quant_type: raise ValueError( "Unknown activation_quantize_type : '%s'. It can only be ", @@ -93,7 +99,7 @@ class QuantizationTransformPass(object): if weight_quantize_type not in quant_type: raise ValueError( "Unknown weight_quantize_type: '%s'. It can only be ", - "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.", + "'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' or 'moving_average_abs_max'.", str(weight_quantize_type)) self._activation_quantize_type = activation_quantize_type @@ -103,6 +109,7 @@ class QuantizationTransformPass(object): self._need_initialized = collections.OrderedDict() self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul'] + self._conv_ops = ['conv2d', 'depthwise_conv2d'] self._quantizable_grad_ops = [ '%s_grad' % (op) for op in self._quantizable_ops ] @@ -135,10 +142,26 @@ class QuantizationTransformPass(object): else self._activation_bits quant_type = self._weight_quantize_type if var_node.name() \ in persistable_vars else self._activation_quantize_type - quant_var_node, scale_var_node = self._insert_quant_op( - graph, var_node, quant_bits, quant_type) - dequant_var_node = self._insert_dequant_op( - graph, quant_var_node, scale_var_node, quant_bits) + if quant_type == 'channel_wise_abs_max': + assert var_node.name( + ) in persistable_vars, "'channel_wise_abs_max' can only be applied on weights." + if op.name() in self._conv_ops: + quant_var_node, scale_var_node = self._insert_channel_quant_op( + graph, var_node, quant_bits) + dequant_var_node = self._insert_channel_dequant_op( + graph, quant_var_node, [scale_var_node], + [quant_bits]) + else: + quant_var_node, scale_var_node = self._insert_quant_op( + graph, var_node, quant_bits, 'abs_max') + dequant_var_node = self._insert_dequant_op( + graph, quant_var_node, scale_var_node, + quant_bits) + else: + quant_var_node, scale_var_node = self._insert_quant_op( + graph, var_node, quant_bits, quant_type) + dequant_var_node = self._insert_dequant_op( + graph, quant_var_node, scale_var_node, quant_bits) dequantized_vars[var_node.name()] = dequant_var_node graph.update_input_link(var_node, dequant_var_node, op) @@ -244,7 +267,7 @@ class QuantizationTransformPass(object): scale_var_node = graph.create_var_node( name=self._quantized_scale_name(var_node.name()), var_type=var_node.type(), - shape=var_node.shape(), + shape=[1], var_dtype=var_node.dtype()) quant_op_node = graph.create_op_node( op_type='fake_quantize_abs_max', @@ -384,6 +407,36 @@ class QuantizationTransformPass(object): return quant_var_node, scale_out_node + def _insert_channel_quant_op(self, graph, var_node, quant_bits): + """ + Insert fake_channel_wise_quantize_abs_max op in the graph. + """ + assert var_node.is_var(), '{} is not a var'.format(var_node.name()) + + quant_var_node = graph.create_var_node( + name=self._quantized_var_name(var_node.name()), + var_type=var_node.type(), + shape=var_node.shape(), + var_dtype=var_node.dtype()) + scale_var_node = graph.create_var_node( + name=self._quantized_scale_name(var_node.name()), + var_type=var_node.type(), + shape=[var_node.shape()[0]], + var_dtype=var_node.dtype()) + quant_op_node = graph.create_op_node( + op_type='fake_channel_wise_quantize_abs_max', + attrs={ + 'bit_length': quant_bits, + 'op_role': core.op_proto_and_checker_maker.OpRole.Forward + }, + inputs={'X': var_node}, + outputs={'Out': quant_var_node, + 'OutScale': scale_var_node}) + graph.link_to(var_node, quant_op_node) + graph.link_to(quant_op_node, quant_var_node) + graph.link_to(quant_op_node, scale_var_node) + return quant_var_node, scale_var_node + def _insert_dequant_op(self, graph, var_node, scale_var_node, quant_bits): """ Insert fake_dequantize_op in the graph. @@ -410,6 +463,33 @@ class QuantizationTransformPass(object): graph.link_to(dequant_op_node, dequant_var_node) return dequant_var_node + def _insert_channel_dequant_op(self, graph, var_node, scale_var_nodes, + quant_bits): + """ + Insert fake_channel_wise_dequantize_max_abs in the graph. + """ + assert var_node.is_var(), '{} is not a var'.format(var_node.name()) + + dequant_var_node = graph.create_var_node( + name=self._dequantized_var_name(var_node.name()), + var_type=var_node.type(), + shape=var_node.shape(), + var_dtype=var_node.dtype()) + dequant_op_node = graph.create_op_node( + op_type='fake_channel_wise_dequantize_max_abs', + attrs={ + 'quant_bits': quant_bits, + 'op_role': core.op_proto_and_checker_maker.OpRole.Forward + }, + inputs={'X': var_node, + 'Scales': scale_var_nodes}, + outputs={'Out': dequant_var_node}) + graph.link_to(var_node, dequant_op_node) + for scale_n in scale_var_nodes: + graph.link_to(scale_n, dequant_op_node) + graph.link_to(dequant_op_node, dequant_var_node) + return dequant_var_node + def _quantized_var_name(self, var_name): """ Return quantized variable name for the input `var_name`. @@ -442,7 +522,7 @@ class QuantizationFreezePass(object): place(fluid.CPUPlace|fluid.CUDAPlace): place is used to restore the weight tensors. weight_bits (int): quantization bit number for weights. activation_bits (int): quantization bit number for activation. - weight_quantize_type (str): quantization type for weights, support 'abs_max'. + weight_quantize_type (str): quantization type for weights, support 'abs_max' and 'channel_wise_abs_max'. The 'range_abs_max' usually is not used for weight, since weights are fixed once the model is well trained. """ @@ -463,11 +543,15 @@ class QuantizationFreezePass(object): self._activation_bits = activation_bits self._weight_quantize_type = weight_quantize_type self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul'] + self._conv_ops = ['conv2d', 'depthwise_conv2d'] self._fake_quant_op_names = [ 'fake_quantize_abs_max', 'fake_quantize_range_abs_max', - 'fake_quantize_moving_average_abs_max' + 'fake_quantize_moving_average_abs_max', + 'fake_channel_wise_quantize_abs_max' + ] + self._fake_dequant_op_names = [ + 'fake_dequantize_max_abs', 'fake_channel_wise_dequantize_max_abs' ] - self._fake_dequant_op_names = ['fake_dequantize_max_abs'] self._op_input_rename_map = collections.OrderedDict() self._op_output_rename_map = collections.OrderedDict() self._var_scale_map = collections.OrderedDict() @@ -489,20 +573,27 @@ class QuantizationFreezePass(object): if self._weight_quantize_type == 'abs_max': param = self._load_var(input_arg_name) scale_v = np.max(np.abs(param)) + elif self._weight_quantize_type == 'channel_wise_abs_max': + param = self._load_var(input_arg_name) + if len(param.shape) == 4: # conv2d or depthwise_conv2d + scale_v = [] + for i in range(param.shape[0]): + scale_v.append(np.max(np.abs(param[i]))) + else: + scale_v = np.max(np.abs(param)) else: scale_v = self._load_var( op_node.output('OutScale')[0])[0] self._var_scale_map[input_arg_name] = scale_v - else: - scale_v = graph.var_node(op_node.output('OutScale')[0]) - self._var_scale_map[input_arg_name] = scale_v - if input_arg_name in persistable_vars: self._remove_fake_quant_and_dequant_op(graph, op_node) # quantize weight and restore param_v = self._load_var(input_arg_name) quantized_param_v = self._quant(param_v, scale_v, self._weight_bits) self._restore_var(input_arg_name, quantized_param_v) + else: + scale_v = graph.var_node(op_node.output('OutScale')[0]) + self._var_scale_map[input_arg_name] = scale_v ops = graph.all_op_nodes() for op_node in ops: @@ -514,7 +605,10 @@ class QuantizationFreezePass(object): for op_node in ops: op_name = op_node.name() if op_name in self._quantizable_ops: - self._insert_post_dequant_op(graph, op_node) + if self._weight_quantize_type == 'channel_wise_abs_max' and op_name in self._conv_ops: + self._insert_post_channel_dequant_op(graph, op_node) + else: + self._insert_post_dequant_op(graph, op_node) for op_node in ops: # insert dequant_op after fc/conv, need to rename inputs of the followed ops @@ -538,9 +632,73 @@ class QuantizationFreezePass(object): self._op_input_rename_map[k] = self._op_input_rename_map[v] graph.safe_remove_nodes(op_node) + def _insert_post_channel_dequant_op(self, graph, op_node): + persistable_vars = [p.name() for p in graph.all_persistable_nodes()] + for var_node in op_node.inputs: + name = var_node.name() + if name in self._op_input_rename_map: + old_in = graph.var_node(name) + new_in = graph.var_node(self._op_input_rename_map[name]) + new_in.clear_outputs() + graph.update_input_link(old_in, new_in, op_node) + original_var_name = self._original_var_name(name) + scale_v = self._var_scale_map[original_var_name] + if original_var_name in persistable_vars: + assert isinstance( + scale_v, + list), 'The scale of parameter %s is not a list.' % ( + original_var_name) + channel_scale = np.array(scale_v) + else: + assert isinstance(scale_v, IrNode) + scale_var_node = self._var_scale_map[original_var_name] + + if len(op_node.outputs) != 1: + raise ValueError("Only support one output, but op %s has" + " more than one output." % (op_node.name())) + + output_var_node = op_node.outputs[0] + weight_scale_node = graph.create_persistable_node( + name=unique_name.generate('channel_scale'), + var_type=core.VarDesc.VarType.LOD_TENSOR, + shape=[channel_scale.shape[0]], + var_dtype=output_var_node.dtype()) + init_program = Program() + weight_scale_var = init_program.global_block().create_var( + name=weight_scale_node.name(), + shape=weight_scale_node.shape(), + dtype=weight_scale_node.dtype(), + type=weight_scale_node.type(), + lod_level=weight_scale_node.var().lod_level(), + persistable=weight_scale_node.persistable()) + initializer = NumpyArrayInitializer(value=channel_scale) + initializer(weight_scale_var, init_program.global_block()) + exe = Executor(self._place) + exe.run(program=init_program, scope=self._scope) + dequant_var_node = graph.create_var_node( + name=self._dequantized_var_name(output_var_node.name()), + var_type=output_var_node.type(), + shape=output_var_node.shape(), + var_dtype=output_var_node.dtype()) + dequant_op_node = graph.create_op_node( + op_type='fake_channel_wise_dequantize_max_abs', + attrs={ + 'quant_bits': [self._weight_bits, self._activation_bits], + 'op_role': core.op_proto_and_checker_maker.OpRole.Forward + }, + inputs={ + 'X': output_var_node, + 'Scales': [weight_scale_node, scale_var_node] + }, + outputs={'Out': dequant_var_node}) + graph.link_to(output_var_node, dequant_op_node) + graph.link_to(scale_var_node, dequant_op_node) + graph.link_to(weight_scale_node, dequant_op_node) + graph.link_to(dequant_op_node, dequant_var_node) + self._op_output_rename_map[output_var_node.name()] = dequant_var_node + return dequant_var_node + def _insert_post_dequant_op(self, graph, op_node): - max_range = None - scale_var_node = None persistable_vars = [p.name() for p in graph.all_persistable_nodes()] for var_node in op_node.inputs: name = var_node.name() @@ -637,7 +795,12 @@ class QuantizationFreezePass(object): or isinstance(v, np.float64) def _quant(self, x, scale, num_bits): - return np.round(x / scale * ((1 << (num_bits - 1)) - 1)) + if isinstance(scale, list): + for i, s in enumerate(scale): + x[i] = np.round(x[i] / s * ((1 << (num_bits - 1)) - 1)) + return x + else: + return np.round(x / scale * ((1 << (num_bits - 1)) - 1)) class ConvertToInt8Pass(object): @@ -731,9 +894,13 @@ class TransformForMobilePass(object): def __init__(self): self._fake_quant_op_names = [ - 'fake_quantize_abs_max', 'fake_quantize_range_abs_max' + 'fake_quantize_abs_max', 'fake_quantize_range_abs_max', + 'fake_quantize_moving_average_abs_max', + 'fake_channel_wise_quantize_abs_max' + ] + self._fake_dequant_op_names = [ + 'fake_dequantize_max_abs', 'fake_channel_wise_dequantize_max_abs' ] - self._fake_dequant_op_names = ['fake_dequantize_max_abs'] def apply(self, graph): """ diff --git a/python/paddle/fluid/contrib/slim/tests/configs/config.yaml b/python/paddle/fluid/contrib/slim/tests/configs/config.yaml deleted file mode 100644 index d9b49029d3e34d487ad65fe0f7e54e2cee1d5838..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/contrib/slim/tests/configs/config.yaml +++ /dev/null @@ -1,29 +0,0 @@ -version: 1.0 -include: ["./configs/pruners.yaml", "./configs/pruners_0.yaml"] -pruners: - pruner_1: - class: 'RatioPruner' - ratios: - 'conv1_1.w': 0.3 - 'conv1_2.w': 0.4 - '*': 0.9 - group_dims: - '*': [1, 2, 3] - criterions: - '*': 'l1-norm' -strategies: - strategy_1: - class: 'SensitivePruneStrategy' - pruner: 'pruner_2' - start_epoch: 0 - end_epoch: 10 - delta_rate: 0.20 - acc_loss_threshold: 0.2 - sensitivities: - 'conv1_1.w': 0.4 - -compress_pass: - class: 'CompressPass' - epoch: 100 - strategies: - - strategy_1 diff --git a/python/paddle/fluid/contrib/slim/tests/configs/filter_pruning.yaml b/python/paddle/fluid/contrib/slim/tests/configs/filter_pruning.yaml new file mode 100644 index 0000000000000000000000000000000000000000..570c60026d55c242106f7e2dc5c3f47bfbdbe884 --- /dev/null +++ b/python/paddle/fluid/contrib/slim/tests/configs/filter_pruning.yaml @@ -0,0 +1,34 @@ +#start_epoch: The 'on_epoch_begin' function will be called in start_epoch. default: 0. +#end_epoch: The 'on_epoch_end' function will be called in end_epoch. default: 10. +#delta_rate: The delta used to generate ratios when calculating sensitivities. +#target_ratio: The flops ratio to be pruned from current model. +#metric_name: The metric used to evaluate the model. +#pruned_params: The pattern str to match the parameter names to be pruned. +#sensitivities_file: The sensitivities file. +#num_steps: The number of pruning steps. +#eval_rate: The rate of sampled data used to calculate sensitivities. +version: 1.0 +pruners: + pruner_1: + class: 'StructurePruner' + pruning_axis: + '*': 0 + criterions: + '*': 'l1_norm' +strategies: + sensitive_pruning_strategy: + class: 'SensitivePruneStrategy' + pruner: 'pruner_1' + start_epoch: 0 + delta_rate: 0.1 + target_ratio: 0.3 + num_steps: 1 + eval_rate: 0.5 + pruned_params: '.*_sep_weights' + sensitivities_file: 'mobilenet_acc_top1_sensitive.data' + metric_name: 'acc_top1' +compressor: + epoch: 120 + checkpoint_path: './checkpoints/' + strategies: + - sensitive_pruning_strategy diff --git a/python/paddle/fluid/contrib/slim/tests/configs/pruners.yaml b/python/paddle/fluid/contrib/slim/tests/configs/pruners.yaml deleted file mode 100644 index 235092c595bf7c653221c7fe2b381fecf487fa49..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/contrib/slim/tests/configs/pruners.yaml +++ /dev/null @@ -1,12 +0,0 @@ -version: 1.0 -pruners: - pruner_2: - class: 'RatioPruner' - ratios: - 'conv1_1.w': 0.5 - 'conv1_2.w': 0.2 - '*': 0.7 - group_dims: - '*': [1, 2, 3] - criterions: - '*': 'l1-norm' diff --git a/python/paddle/fluid/contrib/slim/tests/configs/pruners_0.yaml b/python/paddle/fluid/contrib/slim/tests/configs/pruners_0.yaml deleted file mode 100644 index cd2ef9eb56ddbc1367ce2e3b413372fbcd542bde..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/contrib/slim/tests/configs/pruners_0.yaml +++ /dev/null @@ -1,12 +0,0 @@ -version: 1.0 -pruners: - pruner_3: - class: 'RatioPruner' - ratios: - 'conv1_1.w': 0.5 - 'conv1_2.w': 0.2 - '*': 0.7 - group_dims: - '*': [1, 2, 3] - criterions: - '*': 'l1-norm' diff --git a/python/paddle/fluid/contrib/slim/tests/filter_pruning/__init__.py b/python/paddle/fluid/contrib/slim/tests/filter_pruning/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d0c32e26092f6ea25771279418582a24ea449ab2 --- /dev/null +++ b/python/paddle/fluid/contrib/slim/tests/filter_pruning/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/fluid/contrib/slim/tests/filter_pruning/compress.yaml b/python/paddle/fluid/contrib/slim/tests/filter_pruning/compress.yaml new file mode 100644 index 0000000000000000000000000000000000000000..232276feac5023c45d594015cf7084b000cd5b4a --- /dev/null +++ b/python/paddle/fluid/contrib/slim/tests/filter_pruning/compress.yaml @@ -0,0 +1,34 @@ +#start_epoch: The 'on_epoch_begin' function will be called in start_epoch. default: 0. +#end_epoch: The 'on_epoch_end' function will be called in end_epoch. default: 10. +#delta_rate: The delta used to generate ratios when calculating sensitivities. +#target_ratio: The flops ratio to be pruned from current model. +#metric_name: The metric used to evaluate the model. +#pruned_params: The pattern str to match the parameter names to be pruned. +#sensitivities_file: The sensitivities file. +#num_steps: The number of pruning steps. +#eval_rate: The rate of sampled data used to calculate sensitivities. +version: 1.0 +pruners: + pruner_1: + class: 'StructurePruner' + pruning_axis: + '*': 0 + criterions: + '*': 'l1_norm' +strategies: + sensitive_pruning_strategy: + class: 'SensitivePruneStrategy' + pruner: 'pruner_1' + start_epoch: 1 + delta_rate: 0.2 + target_ratio: 0.08 + num_steps: 1 + eval_rate: 0.5 + pruned_params: 'conv6_sep_weights' + sensitivities_file: 'mobilenet_acc_top1_sensitive.data' + metric_name: 'acc_top1' +compressor: + epoch: 2 + checkpoint_path: './checkpoints/' + strategies: + - sensitive_pruning_strategy diff --git a/python/paddle/fluid/contrib/slim/tests/filter_pruning/mobilenet.py b/python/paddle/fluid/contrib/slim/tests/filter_pruning/mobilenet.py new file mode 100644 index 0000000000000000000000000000000000000000..0148325a642a2bcbebd3d7794056ff2778a3992d --- /dev/null +++ b/python/paddle/fluid/contrib/slim/tests/filter_pruning/mobilenet.py @@ -0,0 +1,210 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import paddle.fluid as fluid +from paddle.fluid.initializer import MSRA +from paddle.fluid.param_attr import ParamAttr + +__all__ = ['MobileNet'] + +train_parameters = { + "input_size": [3, 224, 224], + "input_mean": [0.485, 0.456, 0.406], + "input_std": [0.229, 0.224, 0.225], + "learning_strategy": { + "name": "piecewise_decay", + "batch_size": 256, + "epochs": [30, 60, 90], + "steps": [0.1, 0.01, 0.001, 0.0001] + } +} + + +class MobileNet(): + def __init__(self): + self.params = train_parameters + + def net(self, input, class_dim=1000, scale=1.0): + # conv1: 112x112 + input = self.conv_bn_layer( + input, + filter_size=3, + channels=3, + num_filters=int(32 * scale), + stride=2, + padding=1, + name="conv1") + + # 56x56 + input = self.depthwise_separable( + input, + num_filters1=32, + num_filters2=64, + num_groups=32, + stride=1, + scale=scale, + name="conv2_1") + + input = self.depthwise_separable( + input, + num_filters1=64, + num_filters2=128, + num_groups=64, + stride=2, + scale=scale, + name="conv2_2") + + # 28x28 + input = self.depthwise_separable( + input, + num_filters1=128, + num_filters2=128, + num_groups=128, + stride=1, + scale=scale, + name="conv3_1") + + input = self.depthwise_separable( + input, + num_filters1=128, + num_filters2=256, + num_groups=128, + stride=2, + scale=scale, + name="conv3_2") + + # 14x14 + input = self.depthwise_separable( + input, + num_filters1=256, + num_filters2=256, + num_groups=256, + stride=1, + scale=scale, + name="conv4_1") + + input = self.depthwise_separable( + input, + num_filters1=256, + num_filters2=512, + num_groups=256, + stride=2, + scale=scale, + name="conv4_2") + + # 14x14 + for i in range(5): + input = self.depthwise_separable( + input, + num_filters1=512, + num_filters2=512, + num_groups=512, + stride=1, + scale=scale, + name="conv5" + "_" + str(i + 1)) + # 7x7 + input = self.depthwise_separable( + input, + num_filters1=512, + num_filters2=1024, + num_groups=512, + stride=2, + scale=scale, + name="conv5_6") + + input = self.depthwise_separable( + input, + num_filters1=1024, + num_filters2=1024, + num_groups=1024, + stride=1, + scale=scale, + name="conv6") + + input = fluid.layers.pool2d( + input=input, + pool_size=0, + pool_stride=1, + pool_type='avg', + global_pooling=True) + + output = fluid.layers.fc(input=input, + size=class_dim, + act='softmax', + param_attr=ParamAttr( + initializer=MSRA(), name="fc7_weights"), + bias_attr=ParamAttr(name="fc7_offset")) + return output + + def conv_bn_layer(self, + input, + filter_size, + num_filters, + stride, + padding, + channels=None, + num_groups=1, + act='relu', + use_cudnn=True, + name=None): + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + act=None, + use_cudnn=use_cudnn, + param_attr=ParamAttr( + initializer=MSRA(), name=name + "_weights"), + bias_attr=False) + bn_name = name + "_bn" + return fluid.layers.batch_norm( + input=conv, + act=act, + param_attr=ParamAttr(name=bn_name + "_scale"), + bias_attr=ParamAttr(name=bn_name + "_offset"), + moving_mean_name=bn_name + '_mean', + moving_variance_name=bn_name + '_variance') + + def depthwise_separable(self, + input, + num_filters1, + num_filters2, + num_groups, + stride, + scale, + name=None): + depthwise_conv = self.conv_bn_layer( + input=input, + filter_size=3, + num_filters=int(num_filters1 * scale), + stride=stride, + padding=1, + num_groups=int(num_groups * scale), + use_cudnn=False, + name=name + "_dw") + + pointwise_conv = self.conv_bn_layer( + input=depthwise_conv, + filter_size=1, + num_filters=int(num_filters2 * scale), + stride=1, + padding=0, + name=name + "_sep") + return pointwise_conv diff --git a/python/paddle/fluid/contrib/slim/tests/test_factory.py b/python/paddle/fluid/contrib/slim/tests/test_factory.py index 2fc72b6475e6bdd977dafb57696046a1100d0087..90eb8bd4b3caa44880f6df21c7f9f6d460655a8c 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_factory.py +++ b/python/paddle/fluid/contrib/slim/tests/test_factory.py @@ -12,29 +12,25 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddle.fluid.contrib.slim import ConfigFactory +from paddle.fluid.contrib.slim.core import ConfigFactory import unittest class TestFactory(unittest.TestCase): - def test_parse(self): - factory = ConfigFactory('./configs/config.yaml') + def test_parse_pruning(self): + factory = ConfigFactory('./configs/filter_pruning.yaml') - pruner = factory.instance('pruner_1') - self.assertEquals(pruner.ratios['conv1_1.w'], 0.3) + pruner_1 = factory.instance('pruner_1') + self.assertEquals(pruner_1.pruning_axis['*'], 0) + self.assertEquals(pruner_1.criterions['*'], 'l1_norm') - pruner = factory.instance('pruner_2') - self.assertEquals(pruner.ratios['*'], 0.7) + strategy = factory.instance('sensitive_pruning_strategy') + pruner_1 = strategy.pruner + self.assertEquals(pruner_1.criterions['*'], 'l1_norm') - strategy = factory.instance('strategy_1') - pruner = strategy.pruner - self.assertEquals(pruner.ratios['*'], 0.7) - - compress_pass = factory.get_compress_pass() - self.assertEquals(compress_pass.epoch, 100) - - strategy = compress_pass.strategies[0] - self.assertEquals(strategy.delta_rate, 0.2) + self.assertEquals(strategy.start_epoch, 0) + self.assertEquals(strategy.sensitivities_file, + 'mobilenet_acc_top1_sensitive.data') if __name__ == '__main__': diff --git a/python/paddle/fluid/contrib/slim/tests/test_filter_pruning.py b/python/paddle/fluid/contrib/slim/tests/test_filter_pruning.py new file mode 100644 index 0000000000000000000000000000000000000000..d73ee27779a0d17a0f60df645a6d2946d665c01e --- /dev/null +++ b/python/paddle/fluid/contrib/slim/tests/test_filter_pruning.py @@ -0,0 +1,89 @@ +# copyright (c) 2019 paddlepaddle authors. all rights reserved. +# +# licensed under the apache license, version 2.0 (the "license"); +# you may not use this file except in compliance with the license. +# you may obtain a copy of the license at +# +# http://www.apache.org/licenses/license-2.0 +# +# unless required by applicable law or agreed to in writing, software +# distributed under the license is distributed on an "as is" basis, +# without warranties or conditions of any kind, either express or implied. +# see the license for the specific language governing permissions and +# limitations under the license. + +import paddle +import unittest +import paddle.fluid as fluid +from filter_pruning.mobilenet import MobileNet +from paddle.fluid.contrib.slim.core import Compressor +from paddle.fluid.contrib.slim.graph import GraphWrapper + + +class TestFilterPruning(unittest.TestCase): + def test_compression(self): + """ + Model: mobilenet_v1 + data: mnist + step1: Training one epoch + step2: pruning flops + step3: fine-tune one epoch + step4: check top1_acc. + """ + if not fluid.core.is_compiled_with_cuda(): + return + class_dim = 10 + image_shape = [1, 28, 28] + image = fluid.layers.data( + name='image', shape=image_shape, dtype='float32') + image.stop_gradient = False + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + out = MobileNet().net(input=image, class_dim=class_dim) + acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) + acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) + val_program = fluid.default_main_program().clone(for_test=False) + + cost = fluid.layers.cross_entropy(input=out, label=label) + avg_cost = fluid.layers.mean(x=cost) + + optimizer = fluid.optimizer.Momentum( + momentum=0.9, + learning_rate=0.01, + regularization=fluid.regularizer.L2Decay(4e-5)) + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128) + + val_feed_list = [('img', image.name), ('label', label.name)] + val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5', + acc_top5.name)] + + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=128) + train_feed_list = [('img', image.name), ('label', label.name)] + train_fetch_list = [('loss', avg_cost.name)] + + com_pass = Compressor( + place, + fluid.global_scope(), + fluid.default_main_program(), + train_reader=train_reader, + train_feed_list=train_feed_list, + train_fetch_list=train_fetch_list, + eval_program=val_program, + eval_reader=val_reader, + eval_feed_list=val_feed_list, + eval_fetch_list=val_fetch_list, + train_optimizer=optimizer) + com_pass.config('./filter_pruning/compress.yaml') + eval_graph = com_pass.run() + self.assertTrue( + abs((com_pass.context.eval_results['acc_top1'][-1] - 0.969) / 0.969) + < 0.02) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py b/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..ad82aa941183d72353dae19527b21286d6473a63 --- /dev/null +++ b/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py @@ -0,0 +1,140 @@ +# copyright (c) 2019 paddlepaddle authors. all rights reserved. +# +# licensed under the apache license, version 2.0 (the "license"); +# you may not use this file except in compliance with the license. +# you may obtain a copy of the license at +# +# http://www.apache.org/licenses/license-2.0 +# +# unless required by applicable law or agreed to in writing, software +# distributed under the license is distributed on an "as is" basis, +# without warranties or conditions of any kind, either express or implied. +# see the license for the specific language governing permissions and +# limitations under the license. + +from __future__ import print_function +import unittest +import paddle.fluid as fluid +import six +import numpy as np +from paddle.fluid.contrib.slim.graph import GraphWrapper +from paddle.fluid import core + + +def residual_block(num): + def conv_bn_layer(input, + ch_out, + filter_size, + stride, + padding, + act='relu', + bias_attr=False): + tmp = fluid.layers.conv2d( + input=input, + filter_size=filter_size, + num_filters=ch_out, + stride=stride, + padding=padding, + act=None, + bias_attr=bias_attr) + return fluid.layers.batch_norm(input=tmp, act=act) + + data = fluid.layers.data(name='image', shape=[1, 8, 8], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + data.stop_gradinet = False + hidden = data + for _ in six.moves.xrange(num): + conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True) + short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None) + hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu') + fc = fluid.layers.fc(input=hidden, size=10) + + loss = fluid.layers.cross_entropy(input=fc, label=label) + loss = fluid.layers.mean(loss) + return data, label, loss + + +class TestGraphWrapper(unittest.TestCase): + def build_program(self): + place = fluid.CPUPlace() + if fluid.core.is_compiled_with_cuda(): + place = fluid.CUDAPlace(0) + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + image, label, self.loss = residual_block(2) + eval_program = main.clone() + opt = fluid.optimizer.SGD(learning_rate=0.001) + opt.minimize(self.loss) + self.scope = core.Scope() + exe = fluid.Executor(place) + exe.run(startup, scope=self.scope) + self.eval_graph = GraphWrapper( + program=eval_program, + in_nodes={'image': image.name, + 'label': label.name}, + out_nodes={'loss': self.loss.name}) + self.train_graph = GraphWrapper( + program=main, + in_nodes={'image': image.name, + 'label': label.name}, + out_nodes={'loss': self.loss.name}) + + def test_all_parameters(self): + self.build_program() + self.assertEquals(len(self.train_graph.all_parameters()), 24) + + def test_all_vars(self): + self.build_program() + self.assertEquals(len(self.train_graph.vars()), 90) + + def test_numel_params(self): + self.build_program() + self.assertEquals(self.train_graph.numel_params(), 13258) + + def test_compile(self): + self.build_program() + place = fluid.CPUPlace() + if fluid.core.is_compiled_with_cuda(): + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + self.train_graph.compile() + exe.run(self.train_graph.compiled_graph, + scope=self.scope, + feed={ + 'image': + np.random.randint(0, 40, [16, 1, 8, 8]).astype('float32'), + 'label': np.random.randint(0, 10, [16, 1]).astype('int64') + }) + + def test_pre_and_next_ops(self): + self.build_program() + for op in self.train_graph.ops(): + for next_op in self.train_graph.next_ops(op): + self.assertTrue(op in self.train_graph.pre_ops(next_op)) + + def test_get_optimize_graph(self): + self.build_program() + place = fluid.CPUPlace() + if fluid.core.is_compiled_with_cuda(): + place = fluid.CUDAPlace(0) + opt = fluid.optimizer.SGD(learning_rate=0.001) + train_graph = self.eval_graph.get_optimize_graph( + opt, place, self.scope, no_grad_var_names=['image']) + self.assertEquals(len(self.train_graph.ops()), len(train_graph.ops())) + exe = fluid.Executor(place) + train_graph.compile() + image = np.random.randint(0, 225, [16, 1, 8, 8]).astype('float32') + label = np.random.randint(0, 10, [16, 1]).astype('int64') + exe.run(train_graph.compiled_graph, + scope=self.scope, + feed={'image': image, + 'label': label}) + + def test_flops(self): + self.build_program() + self.assertEquals(self.train_graph.flops(), 354624) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py index 0b4b2a285f5de2596b5d30c6b2a6213762a64e7a..c7feca0b82606cdba9a05fb6de821aa6d347d4e6 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py @@ -127,7 +127,7 @@ class TestQuantizationTransformPass(unittest.TestCase): arg_name.endswith('.quantized.dequantized')) self.assertTrue(arg_name in quantized_ops) - def linear_fc_quant(self, quant_type, for_ci=False): + def linear_fc_quant(self, activation_quant_type, for_ci=False): main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): @@ -140,14 +140,15 @@ class TestQuantizationTransformPass(unittest.TestCase): transform_pass = QuantizationTransformPass( scope=fluid.global_scope(), place=place, - activation_quantize_type=quant_type) + activation_quantize_type=activation_quant_type) transform_pass.apply(graph) if not for_ci: marked_nodes = set() for op in graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) - graph.draw('.', 'quantize_fc_' + quant_type, marked_nodes) + graph.draw('.', 'quantize_fc_' + activation_quant_type, + marked_nodes) program = graph.to_program() self.check_program(transform_pass, program) val_graph = IrGraph(core.Graph(program.desc), for_test=False) @@ -156,7 +157,8 @@ class TestQuantizationTransformPass(unittest.TestCase): for op in val_graph.all_op_nodes(): if op.name().find('quantize') > -1: val_marked_nodes.add(op) - val_graph.draw('.', 'val_fc_' + quant_type, val_marked_nodes) + val_graph.draw('.', 'val_fc_' + activation_quant_type, + val_marked_nodes) def test_linear_fc_quant_abs_max(self): self.linear_fc_quant('abs_max', for_ci=True) @@ -167,7 +169,7 @@ class TestQuantizationTransformPass(unittest.TestCase): def test_linear_fc_quant_moving_average_abs_max(self): self.linear_fc_quant('moving_average_abs_max', for_ci=True) - def residual_block_quant(self, quant_type, for_ci=False): + def residual_block_quant(self, activation_quant_type, for_ci=False): main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): @@ -180,14 +182,15 @@ class TestQuantizationTransformPass(unittest.TestCase): transform_pass = QuantizationTransformPass( scope=fluid.global_scope(), place=place, - activation_quantize_type=quant_type) + activation_quantize_type=activation_quant_type) transform_pass.apply(graph) if not for_ci: marked_nodes = set() for op in graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) - graph.draw('.', 'quantize_residual_' + quant_type, marked_nodes) + graph.draw('.', 'quantize_residual_' + activation_quant_type, + marked_nodes) program = graph.to_program() self.check_program(transform_pass, program) val_graph = IrGraph(core.Graph(program.desc), for_test=False) @@ -196,7 +199,8 @@ class TestQuantizationTransformPass(unittest.TestCase): for op in val_graph.all_op_nodes(): if op.name().find('quantize') > -1: val_marked_nodes.add(op) - val_graph.draw('.', 'val_residual_' + quant_type, val_marked_nodes) + val_graph.draw('.', 'val_residual_' + activation_quant_type, + val_marked_nodes) def test_residual_block_abs_max(self): self.residual_block_quant('abs_max', for_ci=True) @@ -209,7 +213,12 @@ class TestQuantizationTransformPass(unittest.TestCase): class TestQuantizationFreezePass(unittest.TestCase): - def freeze_graph(self, use_cuda, seed, quant_type, for_ci=False): + def freeze_graph(self, + use_cuda, + seed, + activation_quant_type, + weight_quant_type='abs_max', + for_ci=False): def build_program(main, startup, is_test): main.random_seed = seed startup.random_seed = seed @@ -243,7 +252,12 @@ class TestQuantizationFreezePass(unittest.TestCase): with fluid.scope_guard(scope): exe.run(startup) transform_pass = QuantizationTransformPass( - scope=scope, place=place, activation_quantize_type=quant_type) + scope=scope, + place=place, + activation_quantize_type=activation_quant_type, + weight_quantize_type=weight_quant_type) + #transform_pass = QuantizationTransformPass( + # scope=scope, place=place, activation_quantize_type=activation_quant_type) transform_pass.apply(main_graph) transform_pass.apply(test_graph) dev_name = '_gpu_' if use_cuda else '_cpu_' @@ -252,12 +266,14 @@ class TestQuantizationFreezePass(unittest.TestCase): for op in main_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) - main_graph.draw('.', 'main' + dev_name + quant_type, marked_nodes) + main_graph.draw('.', 'main' + dev_name + activation_quant_type + '_' + + weight_quant_type, marked_nodes) marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) - test_graph.draw('.', 'test' + dev_name + quant_type, marked_nodes) + test_graph.draw('.', 'test' + dev_name + activation_quant_type + '_' + + weight_quant_type, marked_nodes) build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False @@ -282,8 +298,9 @@ class TestQuantizationFreezePass(unittest.TestCase): feed=feeder.feed(data), fetch_list=[loss]) if not for_ci: - print('{}: {}'.format('loss' + dev_name + quant_type, - loss_v)) + print('{}: {}'.format('loss' + dev_name + + activation_quant_type + '_' + + weight_quant_type, loss_v)) test_data = next(test_reader()) with fluid.program_guard(quantized_test_program): @@ -296,14 +313,17 @@ class TestQuantizationFreezePass(unittest.TestCase): fetch_list=[loss, w_var]) # Freeze graph for inference, but the weight of fc/conv is still float type. - freeze_pass = QuantizationFreezePass(scope=scope, place=place) + freeze_pass = QuantizationFreezePass( + scope=scope, place=place, weight_quantize_type=weight_quant_type) + #freeze_pass = QuantizationFreezePass(scope=scope, place=place) freeze_pass.apply(test_graph) if not for_ci: marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) - test_graph.draw('.', 'test_freeze' + dev_name + quant_type, + test_graph.draw('.', 'test_freeze' + dev_name + + activation_quant_type + '_' + weight_quant_type, marked_nodes) server_program = test_graph.to_program() @@ -313,18 +333,20 @@ class TestQuantizationFreezePass(unittest.TestCase): fetch_list=[loss]) self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3) if not for_ci: - print('{}: {}'.format('test_loss1' + dev_name + quant_type, - test_loss1)) - print('{}: {}'.format('test_loss2' + dev_name + quant_type, - test_loss2)) + print( + '{}: {}'.format('test_loss1' + dev_name + activation_quant_type + + '_' + weight_quant_type, test_loss1)) + print( + '{}: {}'.format('test_loss2' + dev_name + activation_quant_type + + '_' + weight_quant_type, test_loss2)) w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor()) # Maybe failed, this is due to the calculation precision # self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant)) if not for_ci: - print('{}: {}'.format('w_freeze' + dev_name + quant_type, - np.sum(w_freeze))) - print('{}: {}'.format('w_quant' + dev_name + quant_type, - np.sum(w_quant))) + print('{}: {}'.format('w_freeze' + dev_name + activation_quant_type + + '_' + weight_quant_type, np.sum(w_freeze))) + print('{}: {}'.format('w_quant' + dev_name + activation_quant_type + + '_' + weight_quant_type, np.sum(w_quant))) # Convert parameter to 8-bit. convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place) @@ -334,26 +356,28 @@ class TestQuantizationFreezePass(unittest.TestCase): for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) - test_graph.draw('.', 'test_int8' + dev_name + quant_type, - marked_nodes) + test_graph.draw('.', 'test_int8' + dev_name + activation_quant_type + + '_' + weight_quant_type, marked_nodes) server_program_int8 = test_graph.to_program() # Save the 8-bit parameter and model file. with fluid.scope_guard(scope): - fluid.io.save_inference_model('server_int8' + dev_name + quant_type, - ['image', 'label'], [loss], exe, - server_program_int8) + fluid.io.save_inference_model( + 'server_int8' + dev_name + activation_quant_type + '_' + + weight_quant_type, ['image', 'label'], [loss], exe, + server_program_int8) # Test whether the 8-bit parameter and model file can be loaded successfully. [infer, feed, fetch] = fluid.io.load_inference_model( - 'server_int8' + dev_name + quant_type, exe) + 'server_int8' + dev_name + activation_quant_type + '_' + + weight_quant_type, exe) # Check the loaded 8-bit weight. w_8bit = np.array(scope.find_var('conv2d_1.w_0.int8').get_tensor()) self.assertEqual(w_8bit.dtype, np.int8) self.assertEqual(np.sum(w_8bit), np.sum(w_freeze)) if not for_ci: - print('{}: {}'.format('w_8bit' + dev_name + quant_type, - np.sum(w_8bit))) - print('{}: {}'.format('w_freeze' + dev_name + quant_type, - np.sum(w_freeze))) + print('{}: {}'.format('w_8bit' + dev_name + activation_quant_type + + '_' + weight_quant_type, np.sum(w_8bit))) + print('{}: {}'.format('w_freeze' + dev_name + activation_quant_type + + '_' + weight_quant_type, np.sum(w_freeze))) mobile_pass = TransformForMobilePass() mobile_pass.apply(test_graph) @@ -362,42 +386,103 @@ class TestQuantizationFreezePass(unittest.TestCase): for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) - test_graph.draw('.', 'test_mobile' + dev_name + quant_type, + test_graph.draw('.', 'test_mobile' + dev_name + + activation_quant_type + '_' + weight_quant_type, marked_nodes) mobile_program = test_graph.to_program() with fluid.scope_guard(scope): - fluid.io.save_inference_model('mobile_int8' + dev_name + quant_type, - ['image', 'label'], [loss], exe, - mobile_program) + fluid.io.save_inference_model( + 'mobile_int8' + dev_name + activation_quant_type + '_' + + weight_quant_type, ['image', 'label'], [loss], exe, + mobile_program) def test_freeze_graph_cuda_dynamic(self): if fluid.core.is_compiled_with_cuda(): with fluid.unique_name.guard(): self.freeze_graph( - True, seed=1, quant_type='abs_max', for_ci=True) + True, + seed=1, + activation_quant_type='abs_max', + weight_quant_type='abs_max', + for_ci=True) + with fluid.unique_name.guard(): + self.freeze_graph( + True, + seed=1, + activation_quant_type='abs_max', + weight_quant_type='channel_wise_abs_max', + for_ci=True) def test_freeze_graph_cpu_dynamic(self): with fluid.unique_name.guard(): - self.freeze_graph(False, seed=2, quant_type='abs_max', for_ci=True) + self.freeze_graph( + False, + seed=2, + activation_quant_type='abs_max', + weight_quant_type='abs_max', + for_ci=True) + self.freeze_graph( + False, + seed=2, + activation_quant_type='abs_max', + weight_quant_type='channel_wise_abs_max', + for_ci=True) def test_freeze_graph_cuda_static(self): if fluid.core.is_compiled_with_cuda(): with fluid.unique_name.guard(): self.freeze_graph( - True, seed=1, quant_type='range_abs_max', for_ci=True) + True, + seed=1, + activation_quant_type='range_abs_max', + weight_quant_type='abs_max', + for_ci=True) + self.freeze_graph( + True, + seed=1, + activation_quant_type='moving_average_abs_max', + weight_quant_type='abs_max', + for_ci=True) self.freeze_graph( True, seed=1, - quant_type='moving_average_abs_max', + activation_quant_type='range_abs_max', + weight_quant_type='channel_wise_abs_max', + for_ci=True) + self.freeze_graph( + True, + seed=1, + activation_quant_type='moving_average_abs_max', + weight_quant_type='channel_wise_abs_max', for_ci=True) def test_freeze_graph_cpu_static(self): with fluid.unique_name.guard(): self.freeze_graph( - False, seed=2, quant_type='range_abs_max', for_ci=True) + False, + seed=2, + activation_quant_type='range_abs_max', + weight_quant_type='abs_max', + for_ci=True) + self.freeze_graph( + False, + seed=2, + activation_quant_type='moving_average_abs_max', + weight_quant_type='abs_max', + for_ci=True) + self.freeze_graph( + False, + seed=2, + activation_quant_type='range_abs_max', + weight_quant_type='channel_wise_abs_max', + for_ci=True) self.freeze_graph( - False, seed=2, quant_type='moving_average_abs_max', for_ci=True) + False, + seed=2, + activation_quant_type='moving_average_abs_max', + weight_quant_type='channel_wise_abs_max', + for_ci=True) if __name__ == '__main__': diff --git a/python/paddle/fluid/contrib/tests/test_calibration.py b/python/paddle/fluid/contrib/tests/test_calibration.py index b9f938bebed71dc9611df8d743a066858ea38bca..1a046a79415f9edbfde3f7e01d7ab78177a8641b 100644 --- a/python/paddle/fluid/contrib/tests/test_calibration.py +++ b/python/paddle/fluid/contrib/tests/test_calibration.py @@ -290,7 +290,7 @@ class TestCalibrationForResnet50(unittest.TestCase): self.model, self.infer_iterations) (int8_throughput, int8_latency, int8_acc1) = self.run_program("calibration_out") - delta_value = np.abs(fp32_acc1 - int8_acc1) + delta_value = fp32_acc1 - int8_acc1 self.assertLess(delta_value, 0.01) print( "FP32 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}". diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 556ce71ee585fd24bc983b4fcedc2fbdfb016889..e4169c247f40f1944f98ddd185e55b404bdbf9e3 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -644,10 +644,9 @@ class Operator(object): outputs={"Out": [var1]}) """ OP_WITHOUT_KERNEL_SET = { - 'feed', 'fetch', 'save', 'load', 'recurrent', 'go', - 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', - 'listen_and_serv', 'save_combine', 'load_combine', 'ncclInit', 'select', - 'checkpoint_notify', 'gen_nccl_id' + 'feed', 'fetch', 'recurrent', 'go', 'rnn_memory_helper_grad', + 'conditional_block', 'while', 'send', 'recv', 'listen_and_serv', + 'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id' } def __init__(self, diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/imperative/__init__.py index 7f31ca1b9b70a05d22eca325b38fe2cb5ff15b03..7281b3ea4b961a14126023a14a2ba2f03c7d1387 100644 --- a/python/paddle/fluid/imperative/__init__.py +++ b/python/paddle/fluid/imperative/__init__.py @@ -29,9 +29,13 @@ from .tracer import * from . import profiler from .profiler import * +from . import checkpoint +from .checkpoint import * + __all__ = [] __all__ += layers.__all__ __all__ += base.__all__ __all__ += nn.__all__ __all__ += tracer.__all__ __all__ += profiler.__all__ +__all__ += checkpoint.__all__ diff --git a/python/paddle/fluid/imperative/checkpoint.py b/python/paddle/fluid/imperative/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..37c43f29d2ae9214058238e4f834dbbcd9e42df1 --- /dev/null +++ b/python/paddle/fluid/imperative/checkpoint.py @@ -0,0 +1,187 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import collections +from .. import core +from ..framework import Variable, default_main_program + +__all__ = ['save_persistables', 'load_persistables'] + + +def save_persistables(vardict, dirname, filename=None): + """ + This function filters out all variables in layer.parameters from the + give `layer` and then trys to load these variables from the folder + `dirname` or the file `filename`. + + Use the `dirname` to specify the folder where persistable variables were + saved. If variables were saved in separate files, set `filename` None; + if all variables were saved in a single file, use `filename` to specify + the file name. + + Args: + vardict(dict of Parameters): The parameters will + be saved. If it is None, nothing + will be deal. + dirname(str): The directory path. + filename(str|None): The file which saved all variables. If variables were + saved in differnet files, set it to None. + Default: None + + Returns: + + Examples: + .. code-block:: python + ptb_model = PtbModel( + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + num_steps=num_steps, + init_scale=init_scale) + + x_data = np.arange(12).reshape(4, 3).astype('int64') + y_data = np.arange(1, 13).reshape(4, 3).astype('int64') + x_data = x_data.reshape((-1, num_steps, 1)) + y_data = y_data.reshape((-1, 1)) + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + x = to_variable(x_data) + y = to_variable(y_data) + init_hidden = to_variable(init_hidden_data) + init_cell = to_variable(init_cell_data) + dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, + init_cell) + param_path = "./my_paddle_model" + fluid.imperative.checkpoint.save_persistables(ptb_model.state_dict(), dirname=param_path, + layer=ptb_model) + """ + if isinstance(vardict, collections.OrderedDict): + _save_var_to_file(vardict, dirname, filename) + + +def load_persistables(vardict, dirname, filename=None): + """ + This function trys to load persistable variables from the folder + `dirname` or the file `filename`. + + Use the `dirname` to specify the folder where persistable variables were + saved. If variables were saved in separate files, set `filename` None; + if all variables were saved in a single file, use `filename` to specify + the file name. + + Args: + vardict(dict of Parameters): The parameters will be loaded. + dirname(str): The directory path. + filename(str|None): The file which saved all variables, this file path should be end with '.npz'. If variables were + saved in differnet files, set it to None. + Default: None + + Returns: + dict: The parameter-dict resumed from file + + Examples: + .. code-block:: python + my_layer = layer(fluid.imperative.Layer) + param_path = "./my_paddle_model" + + param_dict = fluid.imperative.checkpoint.load_persistables(my_layer.parameters(), param_path) + param_1 = param_dict['PtbModel_0.w_1'] + + or: + my_layer = layer(fluid.imperative.Layer) + param_path = "./my_paddle_model" + filename = "model.file" + param_dict = fluid.imperative.checkpoint.load_persistables(my_layer.state_dict(), param_path, + filename=filename) + param_1 = param_dict['PtbModel_0.w_1'] + + """ + if isinstance(vardict, collections.OrderedDict): + return _load_var_from_file(vardict, dirname, filename) + + return {} + + +def _save_var_to_file(stat_dict, file_dir, file_name): + save_block = default_main_program().global_block() + save_var_map = {} + for each_var in stat_dict.items(): + save_var_map[each_var.name] = each_var + if file_name is None: + save_block.append_op( + type='save', + inputs={'X': [each_var]}, + outputs={}, + attrs={'file_path': os.path.join(file_dir, each_var.name)}) + + if file_name is not None: + save_var_list = [] + for name in sorted(save_var_map.keys()): + save_var_list.append(save_var_map[name]) + + save_block.append_op( + type='save_combine', + inputs={'X': save_var_list}, + outputs={}, + attrs={'file_path': os.path.join(file_dir, file_name)}) + + +def _load_var_from_file(stat_dict, file_dir, file_name): + load_block = default_main_program().global_block() + load_var_map = {} + + for each_var in stat_dict.items(): + assert isinstance(each_var, Variable) + if each_var.type == core.VarDesc.VarType.RAW: + continue + new_var = _clone_var_in_block_(load_block, each_var) + if file_name is None: + load_block.append_op( + type='load', + inputs={}, + outputs={'Out': [new_var]}, + attrs={'file_path': os.path.join(file_dir, each_var.name)}) + + load_var_map[new_var.name] = new_var + + if file_name is not None: + load_var_list = [] + for name in sorted(load_var_map.keys()): + load_var_list.append(load_var_map[name]) + + load_block.append_op( + type='load_combine', + inputs={}, + outputs={"Out": load_var_list}, + attrs={'file_path': os.path.join(file_dir, file_name)}) + for res_var in load_var_list: + load_var_map[res_var.name] = res_var + + return load_var_map + + +def _clone_var_in_block_(block, var): + assert isinstance(var, Variable) + return block.create_var( + name=var.name, + shape=var.shape, + dtype=var.dtype, + type=var.type, + lod_level=var.lod_level, + persistable=True) diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index 71d169a7dc36d5b2bd90e513f10c179006f89382..a8a1aac8b0c74bcfd57b674d01600672788b016a 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -212,6 +212,34 @@ class Layer(core.Layer): else: object.__delattr__(self, name) + def state_dict(self, destination=None, prefix='', include_sublayers=True): + if destination is None: + destination = collections.OrderedDict() + for name, data in self._parameters.items(): + if data is not None: + destination[prefix + name] = data + + if include_sublayers: + for layer_name, layer_item in self._sub_layers.items(): + if layer_item is not None: + destination_temp = destination.copy() + destination_temp.update( + layer_item.state_dict(destination_temp, prefix + + layer_name + ".", + include_sublayers)) + destination = destination_temp + return destination + + def load_dict(self, stat_dict, include_sublayers=True): + for name, item in self.__dict__.get('_parameters', None).items(): + if item.name in stat_dict: + self.__setattr__(name, stat_dict[item.name]) + + if include_sublayers: + for layer_name, layer_item in self._sub_layers.items(): + if layer_item is not None: + layer_item.load_dict(stat_dict) + class PyLayer(core.PyLayer): """Layers composed of user-defined python codes.""" diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 1775159798414a98bede4a3db5b577fb5e47e611..326a84d82b5718dad898620a6d9e0490f7519448 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -896,7 +896,7 @@ def save_inference_model(dirname, True is supported. Returns: - None + target_var_name_list(list): The fetch variables' name list Raises: ValueError: If `feed_var_names` is not a list of basestring. @@ -949,11 +949,13 @@ def save_inference_model(dirname, # TODO(Superjomn) add an IR pass to remove 1-scale op. with program_guard(main_program): uniq_target_vars = [] - for var in target_vars: + for i, var in enumerate(target_vars): if isinstance(var, Variable): - var1 = layers.scale(var, 1.) - uniq_target_vars.append(var1) + var = layers.scale( + var, 1., name="save_infer_model/scale_{}".format(i)) + uniq_target_vars.append(var) target_vars = uniq_target_vars + target_var_name_list = [var.name for var in target_vars] # when a pserver and a trainer running on the same machine, mkdir may conflict try: @@ -1010,6 +1012,7 @@ def save_inference_model(dirname, params_filename = os.path.basename(params_filename) save_persistables(executor, dirname, main_program, params_filename) + return target_var_name_list def load_inference_model(dirname, diff --git a/python/paddle/fluid/layers/__init__.py b/python/paddle/fluid/layers/__init__.py index a2a808777ddc499570eb9ef92175787a14cf77ca..31effea3788c2dd1b0dab6f62194d27a2d7ce7e3 100644 --- a/python/paddle/fluid/layers/__init__.py +++ b/python/paddle/fluid/layers/__init__.py @@ -33,6 +33,7 @@ from .detection import * from . import metric_op from .metric_op import * from .learning_rate_scheduler import * +from .collective import * __all__ = [] __all__ += nn.__all__ diff --git a/python/paddle/fluid/layers/collective.py b/python/paddle/fluid/layers/collective.py new file mode 100644 index 0000000000000000000000000000000000000000..a9bce77b9d4ae8d5b08c8c4433e5010f20383cc1 --- /dev/null +++ b/python/paddle/fluid/layers/collective.py @@ -0,0 +1,47 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +from ..layer_helper import LayerHelper, unique_name + + +def _allreduce(x, out=None, reduce_type="sum"): + helper = LayerHelper("allreduce", **locals()) + # Convert string reduce type to op int type + red_typ_int = 0 + if reduce_type == "sum": + red_typ_int = 0 + elif reduce_type == "prod": + red_typ_int = 1 + elif reduce_type == "max": + red_typ_int = 2 + elif reduce_type == "min": + red_typ_int = 3 + else: + raise TypeError("reduce type can only be [sum|prod|max|min]") + + if out is None: + out = helper.create_variable( + name=unique_name.generate(".".join([x.name, 'tmp'])), + shape=x.shape, + dtype=x.dtype, + type=x.type, + persistable=x.persistable, + stop_gradient=True) + helper.append_op( + type='allreduce', + inputs={'X': [x]}, + outputs={'Out': [out]}, + attrs={"reduce_type": red_typ_int}) + return out diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index e7f704515df947f107df6d83a644530a0e468430..3277766171d2d812f5fb0fd81556d7f979f0702f 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -28,21 +28,9 @@ import six from functools import reduce __all__ = [ - 'While', - 'Switch', - 'increment', - 'array_write', - 'create_array', - 'less_than', - 'equal', - 'array_read', - 'array_length', - 'IfElse', - 'DynamicRNN', - 'StaticRNN', - 'reorder_lod_tensor_by_rank', - 'Print', - 'is_empty', + 'While', 'Switch', 'increment', 'array_write', 'create_array', 'less_than', + 'equal', 'array_read', 'array_length', 'IfElse', 'DynamicRNN', 'StaticRNN', + 'reorder_lod_tensor_by_rank', 'Print', 'is_empty' ] @@ -1448,12 +1436,13 @@ class DynamicRNN(object): self.input_array = [] self.mem_link = [] - def step_input(self, x): + def step_input(self, x, level=0): """ Mark a sequence as a dynamic RNN input. Args: x(Variable): The input sequence. + level(int): The level of lod used to split steps. Default: 0. Returns: The current timestep in the input sequence. @@ -1471,7 +1460,8 @@ class DynamicRNN(object): parent_block.append_op( type='lod_rank_table', inputs={"X": x}, - outputs={"Out": self.lod_rank_table}) + outputs={"Out": self.lod_rank_table}, + attrs={"level": level}) self.max_seq_len = parent_block.create_var( name=unique_name.generate('dynamic_rnn_max_seq_len'), dtype='int64') diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py index a458cebfb194a068d040a8919fd4abcb4b4bea80..734383655cf6a85015750ab432c0f6697dd6a9b8 100644 --- a/python/paddle/fluid/layers/math_op_patch.py +++ b/python/paddle/fluid/layers/math_op_patch.py @@ -174,6 +174,8 @@ def monkey_patch_variable(): ("__rtruediv__", "elementwise_div", True), ("__pow__", "elementwise_pow", False), ("__rpow__", "elementwise_pow", True), + ("__floordiv__", "elementwise_floordiv", False), + ("__mod__", "elementwise_mod", False), # for logical compare ("__eq__", "equal", False), ("__ne__", "not_equal", False), diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index dbe495b75c8876de3c3f0fb0abe0357089413254..e2c8be613fb2b27d33acbcafdabbf4c8a526f5d5 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1348,7 +1348,7 @@ def dropout(x, 1. downgrade_in_infer(default), downgrade the outcome at inference - train: out = input * mask - - inference: out = input * dropout_prob + - inference: out = input * (1.0 - dropout_prob) (mask is a tensor same shape with input, value is 0 or 1 ratio of 0 is dropout_prob) @@ -4901,6 +4901,9 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None): if len(y_shape) > 2 and len(x_shape) > 2: for i, dim_x in enumerate(x_shape[:-2]): + # don't check neg shape + if dim_x < 0 or y_shape[i] < 0: + continue if dim_x != y_shape[i]: raise ValueError("Invalid inputs for matmul. x(%s), y(%s)" % (x.shape, y.shape)) @@ -9228,9 +9231,24 @@ def elementwise_pow(x, y, axis=-1, act=None, name=None): return _elementwise_op(LayerHelper('elementwise_pow', **locals())) +def elementwise_mod(x, y, axis=-1, act=None, name=None): + return _elementwise_op(LayerHelper('elementwise_mod', **locals())) + + +def elementwise_floordiv(x, y, axis=-1, act=None, name=None): + return _elementwise_op(LayerHelper('elementwise_floordiv', **locals())) + + for func in [ - elementwise_add, elementwise_div, elementwise_sub, elementwise_mul, - elementwise_max, elementwise_min, elementwise_pow + elementwise_add, + elementwise_div, + elementwise_sub, + elementwise_mul, + elementwise_max, + elementwise_min, + elementwise_pow, + elementwise_mod, + elementwise_floordiv, ]: op_proto = OpProtoHolder.instance().get_op_proto(func.__name__) func.__doc__ = _generate_doc_string_( @@ -9706,7 +9724,12 @@ def sequence_reverse(x, name=None): return out -def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None): +def affine_channel(x, + scale=None, + bias=None, + data_layout='NCHW', + name=None, + act=None): """ Applies a separate affine transformation to each channel of the input. Useful for replacing spatial batch norm with its equivalent fixed @@ -9725,6 +9748,7 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None): data_layout (string, default NCHW): NCHW or NHWC. If input is 2D tensor, you can ignore data_layout. name (str, default None): The name of this layer. + act (str, default None): Activation to be applied to the output of this layer. Returns: out (Variable): A tensor of the same shape and data layout with x. @@ -9744,7 +9768,7 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None): 'Bias': bias}, attrs={"data_layout": data_layout}, outputs={"Out": out}) - return out + return helper.append_activation(out) def similarity_focus(input, axis, indexes, name=None): diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index cb973986988c2909f5ef1e15dd32db3e83b1d269..a18e5b6a9c3fe69ee0bcadc150f07b72227df85e 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -25,10 +25,26 @@ from .layer_function_generator import templatedoc import numpy __all__ = [ - 'create_tensor', 'create_parameter', 'create_global_var', 'cast', - 'tensor_array_to_tensor', 'concat', 'sums', 'assign', - 'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax', - 'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite' + 'create_tensor', + 'create_parameter', + 'create_global_var', + 'cast', + 'tensor_array_to_tensor', + 'concat', + 'sums', + 'assign', + 'fill_constant_batch_size_like', + 'fill_constant', + 'argmin', + 'argmax', + 'argsort', + 'ones', + 'zeros', + 'reverse', + 'has_inf', + 'has_nan', + 'isfinite', + 'range', ] @@ -764,3 +780,50 @@ def isfinite(x): out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op(type="isfinite", inputs={"X": x}, outputs={"Out": out}) return out + + +def range(start, end, step, dtype): + """ + Return evenly spaced values within a given interval. + + Values are generated within the half-open interval [start, stop) (in other words, + the interval including start but excluding stop). + + args: + start(int|float|Variable): Start of interval. The interval includes this value. + end(int|float|Variable): End of interval. The interval does not include this + value, except in some cases where step is not an integer + and floating point round-off affects the length of out. + step(int|float|Variable): Spacing between values. For any output out, this is the + distance between two adjacent values, out[i+1] - out[i]. + The default step size is 1. + dtype(string): 'float32'|'int32'|..., the data type of the output tensor. + + returns: + Evenly spaced values within a given interval. + + examples: + + .. code-block:: python + + data = fluid.layers.range(0, 10, 2, 'int32') + + """ + helper = LayerHelper("range", **locals()) + + if not isinstance(start, Variable): + start = fill_constant([1], dtype, start) + if not isinstance(end, Variable): + end = fill_constant([1], dtype, end) + if not isinstance(step, Variable): + step = fill_constant([1], dtype, step) + + out = helper.create_variable_for_type_inference(dtype=start.dtype) + + helper.append_op( + type='range', + inputs={'Start': start, + 'End': end, + 'Step': step}, + outputs={'Out': [out]}) + return out diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index d501d02bd41349d57bdd9362bad44056075fb315..8918886a804847bc31ba6a3eeb175194bed7c4ec 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -70,6 +70,10 @@ class Optimizer(object): # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...} self._accumulators = defaultdict(lambda: dict()) self.helper = None + self._opti_name_list = [] + + def get_opti_var_name_list(self): + return self._opti_name_list def _create_global_learning_rate(self): lr = self._global_learning_rate() @@ -166,8 +170,13 @@ class Optimizer(object): if shape == None: shape = param.shape assert isinstance(self.helper, LayerHelper) + + var_name = param.name + "_" + name + var_name = unique_name.generate(var_name) + self._opti_name_list.append(var_name) + var = self.helper.create_global_variable( - name=unique_name.generate(name), + name=var_name, persistable=True, dtype=dtype or param.dtype, type=param.type, diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index a1cf5fad138f068c9eac5fe8d681c9f08b192270..cefa2b491970c380faeabe43c0cce54c36069eb9 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -70,6 +70,7 @@ list(REMOVE_ITEM TEST_OPS test_dist_transpiler) list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) +list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_nccl) list(REMOVE_ITEM TEST_OPS test_dist_transformer) list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer) list(REMOVE_ITEM TEST_OPS test_image_classification_resnet) @@ -95,14 +96,16 @@ if(WITH_DISTRIBUTE) if(NOT APPLE) set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) - py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext) - set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000) + py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext) + set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000) + py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl) + set_tests_properties(test_dist_se_resnext_nccl PROPERTIES TIMEOUT 1000) # FIXME(typhoonzero): add these tests back - # py_test_modules(test_dist_transformer MODULES test_dist_transformer) - # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000) + # py_test_modules(test_dist_transformer MODULES test_dist_transformer) + # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000) set_tests_properties(test_dist_ctr test_dist_mnist test_dist_mnist_batch_merge test_dist_save_load test_dist_se_resnext test_dist_simnet_bow test_dist_text_classification test_dist_train test_dist_word2vec PROPERTIES RUN_SERIAL TRUE) endif(NOT APPLE) - py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) + # py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) endif() py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) @@ -115,8 +118,8 @@ if(NOT APPLE) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) endif() if(CMAKE_BUILD_TYPE STREQUAL "Debug") - # change the timeout from 600 to 1200, because in debug mode, this test need more time. - set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 1200) + # change the timeout from 600 to 2200, because in debug mode, this test need more time. + set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 2200) endif() if (WITH_NGRAPH) diff --git a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py new file mode 100644 index 0000000000000000000000000000000000000000..88a3cd14c43334f2abed9c8b435b64d47a65dc85 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py @@ -0,0 +1,120 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import argparse +import time +import math + +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler +from paddle.fluid import core +import unittest +from multiprocessing import Process +import os +import signal +from functools import reduce +from test_dist_base import TestDistRunnerBase, runtime_main + +DTYPE = "float32" +paddle.dataset.mnist.fetch() + +# Fix seed for test +fluid.default_startup_program().random_seed = 1 +fluid.default_main_program().random_seed = 1 + + +def cnn_model(data): + conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=data, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu", + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( + value=0.01))) + conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu", + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( + value=0.01))) + + SIZE = 10 + input_shape = conv_pool_2.shape + param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE] + scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5 + + predict = fluid.layers.fc( + input=conv_pool_2, + size=SIZE, + act="softmax", + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01))) + return predict + + +class TestDistMnist2x2(TestDistRunnerBase): + def get_model(self, batch_size=2, single_device=False): + # Input data + images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + predict = cnn_model(images) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + # Evaluator + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + + inference_program = fluid.default_main_program().clone() + + # Reader + train_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=batch_size) + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=batch_size) + + # Optimization + # TODO(typhoonzero): fix distributed adam optimizer + # opt = fluid.optimizer.AdamOptimizer( + # learning_rate=0.001, beta1=0.9, beta2=0.999) + opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9) + if single_device: + opt.minimize(avg_cost) + else: + # multi device or distributed multi device + params_grads = opt.backward(avg_cost) + data_parallel_param_grads = [] + for p, g in params_grads: + # NOTE: scale will be done on loss scale in multi_devices_graph_pass using nranks. + grad_reduce = fluid.layers.collective._allreduce(g) + data_parallel_param_grads.append([p, grad_reduce]) + opt.apply_gradients(data_parallel_param_grads) + + return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict + + +if __name__ == "__main__": + runtime_main(TestDistMnist2x2) diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_int8_mkldnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..0b6556746cd91676d153d862126dd48661fa281d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_int8_mkldnn_op.py @@ -0,0 +1,124 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from paddle.fluid.tests.unittests.op_test import OpTest + + +class TestConcatOp(OpTest): + def setUp(self): + self.op_type = "concat" + self.use_mkldnn = True + self._cpu_only = True + self.init_axis() + self.init_shape() + self.init_test_data() + self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]} + self.attrs = {'axis': self.axis, 'use_mkldnn': True} + + self.output = np.concatenate( + (self.x0, self.x1, self.x2), axis=self.axis).astype('int') + + self.outputs = {'Out': self.output} + + def test_check_output(self): + self.check_output() + +#--------------------test concat s8 in with axis 0-------------------- + + def init_test_data(self): + self.x0 = (np.random.randint(0, 100, self.x0_shape) - 50).astype('int8') + self.x1 = (np.random.randint(0, 80, self.x1_shape) - 30).astype('int8') + self.x2 = (np.random.randint(0, 110, self.x2_shape) - 80).astype('int8') + + def init_axis(self): + self.axis = 0 + + def init_shape(self): + self.x0_shape = [2, 2, 1, 2] + self.x1_shape = [1, 2, 1, 2] + self.x2_shape = [3, 2, 1, 2] + + +#--------------------test concat u8 in with axis 0-------------------- + + +class TestConcatOp2(TestConcatOp): + def init_test_data(self): + self.x0 = (np.random.randint(0, 100, self.x0_shape)).astype('uint8') + self.x1 = (np.random.randint(0, 50, self.x1_shape)).astype('uint8') + self.x2 = (np.random.randint(0, 80, self.x2_shape)).astype('uint8') + + def init_axis(self): + self.axis = 0 + + def init_shape(self): + self.x0_shape = [2, 1, 5, 5] + self.x1_shape = [1, 1, 5, 5] + self.x2_shape = [3, 1, 5, 5] + + +def create_test_int8_class(parent): + + #--------------------test concat s8/u8 in with axis 1-------------------- + + class TestAxis1Case(parent): + def init_axis(self): + self.axis = 1 + + def init_shape(self): + self.x0_shape = [1, 1, 5, 5] + self.x1_shape = [1, 2, 5, 5] + self.x2_shape = [1, 3, 5, 5] + +#--------------------test concat s8/u8 in with axis 2-------------------- + + class TestAxis2Case(parent): + def init_axis(self): + self.axis = 2 + + def init_shape(self): + self.x0_shape = [2, 3, 4, 5] + self.x1_shape = [2, 3, 5, 5] + self.x2_shape = [2, 3, 6, 5] + +#--------------------test concat s8/u8 in with axis 3-------------------- + + class TestAxis3Case(parent): + def init_axis(self): + self.axis = 3 + + def init_shape(self): + self.x0_shape = [2, 3, 5, 5] + self.x1_shape = [2, 3, 5, 6] + self.x2_shape = [2, 3, 5, 7] + + cls_name_1 = "{0}_axis_{1}".format(parent.__name__, "1") + cls_name_2 = "{0}_axis_{1}".format(parent.__name__, "2") + cls_name_3 = "{0}_axis_{1}".format(parent.__name__, "3") + TestAxis1Case.__name__ = cls_name_1 + TestAxis2Case.__name__ = cls_name_2 + TestAxis3Case.__name__ = cls_name_3 + globals()[cls_name_1] = TestAxis1Case + globals()[cls_name_2] = TestAxis2Case + globals()[cls_name_3] = TestAxis3Case + +create_test_int8_class(TestConcatOp) +create_test_int8_class(TestConcatOp2) + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_softmax_with_cross_entropy_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_softmax_with_cross_entropy_ngraph_op.py new file mode 100644 index 0000000000000000000000000000000000000000..86961b8c366c69a210e47ab5d1ece6ba85d1d262 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/test_softmax_with_cross_entropy_ngraph_op.py @@ -0,0 +1,20 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +from paddle.fluid.tests.unittests.test_softmax_with_cross_entropy_op import TestSoftmaxWithCrossEntropyOp, TestSoftmaxWithCrossEntropyOp2, TestSoftmaxWithCrossEntropyOp3 + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index a94487e67dc90d4df935867f841bc567c37c8aa2..61fd9af1275865f2d03e759199c219b36d3a0b5b 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -43,6 +43,7 @@ class TestParallelExecutorBase(unittest.TestCase): use_ir_memory_optimize=True, enable_inplace=True, fuse_elewise_add_act_ops=False, + fuse_all_reduce_ops=False, fuse_relu_depthwise_conv=False, optimizer=fluid.optimizer.Adam, use_fast_executor=False, @@ -80,6 +81,7 @@ class TestParallelExecutorBase(unittest.TestCase): build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv build_strategy.memory_optimize = False if memory_opt else use_ir_memory_optimize + build_strategy.fuse_all_reduce_ops = fuse_all_reduce_ops # python memory optimization is conflict with inplace pass. # Use ir graph memory optimization after inplace pass is the correct way. build_strategy.enable_inplace = False if memory_opt else enable_inplace diff --git a/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py b/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py new file mode 100644 index 0000000000000000000000000000000000000000..fbeff20c63b2f4a3f01ac4131ac7063aff0204cf --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py @@ -0,0 +1,35 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +from test_dist_base import TestDistBase + + +class TestDistMnistNCCL2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_reduce = False + self._use_reader_alloc = False + self._nccl2_mode = True + self._nccl2_reduce_layer = True + + def test_dist_train(self): + import paddle.fluid as fluid + if fluid.core.is_compiled_with_cuda(): + self.check_with_place("dist_allreduce_op.py", delta=1e-5) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index f4d14d4024923a75ef86cd18179b8bd9eed44913..969f5cb63c9dd2a773be9530abd2a49714202cd1 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -33,7 +33,10 @@ DEFAULT_BATCH_SIZE = 2 class TestDistRunnerBase(object): - def get_model(self, batch_size=DEFAULT_BATCH_SIZE, lr=0.1): + def get_model(self, + batch_size=DEFAULT_BATCH_SIZE, + lr=0.1, + single_device=False): raise NotImplementedError( "get_model should be implemented by child classes.") @@ -76,8 +79,12 @@ class TestDistRunnerBase(object): def run_trainer(self, args): self.lr = args.lr - test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ - self.get_model(batch_size=args.batch_size) + if args.nccl2_reduce_layer_local_run: + test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ + self.get_model(batch_size=args.batch_size, single_device=True) + else: + test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ + self.get_model(batch_size=args.batch_size) if args.mem_opt: fluid.memory_optimize(fluid.default_main_program(), skip_grads=True) @@ -87,7 +94,7 @@ class TestDistRunnerBase(object): args.endpoints, args.trainers, args.sync_mode, args.dc_asgd) trainer_prog = t.get_trainer_program() - elif args.update_method == "nccl2": + elif args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer": # transpile for nccl2 config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" @@ -103,16 +110,17 @@ class TestDistRunnerBase(object): trainer_prog = fluid.default_main_program() if args.use_cuda: - place = fluid.CUDAPlace(0) + device_id = int(os.getenv("FLAGS_selected_gpus", "0")) + place = fluid.CUDAPlace(device_id) else: place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) - strategy = fluid.ExecutionStrategy() - strategy.num_threads = 1 - strategy.allow_op_delay = False + exec_strategy = fluid.ExecutionStrategy() + exec_strategy.num_threads = 1 + exec_strategy.allow_op_delay = False build_stra = fluid.BuildStrategy() # FIXME force disable enable_inplace and memory_optimize @@ -124,23 +132,25 @@ class TestDistRunnerBase(object): else: build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce + pass_builder = None if args.batch_merge_repeat > 1: pass_builder = build_stra._finalize_strategy_and_create_passes() mypass = pass_builder.insert_pass( len(pass_builder.all_passes()) - 3, "multi_batch_merge_pass") mypass.set("num_repeats", args.batch_merge_repeat) - if args.update_method == "nccl2": + if args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer": build_stra.num_trainers = len(args.endpoints.split(",")) build_stra.trainer_id = args.trainer_id else: + # case args.update_method == "nccl2_reduce_layer": build_stra.num_trainers = 1 build_stra.trainer_id = 0 binary = compiler.CompiledProgram(trainer_prog).with_data_parallel( loss_name=avg_cost.name, build_strategy=build_stra, - exec_strategy=strategy) + exec_strategy=exec_strategy) feed_var_list = [ var for var in trainer_prog.global_block().vars.values() @@ -182,7 +192,7 @@ def runtime_main(test_class): '--update_method', type=str, default="local", - choices=["pserver", "nccl2", "local"]) + choices=["pserver", "nccl2", "local", "nccl2_reduce_layer"]) parser.add_argument('--trainer_id', type=int, required=False, default=0) parser.add_argument('--trainers', type=int, required=False, default=1) parser.add_argument( @@ -198,6 +208,11 @@ def runtime_main(test_class): parser.add_argument('--lr', required=False, type=float, default=0.001) parser.add_argument( '--batch_merge_repeat', required=False, type=int, default=1) + parser.add_argument( + '--nccl2_reduce_layer_local_run', + required=False, + type=bool, + default=False) args = parser.parse_args() @@ -242,6 +257,12 @@ class TestDistBase(unittest.TestCase): self._dc_asgd = False # must use with async mode self._use_reader_alloc = True self._nccl2_mode = False + self._mp_mode = False + # FIXME(typhoonzero): I added this stupid argument to enable + # testing allreduce layers, which users can call layers.allreduce + # to accumulate tensors at anywhere. Find a better way to do this + # test, reduce check this argument everywhere. + self._nccl2_reduce_layer = False self._lr = 0.001 self._setup_config() self._after_setup_config() @@ -307,10 +328,16 @@ class TestDistBase(unittest.TestCase): cmd += " --batch_size %d" % batch_size if batch_merge_repeat > 1: cmd += " --batch_merge_repeat %d" % batch_merge_repeat + if self._nccl2_reduce_layer: + cmd += " --nccl2_reduce_layer_local_run 1" if self.__use_cuda: cmd += " --use_cuda" - env_local = {"CUDA_VISIBLE_DEVICES": "0"} + env_local = { + "CUDA_VISIBLE_DEVICES": "0", + "PADDLE_TRAINERS_NUM": "1", + "PADDLE_TRAINER_ID": "0" + } else: env_local = {'CPU_NUM': '1'} @@ -427,29 +454,30 @@ class TestDistBase(unittest.TestCase): sys.stderr.write("ps1 stderr: %s\n" % fn.read()) # print log - if stat0 == 0: - sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out)) with open("/tmp/tr0_err.log", "r") as fn: sys.stderr.write('trainer 0 stderr: %s\n' % fn.read()) - if stat1 == 0: - sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out)) with open("/tmp/tr1_err.log", "r") as fn: sys.stderr.write('trainer 1 stderr: %s\n' % fn.read()) return pickle.loads(tr0_out), pickle.loads(tr1_out) - def _run_cluster_nccl2(self, model, envs, check_error_log): + def _run_cluster_nccl2(self, model, envs, nccl2_reduce_layer, + check_error_log): # NOTE: we reuse ps_endpoints as nccl2 worker endpoints worker_endpoints = self._ps_endpoints.split(",") w0_ep, w1_ep = worker_endpoints + if nccl2_reduce_layer: + update_method = "nccl2_reduce_layer" + else: + update_method = "nccl2" - tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2 --lr %f" + tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method %s --lr %f" tr0_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, - 0, w0_ep, self._lr) + 0, w0_ep, update_method, self._lr) tr1_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, - 1, w1_ep, self._lr) + 1, w1_ep, update_method, self._lr) if self._mem_opt: tr0_cmd += " --mem_opt" @@ -463,12 +491,25 @@ class TestDistBase(unittest.TestCase): if self.__use_cuda: tr0_cmd += " --use_cuda" tr1_cmd += " --use_cuda" - env0 = {"CUDA_VISIBLE_DEVICES": "0"} - env1 = {"CUDA_VISIBLE_DEVICES": "1"} + env0 = { + "CUDA_VISIBLE_DEVICES": "0", + # for test nccl2 layer + "PADDLE_TRAINERS_NUM": "2", + "PADDLE_TRAINER_ID": "0" + } + env1 = { + "CUDA_VISIBLE_DEVICES": "1", + "PADDLE_TRAINERS_NUM": "2", + "PADDLE_TRAINER_ID": "1" + } else: env0 = {'CPU_NUM': '1'} env1 = {'CPU_NUM': '1'} + if self._mp_mode: + env0 = {"FLAGS_selected_gpus": "0"} + env1 = {"FLAGS_selected_gpus": "1"} + env0.update(envs) env1.update(envs) @@ -498,8 +539,6 @@ class TestDistBase(unittest.TestCase): # print log sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err) sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) - sys.stderr.write('trainer 0 stdout: %s\n' % tr0_out) - sys.stderr.write('trainer 1 stdout: %s\n' % tr1_out) return pickle.loads(tr0_out), pickle.loads(tr1_out) @@ -528,10 +567,14 @@ class TestDistBase(unittest.TestCase): local_losses\ = self._run_local(model_file, required_envs, - check_error_log) + check_error_log) if self._nccl2_mode: - tr0_losses, tr1_losses = self._run_cluster_nccl2( - model_file, required_envs, check_error_log) + if self._nccl2_reduce_layer: + tr0_losses, tr1_losses = self._run_cluster_nccl2( + model_file, required_envs, True, check_error_log) + else: + tr0_losses, tr1_losses = self._run_cluster_nccl2( + model_file, required_envs, False, check_error_log) else: tr0_losses, tr1_losses = self._run_cluster( model_file, required_envs, check_error_log) diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py index 49a2ca40e3cb1dd35027345e9c38eb8b6912d2cd..030860ec79233ba6c1482ce635fa6907c1650198 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py new file mode 100644 index 0000000000000000000000000000000000000000..38f7bb80d2f9144800ef8f8fb1402dcf86925067 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py @@ -0,0 +1,63 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +from test_dist_base import TestDistBase +import os + + +def skip_ci(func): + on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0'))) + + def __func__(*args, **kwargs): + if on_ci: + return + return func(*args, **kwargs) + + return __func__ + + +class TestDistSeResneXtNCCL(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_reader_alloc = False + self._nccl2_mode = True + + @skip_ci + def test_dist_train(self): + import paddle.fluid as fluid + if fluid.core.is_compiled_with_cuda(): + self.check_with_place("dist_se_resnext.py", delta=1e-5) + + +class TestDistSeResneXtNCCLMP(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_reader_alloc = False + self._nccl2_mode = True + self._mp_mode = True + + @skip_ci + def test_dist_train(self): + import paddle.fluid as fluid + if fluid.core.is_compiled_with_cuda(): + self.check_with_place( + "dist_se_resnext.py", + delta=1e-5, + need_envs={"NCCL_P2P_DISABLE": "1"}) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py new file mode 100644 index 0000000000000000000000000000000000000000..104e896b6e440f5657a90e0ce741b49f72ba75c6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py @@ -0,0 +1,69 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np +import paddle.fluid.core as core +from op_test import OpTest + +import random + + +class TestElementwiseModOp(OpTest): + def init_kernel_type(self): + self.use_mkldnn = False + + def setUp(self): + self.op_type = "elementwise_floordiv" + self.dtype = np.int32 + self.axis = -1 + self.init_dtype() + self.init_input_output() + self.init_kernel_type() + self.init_axis() + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(self.x), + 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) + } + self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn} + self.outputs = {'Out': self.out} + + def test_check_output(self): + self.check_output() + + def init_input_output(self): + self.x = np.random.uniform(0, 10000, [10, 10]).astype(self.dtype) + self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype) + self.out = np.floor_divide(self.x, self.y) + + def init_dtype(self): + pass + + def init_axis(self): + pass + + +class TestElementwiseModOp_scalar(TestElementwiseModOp): + def init_input_output(self): + scale_x = random.randint(0, 100000000) + scale_y = random.randint(1, 100000000) + self.x = (np.random.rand(2, 3, 4) * scale_x).astype(self.dtype) + self.y = (np.random.rand(1) * scale_y + 1).astype(self.dtype) + self.out = np.floor_divide(self.x, self.y) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py new file mode 100644 index 0000000000000000000000000000000000000000..a354ba0177ae70ba4f3a1565360f96a55edd33b6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py @@ -0,0 +1,69 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np +import paddle.fluid.core as core +from op_test import OpTest + +import random + + +class TestElementwiseModOp(OpTest): + def init_kernel_type(self): + self.use_mkldnn = False + + def setUp(self): + self.op_type = "elementwise_mod" + self.dtype = np.int32 + self.axis = -1 + self.init_dtype() + self.init_input_output() + self.init_kernel_type() + self.init_axis() + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(self.x), + 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) + } + self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn} + self.outputs = {'Out': self.out} + + def test_check_output(self): + self.check_output() + + def init_input_output(self): + self.x = np.random.uniform(0, 10000, [10, 10]).astype(self.dtype) + self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype) + self.out = np.mod(self.x, self.y) + + def init_dtype(self): + pass + + def init_axis(self): + pass + + +class TestElementwiseModOp_scalar(TestElementwiseModOp): + def init_input_output(self): + scale_x = random.randint(0, 100000000) + scale_y = random.randint(1, 100000000) + self.x = (np.random.rand(2, 3, 4) * scale_x).astype(self.dtype) + self.y = (np.random.rand(1) * scale_y + 1).astype(self.dtype) + self.out = np.mod(self.x, self.y) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py index 32cb23cbfa9bdef4728e85d0014123652e4aefea..0812b02b47db7fa2d43e1d3bbd0a3f7b59911326 100644 --- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py +++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py @@ -31,15 +31,27 @@ def dequantize_max_abs(x, scale, max_range): return y -def channel_wise_quantize_max_abs(x, quant_bit=8): +def channel_wise_quantize_max_abs(x, quant_bit=8, use_second_dim=False): scales = [] - for i in range(x.shape[0]): - scales.append(np.max(np.abs(x[i])).astype("float32")) - - y = x.copy() - max_range = math.pow(2, quant_bit - 1) - 1 - for i, scale in enumerate(scales): - y[i] = np.round(y[i] / scale * max_range) + if not use_second_dim: + for i in range(x.shape[0]): + scales.append(np.max(np.abs(x[i])).astype("float32")) + y = x.copy() + max_range = math.pow(2, quant_bit - 1) - 1 + for i, scale in enumerate(scales): + y[i] = np.round(x[i] / scale * max_range) + else: + for i in range(x.shape[0]): + s = [] + for j in range(x.shape[1]): + s.append(np.max(np.abs(x[i][j])).astype("float32")) + scales.append(s) + scales = np.amax(np.array(scales), axis=0) + y = x.copy() + max_range = math.pow(2, quant_bit - 1) - 1 + for i in range(x.shape[0]): + for j, scale in enumerate(scales): + y[i][j] = np.round(x[i][j] / scale * max_range) return y, scales @@ -47,10 +59,16 @@ def channel_wise_dequantize_max_abs(x, scales, quant_bits, activation_scale=None): - y = x.copy() - for i in range(x.shape[0]): - y[i] = (scales[i] / (math.pow(2, quant_bits[0] - 1) - 1)) * y[i] - if activation_scale is not None: + if activation_scale is None: + y = x.copy() + for i in range(x.shape[0]): + y[i] = (scales[i] / (math.pow(2, quant_bits[0] - 1) - 1)) * x[i] + else: + y = x.copy() + for i in range(x.shape[0]): + for j in range(x.shape[1]): + y[i][j] = (scales[j] / + (math.pow(2, quant_bits[0] - 1) - 1)) * x[i][j] y *= activation_scale / (math.pow(2, quant_bits[1] - 1) - 1) return y @@ -65,7 +83,8 @@ class TestFakeChannelWiseDequantizeMaxAbsOpTwoScales(OpTest): self.set_args() self.op_type = "fake_channel_wise_dequantize_max_abs" x = np.random.randn(4, 3, 64, 64).astype(self.data_type) - yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0]) + yq, scales = channel_wise_quantize_max_abs( + x, self.quant_bits[0], use_second_dim=True) ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits, self.activation_scale) diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py index cf8f01edb9a6a2b6d91080248553491c54e7707b..07038b0441d0dc37a42cbf2058c1b5f41b47a5da 100644 --- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py +++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py @@ -53,7 +53,7 @@ class TestFakeChannelWiseQuantizeOp(OpTest): self.outputs = { 'Out': outputs, - 'OutScales': np.array(scales).astype("float32"), + 'OutScale': np.array(scales).astype("float32"), } def test_check_output(self): diff --git a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py new file mode 100644 index 0000000000000000000000000000000000000000..ca8669bbc6f3ea7b3f3340793712a221b0bf8c6a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py @@ -0,0 +1,121 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from parallel_executor_test_base import TestParallelExecutorBase +import paddle.fluid as fluid +import paddle.fluid.core as core +import numpy as np +import paddle +import paddle.dataset.mnist as mnist +import unittest +import os + + +def simple_fc_net(use_feed): + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + hidden = img + for _ in range(4): + hidden = fluid.layers.fc( + hidden, + size=200, + act='relu', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +def fc_with_batchnorm(use_feed): + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + hidden = img + for _ in range(2): + hidden = fluid.layers.fc( + hidden, + size=200, + act='relu', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + + hidden = fluid.layers.batch_norm(input=hidden) + + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class TestMNIST(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) + + def _init_data(self, random=True): + np.random.seed(5) + if random: + img = np.random.random(size=[32, 784]).astype(np.float32) + else: + img = np.ones(shape=[32, 784], dtype='float32') + label = np.ones(shape=[32, 1], dtype='int64') + return img, label + + def _compare_fuse_all_reduce_ops(self, model, use_cuda, random_data=True): + if use_cuda and not core.is_compiled_with_cuda(): + return + img, label = self._init_data(random_data) + + def _optimizer(learning_rate=1e-6): + optimizer = fluid.optimizer.SGD( + learning_rate=learning_rate, + regularization=fluid.regularizer.L2Decay(1e-6)) + return optimizer + + not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + fuse_all_reduce_ops=False, + memory_opt=False, + optimizer=_optimizer) + fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + fuse_all_reduce_ops=True, + memory_opt=False, + optimizer=_optimizer) + + for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + + def test_simple_fc_with_fuse_op(self): + self._compare_fuse_all_reduce_ops(simple_fc_net, True) + self._compare_fuse_all_reduce_ops(simple_fc_net, False) + + def test_batchnorm_fc_with_fuse_op(self): + self._compare_fuse_all_reduce_ops(fc_with_batchnorm, True) + self._compare_fuse_all_reduce_ops(fc_with_batchnorm, False) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..62c25f734598e35b7c668d1ec1b89b5c57449f73 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py @@ -0,0 +1,163 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle +import paddle.fluid as fluid +from paddle.fluid.optimizer import SGDOptimizer +from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC +from paddle.fluid.imperative.base import to_variable + + +class SimpleImgConvPool(fluid.imperative.Layer): + def __init__(self, + name_scope, + num_channels, + num_filters, + filter_size, + pool_size, + pool_stride, + pool_padding=0, + pool_type='max', + global_pooling=False, + conv_stride=1, + conv_padding=0, + conv_dilation=1, + conv_groups=1, + act=None, + use_cudnn=False, + param_attr=None, + bias_attr=None): + super(SimpleImgConvPool, self).__init__(name_scope) + + self._conv2d = Conv2D( + self.full_name(), + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=conv_stride, + padding=conv_padding, + dilation=conv_dilation, + groups=conv_groups, + param_attr=None, + bias_attr=None, + use_cudnn=use_cudnn) + + self._pool2d = Pool2D( + self.full_name(), + pool_size=pool_size, + pool_type=pool_type, + pool_stride=pool_stride, + pool_padding=pool_padding, + global_pooling=global_pooling, + use_cudnn=use_cudnn) + + def forward(self, inputs): + x = self._conv2d(inputs) + x = self._pool2d(x) + return x + + +class MNIST(fluid.imperative.Layer): + def __init__(self, name_scope): + super(MNIST, self).__init__(name_scope) + + self._simple_img_conv_pool_1 = SimpleImgConvPool( + self.full_name(), 1, 20, 5, 2, 2, act="relu") + + self._simple_img_conv_pool_2 = SimpleImgConvPool( + self.full_name(), 20, 50, 5, 2, 2, act="relu") + + pool_2_shape = 50 * 4 * 4 + SIZE = 10 + scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5 + self._fc = FC(self.full_name(), + 10, + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale)), + act="softmax") + + def forward(self, inputs): + x = self._simple_img_conv_pool_1(inputs) + x = self._simple_img_conv_pool_2(x) + x = self._fc(x) + return x + + +class TestImperativeCheckpoint(unittest.TestCase): + def save_load_persistables(self): + seed = 90 + epoch_num = 1 + + with fluid.imperative.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + mnist = MNIST("mnist") + sgd = SGDOptimizer(learning_rate=1e-3) + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=128, drop_last=True) + + dy_param_init_value = {} + + step = 0 + for epoch in range(epoch_num): + for batch_id, data in enumerate(train_reader()): + dy_x_data = np.array( + [x[0].reshape(1, 28, 28) + for x in data]).astype('float32') + y_data = np.array( + [x[1] for x in data]).astype('int64').reshape(128, 1) + + img = to_variable(dy_x_data) + label = to_variable(y_data) + label._stop_gradient = True + + cost = mnist(img) + loss = fluid.layers.cross_entropy(cost, label) + avg_loss = fluid.layers.mean(loss) + + dy_out = avg_loss._numpy() + + avg_loss._backward() + sgd.minimize(avg_loss) + fluid.imperative.save_persistables(mnist, "save_dir") + mnist.clear_gradients() + + for param in mnist.parameters(): + dy_param_init_value[param.name] = param._numpy() + + mnist.load_dict( + fluid.imperative.load_persistables(mnist, "save_dir")) + + restore = mnist.parameters() + + self.assertEqual(len(dy_param_init_value), len(restore)) + for value in restore: + self.assertTrue( + np.allclose(value, dy_param_init_value[value.name])) + self.assertTrue(np.isfinite(value.all())) + self.assertFalse(np.isnan(value.any())) + + step += 1 + + if step > 20: + break + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py new file mode 100644 index 0000000000000000000000000000000000000000..af80ca6ce77a4ec187dd52863c2fe2ba278d5023 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py @@ -0,0 +1,196 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import random +import sys + +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from test_imperative_base import new_program_scope +from paddle.fluid.imperative.base import to_variable + +NUM_USERS = 100 +NUM_ITEMS = 1000 + +BATCH_SIZE = 32 +NUM_BATCHES = 2 + + +class MLP(fluid.imperative.Layer): + def __init__(self, name_scope): + super(MLP, self).__init__(name_scope) + self._user_latent = fluid.imperative.FC(self.full_name(), 256) + self._item_latent = fluid.imperative.FC(self.full_name(), 256) + + self._user_layers = [] + self._item_layers = [] + self._hid_sizes = [128, 64] + for i in range(len(self._hid_sizes)): + self._user_layers.append( + self.add_sublayer( + 'user_layer_%d' % i, + fluid.imperative.FC( + self.full_name(), self._hid_sizes[i], act='relu'))) + self._item_layers.append( + self.add_sublayer( + 'item_layer_%d' % i, + fluid.imperative.FC( + self.full_name(), self._hid_sizes[i], act='relu'))) + + def forward(self, users, items): + users = self._user_latent(users) + items = self._item_latent(items) + + for ul, il in zip(self._user_layers, self._item_layers): + users = ul(users) + items = il(items) + return fluid.layers.elementwise_mul(users, items) + + +class DMF(fluid.imperative.Layer): + def __init__(self, name_scope): + super(DMF, self).__init__(name_scope) + self._user_latent = fluid.imperative.FC(self.full_name(), 256) + self._item_latent = fluid.imperative.FC(self.full_name(), 256) + self._match_layers = [] + self._hid_sizes = [128, 64] + for i in range(len(self._hid_sizes)): + self._match_layers.append( + self.add_sublayer( + 'match_layer_%d' % i, + fluid.imperative.FC( + self.full_name(), self._hid_sizes[i], act='relu'))) + self._mat + + def forward(self, users, items): + users = self._user_latent(users) + items = self._item_latent(items) + match_vec = fluid.layers.concat( + [users, items], axis=len(users.shape) - 1) + for l in self._match_layers: + match_vec = l(match_vec) + return match_vec + + +class DeepCF(fluid.imperative.Layer): + def __init__(self, name_scope): + super(DeepCF, self).__init__(name_scope) + + self._user_emb = fluid.imperative.Embedding(self.full_name(), + [NUM_USERS, 256]) + self._item_emb = fluid.imperative.Embedding(self.full_name(), + [NUM_ITEMS, 256]) + + self._mlp = MLP(self.full_name()) + self._dmf = DMF(self.full_name()) + self._match_fc = fluid.imperative.FC(self.full_name(), 1, act='sigmoid') + + def forward(self, users, items): + users_emb = self._user_emb(users) + items_emb = self._item_emb(items) + + mlp_predictive = self._mlp(users_emb, items_emb) + dmf_predictive = self._dmf(users_emb, items_emb) + predictive = fluid.layers.concat( + [mlp_predictive, dmf_predictive], + axis=len(mlp_predictive.shape) - 1) + prediction = self._match_fc(predictive) + return prediction + + +def get_data(): + user_ids = [] + item_ids = [] + labels = [] + for uid in range(NUM_USERS): + for iid in range(NUM_ITEMS): + # 10% positive + label = float(random.randint(1, 10) == 1) + user_ids.append(uid) + item_ids.append(iid) + labels.append(label) + indices = np.arange(NUM_USERS * NUM_ITEMS) + np.random.shuffle(indices) + users_np = np.array(user_ids, dtype=np.int64)[indices] + items_np = np.array(item_ids, dtype=np.int64)[indices] + labels_np = np.array(labels, dtype=np.float32)[indices] + return np.expand_dims(users_np, -1), \ + np.expand_dims(items_np, -1), \ + np.expand_dims(labels_np, -1) + + +class TestImperativeDeepCF(unittest.TestCase): + def test_gan_float32(self): + seed = 90 + users_np, items_np, labels_np = get_data() + + startup = fluid.Program() + startup.random_seed = seed + main = fluid.Program() + main.random_seed = seed + + scope = fluid.core.Scope() + with new_program_scope(main=main, startup=startup, scope=scope): + users = fluid.layers.data('users', [1], dtype='int64') + items = fluid.layers.data('items', [1], dtype='int64') + labels = fluid.layers.data('labels', [1], dtype='float32') + + deepcf = DeepCF('deepcf') + prediction = deepcf(users, items) + loss = fluid.layers.reduce_sum( + fluid.layers.log_loss(prediction, labels)) + adam = fluid.optimizer.AdamOptimizer(0.01) + adam.minimize(loss) + + exe = fluid.Executor(fluid.CPUPlace( + ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) + exe.run(startup) + for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE): + static_loss = exe.run( + main, + feed={ + users.name: users_np[slice:slice + BATCH_SIZE], + items.name: items_np[slice:slice + BATCH_SIZE], + labels.name: labels_np[slice:slice + BATCH_SIZE] + }, + fetch_list=[loss])[0] + sys.stderr.write('static loss %s\n' % static_loss) + + with fluid.imperative.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + deepcf = DeepCF('deepcf') + for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE): + prediction = deepcf( + to_variable(users_np[slice:slice + BATCH_SIZE]), + to_variable(items_np[slice:slice + BATCH_SIZE])) + loss = fluid.layers.reduce_sum( + fluid.layers.log_loss(prediction, + to_variable(labels_np[slice:slice + + BATCH_SIZE]))) + loss._backward() + adam = fluid.optimizer.AdamOptimizer(0.01) + adam.minimize(loss) + deepcf.clear_gradients() + dy_loss = loss._numpy() + + self.assertEqual(static_loss, dy_loss) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py index a80202d6dddacaa4cb6fa3efd3c3dfd5b0ab4400..6024fb5f816d10cedad36272e353704797526676 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py @@ -51,7 +51,7 @@ class Generator(fluid.imperative.Layer): return self._fc3(x) -class TestImperativeMnist(unittest.TestCase): +class TestImperativeGAN(unittest.TestCase): def test_gan_float32(self): seed = 90 diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 885ee170e8032ef865ebfdd646fed1e995e9e60b..1672c3600f389d87e85f965f96122065137cf0ac 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -1240,6 +1240,14 @@ class TestBook(unittest.TestCase): print(str(program)) + def test_range(self): + program = Program() + with program_guard(program): + layers.range(0, 10, 2, 'int32') + layers.range(0.1, 10.0, 0.2, 'float32') + + print(str(program)) + def test_spectral_norm(self): program = Program() with program_guard(program): diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index 9548598d75367ed1f1863d1f6ae50b83d58f8c7f..1f23fae92c9d8148efb25facb602cdc4d485865b 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -13,6 +13,9 @@ # limitations under the License. from __future__ import print_function +import os +os.environ['FLAGS_fuse_parameter_memory_size'] = "131072" +os.environ['FLAGS_fuse_parameter_groups_size'] = "3" import paddle.fluid as fluid import paddle.fluid.layers.ops as ops @@ -22,7 +25,6 @@ import paddle.fluid.core as core from parallel_executor_test_base import TestParallelExecutorBase import unittest import math -import os import numpy as np # FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor @@ -312,17 +314,59 @@ class TestResnet(TestParallelExecutorBase): self.assertAlmostEquals( np.mean(parallel_last_loss), single_last_loss[0], delta=delta2) + def _compare_with_fused_all_reduce(self, + model, + use_cuda, + iter=20, + delta2=1e-5): + if use_cuda and not core.is_compiled_with_cuda(): + return + + global remove_bn + remove_bn = True + + img, label = self._init_data(batch_size=batch_size) + all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + iter=iter, + batch_size=batch_size, + use_cuda=use_cuda, + fuse_all_reduce_ops=False, + optimizer=optimizer) + reduce_first_loss, reduce_last_loss = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + iter=iter, + batch_size=batch_size, + use_cuda=use_cuda, + fuse_all_reduce_ops=True, + optimizer=optimizer) + + for loss in zip(all_reduce_first_loss, reduce_first_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) + for loss in zip(all_reduce_last_loss, reduce_last_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=delta2) + def test_seresnext_with_learning_rate_decay(self): self._check_resnet_convergence(model=SE_ResNeXt50Small, use_cuda=True) self._check_resnet_convergence( model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3) - def test_seresnext_with_new_strategy(self): + def test_seresnext_with_reduce(self): self._compare_reduce_and_allreduce( model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-2) self._compare_reduce_and_allreduce( model=SE_ResNeXt50Small, use_cuda=False, iter=5) + def test_seresnext_with_fused_all_reduce(self): + self._compare_with_fused_all_reduce( + model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-3) + self._compare_with_fused_all_reduce( + model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_range.py b/python/paddle/fluid/tests/unittests/test_range.py new file mode 100644 index 0000000000000000000000000000000000000000..f129ae78cbf7e2ccd5d974de265b8e95d1391df8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_range.py @@ -0,0 +1,70 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest + + +class TestRangeOp(OpTest): + def setUp(self): + self.op_type = "range" + self.init_config() + self.inputs = { + 'Start': np.array([self.case[0]]).astype(self.dtype), + 'End': np.array([self.case[1]]).astype(self.dtype), + 'Step': np.array([self.case[2]]).astype(self.dtype) + } + + self.outputs = { + 'Out': np.arange(self.case[0], self.case[1], + self.case[2]).astype(self.dtype) + } + + def init_config(self): + self.dtype = np.float32 + self.case = (0, 1, 0.2) + + def test_check_output(self): + self.check_output() + + +class TestFloatRangeOpCase0(TestRangeOp): + def init_config(self): + self.dtype = np.float32 + self.case = (0, 5, 1) + + +class TestInt32RangeOpCase0(TestRangeOp): + def init_config(self): + self.dtype = np.int32 + self.case = (0, 5, 2) + + +class TestInt32RangeOpCase1(TestRangeOp): + def init_config(self): + self.dtype = np.int32 + self.case = (10, 1, -2) + + +class TestInt32RangeOpCase2(TestRangeOp): + def init_config(self): + self.dtype = np.int32 + self.case = (-1, -10, -2) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/requirements.txt b/python/requirements.txt index 36bd5d4261cc7aa78d26b8c8ddfd87abd4f4e2e2..ce56462fac9c69df79c3c542202d21c0c67a91b8 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -12,3 +12,4 @@ six funcsigs pyyaml decorator +prettytable diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64 index c2fd743f62f536ab7443ca215d100478021d8f7c..c37a9a92e654e2d0c7d1b3decca0a34a3f34863b 100644 --- a/tools/manylinux1/Dockerfile.x64 +++ b/tools/manylinux1/Dockerfile.x64 @@ -52,7 +52,7 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /o LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \ LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python -RUN wget -O /opt/swig-2.0.12.tar.gz https://cytranet.dl.sourceforge.net/project/swig/swig/swig-2.0.12/swig-2.0.12.tar.gz && \ +RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \ cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]