From 00f971c199ee31b253e74d6c5a8c5c9f09882b20 Mon Sep 17 00:00:00 2001 From: Bai Yifan Date: Fri, 10 Jan 2020 22:22:02 +0800 Subject: [PATCH] remove return value in dist.merge() (#33) --- demo/distillation/distillation_demo.py | 22 +++++------- docs/docs/api/single_distiller_api.md | 46 ++++++++++++------------ docs/docs/tutorials/distillation_demo.md | 2 +- paddleslim/dist/single_distiller.py | 14 ++++---- 4 files changed, 41 insertions(+), 43 deletions(-) diff --git a/demo/distillation/distillation_demo.py b/demo/distillation/distillation_demo.py index 3f47553e..b3467e48 100644 --- a/demo/distillation/distillation_demo.py +++ b/demo/distillation/distillation_demo.py @@ -150,7 +150,9 @@ def compress(args): # print(v.name, v.shape) exe.run(t_startup) - _download('http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_pretrained.tar', '.') + _download( + 'http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_pretrained.tar', + '.') _decompress('./ResNet50_pretrained.tar') assert args.teacher_pretrained_model and os.path.exists( args.teacher_pretrained_model @@ -168,21 +170,17 @@ def compress(args): predicate=if_exist) data_name_map = {'image': 'image'} - main = merge( - teacher_program, - student_program, - data_name_map, - place) - - with fluid.program_guard(main, s_startup): - l2_loss = l2_loss("teacher_fc_0.tmp_0", "fc_0.tmp_0", main) + merge(teacher_program, student_program, data_name_map, place) + + with fluid.program_guard(student_program, s_startup): + l2_loss = l2_loss("teacher_fc_0.tmp_0", "fc_0.tmp_0", student_program) loss = avg_cost + l2_loss opt = create_optimizer(args) opt.minimize(loss) exe.run(s_startup) build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_reduce_ops = False - parallel_main = fluid.CompiledProgram(main).with_data_parallel( + parallel_main = fluid.CompiledProgram(student_program).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) for epoch_id in range(args.num_epochs): @@ -190,9 +188,7 @@ def compress(args): loss_1, loss_2, loss_3 = exe.run( parallel_main, feed=data, - fetch_list=[ - loss.name, avg_cost.name, l2_loss.name - ]) + fetch_list=[loss.name, avg_cost.name, l2_loss.name]) if step_id % args.log_period == 0: _logger.info( "train_epoch {} step {} loss {:.6f}, class loss {:.6f}, l2 loss {:.6f}". diff --git a/docs/docs/api/single_distiller_api.md b/docs/docs/api/single_distiller_api.md index c3685f7a..e6d382d6 100644 --- a/docs/docs/api/single_distiller_api.md +++ b/docs/docs/api/single_distiller_api.md @@ -1,7 +1,7 @@ ## merge -paddleslim.dist.merge(teacher_program, student_program, data_name_map, place, scope=fluid.global_scope(), name_prefix='teacher_') [[源代码]](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py#L19) +paddleslim.dist.merge(teacher_program, student_program, data_name_map, place, scope=fluid.global_scope(), name_prefix='teacher_') [[源代码]](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py#L19) -: merge将两个paddle program(teacher_program, student_program)融合为一个program,并将融合得到的program返回。在融合的program中,可以为其中合适的teacher特征图和student特征图添加蒸馏损失函数,从而达到用teacher模型的暗知识(Dark Knowledge)指导student模型学习的目的。 +: merge将teacher_program融合到student_program中。在融合的program中,可以为其中合适的teacher特征图和student特征图添加蒸馏损失函数,从而达到用teacher模型的暗知识(Dark Knowledge)指导student模型学习的目的。 **参数:** @@ -12,7 +12,7 @@ paddleslim.dist.merge(teacher_program, student_program, data_name_map, place, sc - **scope**(Scope)-该参数表示程序使用的变量作用域,如果不指定将使用默认的全局作用域。默认值:[*fluid.global_scope()*](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/global_scope_cn.html#global-scope) - **name_prefix**(str)-merge操作将统一为teacher的[*Variables*](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.3/api_guides/low_level/program.html#variable)添加的名称前缀name_prefix。默认值:'teacher_' -**返回:** 由student_program和teacher_program merge得到的program +**返回:** 无 !!! note "Note" *data_name_map* 是 **teacher_var name到student_var name的映射**,如果写反可能无法正确进行merge @@ -37,8 +37,8 @@ with fluid.program_guard(teacher_program): data_name_map = {'y':'x'} USE_GPU = False place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace() -main_program = dist.merge(teacher_program, student_program, - data_name_map, place) +dist.merge(teacher_program, student_program, + data_name_map, place) ``` @@ -76,10 +76,10 @@ with fluid.program_guard(teacher_program): data_name_map = {'y':'x'} USE_GPU = False place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace() -main_program = merge(teacher_program, student_program, data_name_map, place) -with fluid.program_guard(main_program): +merge(teacher_program, student_program, data_name_map, place) +with fluid.program_guard(student_program): distillation_loss = dist.fsp_loss('teacher_t1.tmp_1', 'teacher_t2.tmp_1', - 's1.tmp_1', 's2.tmp_1', main_program) + 's1.tmp_1', 's2.tmp_1', main_program) ``` @@ -91,7 +91,7 @@ paddleslim.dist.l2_loss(teacher_var_name, student_var_name, program=fluid.defaul **参数:** -- **teacher_var_name**(str): teacher_var的名称. +- **teacher_var_name**(str): teacher_var的名称. - **student_var_name**(str): student_var的名称. - **program**(Program): 用于蒸馏训练的fluid program。默认值:[*fluid.default_main_program()*](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.3/api_cn/fluid_cn.html#default-main-program) @@ -116,10 +116,10 @@ with fluid.program_guard(teacher_program): data_name_map = {'y':'x'} USE_GPU = False place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace() -main_program = merge(teacher_program, student_program, data_name_map, place) -with fluid.program_guard(main_program): +merge(teacher_program, student_program, data_name_map, place) +with fluid.program_guard(student_program): distillation_loss = dist.l2_loss('teacher_t2.tmp_1', 's2.tmp_1', - main_program) + main_program) ``` @@ -131,11 +131,11 @@ paddleslim.dist.soft_label_loss(teacher_var_name, student_var_name, program=flui **参数:** -- **teacher_var_name**(str): teacher_var的名称. -- **student_var_name**(str): student_var的名称. +- **teacher_var_name**(str): teacher_var的名称. +- **student_var_name**(str): student_var的名称. - **program**(Program): 用于蒸馏训练的fluid program。默认值:[*fluid.default_main_program()*](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.3/api_cn/fluid_cn.html#default-main-program) -- **teacher_temperature**(float): 对teacher_var进行soft操作的温度值,温度值越大得到的特征图越平滑 -- **student_temperature**(float): 对student_var进行soft操作的温度值,温度值越大得到的特征图越平滑 +- **teacher_temperature**(float): 对teacher_var进行soft操作的温度值,温度值越大得到的特征图越平滑 +- **student_temperature**(float): 对student_var进行soft操作的温度值,温度值越大得到的特征图越平滑 **返回:** 由teacher_var, student_var组合得到的soft_label_loss @@ -158,10 +158,10 @@ with fluid.program_guard(teacher_program): data_name_map = {'y':'x'} USE_GPU = False place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace() -main_program = merge(teacher_program, student_program, data_name_map, place) -with fluid.program_guard(main_program): +merge(teacher_program, student_program, data_name_map, place) +with fluid.program_guard(student_program): distillation_loss = dist.soft_label_loss('teacher_t2.tmp_1', - 's2.tmp_1', main_program, 1., 1.) + 's2.tmp_1', main_program, 1., 1.) ``` @@ -173,7 +173,7 @@ paddleslim.dist.loss(loss_func, program=fluid.default_main_program(), **kwargs) **参数:** -- **loss_func**(python function): 自定义的损失函数,输入为teacher var和student var,输出为自定义的loss +- **loss_func**(python function): 自定义的损失函数,输入为teacher var和student var,输出为自定义的loss - **program**(Program): 用于蒸馏训练的fluid program。默认值:[*fluid.default_main_program()*](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.3/api_cn/fluid_cn.html#default-main-program) - **\**kwargs**: loss_func输入名与对应variable名称 @@ -198,15 +198,15 @@ with fluid.program_guard(teacher_program): data_name_map = {'y':'x'} USE_GPU = False place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace() -main_program = merge(teacher_program, student_program, data_name_map, place) +merge(teacher_program, student_program, data_name_map, place) def adaptation_loss(t_var, s_var): teacher_channel = t_var.shape[1] s_hint = fluid.layers.conv2d(s_var, teacher_channel, 1) hint_loss = fluid.layers.reduce_mean(fluid.layers.square(s_hint - t_var)) return hint_loss -with fluid.program_guard(main_program): +with fluid.program_guard(student_program): distillation_loss = dist.loss(main_program, adaptation_loss, - t_var='teacher_t2.tmp_1', s_var='s2.tmp_1') + t_var='teacher_t2.tmp_1', s_var='s2.tmp_1') ``` !!! note "注意事项" diff --git a/docs/docs/tutorials/distillation_demo.md b/docs/docs/tutorials/distillation_demo.md index c565eee7..7f8d7eb8 100644 --- a/docs/docs/tutorials/distillation_demo.md +++ b/docs/docs/tutorials/distillation_demo.md @@ -86,7 +86,7 @@ merge过程操作较多,具体细节请参考[merge API文档](https://paddlep ```python data_name_map = {'data': 'image'} -student_program = merge(teacher_program, student_program, data_name_map, place) +merge(teacher_program, student_program, data_name_map, place) ``` ### 5.添加蒸馏loss diff --git a/paddleslim/dist/single_distiller.py b/paddleslim/dist/single_distiller.py index 8f5dcaeb..5e04134d 100644 --- a/paddleslim/dist/single_distiller.py +++ b/paddleslim/dist/single_distiller.py @@ -34,7 +34,6 @@ def merge(teacher_program, paddle run on which device. scope(Scope): The input scope name_prefix(str): Name prefix added for all vars of the teacher program. - Return(Program): Merged program. """ teacher_program = teacher_program.clone(for_test=True) for teacher_var in teacher_program.list_vars(): @@ -51,7 +50,7 @@ def merge(teacher_program, old_var = scope.var(teacher_var.name).get_tensor() renamed_var = scope.var(new_name).get_tensor() renamed_var.set(np.array(old_var), place) - + # program var rename renamed_var = teacher_program.global_block()._rename_var( teacher_var.name, new_name) @@ -84,11 +83,13 @@ def merge(teacher_program, attrs[attr_name] = op.attr(attr_name) student_program.global_block().append_op( type=op.type, inputs=inputs, outputs=outputs, attrs=attrs) - return student_program -def fsp_loss(teacher_var1_name, teacher_var2_name, student_var1_name, - student_var2_name, program=fluid.default_main_program()): +def fsp_loss(teacher_var1_name, + teacher_var2_name, + student_var1_name, + student_var2_name, + program=fluid.default_main_program()): """ Combine variables from student model and teacher model by fsp-loss. Args: @@ -115,7 +116,8 @@ def fsp_loss(teacher_var1_name, teacher_var2_name, student_var1_name, return fsp_loss -def l2_loss(teacher_var_name, student_var_name, +def l2_loss(teacher_var_name, + student_var_name, program=fluid.default_main_program()): """ Combine variables from student model and teacher model by l2-loss. -- GitLab