提交 00f971c1 编写于 作者: B Bai Yifan 提交者: whs

remove return value in dist.merge() (#33)

上级 74fb067f
...@@ -150,7 +150,9 @@ def compress(args): ...@@ -150,7 +150,9 @@ def compress(args):
# print(v.name, v.shape) # print(v.name, v.shape)
exe.run(t_startup) exe.run(t_startup)
_download('http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_pretrained.tar', '.') _download(
'http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_pretrained.tar',
'.')
_decompress('./ResNet50_pretrained.tar') _decompress('./ResNet50_pretrained.tar')
assert args.teacher_pretrained_model and os.path.exists( assert args.teacher_pretrained_model and os.path.exists(
args.teacher_pretrained_model args.teacher_pretrained_model
...@@ -168,21 +170,17 @@ def compress(args): ...@@ -168,21 +170,17 @@ def compress(args):
predicate=if_exist) predicate=if_exist)
data_name_map = {'image': 'image'} data_name_map = {'image': 'image'}
main = merge( merge(teacher_program, student_program, data_name_map, place)
teacher_program,
student_program, with fluid.program_guard(student_program, s_startup):
data_name_map, l2_loss = l2_loss("teacher_fc_0.tmp_0", "fc_0.tmp_0", student_program)
place)
with fluid.program_guard(main, s_startup):
l2_loss = l2_loss("teacher_fc_0.tmp_0", "fc_0.tmp_0", main)
loss = avg_cost + l2_loss loss = avg_cost + l2_loss
opt = create_optimizer(args) opt = create_optimizer(args)
opt.minimize(loss) opt.minimize(loss)
exe.run(s_startup) exe.run(s_startup)
build_strategy = fluid.BuildStrategy() build_strategy = fluid.BuildStrategy()
build_strategy.fuse_all_reduce_ops = False build_strategy.fuse_all_reduce_ops = False
parallel_main = fluid.CompiledProgram(main).with_data_parallel( parallel_main = fluid.CompiledProgram(student_program).with_data_parallel(
loss_name=loss.name, build_strategy=build_strategy) loss_name=loss.name, build_strategy=build_strategy)
for epoch_id in range(args.num_epochs): for epoch_id in range(args.num_epochs):
...@@ -190,9 +188,7 @@ def compress(args): ...@@ -190,9 +188,7 @@ def compress(args):
loss_1, loss_2, loss_3 = exe.run( loss_1, loss_2, loss_3 = exe.run(
parallel_main, parallel_main,
feed=data, feed=data,
fetch_list=[ fetch_list=[loss.name, avg_cost.name, l2_loss.name])
loss.name, avg_cost.name, l2_loss.name
])
if step_id % args.log_period == 0: if step_id % args.log_period == 0:
_logger.info( _logger.info(
"train_epoch {} step {} loss {:.6f}, class loss {:.6f}, l2 loss {:.6f}". "train_epoch {} step {} loss {:.6f}, class loss {:.6f}, l2 loss {:.6f}".
......
## merge ## merge
paddleslim.dist.merge(teacher_program, student_program, data_name_map, place, scope=fluid.global_scope(), name_prefix='teacher_') [[源代码]](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py#L19) paddleslim.dist.merge(teacher_program, student_program, data_name_map, place, scope=fluid.global_scope(), name_prefix='teacher_') [[源代码]](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/dist/single_distiller.py#L19)
: merge将两个paddle program(teacher_program, student_program)融合为一个program,并将融合得到的program返回。在融合的program中,可以为其中合适的teacher特征图和student特征图添加蒸馏损失函数,从而达到用teacher模型的暗知识(Dark Knowledge)指导student模型学习的目的。 : merge将teacher_program融合到student_program中。在融合的program中,可以为其中合适的teacher特征图和student特征图添加蒸馏损失函数,从而达到用teacher模型的暗知识(Dark Knowledge)指导student模型学习的目的。
**参数:** **参数:**
...@@ -12,7 +12,7 @@ paddleslim.dist.merge(teacher_program, student_program, data_name_map, place, sc ...@@ -12,7 +12,7 @@ paddleslim.dist.merge(teacher_program, student_program, data_name_map, place, sc
- **scope**(Scope)-该参数表示程序使用的变量作用域,如果不指定将使用默认的全局作用域。默认值:[*fluid.global_scope()*](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/global_scope_cn.html#global-scope) - **scope**(Scope)-该参数表示程序使用的变量作用域,如果不指定将使用默认的全局作用域。默认值:[*fluid.global_scope()*](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/global_scope_cn.html#global-scope)
- **name_prefix**(str)-merge操作将统一为teacher的[*Variables*](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.3/api_guides/low_level/program.html#variable)添加的名称前缀name_prefix。默认值:'teacher_' - **name_prefix**(str)-merge操作将统一为teacher的[*Variables*](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.3/api_guides/low_level/program.html#variable)添加的名称前缀name_prefix。默认值:'teacher_'
**返回:** 由student_program和teacher_program merge得到的program **返回:**
!!! note "Note" !!! note "Note"
*data_name_map***teacher_var name到student_var name的映射**,如果写反可能无法正确进行merge *data_name_map***teacher_var name到student_var name的映射**,如果写反可能无法正确进行merge
...@@ -37,7 +37,7 @@ with fluid.program_guard(teacher_program): ...@@ -37,7 +37,7 @@ with fluid.program_guard(teacher_program):
data_name_map = {'y':'x'} data_name_map = {'y':'x'}
USE_GPU = False USE_GPU = False
place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace() place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
main_program = dist.merge(teacher_program, student_program, dist.merge(teacher_program, student_program,
data_name_map, place) data_name_map, place)
``` ```
...@@ -76,8 +76,8 @@ with fluid.program_guard(teacher_program): ...@@ -76,8 +76,8 @@ with fluid.program_guard(teacher_program):
data_name_map = {'y':'x'} data_name_map = {'y':'x'}
USE_GPU = False USE_GPU = False
place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace() place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
main_program = merge(teacher_program, student_program, data_name_map, place) merge(teacher_program, student_program, data_name_map, place)
with fluid.program_guard(main_program): with fluid.program_guard(student_program):
distillation_loss = dist.fsp_loss('teacher_t1.tmp_1', 'teacher_t2.tmp_1', distillation_loss = dist.fsp_loss('teacher_t1.tmp_1', 'teacher_t2.tmp_1',
's1.tmp_1', 's2.tmp_1', main_program) 's1.tmp_1', 's2.tmp_1', main_program)
``` ```
...@@ -116,8 +116,8 @@ with fluid.program_guard(teacher_program): ...@@ -116,8 +116,8 @@ with fluid.program_guard(teacher_program):
data_name_map = {'y':'x'} data_name_map = {'y':'x'}
USE_GPU = False USE_GPU = False
place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace() place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
main_program = merge(teacher_program, student_program, data_name_map, place) merge(teacher_program, student_program, data_name_map, place)
with fluid.program_guard(main_program): with fluid.program_guard(student_program):
distillation_loss = dist.l2_loss('teacher_t2.tmp_1', 's2.tmp_1', distillation_loss = dist.l2_loss('teacher_t2.tmp_1', 's2.tmp_1',
main_program) main_program)
``` ```
...@@ -158,8 +158,8 @@ with fluid.program_guard(teacher_program): ...@@ -158,8 +158,8 @@ with fluid.program_guard(teacher_program):
data_name_map = {'y':'x'} data_name_map = {'y':'x'}
USE_GPU = False USE_GPU = False
place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace() place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
main_program = merge(teacher_program, student_program, data_name_map, place) merge(teacher_program, student_program, data_name_map, place)
with fluid.program_guard(main_program): with fluid.program_guard(student_program):
distillation_loss = dist.soft_label_loss('teacher_t2.tmp_1', distillation_loss = dist.soft_label_loss('teacher_t2.tmp_1',
's2.tmp_1', main_program, 1., 1.) 's2.tmp_1', main_program, 1., 1.)
``` ```
...@@ -198,13 +198,13 @@ with fluid.program_guard(teacher_program): ...@@ -198,13 +198,13 @@ with fluid.program_guard(teacher_program):
data_name_map = {'y':'x'} data_name_map = {'y':'x'}
USE_GPU = False USE_GPU = False
place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace() place = fluid.CUDAPlace(0) if USE_GPU else fluid.CPUPlace()
main_program = merge(teacher_program, student_program, data_name_map, place) merge(teacher_program, student_program, data_name_map, place)
def adaptation_loss(t_var, s_var): def adaptation_loss(t_var, s_var):
teacher_channel = t_var.shape[1] teacher_channel = t_var.shape[1]
s_hint = fluid.layers.conv2d(s_var, teacher_channel, 1) s_hint = fluid.layers.conv2d(s_var, teacher_channel, 1)
hint_loss = fluid.layers.reduce_mean(fluid.layers.square(s_hint - t_var)) hint_loss = fluid.layers.reduce_mean(fluid.layers.square(s_hint - t_var))
return hint_loss return hint_loss
with fluid.program_guard(main_program): with fluid.program_guard(student_program):
distillation_loss = dist.loss(main_program, adaptation_loss, distillation_loss = dist.loss(main_program, adaptation_loss,
t_var='teacher_t2.tmp_1', s_var='s2.tmp_1') t_var='teacher_t2.tmp_1', s_var='s2.tmp_1')
``` ```
......
...@@ -86,7 +86,7 @@ merge过程操作较多,具体细节请参考[merge API文档](https://paddlep ...@@ -86,7 +86,7 @@ merge过程操作较多,具体细节请参考[merge API文档](https://paddlep
```python ```python
data_name_map = {'data': 'image'} data_name_map = {'data': 'image'}
student_program = merge(teacher_program, student_program, data_name_map, place) merge(teacher_program, student_program, data_name_map, place)
``` ```
### 5.添加蒸馏loss ### 5.添加蒸馏loss
......
...@@ -34,7 +34,6 @@ def merge(teacher_program, ...@@ -34,7 +34,6 @@ def merge(teacher_program,
paddle run on which device. paddle run on which device.
scope(Scope): The input scope scope(Scope): The input scope
name_prefix(str): Name prefix added for all vars of the teacher program. name_prefix(str): Name prefix added for all vars of the teacher program.
Return(Program): Merged program.
""" """
teacher_program = teacher_program.clone(for_test=True) teacher_program = teacher_program.clone(for_test=True)
for teacher_var in teacher_program.list_vars(): for teacher_var in teacher_program.list_vars():
...@@ -84,11 +83,13 @@ def merge(teacher_program, ...@@ -84,11 +83,13 @@ def merge(teacher_program,
attrs[attr_name] = op.attr(attr_name) attrs[attr_name] = op.attr(attr_name)
student_program.global_block().append_op( student_program.global_block().append_op(
type=op.type, inputs=inputs, outputs=outputs, attrs=attrs) type=op.type, inputs=inputs, outputs=outputs, attrs=attrs)
return student_program
def fsp_loss(teacher_var1_name, teacher_var2_name, student_var1_name, def fsp_loss(teacher_var1_name,
student_var2_name, program=fluid.default_main_program()): teacher_var2_name,
student_var1_name,
student_var2_name,
program=fluid.default_main_program()):
""" """
Combine variables from student model and teacher model by fsp-loss. Combine variables from student model and teacher model by fsp-loss.
Args: Args:
...@@ -115,7 +116,8 @@ def fsp_loss(teacher_var1_name, teacher_var2_name, student_var1_name, ...@@ -115,7 +116,8 @@ def fsp_loss(teacher_var1_name, teacher_var2_name, student_var1_name,
return fsp_loss return fsp_loss
def l2_loss(teacher_var_name, student_var_name, def l2_loss(teacher_var_name,
student_var_name,
program=fluid.default_main_program()): program=fluid.default_main_program()):
""" """
Combine variables from student model and teacher model by l2-loss. Combine variables from student model and teacher model by l2-loss.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册