Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PALM
提交
255f2d28
P
PALM
项目概览
PaddlePaddle
/
PALM
通知
5
Star
3
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
10
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PALM
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
10
Issue
10
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
255f2d28
编写于
9月 29, 2019
作者:
X
xixiaoyao
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
delete fp16
上级
696ef944
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
19 addition
and
54 deletion
+19
-54
mtl_config.yaml
mtl_config.yaml
+0
-1
mtl_run.py
mtl_run.py
+3
-3
optimizer/bert_optimizer.py
optimizer/bert_optimizer.py
+15
-44
paradigm/answer_matching.py
paradigm/answer_matching.py
+0
-3
paradigm/mask_language_model.py
paradigm/mask_language_model.py
+1
-1
paradigm/reading_comprehension.py
paradigm/reading_comprehension.py
+0
-2
未找到文件。
mtl_config.yaml
浏览文件 @
255f2d28
...
@@ -31,6 +31,5 @@ max_seq_len: 512
...
@@ -31,6 +31,5 @@ max_seq_len: 512
use_ema
:
True
use_ema
:
True
ema_decay
:
0.9999
ema_decay
:
0.9999
random_seed
:
0
random_seed
:
0
use_fp16
:
False
loss_scaling
:
1.0
loss_scaling
:
1.0
mtl_run.py
浏览文件 @
255f2d28
...
@@ -275,14 +275,14 @@ def train(multitask_config):
...
@@ -275,14 +275,14 @@ def train(multitask_config):
exe
,
exe
,
args
.
pretrain_model_path
,
args
.
pretrain_model_path
,
main_program
=
startup_prog
,
main_program
=
startup_prog
,
use_fp16
=
args
.
use_fp16
)
use_fp16
=
False
)
if
args
.
checkpoint_path
:
if
args
.
checkpoint_path
:
if
os
.
path
.
exists
(
args
.
checkpoint_path
):
if
os
.
path
.
exists
(
args
.
checkpoint_path
):
init_checkpoint
(
init_checkpoint
(
exe
,
exe
,
args
.
checkpoint_path
,
args
.
checkpoint_path
,
main_program
=
startup_prog
,
main_program
=
startup_prog
,
use_fp16
=
args
.
use_fp16
)
use_fp16
=
False
)
else
:
else
:
os
.
makedirs
(
args
.
checkpoint_path
)
os
.
makedirs
(
args
.
checkpoint_path
)
...
@@ -294,7 +294,7 @@ def train(multitask_config):
...
@@ -294,7 +294,7 @@ def train(multitask_config):
exe
,
exe
,
args
.
checkpoint_path
,
args
.
checkpoint_path
,
main_program
=
test_prog
,
main_program
=
test_prog
,
use_fp16
=
args
.
use_fp16
)
use_fp16
=
False
)
if
args
.
do_train
:
if
args
.
do_train
:
print
(
'start training...'
)
print
(
'start training...'
)
...
...
optimizer/bert_optimizer.py
浏览文件 @
255f2d28
...
@@ -19,8 +19,6 @@ from __future__ import print_function
...
@@ -19,8 +19,6 @@ from __future__ import print_function
import
numpy
as
np
import
numpy
as
np
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
utils.fp16
import
create_master_params_grads
,
master_param_to_train_param
def
linear_warmup_decay
(
learning_rate
,
warmup_steps
,
num_train_steps
):
def
linear_warmup_decay
(
learning_rate
,
warmup_steps
,
num_train_steps
):
""" Applies linear warmup of learning rate from 0 and decay to 0."""
""" Applies linear warmup of learning rate from 0 and decay to 0."""
...
@@ -73,8 +71,6 @@ def optimization(loss, programs, args):
...
@@ -73,8 +71,6 @@ def optimization(loss, programs, args):
clip_norm_thres
=
1.0
clip_norm_thres
=
1.0
# When using mixed precision training, scale the gradient clip threshold
# When using mixed precision training, scale the gradient clip threshold
# by loss_scaling
# by loss_scaling
if
args
.
use_fp16
and
args
.
loss_scaling
>
1.0
:
clip_norm_thres
*=
args
.
loss_scaling
fluid
.
clip
.
set_gradient_clip
(
fluid
.
clip
.
set_gradient_clip
(
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
clip_norm_thres
))
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
clip_norm_thres
))
...
@@ -89,44 +85,19 @@ def optimization(loss, programs, args):
...
@@ -89,44 +85,19 @@ def optimization(loss, programs, args):
param_list
=
dict
()
param_list
=
dict
()
if
args
.
use_fp16
:
for
param
in
train_program
.
global_block
().
all_parameters
():
param_grads
=
optimizer
.
backward
(
loss
)
param_list
[
param
.
name
]
=
param
*
1.0
master_param_grads
=
create_master_params_grads
(
param_list
[
param
.
name
].
stop_gradient
=
True
param_grads
,
train_program
,
startup_prog
,
args
.
loss_scaling
)
_
,
param_grads
=
optimizer
.
minimize
(
loss
)
for
param
,
_
in
master_param_grads
:
param_list
[
param
.
name
]
=
param
*
1.0
if
args
.
weight_decay
>
0
:
param_list
[
param
.
name
].
stop_gradient
=
True
for
param
,
grad
in
param_grads
:
if
exclude_from_weight_decay
(
param
.
name
):
optimizer
.
apply_gradients
(
master_param_grads
)
continue
with
param
.
block
.
program
.
_optimized_guard
(
if
args
.
weight_decay
>
0
:
[
param
,
grad
]),
fluid
.
framework
.
name_scope
(
"weight_decay"
):
for
param
,
grad
in
master_param_grads
:
updated_param
=
param
-
param_list
[
if
exclude_from_weight_decay
(
param
.
name
.
rstrip
(
".master"
)):
param
.
name
]
*
args
.
weight_decay
*
scheduled_lr
continue
fluid
.
layers
.
assign
(
output
=
param
,
input
=
updated_param
)
with
param
.
block
.
program
.
_optimized_guard
(
[
param
,
grad
]),
fluid
.
framework
.
name_scope
(
"weight_decay"
):
updated_param
=
param
-
param_list
[
param
.
name
]
*
weight_decay
*
scheduled_lr
fluid
.
layers
.
assign
(
output
=
param
,
input
=
updated_param
)
master_param_to_train_param
(
master_param_grads
,
param_grads
,
train_program
)
else
:
for
param
in
train_program
.
global_block
().
all_parameters
():
param_list
[
param
.
name
]
=
param
*
1.0
param_list
[
param
.
name
].
stop_gradient
=
True
_
,
param_grads
=
optimizer
.
minimize
(
loss
)
if
args
.
weight_decay
>
0
:
for
param
,
grad
in
param_grads
:
if
exclude_from_weight_decay
(
param
.
name
):
continue
with
param
.
block
.
program
.
_optimized_guard
(
[
param
,
grad
]),
fluid
.
framework
.
name_scope
(
"weight_decay"
):
updated_param
=
param
-
param_list
[
param
.
name
]
*
args
.
weight_decay
*
scheduled_lr
fluid
.
layers
.
assign
(
output
=
param
,
input
=
updated_param
)
paradigm/answer_matching.py
浏览文件 @
255f2d28
...
@@ -25,9 +25,6 @@ def compute_loss(output_tensors, args=None):
...
@@ -25,9 +25,6 @@ def compute_loss(output_tensors, args=None):
logits
=
logits
,
label
=
labels
,
return_softmax
=
True
)
logits
=
logits
,
label
=
labels
,
return_softmax
=
True
)
loss
=
fluid
.
layers
.
mean
(
x
=
ce_loss
)
loss
=
fluid
.
layers
.
mean
(
x
=
ce_loss
)
if
args
.
use_fp16
and
args
.
loss_scaling
>
1.0
:
loss
*=
args
.
loss_scaling
return
loss
return
loss
...
...
paradigm/mask_language_model.py
浏览文件 @
255f2d28
...
@@ -42,7 +42,7 @@ def create_model(reader_input, base_model=None, is_training=True, args=None):
...
@@ -42,7 +42,7 @@ def create_model(reader_input, base_model=None, is_training=True, args=None):
_hidden_act
=
config
[
'hidden_act'
]
_hidden_act
=
config
[
'hidden_act'
]
_word_emb_name
=
"word_embedding"
_word_emb_name
=
"word_embedding"
_dtype
=
"float
16"
if
args
.
use_fp16
else
"float
32"
_dtype
=
"float32"
_param_initializer
=
fluid
.
initializer
.
TruncatedNormal
(
_param_initializer
=
fluid
.
initializer
.
TruncatedNormal
(
scale
=
config
[
'initializer_range'
])
scale
=
config
[
'initializer_range'
])
...
...
paradigm/reading_comprehension.py
浏览文件 @
255f2d28
...
@@ -35,8 +35,6 @@ def compute_loss(output_tensors, args=None):
...
@@ -35,8 +35,6 @@ def compute_loss(output_tensors, args=None):
start_loss
=
_compute_single_loss
(
start_logits
,
start_positions
)
start_loss
=
_compute_single_loss
(
start_logits
,
start_positions
)
end_loss
=
_compute_single_loss
(
end_logits
,
end_positions
)
end_loss
=
_compute_single_loss
(
end_logits
,
end_positions
)
total_loss
=
(
start_loss
+
end_loss
)
/
2.0
total_loss
=
(
start_loss
+
end_loss
)
/
2.0
if
args
.
use_fp16
and
args
.
loss_scaling
>
1.0
:
total_loss
=
total_loss
*
args
.
loss_scaling
return
total_loss
return
total_loss
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录