Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
models
提交
77f6c273
M
models
项目概览
PaddlePaddle
/
models
大约 2 年 前同步成功
通知
232
Star
6828
Fork
2962
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
602
列表
看板
标记
里程碑
合并请求
255
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
models
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
602
Issue
602
列表
看板
标记
里程碑
合并请求
255
合并请求
255
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
“4e23ba325db40a212ed30165143bcb5301bd106c”上不存在“paddle/phi/kernels/determinant_kernel.h”
未验证
提交
77f6c273
编写于
10月 28, 2018
作者:
G
gongweibao
提交者:
GitHub
10月 28, 2018
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Cleanup transformer train code! (#1392)
上级
514fab8b
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
75 addition
and
24 deletion
+75
-24
fluid/neural_machine_translation/transformer/local_dist.sh
fluid/neural_machine_translation/transformer/local_dist.sh
+64
-0
fluid/neural_machine_translation/transformer/train.py
fluid/neural_machine_translation/transformer/train.py
+11
-24
未找到文件。
fluid/neural_machine_translation/transformer/local_dist.sh
0 → 100755
浏览文件 @
77f6c273
#!/bin/bash
set
-x
unset
http_proxy
unset
https_proxy
#pserver
export
TRAINING_ROLE
=
PSERVER
export
PADDLE_PORT
=
30134
export
PADDLE_PSERVERS
=
127.0.0.1
export
PADDLE_IS_LOCAL
=
0
export
PADDLE_INIT_TRAINER_COUNT
=
1
export
POD_IP
=
127.0.0.1
export
PADDLE_TRAINER_ID
=
0
export
PADDLE_TRAINERS_NUM
=
1
export
LD_LIBRARY_PATH
=
$LD_LIBRARY_PATH
:/usr/lib64/:/usr/local/lib/:/workspace/brpc
export
PYTHONPATH
=
$PYTHONPATH
:/paddle/build/build_reader_RelWithDebInfo_gpu/python
#GLOG_v=7 GLOG_logtostderr=1
CUDA_VISIBLE_DEVICES
=
4,5,6,7 python
-u
train.py
\
--src_vocab_fpath
'cluster_test_data_en_fr/thirdparty/vocab.wordpiece.en-fr'
\
--trg_vocab_fpath
'cluster_test_data_en_fr/thirdparty/vocab.wordpiece.en-fr'
\
--special_token
'<s>'
'<e>'
'<unk>'
\
--token_delimiter
'\x01'
\
--train_file_pattern
'cluster_test_data_en_fr/train/train.wordpiece.en-fr.0'
\
--val_file_pattern
'cluster_test_data_en_fr/thirdparty/newstest2014.wordpiece.en-fr'
\
--use_token_batch
True
\
--batch_size
3200
\
--sort_type
pool
\
--pool_size
200000
\
--local
False
>
pserver.log 2>&1 &
pserver_pid
=
$(
echo
$!
)
echo
$pserver_pid
sleep
30s
#trainer
export
TRAINING_ROLE
=
TRAINER
export
PADDLE_PORT
=
30134
export
PADDLE_PSERVERS
=
127.0.0.1
export
PADDLE_IS_LOCAL
=
0
export
PADDLE_INIT_TRAINER_COUNT
=
1
export
POD_IP
=
127.0.0.1
export
PADDLE_TRAINER_ID
=
0
export
PADDLE_TRAINERS_NUM
=
1
CUDA_VISIBLE_DEVICES
=
4,5,6,7 python
-u
train.py
\
--src_vocab_fpath
'cluster_test_data_en_fr/thirdparty/vocab.wordpiece.en-fr'
\
--trg_vocab_fpath
'cluster_test_data_en_fr/thirdparty/vocab.wordpiece.en-fr'
\
--special_token
'<s>'
'<e>'
'<unk>'
\
--token_delimiter
'\x01'
\
--train_file_pattern
'cluster_test_data_en_fr/train/train.wordpiece.en-fr.0'
\
--val_file_pattern
'cluster_test_data_en_fr/thirdparty/newstest2014.wordpiece.en-fr'
\
--use_token_batch
True
\
--batch_size
3200
\
--sort_type
pool
\
--pool_size
200000
\
--local
False
>
trainer.log 2>&1 &
#sleep 80
#kill -9 $pserver_pid
fluid/neural_machine_translation/transformer/train.py
浏览文件 @
77f6c273
...
@@ -643,7 +643,7 @@ def train(args):
...
@@ -643,7 +643,7 @@ def train(args):
if
args
.
sync
:
if
args
.
sync
:
lr_decay
=
fluid
.
layers
.
learning_rate_scheduler
.
noam_decay
(
lr_decay
=
fluid
.
layers
.
learning_rate_scheduler
.
noam_decay
(
ModelHyperParams
.
d_model
,
TrainTaskConfig
.
warmup_steps
)
ModelHyperParams
.
d_model
,
TrainTaskConfig
.
warmup_steps
)
print
(
"before adam"
)
logging
.
info
(
"before adam"
)
with
fluid
.
default_main_program
().
_lr_schedule_guard
():
with
fluid
.
default_main_program
().
_lr_schedule_guard
():
learning_rate
=
lr_decay
*
TrainTaskConfig
.
learning_rate
learning_rate
=
lr_decay
*
TrainTaskConfig
.
learning_rate
...
@@ -661,7 +661,7 @@ def train(args):
...
@@ -661,7 +661,7 @@ def train(args):
fluid
.
memory_optimize
(
train_prog
)
fluid
.
memory_optimize
(
train_prog
)
if
args
.
local
:
if
args
.
local
:
print
(
"local start_up:"
)
logging
.
info
(
"local start_up:"
)
train_loop
(
exe
,
train_prog
,
startup_prog
,
dev_count
,
sum_cost
,
avg_cost
,
train_loop
(
exe
,
train_prog
,
startup_prog
,
dev_count
,
sum_cost
,
avg_cost
,
token_num
,
predict
,
pyreader
)
token_num
,
predict
,
pyreader
)
else
:
else
:
...
@@ -677,9 +677,9 @@ def train(args):
...
@@ -677,9 +677,9 @@ def train(args):
if
trainer_id
==
0
:
if
trainer_id
==
0
:
logging
.
info
(
"train_id == 0, sleep 60s"
)
logging
.
info
(
"train_id == 0, sleep 60s"
)
time
.
sleep
(
60
)
time
.
sleep
(
60
)
print
(
"trainers_num:"
,
trainers_num
)
logging
.
info
(
"trainers_num:{}"
.
format
(
trainers_num
)
)
print
(
"worker_endpoints:"
,
worker_endpoints
)
logging
.
info
(
"worker_endpoints:{}"
.
format
(
worker_endpoints
)
)
print
(
"current_endpoint:"
,
current_endpoint
)
logging
.
info
(
"current_endpoint:{}"
.
format
(
current_endpoint
)
)
append_nccl2_prepare
(
trainer_id
,
worker_endpoints
,
current_endpoint
)
append_nccl2_prepare
(
trainer_id
,
worker_endpoints
,
current_endpoint
)
train_loop
(
exe
,
train_loop
(
exe
,
fluid
.
default_main_program
(),
dev_count
,
sum_cost
,
fluid
.
default_main_program
(),
dev_count
,
sum_cost
,
...
@@ -696,11 +696,11 @@ def train(args):
...
@@ -696,11 +696,11 @@ def train(args):
current_endpoint
=
os
.
getenv
(
"POD_IP"
)
+
":"
+
port
current_endpoint
=
os
.
getenv
(
"POD_IP"
)
+
":"
+
port
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
))
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
))
print
(
"pserver_endpoints"
,
pserver_endpoints
)
logging
.
info
(
"pserver_endpoints:{}"
.
format
(
pserver_endpoints
)
)
print
(
"current_endpoint"
,
current_endpoint
)
logging
.
info
(
"current_endpoint:{}"
.
format
(
current_endpoint
)
)
print
(
"trainer_id"
,
trainer_id
)
logging
.
info
(
"trainer_id:{}"
.
format
(
trainer_id
)
)
print
(
"pserver_ips"
,
pserver_ips
)
logging
.
info
(
"pserver_ips:{}"
.
format
(
pserver_ips
)
)
print
(
"port"
,
port
)
logging
.
info
(
"port:{}"
.
format
(
port
)
)
t
=
fluid
.
DistributeTranspiler
()
t
=
fluid
.
DistributeTranspiler
()
t
.
transpile
(
t
.
transpile
(
...
@@ -715,30 +715,17 @@ def train(args):
...
@@ -715,30 +715,17 @@ def train(args):
current_endpoint
=
os
.
getenv
(
"POD_IP"
)
+
":"
+
os
.
getenv
(
current_endpoint
=
os
.
getenv
(
"POD_IP"
)
+
":"
+
os
.
getenv
(
"PADDLE_PORT"
)
"PADDLE_PORT"
)
if
not
current_endpoint
:
if
not
current_endpoint
:
print
(
"need env SERVER_ENDPOINT"
)
logging
.
critical
(
"need env SERVER_ENDPOINT"
)
exit
(
1
)
exit
(
1
)
pserver_prog
=
t
.
get_pserver_program
(
current_endpoint
)
pserver_prog
=
t
.
get_pserver_program
(
current_endpoint
)
pserver_startup
=
t
.
get_startup_program
(
current_endpoint
,
pserver_startup
=
t
.
get_startup_program
(
current_endpoint
,
pserver_prog
)
pserver_prog
)
print
(
"pserver start:"
)
program_to_code
(
pserver_startup
)
print
(
"pserver train:"
)
program_to_code
(
pserver_prog
)
#sys.exit(0)
exe
.
run
(
pserver_startup
)
exe
.
run
(
pserver_startup
)
exe
.
run
(
pserver_prog
)
exe
.
run
(
pserver_prog
)
elif
training_role
==
"TRAINER"
:
elif
training_role
==
"TRAINER"
:
logging
.
info
(
"distributed: trainer started"
)
logging
.
info
(
"distributed: trainer started"
)
trainer_prog
=
t
.
get_trainer_program
()
trainer_prog
=
t
.
get_trainer_program
()
'''
print("trainer start:")
program_to_code(pserver_startup)
print("trainer train:")
program_to_code(trainer_prog)
sys.exit(0)
'''
train_loop
(
exe
,
train_prog
,
startup_prog
,
dev_count
,
sum_cost
,
train_loop
(
exe
,
train_prog
,
startup_prog
,
dev_count
,
sum_cost
,
avg_cost
,
token_num
,
predict
,
pyreader
)
avg_cost
,
token_num
,
predict
,
pyreader
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录