提交 d5d4806f 编写于 作者: L liyang109

ce

上级 62b45275
import os
import sys
sys.path.insert(0, os.environ['ceroot'])
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_cost_card1_kpi = CostKpi('train_cost_card1', 0.02, 0, actived=True)
test_cost_card1_kpi = CostKpi('test_cost_card1', 0.008, 0, actived=True)
train_duration_card1_kpi = DurationKpi(
'train_duration_card1', 0.06, 0, actived=True)
train_cost_card4_kpi = CostKpi('train_cost_card4', 0.02, 0, actived=True)
test_cost_card4_kpi = CostKpi('test_cost_card4', 0.008, 0, actived=True)
train_duration_card4_kpi = DurationKpi(
'train_duration_card4', 0.06, 0, actived=True)
tracking_kpis = [
train_cost_card1_kpi,
test_cost_card1_kpi,
train_duration_card1_kpi,
train_cost_card4_kpi,
test_cost_card4_kpi,
train_duration_card4_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
print("-----%s" % fs)
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
print("*****")
print(log)
print("****")
log_to_ce(log)
#!/bin/bash
train(){
DATA_PATH=./dataset/wmt16
python train.py \
--src_vocab_fpath $DATA_PATH/en_10000.dict \
--trg_vocab_fpath $DATA_PATH/de_10000.dict \
--special_token '<s>' '<e>' '<unk>' \
--train_file_pattern $DATA_PATH/wmt16/train \
--val_file_pattern $DATA_PATH/wmt16/val \
--use_token_batch True \
--batch_size 1024 \
--sort_type pool \
--pool_size 200000 \
--shuffle False \
--enable_ce True \
--local False \
--shuffle_batch False \
--use_py_reader True \
--use_mem_opt True \
--fetch_steps 100 $@ \
dropout_seed 10 \
learning_rate 2.0 \
warmup_steps 8000 \
beta2 0.997 \
d_model 512 \
d_inner_hid 2048 \
n_head 8 \
prepostprocess_dropout 0.1 \
attention_dropout 0.1 \
relu_dropout 0.1 \
weight_sharing True \
pass_num 2 \
model_dir 'tmp_models' \
ckpt_dir 'tmp_ckpts' &
}
export PADDLE_PSERVERS="127.0.0.1:7160,127.0.0.1:7161"
export PADDLE_TRAINERS_NUM="2"
mkdir -p logs
run_ps_ce_card1(){
TRAINING_ROLE="PSERVER" \
PADDLE_CURRENT_ENDPOINT="127.0.0.1:7160" \
FLAGS_fraction_of_gpu_memory_to_use=0.0 \
train &> logs/ps0.log &
TRAINING_ROLE="PSERVER" \
PADDLE_CURRENT_ENDPOINT="127.0.0.1:7161" \
FLAGS_fraction_of_gpu_memory_to_use=0.0 \
train &> logs/ps1.log &
TRAINING_ROLE="TRAINER" \
PADDLE_CURRENT_ENDPOINT="127.0.0.1:7162" \
PADDLE_TRAINER_ID="0" \
CUDA_VISIBLE_DEVICES="6" \
train &> logs/tr0.log|python _ce.py &
TRAINING_ROLE="TRAINER" \
PADDLE_CURRENT_ENDPOINT="127.0.0.1:7163" \
PADDLE_TRAINER_ID="1" \
CUDA_VISIBLE_DEVICES="7" \
train &> logs/tr1.log |python _ce.py &
}
run_ps_ce_card1
#!/bin/bash
train(){
DATA_PATH=./dataset/wmt16
python train.py \
--src_vocab_fpath $DATA_PATH/en_10000.dict \
--trg_vocab_fpath $DATA_PATH/de_10000.dict \
--special_token '<s>' '<e>' '<unk>' \
--train_file_pattern $DATA_PATH/wmt16/train \
--val_file_pattern $DATA_PATH/wmt16/val \
--use_token_batch True \
--batch_size 1024 \
--sort_type pool \
--pool_size 200000 \
--shuffle False \
--enable_ce True \
--local False \
--shuffle_batch False \
--use_py_reader True \
--use_mem_opt True \
--fetch_steps 100 $@ \
dropout_seed 10 \
learning_rate 2.0 \
warmup_steps 8000 \
beta2 0.997 \
d_model 512 \
d_inner_hid 2048 \
n_head 8 \
prepostprocess_dropout 0.1 \
attention_dropout 0.1 \
relu_dropout 0.1 \
weight_sharing True \
pass_num 2 \
model_dir 'tmp_models' \
ckpt_dir 'tmp_ckpts' &
}
export PADDLE_PSERVERS="127.0.0.1:7160,127.0.0.1:7161"
export PADDLE_TRAINERS_NUM="2"
mkdir -p logs
run_ps_ce_card4(){
TRAINING_ROLE="PSERVER" \
PADDLE_CURRENT_ENDPOINT="127.0.0.1:7160" \
FLAGS_fraction_of_gpu_memory_to_use=0.0 \
train &> logs/ps2.log &
TRAINING_ROLE="PSERVER" \
PADDLE_CURRENT_ENDPOINT="127.0.0.1:7161" \
FLAGS_fraction_of_gpu_memory_to_use=0.0 \
train &> logs/ps3.log &
TRAINING_ROLE="TRAINER" \
PADDLE_CURRENT_ENDPOINT="127.0.0.1:7162" \
PADDLE_TRAINER_ID="0" \
CUDA_VISIBLE_DEVICES="0,1,2,3" \
train &> logs/tr2.log|python _ce.py &
TRAINING_ROLE="TRAINER" \
PADDLE_CURRENT_ENDPOINT="127.0.0.1:7163" \
PADDLE_TRAINER_ID="1" \
CUDA_VISIBLE_DEVICES="4,5,6,7" \
train &> logs/tr3.log |python _ce.py &
}
run_ps_ce_card4
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册