未验证 提交 babe466a 编写于 作者: Q Qinghe JING 提交者: GitHub

Merge pull request #88 from hysunflower/update_scripts

Update scripts
...@@ -50,7 +50,7 @@ epoch_id = 0 ...@@ -50,7 +50,7 @@ epoch_id = 0
while not trainer.stop(): while not trainer.stop():
if epoch_id > 15: if epoch_id > 15:
break break
print("{} epoch {} start train".format(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), epoch_id)) print("{} Epoch {} start train".format(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), epoch_id))
train_step = 0 train_step = 0
for data in reader(): for data in reader():
trainer.run(feed=data, fetch=[]) trainer.run(feed=data, fetch=[])
......
#!/bin/bash
unset http_proxy unset http_proxy
unset https_proxy unset https_proxy
ps -ef | grep -E fl_ | grep -v grep | awk '{print $2}' | xargs kill -9 ps -ef | grep -E fl_ | grep -v grep | awk '{print $2}' | xargs kill -9
log_dir=${1:-$(pwd)} log_dir=${1:-"logs"}
mkdir -p ${log_dir} mkdir -p ${log_dir}
python fl_master.py > ${log_dir}/master.log 2>&1 & python fl_master.py > ${log_dir}/master.log 2>&1 &
......
...@@ -20,6 +20,7 @@ import paddle ...@@ -20,6 +20,7 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import logging import logging
import math import math
import time
logging.basicConfig( logging.basicConfig(
filename="test.log", filename="test.log",
...@@ -72,9 +73,9 @@ epoch_id = 0 ...@@ -72,9 +73,9 @@ epoch_id = 0
step = 0 step = 0
while not trainer.stop(): while not trainer.stop():
epoch_id += 1 epoch_id += 1
if epoch_id > 40: if epoch_id > 10:
break break
print("epoch %d start train" % (epoch_id)) print("{} Epoch {} start train".format(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), epoch_id))
for step_id, data in enumerate(train_reader()): for step_id, data in enumerate(train_reader()):
acc = trainer.run(feeder.feed(data), fetch=["accuracy_0.tmp_0"]) acc = trainer.run(feeder.feed(data), fetch=["accuracy_0.tmp_0"])
step += 1 step += 1
......
#!/bin/bash
unset http_proxy unset http_proxy
unset https_proxy unset https_proxy
python fl_master.py ps -ef | grep -E fl_ | grep -v grep | awk '{print $2}' | xargs kill -9
log_dir=${1:-"logs"}
mkdir -p ${log_dir}
python fl_master.py > ${log_dir}/master.log 2>&1 &
sleep 2 sleep 2
python -u fl_scheduler.py >scheduler.log & python -u fl_scheduler.py > ${log_dir}/scheduler.log 2>&1 &
sleep 5
python -u fl_server.py > ${log_dir}/server0.log 2>&1 &
sleep 2 sleep 2
python -u fl_server.py >server0.log & for ((i=0;i<4;i++))
sleep 2 do
python -u fl_trainer.py 0 >trainer0.log & python -u fl_trainer.py $i > ${log_dir}/trainer$i.log 2>&1 &
sleep 2 sleep 2
python -u fl_trainer.py 1 >trainer1.log & done
sleep 2
python -u fl_trainer.py 2 >trainer2.log &
sleep 2
python -u fl_trainer.py 3 >trainer3.log &
...@@ -21,6 +21,7 @@ import paddle ...@@ -21,6 +21,7 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import logging import logging
import math import math
import time
logging.basicConfig( logging.basicConfig(
filename="test.log", filename="test.log",
...@@ -60,7 +61,7 @@ def train_test(train_test_program, train_test_feed, train_test_reader): ...@@ -60,7 +61,7 @@ def train_test(train_test_program, train_test_feed, train_test_reader):
epoch_id = 0 epoch_id = 0
step = 0 step = 0
epoch = 3000 epoch = 10
count_by_step = False count_by_step = False
if count_by_step: if count_by_step:
output_folder = "model_node%d" % trainer_id output_folder = "model_node%d" % trainer_id
...@@ -72,7 +73,7 @@ while not trainer.stop(): ...@@ -72,7 +73,7 @@ while not trainer.stop():
epoch_id += 1 epoch_id += 1
if epoch_id > epoch: if epoch_id > epoch:
break break
print("epoch %d start train" % (epoch_id)) print("{} Epoch {} start train".format(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), epoch_id))
#train_data,test_data= data_generater(trainer_id,inner_step=trainer._step,batch_size=64,count_by_step=count_by_step) #train_data,test_data= data_generater(trainer_id,inner_step=trainer._step,batch_size=64,count_by_step=count_by_step)
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
...@@ -97,7 +98,6 @@ while not trainer.stop(): ...@@ -97,7 +98,6 @@ while not trainer.stop():
acc = trainer.run(feeder.feed(data), fetch=["accuracy_0.tmp_0"]) acc = trainer.run(feeder.feed(data), fetch=["accuracy_0.tmp_0"])
step += 1 step += 1
count += 1 count += 1
print(count)
if count % trainer._step == 0: if count % trainer._step == 0:
break break
# print("acc:%.3f" % (acc[0])) # print("acc:%.3f" % (acc[0]))
......
#!/bin/bash
unset http_proxy unset http_proxy
unset https_proxy unset https_proxy
#killall python ps -ef | grep -E fl_ | grep -v grep | awk '{print $2}' | xargs kill -9
python fl_master.py
log_dir=${1:-"logs"}
mkdir -p ${log_dir}
python fl_master.py > ${log_dir}/master.log 2>&1 &
sleep 2 sleep 2
python -u fl_scheduler.py >scheduler.log & python -u fl_scheduler.py > ${log_dir}/scheduler.log 2>&1 &
sleep 2 sleep 5
python -u fl_server.py >server0.log & python -u fl_server.py > ${log_dir}/server0.log 2>&1 &
sleep 2 sleep 2
for ((i=0;i<4;i++)) for ((i=0;i<4;i++))
do do
python -u fl_trainer.py $i >trainer$i.log & python -u fl_trainer.py $i > ${log_dir}/trainer$i.log 2>&1 &
sleep 2 sleep 2
done done
...@@ -17,6 +17,10 @@ pip install paddle_fl ...@@ -17,6 +17,10 @@ pip install paddle_fl
#### How to save a program #### How to save a program
```sh
python program_saver.py
```
In program_saver.py, you can defind a model. And save the program in to 'load_file' In program_saver.py, you can defind a model. And save the program in to 'load_file'
```python ```python
......
...@@ -20,6 +20,7 @@ import paddle ...@@ -20,6 +20,7 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import logging import logging
import math import math
import time
logging.basicConfig( logging.basicConfig(
filename="test.log", filename="test.log",
...@@ -67,9 +68,9 @@ epoch_id = 0 ...@@ -67,9 +68,9 @@ epoch_id = 0
step = 0 step = 0
while not trainer.stop(): while not trainer.stop():
epoch_id += 1 epoch_id += 1
if epoch_id > 40: if epoch_id > 10:
break break
print("epoch %d start train" % (epoch_id)) print("{} Epoch {} start train".format(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), epoch_id))
for step_id, data in enumerate(train_reader()): for step_id, data in enumerate(train_reader()):
acc = trainer.run(feeder.feed(data), fetch=["accuracy_0.tmp_0"]) acc = trainer.run(feeder.feed(data), fetch=["accuracy_0.tmp_0"])
step += 1 step += 1
......
#!/bin/bash
unset http_proxy unset http_proxy
unset https_proxy unset https_proxy
python program_saver.py ps -ef | grep -E fl_ | grep -v grep | awk '{print $2}' | xargs kill -9
if [ ! -d load_file ]; then
python program_saver.py
fi
python fl_master.py log_dir=${1:-"logs"}
sleep 2 mkdir -p ${log_dir}
python -u fl_scheduler.py >scheduler.log &
sleep 2 python fl_master.py > ${log_dir}/master.log 2>&1 &
python -u fl_server.py >server0.log &
sleep 2
python -u fl_trainer.py 0 >trainer0.log &
sleep 2 sleep 2
python -u fl_trainer.py 1 > trainer1.log & python -u fl_scheduler.py > ${log_dir}/scheduler.log 2>&1 &
sleep 5
python -u fl_server.py > ${log_dir}/server0.log 2>&1 &
sleep 2 sleep 2
for ((i=0;i<2;i++))
do
python -u fl_trainer.py $i > ${log_dir}/trainer$i.log 2>&1 &
sleep 2
done
...@@ -20,6 +20,8 @@ import numpy as np ...@@ -20,6 +20,8 @@ import numpy as np
import sys import sys
import os import os
import logging import logging
import time
logging.basicConfig( logging.basicConfig(
filename="test.log", filename="test.log",
filemode="w", filemode="w",
...@@ -43,10 +45,9 @@ r = Gru4rec_Reader() ...@@ -43,10 +45,9 @@ r = Gru4rec_Reader()
train_reader = r.reader(train_file_dir, place, batch_size=125) train_reader = r.reader(train_file_dir, place, batch_size=125)
output_folder = "model_node4" output_folder = "model_node4"
step_i = 0 epoch_i = 0
while not trainer.stop(): while not trainer.stop():
step_i += 1 epoch_i += 1
print("batch %d start train" % (step_i))
train_step = 0 train_step = 0
for data in train_reader(): for data in train_reader():
#print(np.array(data['src_wordseq'])) #print(np.array(data['src_wordseq']))
...@@ -56,10 +57,10 @@ while not trainer.stop(): ...@@ -56,10 +57,10 @@ while not trainer.stop():
break break
avg_ppl = np.exp(ret_avg_cost[0]) avg_ppl = np.exp(ret_avg_cost[0])
newest_ppl = np.mean(avg_ppl) newest_ppl = np.mean(avg_ppl)
print("ppl:%.3f" % (newest_ppl)) print("{} Epoch {} start train, train_step {}, ppl {}".format (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), epoch_i, train_step, newest_ppl))
save_dir = (output_folder + "/epoch_%d") % step_i save_dir = (output_folder + "/epoch_%d") % epoch_i
if trainer_id == 0: if trainer_id == 0:
print("start save") print("start save")
trainer.save_inference_program(save_dir) trainer.save_inference_program(save_dir)
if step_i >= 40: if epoch_i >= 5:
break break
#!/bin/bash
unset http_proxy unset http_proxy
unset https_proxy unset https_proxy
python fl_master.py ps -ef | grep -E fl_ | grep -v grep | awk '{print $2}' | xargs kill -9
if [ ! -d mid_data ];then
sh download.sh
fi
log_dir=${1:-"logs"}
mkdir -p ${log_dir}
python fl_master.py > ${log_dir}/master.log 2>&1 &
sleep 2 sleep 2
python -u fl_scheduler.py >scheduler.log & python -u fl_scheduler.py > ${log_dir}/scheduler.log 2>&1 &
python -u fl_server.py >server0.log & sleep 5
python -u fl_server.py > ${log_dir}/server0.log 2>&1 &
sleep 2 sleep 2
python -u fl_trainer.py 0 >trainer0.log & for ((i=0;i<4;i++))
sleep 2 do
python -u fl_trainer.py 1 >trainer1.log & python -u fl_trainer.py $i > ${log_dir}/trainer$i.log 2>&1 &
sleep 2 sleep 2
python -u fl_trainer.py 2 >trainer2.log & done
sleep 2
python -u fl_trainer.py 3 >trainer3.log &
...@@ -84,21 +84,16 @@ def train_test(train_test_program, train_test_feed, train_test_reader): ...@@ -84,21 +84,16 @@ def train_test(train_test_program, train_test_feed, train_test_reader):
# for test # for test
while not trainer.stop(): while not trainer.stop():
epoch_id += 1 epoch_id += 1
print("epoch %d start train" % (epoch_id))
for data in train_reader(): for data in train_reader():
step_i += 1 step_i += 1
trainer.step_id = step_i trainer.step_id = step_i
accuracy, = trainer.run(feed=feeder.feed(data), accuracy, = trainer.run(feed=feeder.feed(data),
fetch=["accuracy_0.tmp_0"]) fetch=["accuracy_0.tmp_0"])
if step_i % 100 == 0: if step_i % 100 == 0:
print("Epoch: {0}, step: {1}, accuracy: {2}".format( print("{} Epoch {} start train, step: {}, accuracy: {}".format(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),
epoch_id, step_i, accuracy[0])) epoch_id, step_i, accuracy[0]))
print(step_i)
avg_loss_val, acc_val = train_test( avg_loss_val, acc_val = train_test(
train_test_program=test_program, train_test_program=test_program,
train_test_reader=test_reader, train_test_reader=test_reader,
...@@ -106,7 +101,7 @@ while not trainer.stop(): ...@@ -106,7 +101,7 @@ while not trainer.stop():
print("Test with Epoch %d, avg_cost: %s, acc: %s" % print("Test with Epoch %d, avg_cost: %s, acc: %s" %
(epoch_id, avg_loss_val, acc_val)) (epoch_id, avg_loss_val, acc_val))
if epoch_id > 40: if epoch_id > 5:
break break
if epoch_id % 5 == 0: if epoch_id % 5 == 0:
trainer.save_inference_program(output_folder) trainer.save_inference_program(output_folder)
#!/bin/bash
unset http_proxy unset http_proxy
unset https_proxy unset https_proxy
ps -ef | grep -E fl_ | grep -v grep | awk '{print $2}' | xargs kill -9
if [ ! -d log ];then log_dir=${1:-"logs"}
mkdir log mkdir -p ${log_dir}
fi
python fl_master.py python fl_master.py > ${log_dir}/master.log 2>&1 &
sleep 2 sleep 2
python -u fl_server.py >log/server0.log & python -u fl_scheduler.py > ${log_dir}/scheduler.log 2>&1 &
sleep 5
python -u fl_server.py > ${log_dir}/server0.log 2>&1 &
sleep 2 sleep 2
python -u fl_scheduler.py > log/scheduler.log & for ((i=0;i<2;i++))
sleep 2 do
python -u fl_trainer.py 0 >log/trainer0.log & python -u fl_trainer.py $i > ${log_dir}/trainer$i.log 2>&1 &
sleep 2 sleep 2
python -u fl_trainer.py 1 >log/trainer1.log & done
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册