未验证 提交 babe466a 编写于 作者: Q Qinghe JING 提交者: GitHub

Merge pull request #88 from hysunflower/update_scripts

Update scripts
......@@ -50,7 +50,7 @@ epoch_id = 0
while not trainer.stop():
if epoch_id > 15:
break
print("{} epoch {} start train".format(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), epoch_id))
print("{} Epoch {} start train".format(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), epoch_id))
train_step = 0
for data in reader():
trainer.run(feed=data, fetch=[])
......
#!/bin/bash
unset http_proxy
unset https_proxy
ps -ef | grep -E fl_ | grep -v grep | awk '{print $2}' | xargs kill -9
log_dir=${1:-$(pwd)}
log_dir=${1:-"logs"}
mkdir -p ${log_dir}
python fl_master.py > ${log_dir}/master.log 2>&1 &
......
......@@ -20,6 +20,7 @@ import paddle
import paddle.fluid as fluid
import logging
import math
import time
logging.basicConfig(
filename="test.log",
......@@ -72,9 +73,9 @@ epoch_id = 0
step = 0
while not trainer.stop():
epoch_id += 1
if epoch_id > 40:
if epoch_id > 10:
break
print("epoch %d start train" % (epoch_id))
print("{} Epoch {} start train".format(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), epoch_id))
for step_id, data in enumerate(train_reader()):
acc = trainer.run(feeder.feed(data), fetch=["accuracy_0.tmp_0"])
step += 1
......
#!/bin/bash
unset http_proxy
unset https_proxy
python fl_master.py
ps -ef | grep -E fl_ | grep -v grep | awk '{print $2}' | xargs kill -9
log_dir=${1:-"logs"}
mkdir -p ${log_dir}
python fl_master.py > ${log_dir}/master.log 2>&1 &
sleep 2
python -u fl_scheduler.py >scheduler.log &
python -u fl_scheduler.py > ${log_dir}/scheduler.log 2>&1 &
sleep 5
python -u fl_server.py > ${log_dir}/server0.log 2>&1 &
sleep 2
python -u fl_server.py >server0.log &
sleep 2
python -u fl_trainer.py 0 >trainer0.log &
sleep 2
python -u fl_trainer.py 1 >trainer1.log &
sleep 2
python -u fl_trainer.py 2 >trainer2.log &
sleep 2
python -u fl_trainer.py 3 >trainer3.log &
for ((i=0;i<4;i++))
do
python -u fl_trainer.py $i > ${log_dir}/trainer$i.log 2>&1 &
sleep 2
done
......@@ -21,6 +21,7 @@ import paddle
import paddle.fluid as fluid
import logging
import math
import time
logging.basicConfig(
filename="test.log",
......@@ -60,7 +61,7 @@ def train_test(train_test_program, train_test_feed, train_test_reader):
epoch_id = 0
step = 0
epoch = 3000
epoch = 10
count_by_step = False
if count_by_step:
output_folder = "model_node%d" % trainer_id
......@@ -72,7 +73,7 @@ while not trainer.stop():
epoch_id += 1
if epoch_id > epoch:
break
print("epoch %d start train" % (epoch_id))
print("{} Epoch {} start train".format(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), epoch_id))
#train_data,test_data= data_generater(trainer_id,inner_step=trainer._step,batch_size=64,count_by_step=count_by_step)
train_reader = paddle.batch(
paddle.reader.shuffle(
......@@ -97,7 +98,6 @@ while not trainer.stop():
acc = trainer.run(feeder.feed(data), fetch=["accuracy_0.tmp_0"])
step += 1
count += 1
print(count)
if count % trainer._step == 0:
break
# print("acc:%.3f" % (acc[0]))
......
#!/bin/bash
unset http_proxy
unset https_proxy
#killall python
python fl_master.py
ps -ef | grep -E fl_ | grep -v grep | awk '{print $2}' | xargs kill -9
log_dir=${1:-"logs"}
mkdir -p ${log_dir}
python fl_master.py > ${log_dir}/master.log 2>&1 &
sleep 2
python -u fl_scheduler.py >scheduler.log &
sleep 2
python -u fl_server.py >server0.log &
python -u fl_scheduler.py > ${log_dir}/scheduler.log 2>&1 &
sleep 5
python -u fl_server.py > ${log_dir}/server0.log 2>&1 &
sleep 2
for ((i=0;i<4;i++))
do
python -u fl_trainer.py $i >trainer$i.log &
python -u fl_trainer.py $i > ${log_dir}/trainer$i.log 2>&1 &
sleep 2
done
......@@ -17,6 +17,10 @@ pip install paddle_fl
#### How to save a program
```sh
python program_saver.py
```
In program_saver.py, you can defind a model. And save the program in to 'load_file'
```python
......
......@@ -20,6 +20,7 @@ import paddle
import paddle.fluid as fluid
import logging
import math
import time
logging.basicConfig(
filename="test.log",
......@@ -67,9 +68,9 @@ epoch_id = 0
step = 0
while not trainer.stop():
epoch_id += 1
if epoch_id > 40:
if epoch_id > 10:
break
print("epoch %d start train" % (epoch_id))
print("{} Epoch {} start train".format(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), epoch_id))
for step_id, data in enumerate(train_reader()):
acc = trainer.run(feeder.feed(data), fetch=["accuracy_0.tmp_0"])
step += 1
......
#!/bin/bash
unset http_proxy
unset https_proxy
python program_saver.py
ps -ef | grep -E fl_ | grep -v grep | awk '{print $2}' | xargs kill -9
if [ ! -d load_file ]; then
python program_saver.py
fi
python fl_master.py
sleep 2
python -u fl_scheduler.py >scheduler.log &
sleep 2
python -u fl_server.py >server0.log &
sleep 2
python -u fl_trainer.py 0 >trainer0.log &
log_dir=${1:-"logs"}
mkdir -p ${log_dir}
python fl_master.py > ${log_dir}/master.log 2>&1 &
sleep 2
python -u fl_trainer.py 1 > trainer1.log &
python -u fl_scheduler.py > ${log_dir}/scheduler.log 2>&1 &
sleep 5
python -u fl_server.py > ${log_dir}/server0.log 2>&1 &
sleep 2
for ((i=0;i<2;i++))
do
python -u fl_trainer.py $i > ${log_dir}/trainer$i.log 2>&1 &
sleep 2
done
......@@ -20,6 +20,8 @@ import numpy as np
import sys
import os
import logging
import time
logging.basicConfig(
filename="test.log",
filemode="w",
......@@ -43,10 +45,9 @@ r = Gru4rec_Reader()
train_reader = r.reader(train_file_dir, place, batch_size=125)
output_folder = "model_node4"
step_i = 0
epoch_i = 0
while not trainer.stop():
step_i += 1
print("batch %d start train" % (step_i))
epoch_i += 1
train_step = 0
for data in train_reader():
#print(np.array(data['src_wordseq']))
......@@ -56,10 +57,10 @@ while not trainer.stop():
break
avg_ppl = np.exp(ret_avg_cost[0])
newest_ppl = np.mean(avg_ppl)
print("ppl:%.3f" % (newest_ppl))
save_dir = (output_folder + "/epoch_%d") % step_i
print("{} Epoch {} start train, train_step {}, ppl {}".format (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), epoch_i, train_step, newest_ppl))
save_dir = (output_folder + "/epoch_%d") % epoch_i
if trainer_id == 0:
print("start save")
trainer.save_inference_program(save_dir)
if step_i >= 40:
if epoch_i >= 5:
break
#!/bin/bash
unset http_proxy
unset https_proxy
python fl_master.py
ps -ef | grep -E fl_ | grep -v grep | awk '{print $2}' | xargs kill -9
if [ ! -d mid_data ];then
sh download.sh
fi
log_dir=${1:-"logs"}
mkdir -p ${log_dir}
python fl_master.py > ${log_dir}/master.log 2>&1 &
sleep 2
python -u fl_scheduler.py >scheduler.log &
python -u fl_server.py >server0.log &
python -u fl_scheduler.py > ${log_dir}/scheduler.log 2>&1 &
sleep 5
python -u fl_server.py > ${log_dir}/server0.log 2>&1 &
sleep 2
python -u fl_trainer.py 0 >trainer0.log &
sleep 2
python -u fl_trainer.py 1 >trainer1.log &
sleep 2
python -u fl_trainer.py 2 >trainer2.log &
sleep 2
python -u fl_trainer.py 3 >trainer3.log &
for ((i=0;i<4;i++))
do
python -u fl_trainer.py $i > ${log_dir}/trainer$i.log 2>&1 &
sleep 2
done
......@@ -84,21 +84,16 @@ def train_test(train_test_program, train_test_feed, train_test_reader):
# for test
while not trainer.stop():
epoch_id += 1
print("epoch %d start train" % (epoch_id))
for data in train_reader():
step_i += 1
trainer.step_id = step_i
accuracy, = trainer.run(feed=feeder.feed(data),
fetch=["accuracy_0.tmp_0"])
if step_i % 100 == 0:
print("Epoch: {0}, step: {1}, accuracy: {2}".format(
print("{} Epoch {} start train, step: {}, accuracy: {}".format(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),
epoch_id, step_i, accuracy[0]))
print(step_i)
avg_loss_val, acc_val = train_test(
train_test_program=test_program,
train_test_reader=test_reader,
......@@ -106,7 +101,7 @@ while not trainer.stop():
print("Test with Epoch %d, avg_cost: %s, acc: %s" %
(epoch_id, avg_loss_val, acc_val))
if epoch_id > 40:
if epoch_id > 5:
break
if epoch_id % 5 == 0:
trainer.save_inference_program(output_folder)
#!/bin/bash
unset http_proxy
unset https_proxy
ps -ef | grep -E fl_ | grep -v grep | awk '{print $2}' | xargs kill -9
if [ ! -d log ];then
mkdir log
fi
log_dir=${1:-"logs"}
mkdir -p ${log_dir}
python fl_master.py
python fl_master.py > ${log_dir}/master.log 2>&1 &
sleep 2
python -u fl_server.py >log/server0.log &
python -u fl_scheduler.py > ${log_dir}/scheduler.log 2>&1 &
sleep 5
python -u fl_server.py > ${log_dir}/server0.log 2>&1 &
sleep 2
python -u fl_scheduler.py > log/scheduler.log &
sleep 2
python -u fl_trainer.py 0 >log/trainer0.log &
sleep 2
python -u fl_trainer.py 1 >log/trainer1.log &
for ((i=0;i<2;i++))
do
python -u fl_trainer.py $i > ${log_dir}/trainer$i.log 2>&1 &
sleep 2
done
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册