提交 e0076bb2 编写于 作者: Z zhengya01 提交者: Feiyu Chan

add ce for DeepVoice3 (#3801)

* update .run_ce.sh

* chmod +x .run_ce.sh

* add ce

* add ce

* add ce

* add ce
上级 2deb74ec
#!/bin/bash
export FLAGS_fraction_of_gpu_memory_to_use=0.5
export FLAGS_eager_delete_tensor_gb=0.0
export FLAGS_fast_eager_deletion_mode=1
train_single_frame()
{
python train.py \
--data-root=data/ljspeech/ \
--use-gpu \
--preset=presets/deepvoice3_ljspeech.json \
--hparams="nepochs=10"
}
train_multi_frame()
{
python train.py \
--data-root=data/ljspeech/ \
--use-gpu \
--preset=presets/deepvoice3_ljspeech.json \
--hparams="nepochs=10, downsample_step=1, outputs_per_step=4"
}
export CUDA_VISIBLE_DEVICES=0
train_single_frame | python _ce.py
sleep 20
train_multi_frame | python _ce.py
# this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi
from kpi import DurationKpi
from kpi import AccKpi
each_epoch_duration_frame1_card1 = DurationKpi("each_epoch_duration_frame1_card1", 0.02, actived=True)
train_cost_frame1_card1 = CostKpi("train_cost_frame1_card1", 0.02, actived=True)
each_epoch_duration_frame4_card1 = DurationKpi("each_epoch_duration_frame4_card1", 0.05, actived=True)
train_cost_frame4_card1 = CostKpi("train_cost_frame4_card1", 0.02, actived=True)
tracking_kpis = [
each_epoch_duration_frame1_card1,
train_cost_frame1_card1,
each_epoch_duration_frame4_card1,
train_cost_frame4_card1,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
log_to_ce(log)
...@@ -17,6 +17,7 @@ from __future__ import division ...@@ -17,6 +17,7 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import os import os
import time
from itertools import chain from itertools import chain
from paddle import fluid from paddle import fluid
...@@ -31,6 +32,7 @@ def train_model(model, loader, criterion, optimizer, clipper, writer, args, ...@@ -31,6 +32,7 @@ def train_model(model, loader, criterion, optimizer, clipper, writer, args,
assert fluid.framework.in_dygraph_mode( assert fluid.framework.in_dygraph_mode(
), "this function must be run within dygraph guard" ), "this function must be run within dygraph guard"
n_trainers = dg.parallel.Env().nranks
local_rank = dg.parallel.Env().local_rank local_rank = dg.parallel.Env().local_rank
# amount of shifting when compute losses # amount of shifting when compute losses
...@@ -43,6 +45,9 @@ def train_model(model, loader, criterion, optimizer, clipper, writer, args, ...@@ -43,6 +45,9 @@ def train_model(model, loader, criterion, optimizer, clipper, writer, args,
checkpoint_dir = os.path.join(args.output, "checkpoints") checkpoint_dir = os.path.join(args.output, "checkpoints")
tensorboard_dir = os.path.join(args.output, "log") tensorboard_dir = os.path.join(args.output, "log")
ce_loss = 0
start_time = time.time()
for epoch in range(hparams.nepochs): for epoch in range(hparams.nepochs):
epoch_loss = 0. epoch_loss = 0.
for step, inputs in tqdm(enumerate(loader())): for step, inputs in tqdm(enumerate(loader())):
...@@ -183,6 +188,7 @@ def train_model(model, loader, criterion, optimizer, clipper, writer, args, ...@@ -183,6 +188,7 @@ def train_model(model, loader, criterion, optimizer, clipper, writer, args,
if (local_rank == 0 and global_step > 0 and if (local_rank == 0 and global_step > 0 and
global_step % hparams.checkpoint_interval == 0): global_step % hparams.checkpoint_interval == 0):
save_states(global_step, writer, mel_outputs, linear_outputs, save_states(global_step, writer, mel_outputs, linear_outputs,
alignments, mel, linear, alignments, mel, linear,
input_lengths.numpy(), checkpoint_dir) input_lengths.numpy(), checkpoint_dir)
...@@ -239,4 +245,14 @@ def train_model(model, loader, criterion, optimizer, clipper, writer, args, ...@@ -239,4 +245,14 @@ def train_model(model, loader, criterion, optimizer, clipper, writer, args,
if writer is not None and local_rank == 0: if writer is not None and local_rank == 0:
writer.add_scalar("average_loss_in_epoch", average_loss_in_epoch, writer.add_scalar("average_loss_in_epoch", average_loss_in_epoch,
global_epoch) global_epoch)
ce_loss = average_loss_in_epoch
global_epoch += 1 global_epoch += 1
end_time = time.time()
epoch_time = (end_time - start_time) / global_epoch
print("kpis\teach_epoch_duration_frame%s_card%s\t%s" %
(hparams.outputs_per_step, n_trainers, epoch_time))
print("kpis\ttrain_cost_frame%s_card%s\t%f" %
(hparams.outputs_per_step, n_trainers, ce_loss))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册