diff --git a/PaddleSpeech/DeepVoice3/.run_ce.sh b/PaddleSpeech/DeepVoice3/.run_ce.sh new file mode 100644 index 0000000000000000000000000000000000000000..1b3b9d2810294a3c78b32b3ddde9f1bd52415268 --- /dev/null +++ b/PaddleSpeech/DeepVoice3/.run_ce.sh @@ -0,0 +1,30 @@ +#!/bin/bash +export FLAGS_fraction_of_gpu_memory_to_use=0.5 +export FLAGS_eager_delete_tensor_gb=0.0 +export FLAGS_fast_eager_deletion_mode=1 + + +train_single_frame() +{ + python train.py \ + --data-root=data/ljspeech/ \ + --use-gpu \ + --preset=presets/deepvoice3_ljspeech.json \ + --hparams="nepochs=10" +} + + +train_multi_frame() +{ + python train.py \ + --data-root=data/ljspeech/ \ + --use-gpu \ + --preset=presets/deepvoice3_ljspeech.json \ + --hparams="nepochs=10, downsample_step=1, outputs_per_step=4" + +} +export CUDA_VISIBLE_DEVICES=0 +train_single_frame | python _ce.py +sleep 20 +train_multi_frame | python _ce.py + diff --git a/PaddleSpeech/DeepVoice3/_ce.py b/PaddleSpeech/DeepVoice3/_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..137cab05828338b8093a307b660cc606ca0dd073 --- /dev/null +++ b/PaddleSpeech/DeepVoice3/_ce.py @@ -0,0 +1,62 @@ +# this file is only used for continuous evaluation test! + +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi +from kpi import DurationKpi +from kpi import AccKpi + +each_epoch_duration_frame1_card1 = DurationKpi("each_epoch_duration_frame1_card1", 0.02, actived=True) +train_cost_frame1_card1 = CostKpi("train_cost_frame1_card1", 0.02, actived=True) +each_epoch_duration_frame4_card1 = DurationKpi("each_epoch_duration_frame4_card1", 0.05, actived=True) +train_cost_frame4_card1 = CostKpi("train_cost_frame4_card1", 0.02, actived=True) + + +tracking_kpis = [ + each_epoch_duration_frame1_card1, + train_cost_frame1_card1, + each_epoch_duration_frame4_card1, + train_cost_frame4_card1, +] + + +def parse_log(log): + ''' + This method should be implemented by model developers. + + The suggestion: + + each line in the log should be key, value, for example: + + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + log_to_ce(log) diff --git a/PaddleSpeech/DeepVoice3/train_model.py b/PaddleSpeech/DeepVoice3/train_model.py index bf8870cdf25c1f58180abfee9fa7d9ce185429b6..eed7477a707bc67979a698b8ab941ee4f621bbe2 100644 --- a/PaddleSpeech/DeepVoice3/train_model.py +++ b/PaddleSpeech/DeepVoice3/train_model.py @@ -17,6 +17,7 @@ from __future__ import division from __future__ import print_function import os +import time from itertools import chain from paddle import fluid @@ -31,6 +32,7 @@ def train_model(model, loader, criterion, optimizer, clipper, writer, args, assert fluid.framework.in_dygraph_mode( ), "this function must be run within dygraph guard" + n_trainers = dg.parallel.Env().nranks local_rank = dg.parallel.Env().local_rank # amount of shifting when compute losses @@ -43,6 +45,9 @@ def train_model(model, loader, criterion, optimizer, clipper, writer, args, checkpoint_dir = os.path.join(args.output, "checkpoints") tensorboard_dir = os.path.join(args.output, "log") + ce_loss = 0 + start_time = time.time() + for epoch in range(hparams.nepochs): epoch_loss = 0. for step, inputs in tqdm(enumerate(loader())): @@ -183,6 +188,7 @@ def train_model(model, loader, criterion, optimizer, clipper, writer, args, if (local_rank == 0 and global_step > 0 and global_step % hparams.checkpoint_interval == 0): + save_states(global_step, writer, mel_outputs, linear_outputs, alignments, mel, linear, input_lengths.numpy(), checkpoint_dir) @@ -239,4 +245,14 @@ def train_model(model, loader, criterion, optimizer, clipper, writer, args, if writer is not None and local_rank == 0: writer.add_scalar("average_loss_in_epoch", average_loss_in_epoch, global_epoch) + ce_loss = average_loss_in_epoch global_epoch += 1 + + end_time = time.time() + epoch_time = (end_time - start_time) / global_epoch + print("kpis\teach_epoch_duration_frame%s_card%s\t%s" % + (hparams.outputs_per_step, n_trainers, epoch_time)) + print("kpis\ttrain_cost_frame%s_card%s\t%f" % + (hparams.outputs_per_step, n_trainers, ce_loss)) + +