add ce for DeepVoice3 (#3801)

* update .run_ce.sh * chmod +x .run_ce.sh * add ce * add ce * add ce * add ce

add ce for DeepVoice3 (#3801)
* update .run_ce.sh * chmod +x .run_ce.sh * add ce * add ce * add ce * add ce
e0076bb2 · zhengya01 · Feiyu Chan · 2deb74ec · e0076bb2 · e0076bb2
3 changed file
--- a/PaddleSpeech/DeepVoice3/.run_ce.sh
+++ b/PaddleSpeech/DeepVoice3/.run_ce.sh
+#!/bin/bash
+export FLAGS_fraction_of_gpu_memory_to_use=0.5
+export FLAGS_eager_delete_tensor_gb=0.0
+export FLAGS_fast_eager_deletion_mode=1
+train_single_frame()
+{
+    python train.py \
+    --data-root=data/ljspeech/ \
+    --use-gpu \
+    --preset=presets/deepvoice3_ljspeech.json \
+    --hparams="nepochs=10"
+}
+train_multi_frame()
+{
+    python train.py \
+    --data-root=data/ljspeech/ \
+    --use-gpu \
+    --preset=presets/deepvoice3_ljspeech.json \
+    --hparams="nepochs=10, downsample_step=1, outputs_per_step=4"
+}
+export CUDA_VISIBLE_DEVICES=0
+train_single_frame | python _ce.py
+sleep 20
+train_multi_frame | python _ce.py
--- a/PaddleSpeech/DeepVoice3/_ce.py
+++ b/PaddleSpeech/DeepVoice3/_ce.py
+# this file is only used for continuous evaluation test!
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi
+from kpi import DurationKpi
+from kpi import AccKpi
+each_epoch_duration_frame1_card1 = DurationKpi("each_epoch_duration_frame1_card1", 0.02, actived=True)
+train_cost_frame1_card1 = CostKpi("train_cost_frame1_card1", 0.02, actived=True)
+each_epoch_duration_frame4_card1 = DurationKpi("each_epoch_duration_frame4_card1", 0.05, actived=True)
+train_cost_frame4_card1 = CostKpi("train_cost_frame4_card1", 0.02, actived=True)
+tracking_kpis = [
+    each_epoch_duration_frame1_card1,
+    train_cost_frame1_card1,
+    each_epoch_duration_frame4_card1,
+    train_cost_frame4_card1,
+]
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+    The suggestion:
+    each line in the log should be key, value, for example:
+    "
+    train_cost\t1.0
+    test_cost\t1.0
+    train_cost\t1.0
+    train_cost\t1.0
+    train_acc\t1.2
+    "
+    '''
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    log_to_ce(log)
--- a/PaddleSpeech/DeepVoice3/train_model.py
+++ b/PaddleSpeech/DeepVoice3/train_model.py
@@ -17,6 +17,7 @@ from __future__ import division
 from __future__ import print_function
 import os
+import time
 from itertools import chain
 from paddle import fluid
@@ -31,6 +32,7 @@ def train_model(model, loader, criterion, optimizer, clipper, writer, args,
    assert fluid.framework.in_dygraph_mode(
    ), "this function must be run within dygraph guard"
+    n_trainers = dg.parallel.Env().nranks
    local_rank = dg.parallel.Env().local_rank
    # amount of shifting when compute losses
@@ -43,6 +45,9 @@ def train_model(model, loader, criterion, optimizer, clipper, writer, args,
    checkpoint_dir = os.path.join(args.output, "checkpoints")
    tensorboard_dir = os.path.join(args.output, "log")
+    ce_loss = 0
+    start_time = time.time()
    for epoch in range(hparams.nepochs):
        epoch_loss = 0.
        for step, inputs in tqdm(enumerate(loader())):
@@ -183,6 +188,7 @@ def train_model(model, loader, criterion, optimizer, clipper, writer, args,
            if (local_rank == 0 and global_step > 0 and
                    global_step % hparams.checkpoint_interval == 0):
                save_states(global_step, writer, mel_outputs, linear_outputs,
                            alignments, mel, linear,
                            input_lengths.numpy(), checkpoint_dir)
@@ -239,4 +245,14 @@ def train_model(model, loader, criterion, optimizer, clipper, writer, args,
        if writer is not None and local_rank == 0:
            writer.add_scalar("average_loss_in_epoch", average_loss_in_epoch,
                              global_epoch)
+        ce_loss = average_loss_in_epoch
        global_epoch += 1
+    end_time = time.time()
+    epoch_time = (end_time - start_time) / global_epoch 
+    print("kpis\teach_epoch_duration_frame%s_card%s\t%s" %
+            (hparams.outputs_per_step, n_trainers, epoch_time))
+    print("kpis\ttrain_cost_frame%s_card%s\t%f" %
+            (hparams.outputs_per_step, n_trainers, ce_loss))