Merge with develop branch (#1056)

93756632 · Yibing Liu · GitHub · c19f7ac1 · 93756632 · 93756632
7 changed file
--- a/fluid/DeepASR/.gitignore
+++ b/fluid/DeepASR/.gitignore
+.idea
--- a/fluid/DeepASR/data_utils/util.py
+++ b/fluid/DeepASR/data_utils/util.py
@@ -25,16 +25,6 @@ def to_lodtensor(data, place):
    return res
-def lodtensor_to_ndarray(lod_tensor):
-    """conver lodtensor to ndarray
-    """
-    dims = lod_tensor._get_dims()
-    ret = np.zeros(shape=dims).astype('float32')
-    for i in xrange(np.product(dims)):
-        ret.ravel()[i] = lod_tensor.get_float_element(i)
-    return ret, lod_tensor.lod()
 def split_infer_result(infer_seq, lod):
    infer_batch = []
    for i in xrange(0, len(lod[0]) - 1):

--- a/fluid/DeepASR/decoder/.gitignore
+++ b/fluid/DeepASR/decoder/.gitignore
+ThreadPool
+build
+post_latgen_faster_mapped.so
+pybind11
--- a/fluid/DeepASR/examples/aishell/.gitignore
+++ b/fluid/DeepASR/examples/aishell/.gitignore
+aux.tar.gz
+aux
+data
+checkpoints
--- a/fluid/DeepASR/examples/aishell/train.sh
+++ b/fluid/DeepASR/examples/aishell/train.sh
@@ -7,7 +7,8 @@ python -u ../../train.py --train_feature_lst data/train_feature.lst \
                   --checkpoints checkpoints \
                   --frame_dim 80  \
                   --class_num 3040  \
+                   --print_per_batches 100 \
                   --infer_models '' \
-                   --batch_size 64 \
+                   --batch_size 16 \
                   --learning_rate 6.4e-5 \
                   --parallel
--- a/fluid/DeepASR/model_utils/model.py
+++ b/fluid/DeepASR/model_utils/model.py
@@ -5,14 +5,16 @@ from __future__ import print_function
 import paddle.fluid as fluid
-def stacked_lstmp_model(frame_dim,
+def stacked_lstmp_model(feature,
+                        label,
                        hidden_dim,
                        proj_dim,
                        stacked_num,
                        class_num,
                        parallel=False,
                        is_train=True):
-    """ The model for DeepASR. The main structure is composed of stacked 
+    """ 
+    The model for DeepASR. The main structure is composed of stacked 
    identical LSTMP (LSTM with recurrent projection) layers.
    When running in training and validation phase, the feeding dictionary
@@ -28,9 +30,6 @@ def stacked_lstmp_model(frame_dim,
        is_train(bool): Run in training phase or not, default `True`.
        class_dim(int): The number of output classes.
    """
-    # network configuration
-    def _net_conf(feature, label):
    conv1 = fluid.layers.conv2d(
        input=feature,
        num_filters=32,
@@ -73,35 +72,3 @@ def stacked_lstmp_model(frame_dim,
    avg_cost = fluid.layers.mean(x=cost)
    acc = fluid.layers.accuracy(input=prediction, label=label)
    return prediction, avg_cost, acc
-    # data feeder
-    feature = fluid.layers.data(
-        name="feature",
-        shape=[-1, 3, 11, frame_dim],
-        dtype="float32",
-        lod_level=1)
-    label = fluid.layers.data(
-        name="label", shape=[-1, 1], dtype="int64", lod_level=1)
-    if parallel:
-        # When the execution place is specified to CUDAPlace, the program will
-        # run on all $CUDA_VISIBLE_DEVICES GPUs. Otherwise the program will 
-        # run on all CPU devices.
-        places = fluid.layers.device.get_places()
-        pd = fluid.layers.ParallelDo(places)
-        with pd.do():
-            feat_ = pd.read_input(feature)
-            label_ = pd.read_input(label)
-            prediction, avg_cost, acc = _net_conf(feat_, label_)
-            for out in [prediction, avg_cost, acc]:
-                pd.write_output(out)
-        # get mean loss and acc through every devices.
-        prediction, avg_cost, acc = pd()
-        prediction.stop_gradient = True
-        avg_cost = fluid.layers.mean(x=avg_cost)
-        acc = fluid.layers.mean(x=acc)
-    else:
-        prediction, avg_cost, acc = _net_conf(feature, label)
-    return prediction, avg_cost, acc
--- a/fluid/DeepASR/train.py
+++ b/fluid/DeepASR/train.py
@@ -14,7 +14,6 @@ import data_utils.augmentor.trans_add_delta as trans_add_delta
 import data_utils.augmentor.trans_splice as trans_splice
 import data_utils.augmentor.trans_delay as trans_delay
 import data_utils.async_data_reader as reader
-from data_utils.util import lodtensor_to_ndarray
 from model_utils.model import stacked_lstmp_model
@@ -24,7 +23,8 @@ def parse_args():
        '--batch_size',
        type=int,
        default=32,
-        help='The sequence number of a batch data. (default: %(default)d)')
+        help='The sequence number of a batch data. Batch size per GPU. (default: %(default)d)'
+    )
    parser.add_argument(
        '--minimum_batch_size',
        type=int,
@@ -147,18 +147,26 @@ def train(args):
    if args.infer_models != '' and not os.path.exists(args.infer_models):
        os.mkdir(args.infer_models)
+    train_program = fluid.Program()
+    train_startup = fluid.Program()
+    with fluid.program_guard(train_program, train_startup):
+        with fluid.unique_name.guard():
+            py_train_reader = fluid.layers.py_reader(
+                capacity=10,
+                shapes=([-1, 3, 11, args.frame_dim], [-1, 1]),
+                dtypes=['float32', 'int64'],
+                lod_levels=[1, 1],
+                name='train_reader')
+            feature, label = fluid.layers.read_file(py_train_reader)
            prediction, avg_cost, accuracy = stacked_lstmp_model(
-        frame_dim=args.frame_dim,
+                feature=feature,
+                label=label,
                hidden_dim=args.hidden_dim,
                proj_dim=args.proj_dim,
                stacked_num=args.stacked_num,
-        class_num=args.class_num,
+                class_num=args.class_num)
-        parallel=args.parallel)
+            # optimizer = fluid.optimizer.Momentum(learning_rate=args.learning_rate, momentum=0.9)
-    # program for test
-    test_program = fluid.default_main_program().clone()
-    #optimizer = fluid.optimizer.Momentum(learning_rate=args.learning_rate, momentum=0.9)
            optimizer = fluid.optimizer.Adam(
                learning_rate=fluid.layers.exponential_decay(
                    learning_rate=args.learning_rate,
@@ -166,10 +174,45 @@ def train(args):
                    decay_rate=1 / 1.2,
                    staircase=True))
            optimizer.minimize(avg_cost)
+            fluid.memory_optimize(train_program)
+    test_program = fluid.Program()
+    test_startup = fluid.Program()
+    with fluid.program_guard(test_program, test_startup):
+        with fluid.unique_name.guard():
+            py_test_reader = fluid.layers.py_reader(
+                capacity=10,
+                shapes=([-1, 3, 11, args.frame_dim], [-1, 1]),
+                dtypes=['float32', 'int64'],
+                lod_levels=[1, 1],
+                name='test_reader')
+            feature, label = fluid.layers.read_file(py_test_reader)
+            prediction, avg_cost, accuracy = stacked_lstmp_model(
+                feature=feature,
+                label=label,
+                hidden_dim=args.hidden_dim,
+                proj_dim=args.proj_dim,
+                stacked_num=args.stacked_num,
+                class_num=args.class_num)
+    test_program = test_program.clone(for_test=True)
    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
+    exe.run(train_startup)
+    exe.run(test_startup)
+    if args.parallel:
+        exec_strategy = fluid.ExecutionStrategy()
+        exec_strategy.num_iteration_per_drop_scope = 10
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=(args.device == 'GPU'),
+            loss_name=avg_cost.name,
+            exec_strategy=exec_strategy,
+            main_program=train_program)
+        test_exe = fluid.ParallelExecutor(
+            use_cuda=(args.device == 'GPU'),
+            main_program=test_program,
+            exec_strategy=exec_strategy,
+            share_vars_from=train_exe)
    # resume training if initial model provided.
    if args.init_model_path is not None:
@@ -181,15 +224,24 @@ def train(args):
        trans_splice.TransSplice(5, 5), trans_delay.TransDelay(5)
    ]
-    feature_t = fluid.LoDTensor()
+    # bind train_reader
-    label_t = fluid.LoDTensor()
+    train_data_reader = reader.AsyncDataReader(
+        args.train_feature_lst,
+        args.train_label_lst,
+        -1,
+        split_sentence_threshold=1024)
-    # validation
+    train_data_reader.set_transformers(ltrans)
-    def test(exe):
-        # If test data not found, return invalid cost and accuracy
+    def train_data_provider():
-        if not (os.path.exists(args.val_feature_lst) and
+        for data in train_data_reader.batch_iterator(args.batch_size,
+                                                     args.minimum_batch_size):
+            yield batch_data_to_lod_tensors(args, data, fluid.CPUPlace())
+    py_train_reader.decorate_tensor_provider(train_data_provider)
+    if (os.path.exists(args.val_feature_lst) and
            os.path.exists(args.val_label_lst)):
-            return -1.0, -1.0
        # test data reader
        test_data_reader = reader.AsyncDataReader(
            args.val_feature_lst,
@@ -197,86 +249,101 @@ def train(args):
            -1,
            split_sentence_threshold=1024)
        test_data_reader.set_transformers(ltrans)
-        test_costs, test_accs = [], []
-        for batch_id, batch_data in enumerate(
-                test_data_reader.batch_iterator(args.batch_size,
-                                                args.minimum_batch_size)):
-            # load_data
-            (features, labels, lod, _) = batch_data
-            features = np.reshape(features, (-1, 11, 3, args.frame_dim))
-            features = np.transpose(features, (0, 2, 1, 3))
-            feature_t.set(features, place)
-            feature_t.set_lod([lod])
-            label_t.set(labels, place)
-            label_t.set_lod([lod])
-            cost, acc = exe.run(test_program,
+        def test_data_provider():
-                                feed={"feature": feature_t,
+            for data in test_data_reader.batch_iterator(
-                                      "label": label_t},
+                    args.batch_size, args.minimum_batch_size):
+                yield batch_data_to_lod_tensors(args, data, fluid.CPUPlace())
+        py_test_reader.decorate_tensor_provider(test_data_provider)
+    # validation
+    def test(exe):
+        # If test data not found, return invalid cost and accuracy
+        if not (os.path.exists(args.val_feature_lst) and
+                os.path.exists(args.val_label_lst)):
+            return -1.0, -1.0
+        batch_id = 0
+        test_costs = []
+        test_accs = []
+        while True:
+            if batch_id == 0:
+                py_test_reader.start()
+            try:
+                if args.parallel:
+                    cost, acc = exe.run(
+                        fetch_list=[avg_cost.name, accuracy.name],
+                        return_numpy=False)
+                else:
+                    cost, acc = exe.run(program=test_program,
                                        fetch_list=[avg_cost, accuracy],
                                        return_numpy=False)
-            test_costs.append(lodtensor_to_ndarray(cost)[0])
+                sys.stdout.write('.')
-            test_accs.append(lodtensor_to_ndarray(acc)[0])
+                sys.stdout.flush()
+                test_costs.append(np.array(cost)[0])
+                test_accs.append(np.array(acc)[0])
+                batch_id += 1
+            except fluid.core.EOFException:
+                py_test_reader.reset()
+                break
        return np.mean(test_costs), np.mean(test_accs)
-    # train data reader
-    train_data_reader = reader.AsyncDataReader(
-        args.train_feature_lst,
-        args.train_label_lst,
-        -1,
-        split_sentence_threshold=1024)
-    train_data_reader.set_transformers(ltrans)
    # train
    for pass_id in xrange(args.pass_num):
        pass_start_time = time.time()
-        for batch_id, batch_data in enumerate(
+        batch_id = 0
-                train_data_reader.batch_iterator(args.batch_size,
+        while True:
-                                                 args.minimum_batch_size)):
+            if batch_id == 0:
-            # load_data
+                py_train_reader.start()
-            (features, labels, lod, name_lst) = batch_data
-            features = np.reshape(features, (-1, 11, 3, args.frame_dim))
-            features = np.transpose(features, (0, 2, 1, 3))
-            feature_t.set(features, place)
-            feature_t.set_lod([lod])
-            label_t.set(labels, place)
-            label_t.set_lod([lod])
            to_print = batch_id > 0 and (batch_id % args.print_per_batches == 0)
-            outs = exe.run(fluid.default_main_program(),
+            try:
-                           feed={"feature": feature_t,
+                if args.parallel:
-                                 "label": label_t},
+                    outs = train_exe.run(
-                           fetch_list=[avg_cost, accuracy] if to_print else [],
+                        fetch_list=[avg_cost.name, accuracy.name]
+                        if to_print else [],
+                        return_numpy=False)
+                else:
+                    outs = exe.run(program=train_program,
+                                   fetch_list=[avg_cost, accuracy]
+                                   if to_print else [],
                                   return_numpy=False)
+            except fluid.core.EOFException:
+                py_train_reader.reset()
+                break
            if to_print:
+                if args.parallel:
                    print("\nBatch %d, train cost: %f, train acc: %f" %
-                      (batch_id, lodtensor_to_ndarray(outs[0])[0],
+                          (batch_id, np.mean(outs[0]), np.mean(outs[1])))
-                       lodtensor_to_ndarray(outs[1])[0]))
+                else:
+                    print("\nBatch %d, train cost: %f, train acc: %f" % (
+                        batch_id, np.array(outs[0])[0], np.array(outs[1])[0]))
                # save the latest checkpoint
                if args.checkpoints != '':
                    model_path = os.path.join(args.checkpoints,
                                              "deep_asr.latest.checkpoint")
-                    fluid.io.save_persistables(exe, model_path)
+                    fluid.io.save_persistables(exe, model_path, train_program)
            else:
                sys.stdout.write('.')
                sys.stdout.flush()
+            batch_id += 1
        # run test
-        val_cost, val_acc = test(exe)
+        val_cost, val_acc = test(test_exe if args.parallel else exe)
        # save checkpoint per pass
        if args.checkpoints != '':
            model_path = os.path.join(
                args.checkpoints,
                "deep_asr.pass_" + str(pass_id) + ".checkpoint")
-            fluid.io.save_persistables(exe, model_path)
+            fluid.io.save_persistables(exe, model_path, train_program)
        # save inference model
        if args.infer_models != '':
            model_path = os.path.join(
                args.infer_models,
                "deep_asr.pass_" + str(pass_id) + ".infer.model")
            fluid.io.save_inference_model(model_path, ["feature"],
-                                          [prediction], exe)
+                                          [prediction], exe, train_program)
        # cal pass time
        pass_end_time = time.time()
        time_consumed = pass_end_time - pass_start_time
@@ -285,6 +352,19 @@ def train(args):
              (pass_id, time_consumed, val_cost, val_acc))
+def batch_data_to_lod_tensors(args, batch_data, place):
+    features, labels, lod, name_lst = batch_data
+    features = np.reshape(features, (-1, 11, 3, args.frame_dim))
+    features = np.transpose(features, (0, 2, 1, 3))
+    feature_t = fluid.LoDTensor()
+    label_t = fluid.LoDTensor()
+    feature_t.set(features, place)
+    feature_t.set_lod([lod])
+    label_t.set(labels, place)
+    label_t.set_lod([lod])
+    return feature_t, label_t
 if __name__ == '__main__':
    args = parse_args()
    print_arguments(args)