train.py 6.0 KB
Newer Older
D
dongdaxiang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
#Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
D
dongdaxiang 已提交
14 15
import os
import sys
Z
add ssr  
zhangwenhui03 已提交
16
import time
D
dongdaxiang 已提交
17 18 19 20
import argparse
import logging
import paddle.fluid as fluid
import paddle
Z
add ssr  
zhangwenhui03 已提交
21 22
import utils
import numpy as np
D
dongdaxiang 已提交
23
from nets import SequenceSemanticRetrieval
D
dongdaxiang 已提交
24

25
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
D
dongdaxiang 已提交
26 27 28
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)

Z
add ssr  
zhangwenhui03 已提交
29

D
dongdaxiang 已提交
30
def parse_args():
D
dongdaxiang 已提交
31
    parser = argparse.ArgumentParser("sequence semantic retrieval")
32
    parser.add_argument(
Z
add ssr  
zhangwenhui03 已提交
33 34 35
        "--train_dir", type=str, default='train_data', help="Training file")
    parser.add_argument(
        "--base_lr", type=float, default=0.01, help="learning rate")
36
    parser.add_argument(
Z
zhangwenhui03 已提交
37
        '--vocab_path', type=str, default='vocab.txt', help='vocab file')
Z
add ssr  
zhangwenhui03 已提交
38 39 40 41
    parser.add_argument(
        "--epochs", type=int, default=10, help="Number of epochs")
    parser.add_argument(
        '--parallel', type=int, default=0, help='whether parallel')
42
    parser.add_argument(
Z
add ssr  
zhangwenhui03 已提交
43
        '--use_cuda', type=int, default=0, help='whether use gpu')
44
    parser.add_argument(
Z
add ssr  
zhangwenhui03 已提交
45
        '--print_batch', type=int, default=10, help='num of print batch')
46
    parser.add_argument(
Z
add ssr  
zhangwenhui03 已提交
47
        '--model_dir', type=str, default='model_output', help='model dir')
48
    parser.add_argument(
Z
add ssr  
zhangwenhui03 已提交
49
        "--hidden_size", type=int, default=128, help="hidden size")
Z
zhangwenhui03 已提交
50 51
    parser.add_argument(
        "--batch_size", type=int, default=50, help="number of batch")
Z
add ssr  
zhangwenhui03 已提交
52 53 54 55
    parser.add_argument(
        "--embedding_dim", type=int, default=128, help="embedding dim")
    parser.add_argument(
        '--num_devices', type=int, default=1, help='Number of GPU devices')
Z
zhengya01 已提交
56 57 58 59 60 61
    parser.add_argument(
        '--step_num', type=int, default=1000, help='Number of steps')
    parser.add_argument(
        '--enable_ce',
        action='store_true',
        help='If set, run the task with continuous evaluation logs.')
D
dongdaxiang 已提交
62 63
    return parser.parse_args()

D
dongdaxiang 已提交
64

Z
add ssr  
zhangwenhui03 已提交
65 66 67 68 69
def get_cards(args):
    return args.num_devices


def train(args):
Z
zhengya01 已提交
70
    if args.enable_ce:
Z
zhang wenhui 已提交
71 72 73
        SEED = 102
        fluid.default_startup_program().random_seed = SEED
        fluid.default_main_program().random_seed = SEED
Z
add ssr  
zhangwenhui03 已提交
74 75 76 77 78 79 80 81 82 83 84 85 86
    use_cuda = True if args.use_cuda else False
    parallel = True if args.parallel else False
    print("use_cuda:", use_cuda, "parallel:", parallel)
    train_reader, vocab_size = utils.construct_train_data(
        args.train_dir, args.vocab_path, args.batch_size * get_cards(args))
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    ssr = SequenceSemanticRetrieval(vocab_size, args.embedding_dim,
                                    args.hidden_size)
    # Train program
    train_input_data, cos_pos, avg_cost, acc = ssr.train()

    # Optimization to minimize lost
    optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr)
D
dongdaxiang 已提交
87
    optimizer.minimize(avg_cost)
Z
add ssr  
zhangwenhui03 已提交
88 89

    data_list = [var.name for var in train_input_data]
D
dongdaxiang 已提交
90
    feeder = fluid.DataFeeder(feed_list=data_list, place=place)
D
dongdaxiang 已提交
91
    exe = fluid.Executor(place)
Z
add ssr  
zhangwenhui03 已提交
92 93 94 95 96 97
    exe.run(fluid.default_startup_program())
    if parallel:
        train_exe = fluid.ParallelExecutor(
            use_cuda=use_cuda, loss_name=avg_cost.name)
    else:
        train_exe = exe
D
dongdaxiang 已提交
98

Z
add ssr  
zhangwenhui03 已提交
99
    total_time = 0.0
Z
zhengya01 已提交
100
    ce_info = []
D
dongdaxiang 已提交
101
    for pass_id in range(args.epochs):
Z
add ssr  
zhangwenhui03 已提交
102 103 104 105
        epoch_idx = pass_id + 1
        print("epoch_%d start" % epoch_idx)
        t0 = time.time()
        i = 0
D
dongdaxiang 已提交
106
        for batch_id, data in enumerate(train_reader()):
Z
add ssr  
zhangwenhui03 已提交
107 108 109
            i += 1
            loss_val, correct_val = train_exe.run(
                feed=feeder.feed(data), fetch_list=[avg_cost.name, acc.name])
Z
zhengya01 已提交
110
            ce_info.append(float(np.mean(correct_val)) / args.batch_size)
Z
add ssr  
zhangwenhui03 已提交
111 112 113 114 115 116
            if i % args.print_batch == 0:
                logger.info(
                    "Train --> pass: {} batch_id: {} avg_cost: {}, acc: {}".
                    format(pass_id, batch_id,
                           np.mean(loss_val),
                           float(np.mean(correct_val)) / args.batch_size))
Z
zhengya01 已提交
117 118
            if args.enable_ce and i > args.step_num:
                break
Z
add ssr  
zhangwenhui03 已提交
119 120 121 122 123 124 125 126
        t1 = time.time()
        total_time += t1 - t0
        print("epoch:%d num_steps:%d time_cost(s):%f" %
              (epoch_idx, i, total_time / epoch_idx))
        save_dir = "%s/epoch_%d" % (args.model_dir, epoch_idx)
        fluid.io.save_params(executor=exe, dirname=save_dir)
        print("model saved in %s" % save_dir)

Z
zhengya01 已提交
127 128 129 130 131 132 133 134 135 136 137 138
    # only for ce
    if args.enable_ce:
        ce_acc = 0
        try:
            ce_acc = ce_info[-2]
        except:
            print("ce info error")
        epoch_idx = args.epochs
        device = get_device(args)
        if args.use_cuda:
            gpu_num = device[1]
            print("kpis\teach_pass_duration_gpu%s\t%s" %
Z
zhang wenhui 已提交
139 140
                  (gpu_num, total_time / epoch_idx))
            print("kpis\ttrain_acc_gpu%s\t%s" % (gpu_num, ce_acc))
Z
zhengya01 已提交
141 142 143 144
        else:
            cpu_num = device[1]
            threads_num = device[2]
            print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" %
Z
zhang wenhui 已提交
145
                  (cpu_num, threads_num, total_time / epoch_idx))
Z
zhengya01 已提交
146
            print("kpis\ttrain_acc_cpu%s_thread%s\t%s" %
Z
zhang wenhui 已提交
147 148
                  (cpu_num, threads_num, ce_acc))

Z
zhengya01 已提交
149 150 151 152 153 154 155 156 157 158

def get_device(args):
    if args.use_cuda:
        gpus = os.environ.get("CUDA_VISIBLE_DEVICES", 1)
        gpu_num = len(gpus.split(','))
        return "gpu", gpu_num
    else:
        threads_num = os.environ.get('NUM_THREADS', 1)
        cpu_num = os.environ.get('CPU_NUM', 1)
        return "cpu", int(cpu_num), int(threads_num)
Z
zhang wenhui 已提交
159

D
dongdaxiang 已提交
160 161 162

def main():
    args = parse_args()
Z
add ssr  
zhangwenhui03 已提交
163 164
    train(args)

D
dongdaxiang 已提交
165 166

if __name__ == "__main__":
Z
zhang wenhui 已提交
167
    utils.check_version()
D
dongdaxiang 已提交
168
    main()