train.py 6.0 KB
Newer Older
D
dongdaxiang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
#Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
D
dongdaxiang 已提交
14 15
import os
import sys
Z
add ssr  
zhangwenhui03 已提交
16
import time
D
dongdaxiang 已提交
17 18 19 20
import argparse
import logging
import paddle.fluid as fluid
import paddle
Z
add ssr  
zhangwenhui03 已提交
21 22
import utils
import numpy as np
D
dongdaxiang 已提交
23
from nets import SequenceSemanticRetrieval
D
dongdaxiang 已提交
24

25
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
D
dongdaxiang 已提交
26 27 28
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)

Z
add ssr  
zhangwenhui03 已提交
29

D
dongdaxiang 已提交
30
def parse_args():
D
dongdaxiang 已提交
31
    parser = argparse.ArgumentParser("sequence semantic retrieval")
32
    parser.add_argument(
Z
add ssr  
zhangwenhui03 已提交
33 34 35
        "--train_dir", type=str, default='train_data', help="Training file")
    parser.add_argument(
        "--base_lr", type=float, default=0.01, help="learning rate")
36
    parser.add_argument(
Z
zhangwenhui03 已提交
37
        '--vocab_path', type=str, default='vocab.txt', help='vocab file')
Z
add ssr  
zhangwenhui03 已提交
38 39 40 41
    parser.add_argument(
        "--epochs", type=int, default=10, help="Number of epochs")
    parser.add_argument(
        '--parallel', type=int, default=0, help='whether parallel')
42
    parser.add_argument(
Z
add ssr  
zhangwenhui03 已提交
43
        '--use_cuda', type=int, default=0, help='whether use gpu')
44
    parser.add_argument(
Z
add ssr  
zhangwenhui03 已提交
45
        '--print_batch', type=int, default=10, help='num of print batch')
46
    parser.add_argument(
Z
add ssr  
zhangwenhui03 已提交
47
        '--model_dir', type=str, default='model_output', help='model dir')
48
    parser.add_argument(
Z
add ssr  
zhangwenhui03 已提交
49
        "--hidden_size", type=int, default=128, help="hidden size")
Z
zhangwenhui03 已提交
50 51
    parser.add_argument(
        "--batch_size", type=int, default=50, help="number of batch")
Z
add ssr  
zhangwenhui03 已提交
52 53 54 55
    parser.add_argument(
        "--embedding_dim", type=int, default=128, help="embedding dim")
    parser.add_argument(
        '--num_devices', type=int, default=1, help='Number of GPU devices')
Z
zhengya01 已提交
56 57 58 59 60 61
    parser.add_argument(
        '--step_num', type=int, default=1000, help='Number of steps')
    parser.add_argument(
        '--enable_ce',
        action='store_true',
        help='If set, run the task with continuous evaluation logs.')
D
dongdaxiang 已提交
62 63
    return parser.parse_args()

D
dongdaxiang 已提交
64

Z
add ssr  
zhangwenhui03 已提交
65 66 67 68 69
def get_cards(args):
    return args.num_devices


def train(args):
Z
zhengya01 已提交
70 71 72 73
    if args.enable_ce:
       SEED = 102
       fluid.default_startup_program().random_seed = SEED 
       fluid.default_main_program().random_seed = SEED 
Z
add ssr  
zhangwenhui03 已提交
74 75 76 77 78 79 80 81 82 83 84 85 86
    use_cuda = True if args.use_cuda else False
    parallel = True if args.parallel else False
    print("use_cuda:", use_cuda, "parallel:", parallel)
    train_reader, vocab_size = utils.construct_train_data(
        args.train_dir, args.vocab_path, args.batch_size * get_cards(args))
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    ssr = SequenceSemanticRetrieval(vocab_size, args.embedding_dim,
                                    args.hidden_size)
    # Train program
    train_input_data, cos_pos, avg_cost, acc = ssr.train()

    # Optimization to minimize lost
    optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr)
D
dongdaxiang 已提交
87
    optimizer.minimize(avg_cost)
Z
add ssr  
zhangwenhui03 已提交
88 89

    data_list = [var.name for var in train_input_data]
D
dongdaxiang 已提交
90
    feeder = fluid.DataFeeder(feed_list=data_list, place=place)
D
dongdaxiang 已提交
91
    exe = fluid.Executor(place)
Z
add ssr  
zhangwenhui03 已提交
92 93 94 95 96 97
    exe.run(fluid.default_startup_program())
    if parallel:
        train_exe = fluid.ParallelExecutor(
            use_cuda=use_cuda, loss_name=avg_cost.name)
    else:
        train_exe = exe
D
dongdaxiang 已提交
98

Z
add ssr  
zhangwenhui03 已提交
99
    total_time = 0.0
Z
zhengya01 已提交
100
    ce_info = []
D
dongdaxiang 已提交
101
    for pass_id in range(args.epochs):
Z
add ssr  
zhangwenhui03 已提交
102 103 104 105
        epoch_idx = pass_id + 1
        print("epoch_%d start" % epoch_idx)
        t0 = time.time()
        i = 0
D
dongdaxiang 已提交
106
        for batch_id, data in enumerate(train_reader()):
Z
add ssr  
zhangwenhui03 已提交
107 108 109
            i += 1
            loss_val, correct_val = train_exe.run(
                feed=feeder.feed(data), fetch_list=[avg_cost.name, acc.name])
Z
zhengya01 已提交
110
            ce_info.append(float(np.mean(correct_val)) / args.batch_size)
Z
add ssr  
zhangwenhui03 已提交
111 112 113 114 115 116
            if i % args.print_batch == 0:
                logger.info(
                    "Train --> pass: {} batch_id: {} avg_cost: {}, acc: {}".
                    format(pass_id, batch_id,
                           np.mean(loss_val),
                           float(np.mean(correct_val)) / args.batch_size))
Z
zhengya01 已提交
117 118
            if args.enable_ce and i > args.step_num:
                break
Z
add ssr  
zhangwenhui03 已提交
119 120 121 122 123 124 125 126
        t1 = time.time()
        total_time += t1 - t0
        print("epoch:%d num_steps:%d time_cost(s):%f" %
              (epoch_idx, i, total_time / epoch_idx))
        save_dir = "%s/epoch_%d" % (args.model_dir, epoch_idx)
        fluid.io.save_params(executor=exe, dirname=save_dir)
        print("model saved in %s" % save_dir)

Z
zhengya01 已提交
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
    # only for ce
    if args.enable_ce:
        ce_acc = 0
        try:
            ce_acc = ce_info[-2]
        except:
            print("ce info error")
        epoch_idx = args.epochs
        device = get_device(args)
        if args.use_cuda:
            gpu_num = device[1]
            print("kpis\teach_pass_duration_gpu%s\t%s" %
                (gpu_num, total_time / epoch_idx))
            print("kpis\ttrain_acc_gpu%s\t%s" %
                (gpu_num, ce_acc))
        else:
            cpu_num = device[1]
            threads_num = device[2]
            print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" %
                (cpu_num, threads_num, total_time / epoch_idx))
            print("kpis\ttrain_acc_cpu%s_thread%s\t%s" %
                (cpu_num, threads_num, ce_acc))
        

def get_device(args):
    if args.use_cuda:
        gpus = os.environ.get("CUDA_VISIBLE_DEVICES", 1)
        gpu_num = len(gpus.split(','))
        return "gpu", gpu_num
    else:
        threads_num = os.environ.get('NUM_THREADS', 1)
        cpu_num = os.environ.get('CPU_NUM', 1)
        return "cpu", int(cpu_num), int(threads_num)
        
D
dongdaxiang 已提交
161 162 163

def main():
    args = parse_args()
Z
add ssr  
zhangwenhui03 已提交
164 165
    train(args)

D
dongdaxiang 已提交
166 167 168

if __name__ == "__main__":
    main()