train.py 7.5 KB
Newer Older
Q
Qiao Longfei 已提交
1 2 3 4
from __future__ import print_function
import argparse
import logging
import os
5
import time
Z
zhangwenhui03 已提交
6 7
import math
import random
8
import numpy as np
Q
Qiao Longfei 已提交
9 10
import paddle
import paddle.fluid as fluid
J
JiabinYang 已提交
11
import six
Q
Qiao Longfei 已提交
12
import reader
Z
zhangwenhui03 已提交
13
from net import skip_gram_word2vec
Q
Qiao Longfei 已提交
14 15 16 17 18 19 20

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)


def parse_args():
J
JiabinYang 已提交
21 22
    parser = argparse.ArgumentParser(
        description="PaddlePaddle Word2vec example")
Q
Qiao Longfei 已提交
23
    parser.add_argument(
Z
zhangwenhui03 已提交
24
        '--train_data_dir',
Q
Qiao Longfei 已提交
25
        type=str,
Z
zhangwenhui03 已提交
26
        default='./data/text',
J
JiabinYang 已提交
27
        help="The path of taining dataset")
Z
zhangwenhui03 已提交
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
    parser.add_argument(
        '--base_lr',
        type=float,
        default=0.01,
        help="The number of learing rate (default: 0.01)")
    parser.add_argument(
        '--save_step',
        type=int,
        default=500000,
        help="The number of step to save (default: 500000)")
    parser.add_argument(
        '--print_batch',
        type=int,
        default=10,
        help="The number of print_batch (default: 10)")
Q
Qiao Longfei 已提交
43 44 45
    parser.add_argument(
        '--dict_path',
        type=str,
46
        default='./data/1-billion_dict',
Q
Qiao Longfei 已提交
47
        help="The path of data dict")
Q
Qiao Longfei 已提交
48 49 50
    parser.add_argument(
        '--batch_size',
        type=int,
Z
zhangwenhui03 已提交
51 52
        default=500,
        help="The size of mini-batch (default:500)")
Q
Qiao Longfei 已提交
53 54 55 56 57 58 59 60 61 62
    parser.add_argument(
        '--num_passes',
        type=int,
        default=10,
        help="The number of passes to train (default: 10)")
    parser.add_argument(
        '--model_output_dir',
        type=str,
        default='models',
        help='The path for model to store (default: models)')
Z
zhangwenhui03 已提交
63
    parser.add_argument('--nce_num', type=int, default=5, help='nce_num')
Q
Qiao Longfei 已提交
64 65 66 67 68
    parser.add_argument(
        '--embedding_size',
        type=int,
        default=64,
        help='sparse feature hashing space for index processing')
T
tangwei12 已提交
69 70
    parser.add_argument(
        '--is_sparse',
T
tangwei12 已提交
71 72
        action='store_true',
        required=False,
T
tangwei12 已提交
73 74
        default=False,
        help='embedding and nce will use sparse or not, (default: False)')
75 76 77 78 79 80
    parser.add_argument(
        '--with_speed',
        action='store_true',
        required=False,
        default=False,
        help='print speed or not , (default: False)')
Q
Qiao Longfei 已提交
81 82 83
    return parser.parse_args()


Z
zhangwenhui03 已提交
84
def convert_python_to_tensor(weight, batch_size, sample_reader):
J
JiabinYang 已提交
85
    def __reader__():
Z
zhangwenhui03 已提交
86 87
        cs = np.array(weight).cumsum()
        result = [[], []]
J
JiabinYang 已提交
88 89 90 91 92 93 94 95 96 97 98 99 100 101
        for sample in sample_reader():
            for i, fea in enumerate(sample):
                result[i].append(fea)
            if len(result[0]) == batch_size:
                tensor_result = []
                for tensor in result:
                    t = fluid.Tensor()
                    dat = np.array(tensor, dtype='int64')
                    if len(dat.shape) > 2:
                        dat = dat.reshape((dat.shape[0], dat.shape[2]))
                    elif len(dat.shape) == 1:
                        dat = dat.reshape((-1, 1))
                    t.set(dat, fluid.CPUPlace())
                    tensor_result.append(t)
Z
zhangwenhui03 已提交
102 103 104 105 106 107 108
                tt = fluid.Tensor()
                neg_array = cs.searchsorted(np.random.sample(args.nce_num))
                neg_array = np.tile(neg_array, batch_size)
                tt.set(
                    neg_array.reshape((batch_size, args.nce_num)),
                    fluid.CPUPlace())
                tensor_result.append(tt)
J
JiabinYang 已提交
109
                yield tensor_result
Z
zhangwenhui03 已提交
110
                result = [[], []]
J
JiabinYang 已提交
111 112 113 114

    return __reader__


Z
zhangwenhui03 已提交
115 116
def train_loop(args, train_program, reader, py_reader, loss, trainer_id,
               weight):
Q
Qiao Longfei 已提交
117

J
JiabinYang 已提交
118
    py_reader.decorate_tensor_provider(
Z
zhangwenhui03 已提交
119
        convert_python_to_tensor(weight, args.batch_size, reader.train()))
120 121

    place = fluid.CPUPlace()
Q
Qiao Longfei 已提交
122 123
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
124 125

    exec_strategy = fluid.ExecutionStrategy()
J
JiabinYang 已提交
126
    exec_strategy.use_experimental_executor = True
T
tangwei12 已提交
127

J
JiabinYang 已提交
128 129
    print("CPU_NUM:" + str(os.getenv("CPU_NUM")))
    exec_strategy.num_threads = int(os.getenv("CPU_NUM"))
T
tangwei12 已提交
130

131
    build_strategy = fluid.BuildStrategy()
J
JiabinYang 已提交
132 133
    if int(os.getenv("CPU_NUM")) > 1:
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
134 135 136 137 138 139 140 141

    train_exe = fluid.ParallelExecutor(
        use_cuda=False,
        loss_name=loss.name,
        main_program=train_program,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)

Q
Qiao Longfei 已提交
142
    for pass_id in range(args.num_passes):
143
        py_reader.start()
J
JiabinYang 已提交
144 145
        time.sleep(10)
        epoch_start = time.time()
146
        batch_id = 0
147
        start = time.time()
148 149
        try:
            while True:
150

151 152 153
                loss_val = train_exe.run(fetch_list=[loss.name])
                loss_val = np.mean(loss_val)

Z
zhangwenhui03 已提交
154
                if batch_id % args.print_batch == 0:
J
JiabinYang 已提交
155 156 157
                    logger.info(
                        "TRAIN --> pass: {} batch: {} loss: {} reader queue:{}".
                        format(pass_id, batch_id,
J
JiabinYang 已提交
158
                               loss_val.mean(), py_reader.queue.size()))
159
                if args.with_speed:
Z
zhangwenhui03 已提交
160
                    if batch_id % 500 == 0 and batch_id != 0:
161 162
                        elapsed = (time.time() - start)
                        start = time.time()
163 164 165 166
                        samples = 1001 * args.batch_size * int(
                            os.getenv("CPU_NUM"))
                        logger.info("Time used: {}, Samples/Sec: {}".format(
                            elapsed, samples / elapsed))
Z
zhangwenhui03 已提交
167 168 169 170 171 172 173

                if batch_id % args.save_step == 0 and batch_id != 0:
                    model_dir = args.model_output_dir + '/pass-' + str(
                        pass_id) + ('/batch-' + str(batch_id))
                    if trainer_id == 0:
                        fluid.io.save_params(executor=exe, dirname=model_dir)
                        print("model saved in %s" % model_dir)
174 175 176 177 178
                batch_id += 1

        except fluid.core.EOFException:
            py_reader.reset()
            epoch_end = time.time()
179
            logger.info("Epoch: {0}, Train total expend: {1} ".format(
J
JiabinYang 已提交
180
                pass_id, epoch_end - epoch_start))
181 182
            model_dir = args.model_output_dir + '/pass-' + str(pass_id)
            if trainer_id == 0:
Z
zhangwenhui03 已提交
183 184
                fluid.io.save_params(executor=exe, dirname=model_dir)
                print("model saved in %s" % model_dir)
Q
Qiao Longfei 已提交
185 186


187 188 189 190 191
def GetFileList(data_path):
    return os.listdir(data_path)


def train(args):
Q
Qiao Longfei 已提交
192 193 194 195

    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

Z
zhangwenhui03 已提交
196 197 198
    filelist = GetFileList(args.train_data_dir)
    word2vec_reader = reader.Word2VecReader(args.dict_path, args.train_data_dir,
                                            filelist, 0, 1)
199 200

    logger.info("dict_size: {}".format(word2vec_reader.dict_size))
Z
zhangwenhui03 已提交
201 202
    np_power = np.power(np.array(word2vec_reader.id_frequencys), 0.75)
    id_frequencys_pow = np_power / np_power.sum()
Z
zhangwenhui03 已提交
203

204
    loss, py_reader = skip_gram_word2vec(
J
JiabinYang 已提交
205 206
        word2vec_reader.dict_size,
        args.embedding_size,
Z
zhangwenhui03 已提交
207 208
        is_sparse=args.is_sparse,
        neg_num=args.nce_num)
J
JiabinYang 已提交
209

Z
zhangwenhui03 已提交
210 211 212 213 214 215
    optimizer = fluid.optimizer.SGD(
        learning_rate=fluid.layers.exponential_decay(
            learning_rate=args.base_lr,
            decay_steps=100000,
            decay_rate=0.999,
            staircase=True))
216

Q
Qiao Longfei 已提交
217 218
    optimizer.minimize(loss)

219
    # do local training 
Z
zhangwenhui03 已提交
220 221 222
    logger.info("run local training")
    main_program = fluid.default_main_program()
    train_loop(args, main_program, word2vec_reader, py_reader, loss, 0,
Z
zhangwenhui03 已提交
223
               id_frequencys_pow)
J
JiabinYang 已提交
224 225


Q
Qiao Longfei 已提交
226
if __name__ == '__main__':
F
frankwhzhang 已提交
227
    utils.check_version()
228 229
    args = parse_args()
    train(args)