train.py 7.9 KB
Newer Older
Q
Qiao Longfei 已提交
1 2 3 4
from __future__ import print_function
import argparse
import logging
import os
5
import time
Z
zhangwenhui03 已提交
6 7
import math
import random
8
import numpy as np
Q
Qiao Longfei 已提交
9 10
import paddle
import paddle.fluid as fluid
J
JiabinYang 已提交
11
import six
Q
Qiao Longfei 已提交
12
import reader
Z
zhangwenhui03 已提交
13
from net import skip_gram_word2vec
Q
Qiao Longfei 已提交
14

Z
zhang wenhui 已提交
15 16 17 18 19 20
import utils
import sys
if six.PY2:
    reload(sys)
    sys.setdefaultencoding('utf-8')

Q
Qiao Longfei 已提交
21 22 23 24 25 26
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)


def parse_args():
J
JiabinYang 已提交
27 28
    parser = argparse.ArgumentParser(
        description="PaddlePaddle Word2vec example")
Q
Qiao Longfei 已提交
29
    parser.add_argument(
Z
zhangwenhui03 已提交
30
        '--train_data_dir',
Q
Qiao Longfei 已提交
31
        type=str,
Z
zhangwenhui03 已提交
32
        default='./data/text',
J
JiabinYang 已提交
33
        help="The path of taining dataset")
Z
zhangwenhui03 已提交
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
    parser.add_argument(
        '--base_lr',
        type=float,
        default=0.01,
        help="The number of learing rate (default: 0.01)")
    parser.add_argument(
        '--save_step',
        type=int,
        default=500000,
        help="The number of step to save (default: 500000)")
    parser.add_argument(
        '--print_batch',
        type=int,
        default=10,
        help="The number of print_batch (default: 10)")
Q
Qiao Longfei 已提交
49 50 51
    parser.add_argument(
        '--dict_path',
        type=str,
52
        default='./data/1-billion_dict',
Q
Qiao Longfei 已提交
53
        help="The path of data dict")
Q
Qiao Longfei 已提交
54 55 56
    parser.add_argument(
        '--batch_size',
        type=int,
Z
zhangwenhui03 已提交
57 58
        default=500,
        help="The size of mini-batch (default:500)")
Q
Qiao Longfei 已提交
59 60 61 62 63 64 65 66 67 68
    parser.add_argument(
        '--num_passes',
        type=int,
        default=10,
        help="The number of passes to train (default: 10)")
    parser.add_argument(
        '--model_output_dir',
        type=str,
        default='models',
        help='The path for model to store (default: models)')
Z
zhangwenhui03 已提交
69
    parser.add_argument('--nce_num', type=int, default=5, help='nce_num')
Q
Qiao Longfei 已提交
70 71 72 73 74
    parser.add_argument(
        '--embedding_size',
        type=int,
        default=64,
        help='sparse feature hashing space for index processing')
T
tangwei12 已提交
75 76
    parser.add_argument(
        '--is_sparse',
T
tangwei12 已提交
77 78
        action='store_true',
        required=False,
T
tangwei12 已提交
79 80
        default=False,
        help='embedding and nce will use sparse or not, (default: False)')
81 82 83 84 85 86
    parser.add_argument(
        '--with_speed',
        action='store_true',
        required=False,
        default=False,
        help='print speed or not , (default: False)')
M
mmglove 已提交
87 88 89
    parser.add_argument(
        '--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.')

Q
Qiao Longfei 已提交
90 91 92
    return parser.parse_args()


Z
zhangwenhui03 已提交
93
def convert_python_to_tensor(weight, batch_size, sample_reader):
J
JiabinYang 已提交
94
    def __reader__():
Z
zhangwenhui03 已提交
95 96
        cs = np.array(weight).cumsum()
        result = [[], []]
J
JiabinYang 已提交
97 98 99 100 101 102 103 104 105 106 107 108 109 110
        for sample in sample_reader():
            for i, fea in enumerate(sample):
                result[i].append(fea)
            if len(result[0]) == batch_size:
                tensor_result = []
                for tensor in result:
                    t = fluid.Tensor()
                    dat = np.array(tensor, dtype='int64')
                    if len(dat.shape) > 2:
                        dat = dat.reshape((dat.shape[0], dat.shape[2]))
                    elif len(dat.shape) == 1:
                        dat = dat.reshape((-1, 1))
                    t.set(dat, fluid.CPUPlace())
                    tensor_result.append(t)
Z
zhangwenhui03 已提交
111 112 113 114 115 116 117
                tt = fluid.Tensor()
                neg_array = cs.searchsorted(np.random.sample(args.nce_num))
                neg_array = np.tile(neg_array, batch_size)
                tt.set(
                    neg_array.reshape((batch_size, args.nce_num)),
                    fluid.CPUPlace())
                tensor_result.append(tt)
J
JiabinYang 已提交
118
                yield tensor_result
Z
zhangwenhui03 已提交
119
                result = [[], []]
J
JiabinYang 已提交
120 121 122 123

    return __reader__


Z
zhangwenhui03 已提交
124 125
def train_loop(args, train_program, reader, py_reader, loss, trainer_id,
               weight):
Q
Qiao Longfei 已提交
126

J
JiabinYang 已提交
127
    py_reader.decorate_tensor_provider(
Z
zhangwenhui03 已提交
128
        convert_python_to_tensor(weight, args.batch_size, reader.train()))
129 130

    place = fluid.CPUPlace()
Q
Qiao Longfei 已提交
131 132
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
133 134

    exec_strategy = fluid.ExecutionStrategy()
J
JiabinYang 已提交
135
    exec_strategy.use_experimental_executor = True
T
tangwei12 已提交
136

J
JiabinYang 已提交
137 138
    print("CPU_NUM:" + str(os.getenv("CPU_NUM")))
    exec_strategy.num_threads = int(os.getenv("CPU_NUM"))
T
tangwei12 已提交
139

140
    build_strategy = fluid.BuildStrategy()
J
JiabinYang 已提交
141 142
    if int(os.getenv("CPU_NUM")) > 1:
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
143 144 145 146 147 148 149 150

    train_exe = fluid.ParallelExecutor(
        use_cuda=False,
        loss_name=loss.name,
        main_program=train_program,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)

Q
Qiao Longfei 已提交
151
    for pass_id in range(args.num_passes):
152
        py_reader.start()
J
JiabinYang 已提交
153 154
        time.sleep(10)
        epoch_start = time.time()
155
        batch_id = 0
156
        start = time.time()
157 158
        try:
            while True:
159

160 161 162
                loss_val = train_exe.run(fetch_list=[loss.name])
                loss_val = np.mean(loss_val)

Z
zhangwenhui03 已提交
163
                if batch_id % args.print_batch == 0:
J
JiabinYang 已提交
164 165 166
                    logger.info(
                        "TRAIN --> pass: {} batch: {} loss: {} reader queue:{}".
                        format(pass_id, batch_id,
J
JiabinYang 已提交
167
                               loss_val.mean(), py_reader.queue.size()))
168
                if args.with_speed:
Z
zhangwenhui03 已提交
169
                    if batch_id % 500 == 0 and batch_id != 0:
170 171
                        elapsed = (time.time() - start)
                        start = time.time()
172 173 174 175
                        samples = 1001 * args.batch_size * int(
                            os.getenv("CPU_NUM"))
                        logger.info("Time used: {}, Samples/Sec: {}".format(
                            elapsed, samples / elapsed))
Z
zhangwenhui03 已提交
176 177 178 179 180 181 182

                if batch_id % args.save_step == 0 and batch_id != 0:
                    model_dir = args.model_output_dir + '/pass-' + str(
                        pass_id) + ('/batch-' + str(batch_id))
                    if trainer_id == 0:
                        fluid.io.save_params(executor=exe, dirname=model_dir)
                        print("model saved in %s" % model_dir)
183 184 185 186 187
                batch_id += 1

        except fluid.core.EOFException:
            py_reader.reset()
            epoch_end = time.time()
188
            logger.info("Epoch: {0}, Train total expend: {1} ".format(
J
JiabinYang 已提交
189
                pass_id, epoch_end - epoch_start))
190 191
            model_dir = args.model_output_dir + '/pass-' + str(pass_id)
            if trainer_id == 0:
Z
zhangwenhui03 已提交
192 193
                fluid.io.save_params(executor=exe, dirname=model_dir)
                print("model saved in %s" % model_dir)
Q
Qiao Longfei 已提交
194 195


196 197 198 199 200
def GetFileList(data_path):
    return os.listdir(data_path)


def train(args):
M
mmglove 已提交
201 202 203 204 205
    # add ce
    if args.enable_ce:
        SEED = 102
        fluid.default_main_program().random_seed = SEED
        fluid.default_startup_program().random_seed = SEED
Q
Qiao Longfei 已提交
206 207 208 209

    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

Z
zhangwenhui03 已提交
210 211 212
    filelist = GetFileList(args.train_data_dir)
    word2vec_reader = reader.Word2VecReader(args.dict_path, args.train_data_dir,
                                            filelist, 0, 1)
213 214

    logger.info("dict_size: {}".format(word2vec_reader.dict_size))
Z
zhangwenhui03 已提交
215 216
    np_power = np.power(np.array(word2vec_reader.id_frequencys), 0.75)
    id_frequencys_pow = np_power / np_power.sum()
Z
zhangwenhui03 已提交
217

218
    loss, py_reader = skip_gram_word2vec(
J
JiabinYang 已提交
219 220
        word2vec_reader.dict_size,
        args.embedding_size,
Z
zhangwenhui03 已提交
221 222
        is_sparse=args.is_sparse,
        neg_num=args.nce_num)
J
JiabinYang 已提交
223

Z
zhangwenhui03 已提交
224 225 226 227 228 229
    optimizer = fluid.optimizer.SGD(
        learning_rate=fluid.layers.exponential_decay(
            learning_rate=args.base_lr,
            decay_steps=100000,
            decay_rate=0.999,
            staircase=True))
230

Q
Qiao Longfei 已提交
231 232
    optimizer.minimize(loss)

233
    # do local training 
Z
zhangwenhui03 已提交
234 235 236
    logger.info("run local training")
    main_program = fluid.default_main_program()
    train_loop(args, main_program, word2vec_reader, py_reader, loss, 0,
Z
zhangwenhui03 已提交
237
               id_frequencys_pow)
J
JiabinYang 已提交
238 239


Q
Qiao Longfei 已提交
240
if __name__ == '__main__':
Z
zhang wenhui 已提交
241
    utils.check_version()
242 243
    args = parse_args()
    train(args)