train_and_evaluate.py 10.7 KB
Newer Older
M
mapingshuo 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
from __future__ import print_function

import os
import sys
import time
import argparse
import unittest
import contextlib
import numpy as np

import paddle.fluid as fluid

import utils, metric, configs
import models

Y
Yibing Liu 已提交
30
from pretrained_word2vec import Glove840B_300D
31 32 33

parser = argparse.ArgumentParser(description=__doc__)

Y
Yibing Liu 已提交
34 35 36 37 38 39 40 41 42 43 44
parser.add_argument(
    '--model_name', type=str, default='cdssmNet', help="Which model to train")
parser.add_argument(
    '--config',
    type=str,
    default='cdssm_base',
    help="The global config setting")
parser.add_argument(
    '--enable_ce',
    action='store_true',
    help='If set, run the task with continuous evaluation logs.')
Z
zhengya01 已提交
45
parser.add_argument('--epoch_num', type=int, help='Number of epoch')
46 47 48

DATA_DIR = os.path.join(os.path.expanduser('~'), '.cache/paddle/dataset')

Y
Yibing Liu 已提交
49 50 51

def evaluate(epoch_id, exe, inference_program, dev_reader, test_reader,
             fetch_list, feeder, metric_type):
52 53 54
    """
    evaluate on test/dev dataset
    """
Y
Yibing Liu 已提交
55

56 57 58 59 60 61 62 63
    def infer(test_reader):
        """
        do inference function
        """
        total_cost = 0.0
        total_count = 0
        preds, labels = [], []
        for data in test_reader():
Y
Yibing Liu 已提交
64 65 66 67 68
            avg_cost, avg_acc, batch_prediction = exe.run(
                inference_program,
                feed=feeder.feed(data),
                fetch_list=fetch_list,
                return_numpy=True)
69 70 71 72 73 74 75 76 77 78
            total_cost += avg_cost * len(data)
            total_count += len(data)
            preds.append(batch_prediction)
            labels.append(np.asarray([x[-1] for x in data], dtype=np.int64))
        y_pred = np.concatenate(preds)
        y_label = np.concatenate(labels)

        metric_res = []
        for metric_name in metric_type:
            if metric_name == 'accuracy_with_threshold':
Y
Yibing Liu 已提交
79 80
                metric_res.append((metric_name, metric.accuracy_with_threshold(
                    y_pred, y_label, threshold=0.3)))
81
            elif metric_name == 'accuracy':
Y
Yibing Liu 已提交
82 83
                metric_res.append(
                    (metric_name, metric.accuracy(y_pred, y_label)))
84 85 86 87 88 89
            else:
                print("Unknown metric type: ", metric_name)
                exit()
        return total_cost / (total_count * 1.0), metric_res

    dev_cost, dev_metric_res = infer(dev_reader)
Y
Yibing Liu 已提交
90 91 92
    print("[%s] epoch_id: %d, dev_cost: %f, " % (time.asctime(
        time.localtime(time.time())), epoch_id, dev_cost) + ', '.join(
            [str(x[0]) + ": " + str(x[1]) for x in dev_metric_res]))
93 94

    test_cost, test_metric_res = infer(test_reader)
Y
Yibing Liu 已提交
95 96 97
    print("[%s] epoch_id: %d, test_cost: %f, " % (time.asctime(
        time.localtime(time.time())), epoch_id, test_cost) + ', '.join(
            [str(x[0]) + ": " + str(x[1]) for x in test_metric_res]))
98 99 100
    print("")


Y
Yibing Liu 已提交
101 102 103
def train_and_evaluate(train_reader, dev_reader, test_reader, network,
                       optimizer, global_config, pretrained_word_embedding,
                       use_cuda, parallel):
104 105 106
    """
    train network
    """
Y
Yibing Liu 已提交
107

108
    # define the net
Y
Yibing Liu 已提交
109
    if global_config.use_lod_tensor:
110 111 112 113 114 115
        # automatic add batch dim
        q1 = fluid.layers.data(
            name="question1", shape=[1], dtype="int64", lod_level=1)
        q2 = fluid.layers.data(
            name="question2", shape=[1], dtype="int64", lod_level=1)
        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
Y
Yibing Liu 已提交
116
        cost, acc, prediction = network(q1, q2, label)
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
    else:
        # shape: [batch_size, max_seq_len_in_batch, 1]
        q1 = fluid.layers.data(
            name="question1", shape=[-1, -1, 1], dtype="int64")
        q2 = fluid.layers.data(
            name="question2", shape=[-1, -1, 1], dtype="int64")
        # shape: [batch_size, max_seq_len_in_batch]
        mask1 = fluid.layers.data(name="mask1", shape=[-1, -1], dtype="float32")
        mask2 = fluid.layers.data(name="mask2", shape=[-1, -1], dtype="float32")
        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
        cost, acc, prediction = network(q1, q2, mask1, mask2, label)

    if parallel:
        # TODO: Paarallel Training
        print("Parallel Training is not supported for now.")
        sys.exit(1)

134 135 136 137 138 139 140
    #optimizer.minimize(cost)
    if use_cuda:
        print("Using GPU")
        place = fluid.CUDAPlace(0)
    else:
        print("Using CPU")
        place = fluid.CPUPlace()
141 142 143 144 145
    exe = fluid.Executor(place)

    if global_config.use_lod_tensor:
        feeder = fluid.DataFeeder(feed_list=[q1, q2, label], place=place)
    else:
Y
Yibing Liu 已提交
146 147
        feeder = fluid.DataFeeder(
            feed_list=[q1, q2, mask1, mask2, label], place=place)
148

R
add ce  
root 已提交
149 150 151
    # only for ce
    args = parser.parse_args()
    if args.enable_ce:
Z
zhengya01 已提交
152
        SEED = 102
R
add ce  
root 已提交
153 154 155
        fluid.default_startup_program().random_seed = SEED
        fluid.default_main_program().random_seed = SEED

156 157 158
    # logging param info
    for param in fluid.default_main_program().global_block().all_parameters():
        print("param name: %s; param shape: %s" % (param.name, param.shape))
Y
Yibing Liu 已提交
159

160
    # define inference_program
161 162 163 164
    inference_program = fluid.default_main_program().clone(for_test=True)

    optimizer.minimize(cost)

165
    exe.run(fluid.default_startup_program())
Y
Yibing Liu 已提交
166

167 168 169 170
    # load emb from a numpy erray
    if pretrained_word_embedding is not None:
        print("loading pretrained word embedding to param")
        embedding_name = "emb.w"
Y
Yibing Liu 已提交
171 172
        embedding_param = fluid.global_scope().find_var(
            embedding_name).get_tensor()
173
        embedding_param.set(pretrained_word_embedding, place)
Y
Yibing Liu 已提交
174 175 176 177 178 179 180 181 182 183

    evaluate(
        -1,
        exe,
        inference_program,
        dev_reader,
        test_reader,
        fetch_list=[cost, acc, prediction],
        feeder=feeder,
        metric_type=global_config.metric_type)
184 185

    # start training
R
add ce  
root 已提交
186
    total_time = 0.0
187
    print("[%s] Start Training" % time.asctime(time.localtime(time.time())))
M
mapingshuo 已提交
188
    for epoch_id in range(global_config.epoch_num):
R
add ce  
root 已提交
189

190 191
        data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0
        batch_id = 0
C
ccmeteorljh 已提交
192
        epoch_begin_time = time.time()
193 194 195 196 197
        for data in train_reader():
            avg_cost_np, avg_acc_np = exe.run(fluid.default_main_program(),
                                              feed=feeder.feed(data),
                                              fetch_list=[cost, acc])
            data_size = len(data)
M
mapingshuo 已提交
198 199
            total_acc += data_size * avg_acc_np[0]
            total_cost += data_size * avg_cost_np[0]
200 201
            data_count += data_size
            if batch_id % 100 == 0:
Y
Yibing Liu 已提交
202 203 204
                print("[%s] epoch_id: %d, batch_id: %d, cost: %f, acc: %f" %
                      (time.asctime(time.localtime(time.time())), epoch_id,
                       batch_id, avg_cost_np, avg_acc_np))
205 206 207
            batch_id += 1
        avg_cost = total_cost / data_count
        avg_acc = total_acc / data_count
R
add ce  
root 已提交
208 209 210
        epoch_end_time = time.time()
        total_time += epoch_end_time - epoch_begin_time

211
        print("")
Y
Yibing Liu 已提交
212 213 214 215
        print(
            "[%s] epoch_id: %d, train_avg_cost: %f, train_avg_acc: %f, epoch_time_cost: %f"
            % (time.asctime(time.localtime(time.time())), epoch_id, avg_cost,
               avg_acc, time.time() - epoch_begin_time))
216

R
add ce  
root 已提交
217 218 219 220 221 222
        # only for ce
        if epoch_id == global_config.epoch_num - 1 and args.enable_ce:
            #Note: The following logs are special for CE monitoring.
            #Other situations do not need to care about these logs.
            gpu_num = get_cards(args)
            print("kpis\teach_pass_duration_card%s\t%s" % \
M
mapingshuo 已提交
223
                  (gpu_num, total_time / (global_config.epoch_num)))
Y
Yibing Liu 已提交
224 225
            print("kpis\ttrain_avg_cost_card%s\t%s" % (gpu_num, avg_cost))
            print("kpis\ttrain_avg_acc_card%s\t%s" % (gpu_num, avg_acc))
R
add ce  
root 已提交
226

227
        epoch_model = global_config.save_dirname + "/" + "epoch" + str(epoch_id)
Y
Yibing Liu 已提交
228 229 230 231 232 233 234 235 236 237 238 239 240
        fluid.io.save_inference_model(
            epoch_model, ["question1", "question2", "label"], acc, exe)

        evaluate(
            epoch_id,
            exe,
            inference_program,
            dev_reader,
            test_reader,
            fetch_list=[cost, acc, prediction],
            feeder=feeder,
            metric_type=global_config.metric_type)

241 242 243 244 245 246 247 248

def main():
    """
    This function will parse argments, prepare data and prepare pretrained embedding
    """
    args = parser.parse_args()
    global_config = configs.__dict__[args.config]()

Z
zhengya01 已提交
249
    if args.epoch_num != None:
Z
zhengya01 已提交
250 251
        global_config.epoch_num = args.epoch_num

252 253 254 255 256 257 258 259 260
    print("net_name: ", args.model_name)
    net = models.__dict__[args.model_name](global_config)

    # get word_dict
    word_dict = utils.getDict(data_type="quora_question_pairs")

    # get reader
    train_reader, dev_reader, test_reader = utils.prepare_data(
        "quora_question_pairs",
Y
Yibing Liu 已提交
261 262 263 264 265 266
        word_dict=word_dict,
        batch_size=global_config.batch_size,
        buf_size=800000,
        duplicate_data=global_config.duplicate_data,
        use_pad=(not global_config.use_lod_tensor))

267 268
    # load pretrained_word_embedding
    if global_config.use_pretrained_word_embedding:
Y
Yibing Liu 已提交
269 270 271
        word2vec = Glove840B_300D(
            filepath=os.path.join(DATA_DIR, "glove.840B.300d.txt"),
            keys=set(word_dict.keys()))
272
        pretrained_word_embedding = utils.get_pretrained_word_embedding(
Y
Yibing Liu 已提交
273 274 275
            word2vec=word2vec, word2id=word_dict, config=global_config)
        print("pretrained_word_embedding to be load:",
              pretrained_word_embedding)
276 277 278 279 280
    else:
        pretrained_word_embedding = None

    # define optimizer
    optimizer = utils.getOptimizer(global_config)
Y
Yibing Liu 已提交
281

282 283
    # use cuda or not
    if not global_config.has_member('use_cuda'):
Y
Yibing Liu 已提交
284 285
        if 'CUDA_VISIBLE_DEVICES' in os.environ and os.environ[
                'CUDA_VISIBLE_DEVICES'] != '':
M
mapingshuo 已提交
286 287 288
            global_config.use_cuda = True
        else:
            global_config.use_cuda = False
289 290

    global_config.list_config()
291 292

    train_and_evaluate(
Y
Yibing Liu 已提交
293 294 295 296 297 298 299 300 301
        train_reader,
        dev_reader,
        test_reader,
        net,
        optimizer,
        global_config,
        pretrained_word_embedding,
        use_cuda=global_config.use_cuda,
        parallel=False)
302

R
add ce  
root 已提交
303 304 305 306 307 308 309 310 311 312

def get_cards(args):
    if args.enable_ce:
        cards = os.environ.get('CUDA_VISIBLE_DEVICES')
        num = len(cards.split(","))
        return num
    else:
        return args.num_devices


313 314
if __name__ == "__main__":
    main()