train_and_evaluate.py

#Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import os
import sys
import time
import argparse
import unittest
import contextlib
import numpy as np

import paddle.fluid as fluid

import utils, metric, configs
import models

from pretrained_word2vec import Glove840B_300D 

parser = argparse.ArgumentParser(description=__doc__)

parser.add_argument('--model_name',       type=str,   default='cdssmNet',                  help="Which model to train")
parser.add_argument('--config',           type=str,   default='cdssm_base',       help="The global config setting")

DATA_DIR = os.path.join(os.path.expanduser('~'), '.cache/paddle/dataset')

def evaluate(epoch_id, exe, inference_program, dev_reader, test_reader, fetch_list, feeder, metric_type):
    """
    evaluate on test/dev dataset
    """
    def infer(test_reader):
        """
        do inference function
        """
        total_cost = 0.0
        total_count = 0
        preds, labels = [], []
        for data in test_reader():
            avg_cost, avg_acc, batch_prediction = exe.run(inference_program,
                          feed=feeder.feed(data),
                          fetch_list=fetch_list,
                          return_numpy=True)
            total_cost += avg_cost * len(data)
            total_count += len(data)
            preds.append(batch_prediction)
            labels.append(np.asarray([x[-1] for x in data], dtype=np.int64))
        y_pred = np.concatenate(preds)
        y_label = np.concatenate(labels)

        metric_res = []
        for metric_name in metric_type:
            if metric_name == 'accuracy_with_threshold':
                metric_res.append((metric_name, metric.accuracy_with_threshold(y_pred, y_label, threshold=0.3)))
            elif metric_name == 'accuracy':
                metric_res.append((metric_name, metric.accuracy(y_pred, y_label)))
            else:
                print("Unknown metric type: ", metric_name)
                exit()
        return total_cost / (total_count * 1.0), metric_res

    dev_cost, dev_metric_res = infer(dev_reader)
    print("[%s] epoch_id: %d, dev_cost: %f, " % (
                 time.asctime( time.localtime(time.time()) ),
                 epoch_id,
                 dev_cost)
               + ', '.join([str(x[0]) + ": " + str(x[1]) for x in dev_metric_res]))

    test_cost, test_metric_res = infer(test_reader)
    print("[%s] epoch_id: %d, test_cost: %f, " % (
                time.asctime( time.localtime(time.time()) ),
                epoch_id,
                test_cost)
              + ', '.join([str(x[0]) + ": " + str(x[1]) for x in test_metric_res]))
    print("")


def train_and_evaluate(train_reader,
          test_reader, 
          dev_reader,
          network,
          optimizer,
          global_config,
          pretrained_word_embedding,
          use_cuda,
          parallel):
    """
    train network
    """
    
    # define the net
    if global_config.use_lod_tensor: 
        # automatic add batch dim
        q1 = fluid.layers.data(
            name="question1", shape=[1], dtype="int64", lod_level=1)
        q2 = fluid.layers.data(
            name="question2", shape=[1], dtype="int64", lod_level=1)
        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
        cost, acc, prediction = network(q1, q2, label)  
    else:
        # shape: [batch_size, max_seq_len_in_batch, 1]
        q1 = fluid.layers.data(
            name="question1", shape=[-1, -1, 1], dtype="int64")
        q2 = fluid.layers.data(
            name="question2", shape=[-1, -1, 1], dtype="int64")
        # shape: [batch_size, max_seq_len_in_batch]
        mask1 = fluid.layers.data(name="mask1", shape=[-1, -1], dtype="float32")
        mask2 = fluid.layers.data(name="mask2", shape=[-1, -1], dtype="float32")
        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
        cost, acc, prediction = network(q1, q2, mask1, mask2, label)

    if parallel:
        # TODO: Paarallel Training
        print("Parallel Training is not supported for now.")
        sys.exit(1)

    #optimizer.minimize(cost)
    if use_cuda:
        print("Using GPU")
        place = fluid.CUDAPlace(0)
    else:
        print("Using CPU")
        place = fluid.CPUPlace()
    exe = fluid.Executor(place)

    if global_config.use_lod_tensor:
        feeder = fluid.DataFeeder(feed_list=[q1, q2, label], place=place)
    else:
        feeder = fluid.DataFeeder(feed_list=[q1, q2, mask1, mask2, label], place=place)

    # logging param info
    for param in fluid.default_main_program().global_block().all_parameters():
        print("param name: %s; param shape: %s" % (param.name, param.shape))
    
    # define inference_program
    inference_program = fluid.default_main_program().clone(for_test=True)

    optimizer.minimize(cost)

    exe.run(fluid.default_startup_program())
    
    # load emb from a numpy erray
    if pretrained_word_embedding is not None:
        print("loading pretrained word embedding to param")
        embedding_name = "emb.w"
        embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor()
        embedding_param.set(pretrained_word_embedding, place)
   
    evaluate(-1,
             exe,
             inference_program,
             dev_reader,
             test_reader,
             fetch_list=[cost, acc, prediction],
             feeder=feeder,
             metric_type=global_config.metric_type)

    # start training
    print("[%s] Start Training" % time.asctime(time.localtime(time.time())))
    for epoch_id in range(global_config.epoch_num):
        data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0
        batch_id = 0
        for data in train_reader():
            avg_cost_np, avg_acc_np = exe.run(fluid.default_main_program(),
                                              feed=feeder.feed(data),
                                              fetch_list=[cost, acc])
            data_size = len(data)
            total_acc += data_size * avg_acc_np
            total_cost += data_size * avg_cost_np
            data_count += data_size
            if batch_id % 100 == 0:
                print("[%s] epoch_id: %d, batch_id: %d, cost: %f, acc: %f" % (
                    time.asctime(time.localtime(time.time())),
                    epoch_id, 
                    batch_id, 
                    avg_cost_np,
                    avg_acc_np))
            batch_id += 1
        
        avg_cost = total_cost / data_count
        avg_acc = total_acc / data_count
        
        print("")
        print("[%s] epoch_id: %d, train_avg_cost: %f, train_avg_acc: %f" % (
            time.asctime( time.localtime(time.time()) ), epoch_id, avg_cost, avg_acc))

        epoch_model = global_config.save_dirname + "/" + "epoch" + str(epoch_id)
        fluid.io.save_inference_model(epoch_model, ["question1", "question2", "label"], acc, exe)    
        
        evaluate(epoch_id, 
                 exe, 
                 inference_program,
                 dev_reader,
                 test_reader, 
                 fetch_list=[cost, acc, prediction], 
                 feeder=feeder, 
                 metric_type=global_config.metric_type)

def main():
    """
    This function will parse argments, prepare data and prepare pretrained embedding
    """
    args = parser.parse_args()
    global_config = configs.__dict__[args.config]()

    print("net_name: ", args.model_name)
    net = models.__dict__[args.model_name](global_config)

    # get word_dict
    word_dict = utils.getDict(data_type="quora_question_pairs")

    # get reader
    train_reader, dev_reader, test_reader = utils.prepare_data(
        "quora_question_pairs",
         word_dict=word_dict,
         batch_size = global_config.batch_size,
         buf_size=800000,
         duplicate_data=global_config.duplicate_data,
         use_pad=(not global_config.use_lod_tensor))
 
    # load pretrained_word_embedding
    if global_config.use_pretrained_word_embedding:
        word2vec = Glove840B_300D(filepath=os.path.join(DATA_DIR, "glove.840B.300d.txt"),
                                  keys=set(word_dict.keys()))
        pretrained_word_embedding = utils.get_pretrained_word_embedding(
                                        word2vec=word2vec,
                                        word2id=word_dict,
                                        config=global_config)
        print("pretrained_word_embedding to be load:", pretrained_word_embedding)
    else:
        pretrained_word_embedding = None

    # define optimizer
    optimizer = utils.getOptimizer(global_config)
   
    # use cuda or not
    if not global_config.has_member('use_cuda'):
        global_config.use_cuda = 'CUDA_VISIBLE_DEVICES' in os.environ

    global_config.list_config()

    train_and_evaluate(
                   train_reader,
                   dev_reader,
                   test_reader,
                   net,
                   optimizer,
                   global_config,
                   pretrained_word_embedding,
                   use_cuda=global_config.use_cuda,
                   parallel=False)

if __name__ == "__main__":
    main()