from __future__ import print_function import os import sys import time import argparse import unittest import contextlib import numpy as np import paddle.fluid as fluid import paddle.v2 as paddle import utils, metric, configs import models from pretrained_word2vec import Glove840B_300D parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('--model_name', type=str, default='cdssm', help="Which model to train") parser.add_argument('--config', type=str, default='cdssm.cdssm_base', help="The global config setting") DATA_DIR = os.path.join(os.path.expanduser('~'), '.cache/paddle/dataset') def evaluate(epoch_id, exe, inference_program, dev_reader, test_reader, fetch_list, feeder, metric_type): """ evaluate on test/dev dataset """ def infer(test_reader): """ do inference function """ total_cost = 0.0 total_count = 0 preds, labels = [], [] for data in test_reader(): avg_cost, avg_acc, batch_prediction = exe.run(inference_program, feed=feeder.feed(data), fetch_list=fetch_list, return_numpy=True) total_cost += avg_cost * len(data) total_count += len(data) preds.append(batch_prediction) labels.append(np.asarray([x[-1] for x in data], dtype=np.int64)) y_pred = np.concatenate(preds) y_label = np.concatenate(labels) metric_res = [] for metric_name in metric_type: if metric_name == 'accuracy_with_threshold': metric_res.append((metric_name, metric.accuracy_with_threshold(y_pred, y_label, threshold=0.3))) elif metric_name == 'accuracy': metric_res.append((metric_name, metric.accuracy(y_pred, y_label))) else: print("Unknown metric type: ", metric_name) exit() return total_cost / (total_count * 1.0), metric_res dev_cost, dev_metric_res = infer(dev_reader) print("[%s] epoch_id: %d, dev_cost: %f, " % ( time.asctime( time.localtime(time.time()) ), epoch_id, dev_cost) + ', '.join([str(x[0]) + ": " + str(x[1]) for x in dev_metric_res])) test_cost, test_metric_res = infer(test_reader) print("[%s] epoch_id: %d, test_cost: %f, " % ( time.asctime( time.localtime(time.time()) ), epoch_id, test_cost) + ', '.join([str(x[0]) + ": " + str(x[1]) for x in test_metric_res])) print("") def train_and_evaluate(train_reader, test_reader, dev_reader, network, optimizer, global_config, pretrained_word_embedding, use_cuda, parallel): """ train network """ # define the net if global_config.use_lod_tensor: # automatic add batch dim q1 = fluid.layers.data( name="question1", shape=[1], dtype="int64", lod_level=1) q2 = fluid.layers.data( name="question2", shape=[1], dtype="int64", lod_level=1) label = fluid.layers.data(name="label", shape=[1], dtype="int64") cost, acc, prediction = network(q1, q2, label) else: # shape: [batch_size, max_seq_len_in_batch, 1] q1 = fluid.layers.data( name="question1", shape=[-1, -1, 1], dtype="int64") q2 = fluid.layers.data( name="question2", shape=[-1, -1, 1], dtype="int64") # shape: [batch_size, max_seq_len_in_batch] mask1 = fluid.layers.data(name="mask1", shape=[-1, -1], dtype="float32") mask2 = fluid.layers.data(name="mask2", shape=[-1, -1], dtype="float32") label = fluid.layers.data(name="label", shape=[1], dtype="int64") cost, acc, prediction = network(q1, q2, mask1, mask2, label) if parallel: # TODO: Paarallel Training print("Parallel Training is not supported for now.") sys.exit(1) optimizer.minimize(cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) if global_config.use_lod_tensor: feeder = fluid.DataFeeder(feed_list=[q1, q2, label], place=place) else: feeder = fluid.DataFeeder(feed_list=[q1, q2, mask1, mask2, label], place=place) # logging param info for param in fluid.default_main_program().global_block().all_parameters(): print("param name: %s; param shape: %s" % (param.name, param.shape)) # define inference_program inference_program = fluid.default_main_program().clone() with fluid.program_guard(inference_program): inference_program = fluid.io.get_inference_program([cost, acc]) exe.run(fluid.default_startup_program()) # load emb from a numpy erray if pretrained_word_embedding is not None: print("loading pretrained word embedding to param") embedding_name = "emb.w" embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor() embedding_param.set(pretrained_word_embedding, place) evaluate(-1, exe, inference_program, dev_reader, test_reader, fetch_list=[cost, acc, prediction], feeder=feeder, metric_type=global_config.metric_type) # start training print("[%s] Start Training" % time.asctime(time.localtime(time.time()))) for epoch_id in xrange(global_config.epoch_num): data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0 batch_id = 0 for data in train_reader(): avg_cost_np, avg_acc_np = exe.run(fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[cost, acc]) data_size = len(data) total_acc += data_size * avg_acc_np total_cost += data_size * avg_cost_np data_count += data_size if batch_id % 100 == 0: print("[%s] epoch_id: %d, batch_id: %d, cost: %f, acc: %f" % ( time.asctime(time.localtime(time.time())), epoch_id, batch_id, avg_cost_np, avg_acc_np)) batch_id += 1 avg_cost = total_cost / data_count avg_acc = total_acc / data_count print("") print("[%s] epoch_id: %d, train_avg_cost: %f, train_avg_acc: %f" % ( time.asctime( time.localtime(time.time()) ), epoch_id, avg_cost, avg_acc)) epoch_model = global_config.save_dirname + "/" + "epoch" + str(epoch_id) fluid.io.save_inference_model(epoch_model, ["question1", "question2", "label"], acc, exe) evaluate(epoch_id, exe, inference_program, dev_reader, test_reader, fetch_list=[cost, acc, prediction], feeder=feeder, metric_type=global_config.metric_type) def main(): """ This function will parse argments, prepare data and prepare pretrained embedding """ args = parser.parse_args() global_config = configs.__dict__[args.config]() print("net_name: ", args.model_name) net = models.__dict__[args.model_name](global_config) global_config.list_config() # get word_dict word_dict = utils.getDict(data_type="quora_question_pairs") # get reader train_reader, dev_reader, test_reader = utils.prepare_data( "quora_question_pairs", word_dict=word_dict, batch_size = global_config.batch_size, buf_size=800000, duplicate_data=global_config.duplicate_data, use_pad=(not global_config.use_lod_tensor)) # load pretrained_word_embedding if global_config.use_pretrained_word_embedding: word2vec = Glove840B_300D(filepath=os.path.join(DATA_DIR, "glove.840B.300d.txt")) pretrained_word_embedding = utils.get_pretrained_word_embedding( word2vec=word2vec, word2id=word_dict, config=global_config) print("pretrained_word_embedding to be load:", pretrained_word_embedding) else: pretrained_word_embedding = None # define optimizer optimizer = utils.getOptimizer(global_config) train_and_evaluate( train_reader, dev_reader, test_reader, net, optimizer, global_config, pretrained_word_embedding, use_cuda=True, parallel=False) if __name__ == "__main__": main()