From 2d3928289b7ad845b274c3d0f90d1a64fd38a9ce Mon Sep 17 00:00:00 2001 From: Yao Cheng Date: Fri, 27 Apr 2018 08:58:39 +0000 Subject: [PATCH] add cloud --- .../clouds/scdb_parallel_executor.py | 509 ++++++++++++++++++ .../clouds/scdb_single_card.py | 494 +++++++++++++++++ 2 files changed, 1003 insertions(+) create mode 100644 fluid/text_classification/clouds/scdb_parallel_executor.py create mode 100644 fluid/text_classification/clouds/scdb_single_card.py diff --git a/fluid/text_classification/clouds/scdb_parallel_executor.py b/fluid/text_classification/clouds/scdb_parallel_executor.py new file mode 100644 index 00000000..5b34bb75 --- /dev/null +++ b/fluid/text_classification/clouds/scdb_parallel_executor.py @@ -0,0 +1,509 @@ +import unittest +import contextlib +import paddle.fluid as fluid +import paddle.v2 as paddle +import numpy as np +import sys +import time +import os +import json +import random + +def to_lodtensor(data, place): + """ + convert to LODtensor + """ + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = fluid.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res + + +def load_vocab(filename): + """ + load imdb vocabulary + """ + vocab = {} + with open(filename) as f: + wid = 0 + for line in f: + vocab[line.strip()] = wid + wid += 1 + vocab[""] = len(vocab) + return vocab + + +def data2tensor(data, place): + """ + data2tensor + """ + input_seq = to_lodtensor(map(lambda x:x[0], data), place) + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + return {"words": input_seq, "label": y_data} + +def data2pred(data, place): + """ + data2tensor + """ + input_seq = to_lodtensor(map(lambda x:x[0], data), place) + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + return {"words": input_seq} + +def load_dict(vocab): + """ + Load dict from vocab + """ + word_dict = dict() + with open(vocab, "r") as fin: + for line in fin: + cols = line.strip("\r\n").decode("gb18030").split("\t") + word_dict[cols[0]] = int(cols[1]) + return word_dict + + +def save_dict(word_dict, vocab): + """ + Save dict into file + """ + with open(vocab, "w") as fout: + for k, v in word_dict.iteritems(): + outstr = ("%s\t%s\n" % (k, v)).encode("gb18030") + fout.write(outstr) + +def build_dict(fname): + """ + build word dict using trainset + """ + word_dict = dict() + with open(fname, "r") as fin: + for line in fin: + try: + words = line.strip("\r\n").decode("gb18030").split("\t")[1].split(" ") + except: + sys.stderr.write("[warning] build_dict: decode error\n") + continue + for w in words: + if w not in word_dict: + word_dict[w] = len(word_dict) + return word_dict + + +def scdb_word_dict(vocab="scdb_data/train_set/train.vocab"): + """ + get word_dict + """ + if not os.path.exists(vocab): + w_dict = build_dict(train_file) + save_dict(w_dict, vocab) + else: + w_dict = load_dict(vocab) + w_dict[""] = len(w_dict) + return w_dict + + +def data_reader(fname, word_dict, is_dir=False): + """ + Convert word sequence into slot + """ + unk_id = len(word_dict) + all_data = [] + filelist = [] + if is_dir: + filelist = [fname + os.sep + f for f in os.listdir(fname)] + else: + filelist = [fname] + + for each_name in filelist: + with open(each_name, "r") as fin: + for line in fin: + try: + cols = line.strip("\r\n").decode("gb18030").split("\t") + except: + sys.stderr.write("warning: ignore decode error\n") + continue + + label = int(cols[0]) + wids = [word_dict[x] if x in word_dict else unk_id for x in cols[1].split(" ")] + all_data.append((wids, label)) + + random.shuffle(all_data) + + def reader(): + for doc, label in all_data: + yield doc, label + + return reader + + +def scdb_train_data(train_dir="scdb_data/train_set/corpus.train.seg", w_dict=None): + """ + create train data + """ + return data_reader(train_dir, w_dict, True) + + +def scdb_test_data(test_file, w_dict): + """ + test_set=["car", "lbs", "spot", "weibo", + "baby", "toutiao", "3c", "movie", "haogan"] + """ + return data_reader(test_file, w_dict) + + +def bow_net(data, label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2): + """ + bow net + """ + emb = fluid.layers.embedding(input=data, + size=[dict_dim, emb_dim]) + bow = fluid.layers.sequence_pool( + input=emb, + pool_type='sum') + bow_tanh = fluid.layers.tanh(bow) + fc_1 = fluid.layers.fc(input=bow_tanh, + size=hid_dim, act = "tanh") + fc_2 = fluid.layers.fc(input=fc_1, + size=hid_dim2, act = "tanh") + prediction = fluid.layers.fc(input=[fc_2], + size=class_dim, + act="softmax") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + + return avg_cost, acc, prediction + + +def cnn_net(data, label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + win_size=3): + """ + conv net + """ + emb = fluid.layers.embedding(input=data, + size=[dict_dim, emb_dim]) + + conv_3 = fluid.nets.sequence_conv_pool(input=emb, + num_filters=hid_dim, + filter_size=win_size, + act="tanh", + pool_type="max") + + fc_1 = fluid.layers.fc(input=[conv_3], + size=hid_dim2) + + prediction = fluid.layers.fc(input=[fc_1], + size=class_dim, + act="softmax") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + + return avg_cost, acc, prediction + + +def lstm_net(data, label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + emb_lr=30.0): + """ + lstm net + """ + emb = fluid.layers.embedding(input=data, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr(learning_rate=emb_lr)) + + fc0 = fluid.layers.fc(input=emb, + size=hid_dim * 4, + act='tanh') + + lstm_h, c = fluid.layers.dynamic_lstm(input=fc0, + size=hid_dim * 4, + is_reverse=False) + + lstm_max = fluid.layers.sequence_pool(input=lstm_h, + pool_type='max') + lstm_max_tanh = fluid.layers.tanh(lstm_max) + + fc1 = fluid.layers.fc(input=lstm_max_tanh, + size=hid_dim2, + act='tanh') + + prediction = fluid.layers.fc(input=fc1, + size=class_dim, + act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + + return avg_cost, acc, prediction + + +def bilstm_net(data, label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + emb_lr=30.0): + """ + lstm net + """ + emb = fluid.layers.embedding(input=data, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr(learning_rate=emb_lr)) + + fc0 = fluid.layers.fc(input=emb, + size=hid_dim * 4, + act='tanh') + + rfc0 = fluid.layers.fc(input=emb, + size=hid_dim * 4, + act='tanh') + + lstm_h, c = fluid.layers.dynamic_lstm(input=fc0, + size=hid_dim * 4, + is_reverse=False) + + rlstm_h, c = fluid.layers.dynamic_lstm(input=rfc0, + size=hid_dim * 4, + is_reverse=True) + + lstm_last = fluid.layers.sequence_last_step(input=lstm_h) + rlstm_last = fluid.layers.sequence_last_step(input=rlstm_h) + + lstm_last_tanh = fluid.layers.tanh(lstm_last) + rlstm_last_tanh = fluid.layers.tanh(rlstm_last) + + lstm_concat = fluid.layers.concat(input=[lstm_last, rlstm_last], axis=1) + + fc1 = fluid.layers.fc(input=lstm_concat, + size=hid_dim2, + act='tanh') + + prediction = fluid.layers.fc(input=fc1, + size=class_dim, + act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + + return avg_cost, acc, prediction + + +def gru_net(data, label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + emb_lr=30.0): + """ + gru net + """ + emb = fluid.layers.embedding(input=data, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr(learning_rate=emb_lr)) + + fc0 = fluid.layers.fc(input=emb, + size=hid_dim * 3) + + gru_h = fluid.layers.dynamic_gru(input=fc0, + size=hid_dim, + is_reverse=False) + + gru_max = fluid.layers.sequence_pool(input=gru_h, + pool_type='max') + gru_max_tanh = fluid.layers.tanh(gru_max) + + fc1 = fluid.layers.fc(input=gru_max_tanh, + size=hid_dim2, + act='tanh') + + prediction = fluid.layers.fc(input=fc1, + size=class_dim, + act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + + return avg_cost, acc, prediction + +def infer(test_reader, + use_cuda, + model_path=None): + """ + inference function + """ + if model_path is None: + print(str(model_path) + " cannot be found") + return + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + + inference_scope = fluid.core.Scope() + with fluid.scope_guard(inference_scope): + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model(model_path, exe) + + class2_list, class3_list = [], [] + for each_test_reader in test_reader: + class2_acc, class3_acc = 0.0, 0.0 + total_count, neu_count = 0, 0 + + for data in each_test_reader(): + pred = exe.run(inference_program, + feed = data2pred(data, place), + fetch_list=fetch_targets, + return_numpy=True) + + for i, val in enumerate(data): + pos_score = pred[0][i, 1] + true_label = val[1] + if true_label == 2.0 and pos_score > 0.5: + class2_acc += 1 + if true_label == 0.0 and pos_score < 0.5: + class2_acc += 1 + + if true_label == 2.0 and pos_score > 0.55: + class3_acc += 1 + if true_label == 1.0 and pos_score > 0.45 and pos_score <= 0.55: + class3_acc += 1 + if true_label == 0.0 and pos_score <= 0.45: + class3_acc += 1 + + if true_label == 1.0: + neu_count += 1 + + total_count += len(data) + + class2_acc = class2_acc / (total_count - neu_count) + class3_acc = class3_acc / total_count + class2_list.append(class2_acc) + class3_list.append(class3_acc) + + class2_acc = sum(class2_list) / len(class2_list) + class3_acc = sum(class3_list) / len(class3_list) + print("[test info] model_path: %s, class2_acc: %f, class3_acc: %f" % (model_path, class2_acc, class3_acc)) + + +def start_train(train_reader, + test_reader, + word_dict, + network, + use_cuda, + parallel, + save_dirname, + lr=0.2, + batch_size=128, + pass_num=30): + """ + train network + """ + data = fluid.layers.data( + name="words", + shape=[1], + dtype="int64", + lod_level=1) + + label = fluid.layers.data( + name="label", + shape=[1], + dtype="int64") + + cost, acc, pred = network( + data, label, len(word_dict) + 1) + + sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=lr) + sgd_optimizer.minimize(cost) + + place = fluid.CPUPlace() + feeder = fluid.DataFeeder(feed_list=[data, label], place=place) + + start_exe = fluid.Executor(place) + start_exe.run(fluid.default_startup_program()) + + exe = fluid.ParallelExecutor(use_cuda, loss_name=cost.name) + for pass_id in xrange(pass_num): + total_acc, total_cost, total_count, avg_cost, avg_acc = 0.0, 0.0, 0.0, 0.0, 0.0 + for data in train_reader(): + cost_val, acc_val = exe.run(feed=feeder.feed(data), + fetch_list=[cost.name, acc.name]) + cost_val_list, acc_val_list = np.array(cost_val), np.array(acc_val) + total_cost += cost_val_list.sum() * len(data) + total_acc += acc_val_list.sum() * len(data) + total_count += len(data) + + avg_cost = total_cost / total_count + avg_acc = total_acc / total_count + print("[train info]: pass_id: %d, avg_acc: %f, avg_cost: %f" % (pass_id, avg_acc, avg_cost)) + + gpu_place = fluid.CUDAPlace(0) + save_exe = fluid.Executor(gpu_place) + epoch_model = save_dirname + "/" + "epoch" + str(pass_id) + fluid.io.save_inference_model( + epoch_model, + ["words"], + pred, save_exe) + infer(test_reader, False, epoch_model) + + +def train_net(vocab="./thirdparty/train.vocab", + train_dir="./train", + test_list=["car", "spot", "weibo", "lbs"]): + """ + w_dict = scdb_word_dict(vocab=vocab) + test_files = [ "./thirdparty" + os.sep + f for f in test_list] + + train_reader = paddle.batch( + scdb_train_data(train_dir, w_dict), + batch_size = 256) + + test_reader = [paddle.batch(scdb_test_data(test_file, w_dict), batch_size = 50) \ + for test_file in test_files] + """ + w_dict = paddle.dataset.imdb.word_dict() + print("dict ready") + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imdb.train(w_dict), buf_size=50000), + batch_size=128) + + test_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imdb.test(w_dict), buf_size=50000), + batch_size=128) + test_reader = [test_reader] + start_train(train_reader, test_reader, w_dict, bilstm_net, use_cuda=True, + parallel=False, save_dirname="scdb_bilstm_model", lr=0.05, + pass_num=10, batch_size=256) + + +if __name__ == "__main__": + train_net() diff --git a/fluid/text_classification/clouds/scdb_single_card.py b/fluid/text_classification/clouds/scdb_single_card.py new file mode 100644 index 00000000..1956f8e3 --- /dev/null +++ b/fluid/text_classification/clouds/scdb_single_card.py @@ -0,0 +1,494 @@ +import unittest +import contextlib +import paddle.fluid as fluid +import paddle.v2 as paddle +import numpy as np +import sys +import time +import os +import json +import random + +def to_lodtensor(data, place): + """ + convert to LODtensor + """ + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = fluid.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res + + +def load_vocab(filename): + """ + load imdb vocabulary + """ + vocab = {} + with open(filename) as f: + wid = 0 + for line in f: + vocab[line.strip()] = wid + wid += 1 + vocab[""] = len(vocab) + return vocab + + +def data2tensor(data, place): + """ + data2tensor + """ + input_seq = to_lodtensor(map(lambda x:x[0], data), place) + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + return {"words": input_seq, "label": y_data} + +def data2pred(data, place): + """ + data2tensor + """ + input_seq = to_lodtensor(map(lambda x:x[0], data), place) + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + return {"words": input_seq} + +def load_dict(vocab): + """ + Load dict from vocab + """ + word_dict = dict() + with open(vocab, "r") as fin: + for line in fin: + cols = line.strip("\r\n").decode("gb18030").split("\t") + word_dict[cols[0]] = int(cols[1]) + return word_dict + + +def save_dict(word_dict, vocab): + """ + Save dict into file + """ + with open(vocab, "w") as fout: + for k, v in word_dict.iteritems(): + outstr = ("%s\t%s\n" % (k, v)).encode("gb18030") + fout.write(outstr) + +def build_dict(fname): + """ + build word dict using trainset + """ + word_dict = dict() + with open(fname, "r") as fin: + for line in fin: + try: + words = line.strip("\r\n").decode("gb18030").split("\t")[1].split(" ") + except: + sys.stderr.write("[warning] build_dict: decode error\n") + continue + for w in words: + if w not in word_dict: + word_dict[w] = len(word_dict) + return word_dict + + +def scdb_word_dict(vocab="scdb_data/train_set/train.vocab"): + """ + get word_dict + """ + if not os.path.exists(vocab): + w_dict = build_dict(train_file) + save_dict(w_dict, vocab) + else: + w_dict = load_dict(vocab) + w_dict[""] = len(w_dict) + return w_dict + + +def data_reader(fname, word_dict, is_dir=False): + """ + Convert word sequence into slot + """ + unk_id = len(word_dict) + all_data = [] + filelist = [] + if is_dir: + filelist = [fname + os.sep + f for f in os.listdir(fname)] + else: + filelist = [fname] + + for each_name in filelist: + with open(each_name, "r") as fin: + for line in fin: + try: + cols = line.strip("\r\n").decode("gb18030").split("\t") + except: + sys.stderr.write("warning: ignore decode error\n") + continue + + label = int(cols[0]) + wids = [word_dict[x] if x in word_dict else unk_id for x in cols[1].split(" ")] + all_data.append((wids, label)) + + random.shuffle(all_data) + + def reader(): + for doc, label in all_data: + yield doc, label + + return reader + + +def scdb_train_data(train_dir="scdb_data/train_set/corpus.train.seg", w_dict=None): + """ + create train data + """ + return data_reader(train_dir, w_dict, True) + + +def scdb_test_data(test_file, w_dict): + """ + test_set=["car", "lbs", "spot", "weibo", + "baby", "toutiao", "3c", "movie", "haogan"] + """ + return data_reader(test_file, w_dict) + + +def bow_net(data, label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2): + """ + bow net + """ + emb = fluid.layers.embedding(input=data, + size=[dict_dim, emb_dim]) + bow = fluid.layers.sequence_pool( + input=emb, + pool_type='sum') + bow_tanh = fluid.layers.tanh(bow) + fc_1 = fluid.layers.fc(input=bow_tanh, + size=hid_dim, act = "tanh") + fc_2 = fluid.layers.fc(input=fc_1, + size=hid_dim2, act = "tanh") + prediction = fluid.layers.fc(input=[fc_2], + size=class_dim, + act="softmax") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + + return avg_cost, acc, prediction + + +def cnn_net(data, label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + win_size=3): + """ + conv net + """ + emb = fluid.layers.embedding(input=data, + size=[dict_dim, emb_dim]) + + conv_3 = fluid.nets.sequence_conv_pool(input=emb, + num_filters=hid_dim, + filter_size=win_size, + act="tanh", + pool_type="max") + + fc_1 = fluid.layers.fc(input=[conv_3], + size=hid_dim2) + + prediction = fluid.layers.fc(input=[fc_1], + size=class_dim, + act="softmax") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + + return avg_cost, acc, prediction + + +def lstm_net(data, label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + emb_lr=30.0): + """ + lstm net + """ + emb = fluid.layers.embedding(input=data, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr(learning_rate=emb_lr)) + + fc0 = fluid.layers.fc(input=emb, + size=hid_dim * 4, + act='tanh') + + lstm_h, c = fluid.layers.dynamic_lstm(input=fc0, + size=hid_dim * 4, + is_reverse=False) + + lstm_max = fluid.layers.sequence_pool(input=lstm_h, + pool_type='max') + lstm_max_tanh = fluid.layers.tanh(lstm_max) + + fc1 = fluid.layers.fc(input=lstm_max_tanh, + size=hid_dim2, + act='tanh') + + prediction = fluid.layers.fc(input=fc1, + size=class_dim, + act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + + return avg_cost, acc, prediction + + +def bilstm_net(data, label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + emb_lr=30.0): + """ + lstm net + """ + emb = fluid.layers.embedding(input=data, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr(learning_rate=emb_lr)) + + fc0 = fluid.layers.fc(input=emb, + size=hid_dim * 4, + act='tanh') + + rfc0 = fluid.layers.fc(input=emb, + size=hid_dim * 4, + act='tanh') + + lstm_h, c = fluid.layers.dynamic_lstm(input=fc0, + size=hid_dim * 4, + is_reverse=False) + + rlstm_h, c = fluid.layers.dynamic_lstm(input=rfc0, + size=hid_dim * 4, + is_reverse=True) + + lstm_last = fluid.layers.sequence_last_step(input=lstm_h) + rlstm_last = fluid.layers.sequence_last_step(input=rlstm_h) + + lstm_last_tanh = fluid.layers.tanh(lstm_last) + rlstm_last_tanh = fluid.layers.tanh(rlstm_last) + + lstm_concat = fluid.layers.concat(input=[lstm_last, rlstm_last], axis=1) + + fc1 = fluid.layers.fc(input=lstm_concat, + size=hid_dim2, + act='tanh') + + prediction = fluid.layers.fc(input=fc1, + size=class_dim, + act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + + return avg_cost, acc, prediction + + +def gru_net(data, label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + emb_lr=30.0): + """ + gru net + """ + emb = fluid.layers.embedding(input=data, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr(learning_rate=emb_lr)) + + fc0 = fluid.layers.fc(input=emb, + size=hid_dim * 3) + + gru_h = fluid.layers.dynamic_gru(input=fc0, + size=hid_dim, + is_reverse=False) + + gru_max = fluid.layers.sequence_pool(input=gru_h, + pool_type='max') + gru_max_tanh = fluid.layers.tanh(gru_max) + + fc1 = fluid.layers.fc(input=gru_max_tanh, + size=hid_dim2, + act='tanh') + + prediction = fluid.layers.fc(input=fc1, + size=class_dim, + act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + + return avg_cost, acc, prediction + + +def infer(test_reader, + use_cuda, + model_path=None): + """ + inference function + """ + if model_path is None: + print(str(model_path) + " cannot be found") + return + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + inference_scope = fluid.core.Scope() + with fluid.scope_guard(inference_scope): + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model(model_path, exe) + + class2_list, class3_list = [], [] + for each_test_reader in test_reader: + class2_acc, class3_acc = 0.0, 0.0 + total_count, neu_count = 0, 0 + + for data in each_test_reader(): + pred = exe.run(inference_program, + feed = data2pred(data, place), + fetch_list=fetch_targets, + return_numpy=True) + + for i, val in enumerate(data): + pos_score = pred[0][i, 1] + true_label = val[1] + if true_label == 2.0 and pos_score > 0.5: + class2_acc += 1 + if true_label == 0.0 and pos_score < 0.5: + class2_acc += 1 + + if true_label == 2.0 and pos_score > 0.55: + class3_acc += 1 + if true_label == 1.0 and pos_score > 0.45 and pos_score <= 0.55: + class3_acc += 1 + if true_label == 0.0 and pos_score <= 0.45: + class3_acc += 1 + + if true_label == 1.0: + neu_count += 1 + + total_count += len(data) + + class2_acc = class2_acc / (total_count - neu_count) + class3_acc = class3_acc / total_count + class2_list.append(class2_acc) + class3_list.append(class3_acc) + + class2_acc = sum(class2_list) / len(class2_list) + class3_acc = sum(class3_list) / len(class3_list) + print("[test info] model_path: %s, class2_acc: %f, class3_acc: %f" % (model_path, class2_acc, class3_acc)) + + +def start_train(train_reader, + test_reader, + word_dict, + network, + use_cuda, + parallel, + save_dirname, + lr=0.2, + batch_size=128, + pass_num=30): + """ + train network + """ + data = fluid.layers.data( + name="words", + shape=[1], + dtype="int64", + lod_level=1) + + label = fluid.layers.data( + name="label", + shape=[1], + dtype="int64") + + cost, acc, pred = network( + data, label, len(word_dict) + 1) + + sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=lr) + sgd_optimizer.minimize(cost) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=[data, label], place=place) + + exe.run(fluid.default_startup_program()) + for pass_id in xrange(pass_num): + data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0 + for data in train_reader(): + avg_cost_np, avg_acc_np = exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[cost, acc]) + data_size = len(data) + total_acc += data_size * avg_acc_np + total_cost += data_size * avg_cost_np + data_count += data_size + + avg_cost = total_cost / data_count + avg_acc = total_acc / data_count + print("[train info]: pass_id: %d, avg_acc: %f, avg_cost: %f" % (pass_id, avg_acc, avg_cost)) + + epoch_model = save_dirname + "/" + "epoch" + str(pass_id) + fluid.io.save_inference_model( + epoch_model, + ["words"], + pred, exe) + infer(test_reader, False, epoch_model) + + +def train_net(vocab="./thirdparty/train.vocab", + train_dir="./train", + test_list=["car", "spot", "weibo", "lbs"]): + w_dict = scdb_word_dict(vocab=vocab) + test_files = [ "./thirdparty" + os.sep + f for f in test_list] + + train_reader = paddle.batch( + scdb_train_data(train_dir, w_dict), + batch_size = 256) + + test_reader = [paddle.batch(scdb_test_data(test_file, w_dict), batch_size = 50) \ + for test_file in test_files] + + start_train(train_reader, test_reader, w_dict, bow_net, use_cuda=False, + parallel=False, save_dirname="scdb_bow_model", lr=0.002, + pass_num=10, batch_size=256) + + +if __name__ == "__main__": + train_net() -- GitLab