From 01a4c4741e0faf55d964808bdbe83ea46a371c4e Mon Sep 17 00:00:00 2001 From: lilong12 Date: Fri, 17 Jul 2020 14:05:02 +0800 Subject: [PATCH] bug fix (#70) --- demo/data_loader.py | 74 +++++++++++++++++++++++++++ demo/demo_noncv.py | 121 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 195 insertions(+) create mode 100644 demo/data_loader.py create mode 100644 demo/demo_noncv.py diff --git a/demo/data_loader.py b/demo/data_loader.py new file mode 100644 index 0000000..1a8ce2e --- /dev/null +++ b/demo/data_loader.py @@ -0,0 +1,74 @@ +import numpy as np +import sys +import os + +word_title_num = 50 +word_cont_num = 1024 +word_att_num = 10 +CLASS_NUM = 1284213 + +def pad_and_trunk(_list, fix_sz = -1): + if len(_list) > 0 and _list[0] == '': + _list = [] + _list = _list[:fix_sz] + if len(_list) < fix_sz: + pad = ['0' for i in range(fix_sz - len(_list))] + _list.extend(pad) + return _list + +def generate_reader(url2fea, topic2fea, _path, class_num=CLASS_NUM): + + def reader(): + print 'file open.' + trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + if os.getenv("PADDLE_TRAINER_ENDPOINTS"): + trainer_count = len(os.getenv("PADDLE_TRAINER_ENDPOINTS").split(",")) + else: + trainer_count = int(os.getenv("PADDLE_TRAINERS", "1")) + f = open(_path) + sample_index = 0 + for line in f: + line = line.strip('\n') + if len(line) == 0: + continue + + part = line.split('\t') + + url = part[0] + title_ids = part[1] + content_ids = part[2] + label = int(part[3]) + + if sample_index % trainer_count != trainer_id: + sample_index += 1 + continue + sample_index += 1 + + title_ids = pad_and_trunk(title_ids.split(','), word_title_num) + content_ids = pad_and_trunk(content_ids.split(','), word_cont_num) + + title_input_x_train = np.asarray(title_ids, dtype='int64').reshape( (len(title_ids), 1) ) + content_input_x_train = np.asarray(content_ids, dtype='int64').reshape( (len(content_ids), 1) ) + + label = np.array([label]) + yield title_input_x_train, content_input_x_train, label + + f.close() + print 'file close.' + return reader + +if __name__ == '__main__': + + #load_validation(url2fea, topic2fea, './data_makeup/merge_att_data/format_sample_v1/test.sample.shuffle') + + ''' + for (x1, x2, x3, y) in generate_batch_from_file(url2fea, topic2fea, \ + './data_makeup/merge_att_data/format_sample_v1/train.sample.shuffle', 50): + print x1[0], x2[0], x3[0], y[0] + break + ''' + + for x1, x2, x3, x4 in generate_reader(None, None, './data_makeup/merge_att_data/format_sample_v4/test.10w.sample.shuffle').reader(): + print x1, x2, x3, x4 + break + diff --git a/demo/demo_noncv.py b/demo/demo_noncv.py new file mode 100644 index 0000000..750e300 --- /dev/null +++ b/demo/demo_noncv.py @@ -0,0 +1,121 @@ +import os +import sys +from plsc import Entry +from plsc.models import BaseModel +import paddle +import paddle.fluid as fluid +from utils import LogUtil +import numpy as np + +CLASS_NUM = 1284213 + +from data_loader import generate_reader + +class UserModel(BaseModel): + def __init__(self, emb_dim=512): + self.emb_dim = emb_dim + + def build_network(self, + input, + is_train=True): + title_ids = input.title_ids + content_ids = input.content_ids + label = input.label + vob_size = 1841178 + 1 + #embedding layer + #current shape is [-1, seq_length, emb_dim] + word_title_sequence_input = fluid.layers.embedding( + input=title_ids, size=[vob_size, 128], is_sparse=False, + param_attr=fluid.ParamAttr(name='word_embedding')) + word_cont_sequence_input = fluid.layers.embedding( + input=content_ids, size=[vob_size, 128], is_sparse=False, + param_attr=fluid.ParamAttr(name='word_embedding')) + + #current shape is [-1, emb_dim, seq_length] + word_title_sequence_input = fluid.layers.transpose(word_title_sequence_input, perm=[0, 2, 1], name='title_transpose') + word_cont_sequence_input = fluid.layers.transpose(word_cont_sequence_input, perm=[0, 2, 1], name='cont_transpose') + + #current shape is [-1, emb_dim, 1, seq_length], which is NCHW format + _shape = word_title_sequence_input.shape + word_title_sequence_input = fluid.layers.reshape(x=word_title_sequence_input, + shape=[_shape[0], _shape[1], 1, _shape[2]], inplace=True, name='title_reshape') + _shape = word_cont_sequence_input.shape + word_cont_sequence_input = fluid.layers.reshape(x=word_cont_sequence_input, + shape=[_shape[0], _shape[1], 1, _shape[2]], inplace=True, name='cont_reshape') + + word_title_win_3 = fluid.layers.conv2d(input=word_title_sequence_input, num_filters=128, + filter_size=(1,3), stride=(1,1), padding=(0,1), act='relu', + name='word_title_win_3_conv') + + word_title_x = fluid.layers.pool2d(input=word_title_win_3, pool_size=(1,4), + pool_type='max', pool_stride=(1,4), + name='word_title_win_3_pool') + + word_cont_win_3 = fluid.layers.conv2d(input=word_cont_sequence_input, num_filters=128, + filter_size=(1,3), stride=(1,1), padding=(0,1), act='relu', + name='word_cont_win_3_conv') + + word_cont_x = fluid.layers.pool2d(input=word_cont_win_3, pool_size=(1,20), + pool_type='max', pool_stride=(1,20), + name='word_cont_win_3_pool') + + print('word_title_x.shape:', word_title_x.shape) + print('word_cont_x.shape:', word_cont_x.shape) + x_concat = fluid.layers.concat(input=[word_title_x, word_cont_x], axis=3, name='feature_concat') + x_flatten = fluid.layers.flatten(x=x_concat, axis=1, name='feature_flatten') + x_fc = fluid.layers.fc(input=x_flatten, size=self.emb_dim, act="relu", name='final_fc') + return x_fc + + +def train(url2fea_path, topic2fea_path, train_path, val_path, model_save_dir): + ins = Entry() + ins.set_with_test(False) + ins.set_train_epochs(20) + + #load id features + + word_title_num = 50 + word_cont_num = 1024 + batch_size = int(os.getenv("BATCH_SIZE", "64")) + + input_info = [{'name': 'title_ids', + 'shape': [-1, word_title_num, 1], + 'dtype': 'int64'}, + {'name': 'content_ids', + 'shape': [-1, word_cont_num, 1], + 'dtype': 'int64'}, + {'name': 'label', + 'shape': [-1, 1], + 'dtype': 'int64'} + ] + ins.set_input_info(input_info) + ins.set_class_num(CLASS_NUM) + + emb_dim = int(os.getenv("EMB_DIM", "512")) + model = UserModel(emb_dim=emb_dim) + ins.set_model(model) + ins.set_train_batch_size(batch_size) + + sgd_optimizer = fluid.optimizer.Adam(learning_rate=1e-3) + ins.set_optimizer(sgd_optimizer) + + train_reader = generate_reader(None, None, train_path) + ins.train_reader = train_reader + + ins.set_train_epochs(20) + ins.set_model_save_dir("./saved_model") + ins.set_loss_type('dist_softmax') + ins.train() + + + +if __name__ == "__main__": + data = './package/' + url2fea_path = data + 'click_search_all.url_title_cont.seg.lower.id' + topic2fea_path = data + 'click_search_all.att.seg.id' + train_path = data +'train.sample.shuffle.label_expand' + val_path = data +'test.10w.sample.shuffle.label_expand' + model_save_dir = data + 'saved_models' + + train(url2fea_path, topic2fea_path, train_path, val_path, model_save_dir) + -- GitLab