Revert "bug fix (#70)"

This reverts commit 01a4c474.

Revert "bug fix (#70)"
This reverts commit 01a4c474.
6a1eade6 · lilong12 · GitHub · 01a4c474 · 01a4c474 · 01a4c474
隐藏空白更改
内联并排

Showing with 0 addition and 195 deletion

demo/data_loader.py demo/data_loader.py +0 -74

demo/demo_noncv.py demo/demo_noncv.py +0 -121

未找到文件。
--- a/demo/data_loader.py
+++ b/demo/data_loader.py
-import numpy as np
-import sys
-import os
-word_title_num = 50
-word_cont_num = 1024
-word_att_num = 10
-CLASS_NUM = 1284213
-def pad_and_trunk(_list, fix_sz = -1):
-    if len(_list) > 0 and _list[0] == '':
-        _list = []
-    _list = _list[:fix_sz]
-    if len(_list) < fix_sz:
-        pad = ['0' for i in range(fix_sz - len(_list))]
-        _list.extend(pad)
-    return _list
-def generate_reader(url2fea, topic2fea, _path, class_num=CLASS_NUM):
-    def reader():
-        print 'file open.'
-        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
-        if os.getenv("PADDLE_TRAINER_ENDPOINTS"):
-            trainer_count = len(os.getenv("PADDLE_TRAINER_ENDPOINTS").split(","))
-        else:
-            trainer_count = int(os.getenv("PADDLE_TRAINERS", "1"))
-        f = open(_path)
-        sample_index = 0
-        for line in f:
-            line = line.strip('\n')
-            if len(line) == 0:
-                continue
-            part = line.split('\t')
-            url  = part[0]
-            title_ids = part[1]
-            content_ids = part[2]
-            label = int(part[3])
-            if sample_index % trainer_count != trainer_id:
-                sample_index += 1
-                continue
-            sample_index += 1
-            title_ids = pad_and_trunk(title_ids.split(','), word_title_num)
-            content_ids = pad_and_trunk(content_ids.split(','), word_cont_num)
-            title_input_x_train = np.asarray(title_ids, dtype='int64').reshape( (len(title_ids), 1) )
-            content_input_x_train = np.asarray(content_ids, dtype='int64').reshape( (len(content_ids), 1) )
-            label = np.array([label])
-            yield title_input_x_train, content_input_x_train, label
-        f.close()
-        print 'file close.'
-    return reader
-if __name__ == '__main__':
-    #load_validation(url2fea, topic2fea, './data_makeup/merge_att_data/format_sample_v1/test.sample.shuffle')
-    '''
-    for (x1, x2, x3, y) in generate_batch_from_file(url2fea, topic2fea, \
-                    './data_makeup/merge_att_data/format_sample_v1/train.sample.shuffle', 50):
-        print x1[0], x2[0], x3[0], y[0]
-        break
-    '''
-    for x1, x2, x3, x4 in generate_reader(None, None, './data_makeup/merge_att_data/format_sample_v4/test.10w.sample.shuffle').reader():
-        print x1, x2, x3, x4
-        break
--- a/demo/demo_noncv.py
+++ b/demo/demo_noncv.py
-import os
-import sys
-from plsc import Entry
-from plsc.models import BaseModel
-import paddle
-import paddle.fluid as fluid
-from utils import LogUtil
-import numpy as np
-CLASS_NUM = 1284213
-from data_loader import generate_reader
-class UserModel(BaseModel):
-    def __init__(self, emb_dim=512):
-        self.emb_dim = emb_dim
-    def build_network(self,
-                      input,
-                      is_train=True):
-        title_ids = input.title_ids
-        content_ids = input.content_ids
-        label = input.label
-        vob_size = 1841178 + 1
-        #embedding layer
-        #current shape is [-1, seq_length, emb_dim]
-        word_title_sequence_input = fluid.layers.embedding(
-            input=title_ids, size=[vob_size, 128], is_sparse=False,
-            param_attr=fluid.ParamAttr(name='word_embedding'))
-        word_cont_sequence_input = fluid.layers.embedding(
-            input=content_ids, size=[vob_size, 128], is_sparse=False,
-            param_attr=fluid.ParamAttr(name='word_embedding'))
-        #current shape is [-1, emb_dim, seq_length]
-        word_title_sequence_input = fluid.layers.transpose(word_title_sequence_input, perm=[0, 2, 1], name='title_transpose')
-        word_cont_sequence_input = fluid.layers.transpose(word_cont_sequence_input, perm=[0, 2, 1], name='cont_transpose')
-        #current shape is [-1, emb_dim, 1, seq_length], which is NCHW format
-        _shape = word_title_sequence_input.shape
-        word_title_sequence_input = fluid.layers.reshape(x=word_title_sequence_input,
-                    shape=[_shape[0], _shape[1], 1, _shape[2]], inplace=True, name='title_reshape')
-        _shape = word_cont_sequence_input.shape
-        word_cont_sequence_input = fluid.layers.reshape(x=word_cont_sequence_input,
-                    shape=[_shape[0], _shape[1], 1, _shape[2]], inplace=True, name='cont_reshape')
-        word_title_win_3 = fluid.layers.conv2d(input=word_title_sequence_input, num_filters=128,
-                                            filter_size=(1,3), stride=(1,1), padding=(0,1), act='relu',
-                                            name='word_title_win_3_conv')
-        word_title_x = fluid.layers.pool2d(input=word_title_win_3, pool_size=(1,4),
-                                           pool_type='max', pool_stride=(1,4),
-                                           name='word_title_win_3_pool')
-        word_cont_win_3 = fluid.layers.conv2d(input=word_cont_sequence_input, num_filters=128,
-                                            filter_size=(1,3), stride=(1,1), padding=(0,1), act='relu',
-                                            name='word_cont_win_3_conv')
-        word_cont_x = fluid.layers.pool2d(input=word_cont_win_3, pool_size=(1,20),
-                                           pool_type='max', pool_stride=(1,20), 
-                                           name='word_cont_win_3_pool')
-        print('word_title_x.shape:', word_title_x.shape)
-        print('word_cont_x.shape:', word_cont_x.shape)
-        x_concat = fluid.layers.concat(input=[word_title_x, word_cont_x], axis=3, name='feature_concat')
-        x_flatten = fluid.layers.flatten(x=x_concat, axis=1, name='feature_flatten')
-        x_fc = fluid.layers.fc(input=x_flatten, size=self.emb_dim, act="relu", name='final_fc')
-        return x_fc
-def train(url2fea_path, topic2fea_path, train_path, val_path, model_save_dir):
-    ins = Entry()
-    ins.set_with_test(False)
-    ins.set_train_epochs(20)
-    #load id features
-    word_title_num = 50
-    word_cont_num = 1024
-    batch_size = int(os.getenv("BATCH_SIZE", "64"))
-    input_info = [{'name': 'title_ids',
-                   'shape': [-1, word_title_num, 1],
-                   'dtype': 'int64'},
-                  {'name': 'content_ids',
-                   'shape': [-1, word_cont_num, 1],
-                   'dtype': 'int64'},
-                  {'name': 'label',
-                   'shape': [-1, 1],
-                   'dtype': 'int64'}
-                 ]
-    ins.set_input_info(input_info)
-    ins.set_class_num(CLASS_NUM)
-    emb_dim = int(os.getenv("EMB_DIM", "512"))
-    model = UserModel(emb_dim=emb_dim)
-    ins.set_model(model)
-    ins.set_train_batch_size(batch_size)
-    sgd_optimizer = fluid.optimizer.Adam(learning_rate=1e-3)
-    ins.set_optimizer(sgd_optimizer)
-    train_reader = generate_reader(None, None, train_path)
-    ins.train_reader = train_reader
-    ins.set_train_epochs(20)
-    ins.set_model_save_dir("./saved_model")
-    ins.set_loss_type('dist_softmax')
-    ins.train()
-if __name__ == "__main__":
-    data = './package/'
-    url2fea_path = data + 'click_search_all.url_title_cont.seg.lower.id'
-    topic2fea_path = data + 'click_search_all.att.seg.id'
-    train_path = data +'train.sample.shuffle.label_expand'
-    val_path = data +'test.10w.sample.shuffle.label_expand'
-    model_save_dir = data + 'saved_models'
-    train(url2fea_path, topic2fea_path, train_path, val_path, model_save_dir)