scdb_parallel_executor.py 14.0 KB
Newer Older
Y
Yao Cheng 已提交
1 2
import unittest
import contextlib
Y
Yibing Liu 已提交
3
import paddle
Y
Yao Cheng 已提交
4 5
import paddle.fluid as fluid
import numpy as np
M
minqiyang 已提交
6
import six
Y
Yao Cheng 已提交
7 8 9 10 11 12
import sys
import time
import os
import json
import random

C
chengyao 已提交
13

Y
Yao Cheng 已提交
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
def to_lodtensor(data, place):
    """
    convert to LODtensor
    """
    seq_lens = [len(seq) for seq in data]
    cur_len = 0
    lod = [cur_len]
    for l in seq_lens:
        cur_len += l
        lod.append(cur_len)
    flattened_data = np.concatenate(data, axis=0).astype("int64")
    flattened_data = flattened_data.reshape([len(flattened_data), 1])
    res = fluid.LoDTensor()
    res.set(flattened_data, place)
    res.set_lod([lod])
    return res


def load_vocab(filename):
    """
    load imdb vocabulary
    """
    vocab = {}
    with open(filename) as f:
        wid = 0
        for line in f:
            vocab[line.strip()] = wid
            wid += 1
    vocab["<unk>"] = len(vocab)
    return vocab


def data2tensor(data, place):
    """
    data2tensor
    """
M
minqiyang 已提交
50 51
    input_seq = to_lodtensor([x[0] for x in data], place)
    y_data = np.array([x[1] for x in data]).astype("int64")
Y
Yao Cheng 已提交
52 53 54
    y_data = y_data.reshape([-1, 1])
    return {"words": input_seq, "label": y_data}

C
chengyao 已提交
55

Y
Yao Cheng 已提交
56 57 58 59
def data2pred(data, place):
    """
    data2tensor
    """
M
minqiyang 已提交
60 61
    input_seq = to_lodtensor([x[0] for x in data], place)
    y_data = np.array([x[1] for x in data]).astype("int64")
Y
Yao Cheng 已提交
62 63 64
    y_data = y_data.reshape([-1, 1])
    return {"words": input_seq}

C
chengyao 已提交
65

Y
Yao Cheng 已提交
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
def load_dict(vocab):
    """
    Load dict from vocab
    """
    word_dict = dict()
    with open(vocab, "r") as fin:
        for line in fin:
            cols = line.strip("\r\n").decode("gb18030").split("\t")
            word_dict[cols[0]] = int(cols[1])
    return word_dict


def save_dict(word_dict, vocab):
    """
    Save dict into file
    """
    with open(vocab, "w") as fout:
M
minqiyang 已提交
83
        for k, v in six.iteritems(word_dict):
Y
Yao Cheng 已提交
84 85 86
            outstr = ("%s\t%s\n" % (k, v)).encode("gb18030")
            fout.write(outstr)

C
chengyao 已提交
87

Y
Yao Cheng 已提交
88 89 90 91 92 93 94 95
def build_dict(fname):
    """
    build word dict using trainset
    """
    word_dict = dict()
    with open(fname, "r") as fin:
        for line in fin:
            try:
C
chengyao 已提交
96 97
                words = line.strip("\r\n").decode("gb18030").split("\t")[
                    1].split(" ")
Y
Yao Cheng 已提交
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
            except:
                sys.stderr.write("[warning] build_dict: decode error\n")
                continue
            for w in words:
                if w not in word_dict:
                    word_dict[w] = len(word_dict)
    return word_dict


def scdb_word_dict(vocab="scdb_data/train_set/train.vocab"):
    """
    get word_dict
    """
    if not os.path.exists(vocab):
        w_dict = build_dict(train_file)
        save_dict(w_dict, vocab)
    else:
        w_dict = load_dict(vocab)
    w_dict["<unk>"] = len(w_dict)
    return w_dict


def data_reader(fname, word_dict, is_dir=False):
    """
    Convert word sequence into slot
    """
    unk_id = len(word_dict)
    all_data = []
    filelist = []
    if is_dir:
        filelist = [fname + os.sep + f for f in os.listdir(fname)]
    else:
        filelist = [fname]

    for each_name in filelist:
        with open(each_name, "r") as fin:
            for line in fin:
                try:
                    cols = line.strip("\r\n").decode("gb18030").split("\t")
                except:
                    sys.stderr.write("warning: ignore decode error\n")
                    continue

                label = int(cols[0])
C
chengyao 已提交
142 143 144 145
                wids = [
                    word_dict[x] if x in word_dict else unk_id
                    for x in cols[1].split(" ")
                ]
Y
Yao Cheng 已提交
146 147 148 149 150 151 152
                all_data.append((wids, label))

    random.shuffle(all_data)

    def reader():
        for doc, label in all_data:
            yield doc, label
C
chengyao 已提交
153

Y
Yao Cheng 已提交
154 155 156
    return reader


C
chengyao 已提交
157 158
def scdb_train_data(train_dir="scdb_data/train_set/corpus.train.seg",
                    w_dict=None):
Y
Yao Cheng 已提交
159 160 161 162 163 164 165 166
    """
    create train data
    """
    return data_reader(train_dir, w_dict, True)


def scdb_test_data(test_file, w_dict):
    """
M
minqiyang 已提交
167
    test_set=["car", "lbs", "spot", "weibo",
Y
Yao Cheng 已提交
168 169 170 171 172
            "baby", "toutiao", "3c", "movie", "haogan"]
    """
    return data_reader(test_file, w_dict)


C
chengyao 已提交
173 174
def bow_net(data,
            label,
Y
Yao Cheng 已提交
175 176 177 178 179 180 181 182
            dict_dim,
            emb_dim=128,
            hid_dim=128,
            hid_dim2=96,
            class_dim=2):
    """
    bow net
    """
C
chengyao 已提交
183 184
    emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
Y
Yao Cheng 已提交
185
    bow_tanh = fluid.layers.tanh(bow)
C
chengyao 已提交
186 187 188
    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
Y
Yao Cheng 已提交
189 190 191
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(x=cost)
    acc = fluid.layers.accuracy(input=prediction, label=label)
C
chengyao 已提交
192

Y
Yao Cheng 已提交
193 194 195
    return avg_cost, acc, prediction


C
chengyao 已提交
196 197
def cnn_net(data,
            label,
Y
Yao Cheng 已提交
198 199 200 201 202 203 204 205 206
            dict_dim,
            emb_dim=128,
            hid_dim=128,
            hid_dim2=96,
            class_dim=2,
            win_size=3):
    """
    conv net
    """
C
chengyao 已提交
207
    emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
Y
Yao Cheng 已提交
208

C
chengyao 已提交
209 210 211 212 213 214
    conv_3 = fluid.nets.sequence_conv_pool(
        input=emb,
        num_filters=hid_dim,
        filter_size=win_size,
        act="tanh",
        pool_type="max")
Y
Yao Cheng 已提交
215

C
chengyao 已提交
216
    fc_1 = fluid.layers.fc(input=[conv_3], size=hid_dim2)
Y
Yao Cheng 已提交
217

C
chengyao 已提交
218
    prediction = fluid.layers.fc(input=[fc_1], size=class_dim, act="softmax")
Y
Yao Cheng 已提交
219 220 221
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(x=cost)
    acc = fluid.layers.accuracy(input=prediction, label=label)
C
chengyao 已提交
222

Y
Yao Cheng 已提交
223 224 225
    return avg_cost, acc, prediction


C
chengyao 已提交
226 227 228 229 230 231 232 233
def lstm_net(data,
             label,
             dict_dim,
             emb_dim=128,
             hid_dim=128,
             hid_dim2=96,
             class_dim=2,
             emb_lr=30.0):
Y
Yao Cheng 已提交
234 235 236
    """
    lstm net
    """
C
chengyao 已提交
237 238 239 240
    emb = fluid.layers.embedding(
        input=data,
        size=[dict_dim, emb_dim],
        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
Y
Yao Cheng 已提交
241

C
chengyao 已提交
242
    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
Y
Yao Cheng 已提交
243

C
chengyao 已提交
244 245
    lstm_h, c = fluid.layers.dynamic_lstm(
        input=fc0, size=hid_dim * 4, is_reverse=False)
Y
Yao Cheng 已提交
246

C
chengyao 已提交
247
    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
Y
Yao Cheng 已提交
248 249
    lstm_max_tanh = fluid.layers.tanh(lstm_max)

C
chengyao 已提交
250
    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
Y
Yao Cheng 已提交
251

C
chengyao 已提交
252
    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
Y
Yao Cheng 已提交
253 254 255 256

    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(x=cost)
    acc = fluid.layers.accuracy(input=prediction, label=label)
C
chengyao 已提交
257

Y
Yao Cheng 已提交
258 259 260
    return avg_cost, acc, prediction


C
chengyao 已提交
261 262 263 264 265 266 267 268
def bilstm_net(data,
               label,
               dict_dim,
               emb_dim=128,
               hid_dim=128,
               hid_dim2=96,
               class_dim=2,
               emb_lr=30.0):
Y
Yao Cheng 已提交
269 270 271
    """
    lstm net
    """
C
chengyao 已提交
272 273 274 275
    emb = fluid.layers.embedding(
        input=data,
        size=[dict_dim, emb_dim],
        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
Y
Yao Cheng 已提交
276

C
chengyao 已提交
277
    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
Y
Yao Cheng 已提交
278

C
chengyao 已提交
279
    rfc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
Y
Yao Cheng 已提交
280

C
chengyao 已提交
281 282 283 284 285
    lstm_h, c = fluid.layers.dynamic_lstm(
        input=fc0, size=hid_dim * 4, is_reverse=False)

    rlstm_h, c = fluid.layers.dynamic_lstm(
        input=rfc0, size=hid_dim * 4, is_reverse=True)
Y
Yao Cheng 已提交
286 287 288

    lstm_last = fluid.layers.sequence_last_step(input=lstm_h)
    rlstm_last = fluid.layers.sequence_last_step(input=rlstm_h)
C
chengyao 已提交
289

Y
Yao Cheng 已提交
290 291 292 293 294
    lstm_last_tanh = fluid.layers.tanh(lstm_last)
    rlstm_last_tanh = fluid.layers.tanh(rlstm_last)

    lstm_concat = fluid.layers.concat(input=[lstm_last, rlstm_last], axis=1)

C
chengyao 已提交
295
    fc1 = fluid.layers.fc(input=lstm_concat, size=hid_dim2, act='tanh')
Y
Yao Cheng 已提交
296

C
chengyao 已提交
297
    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
Y
Yao Cheng 已提交
298 299 300 301

    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(x=cost)
    acc = fluid.layers.accuracy(input=prediction, label=label)
C
chengyao 已提交
302

Y
Yao Cheng 已提交
303 304 305
    return avg_cost, acc, prediction


C
chengyao 已提交
306 307
def gru_net(data,
            label,
Y
Yao Cheng 已提交
308 309 310 311 312 313 314 315 316
            dict_dim,
            emb_dim=128,
            hid_dim=128,
            hid_dim2=96,
            class_dim=2,
            emb_lr=30.0):
    """
    gru net
    """
C
chengyao 已提交
317 318 319 320
    emb = fluid.layers.embedding(
        input=data,
        size=[dict_dim, emb_dim],
        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
Y
Yao Cheng 已提交
321

C
chengyao 已提交
322
    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3)
Y
Yao Cheng 已提交
323

C
chengyao 已提交
324
    gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False)
Y
Yao Cheng 已提交
325

C
chengyao 已提交
326
    gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max')
Y
Yao Cheng 已提交
327 328
    gru_max_tanh = fluid.layers.tanh(gru_max)

C
chengyao 已提交
329
    fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh')
Y
Yao Cheng 已提交
330

C
chengyao 已提交
331
    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
Y
Yao Cheng 已提交
332 333 334 335

    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(x=cost)
    acc = fluid.layers.accuracy(input=prediction, label=label)
C
chengyao 已提交
336

Y
Yao Cheng 已提交
337 338
    return avg_cost, acc, prediction

C
chengyao 已提交
339 340

def infer(test_reader, use_cuda, model_path=None):
Y
Yao Cheng 已提交
341 342 343 344 345 346 347 348 349
    """
    inference function
    """
    if model_path is None:
        print(str(model_path) + " cannot be found")
        return

    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
C
chengyao 已提交
350

Y
Yao Cheng 已提交
351 352 353
    inference_scope = fluid.core.Scope()
    with fluid.scope_guard(inference_scope):
        [inference_program, feed_target_names,
C
chengyao 已提交
354
         fetch_targets] = fluid.io.load_inference_model(model_path, exe)
Y
Yao Cheng 已提交
355 356 357 358 359

        class2_list, class3_list = [], []
        for each_test_reader in test_reader:
            class2_acc, class3_acc = 0.0, 0.0
            total_count, neu_count = 0, 0
C
chengyao 已提交
360

Y
Yao Cheng 已提交
361 362
            for data in each_test_reader():
                pred = exe.run(inference_program,
C
chengyao 已提交
363 364 365 366
                               feed=data2pred(data, place),
                               fetch_list=fetch_targets,
                               return_numpy=True)

Y
Yao Cheng 已提交
367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385
                for i, val in enumerate(data):
                    pos_score = pred[0][i, 1]
                    true_label = val[1]
                    if true_label == 2.0 and pos_score > 0.5:
                        class2_acc += 1
                    if true_label == 0.0 and pos_score < 0.5:
                        class2_acc += 1

                    if true_label == 2.0 and pos_score > 0.55:
                        class3_acc += 1
                    if true_label == 1.0 and pos_score > 0.45 and pos_score <= 0.55:
                        class3_acc += 1
                    if true_label == 0.0 and pos_score <= 0.45:
                        class3_acc += 1

                    if true_label == 1.0:
                        neu_count += 1

                total_count += len(data)
C
chengyao 已提交
386

Y
Yao Cheng 已提交
387 388 389 390 391 392 393
            class2_acc = class2_acc / (total_count - neu_count)
            class3_acc = class3_acc / total_count
            class2_list.append(class2_acc)
            class3_list.append(class3_acc)

        class2_acc = sum(class2_list) / len(class2_list)
        class3_acc = sum(class3_list) / len(class3_list)
C
chengyao 已提交
394 395
        print("[test info] model_path: %s, class2_acc: %f, class3_acc: %f" %
              (model_path, class2_acc, class3_acc))
Y
Yao Cheng 已提交
396 397 398


def start_train(train_reader,
C
chengyao 已提交
399 400 401 402 403 404 405 406 407
                test_reader,
                word_dict,
                network,
                use_cuda,
                parallel,
                save_dirname,
                lr=0.2,
                batch_size=128,
                pass_num=30):
Y
Yao Cheng 已提交
408 409 410 411
    """
    train network
    """
    data = fluid.layers.data(
C
chengyao 已提交
412
        name="words", shape=[1], dtype="int64", lod_level=1)
Y
Yao Cheng 已提交
413

C
chengyao 已提交
414
    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
Y
Yao Cheng 已提交
415

C
chengyao 已提交
416
    cost, acc, pred = network(data, label, len(word_dict) + 1)
Y
Yao Cheng 已提交
417 418 419 420 421 422

    sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=lr)
    sgd_optimizer.minimize(cost)

    place = fluid.CPUPlace()
    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
C
chengyao 已提交
423

Y
Yao Cheng 已提交
424 425
    start_exe = fluid.Executor(place)
    start_exe.run(fluid.default_startup_program())
C
chengyao 已提交
426

Y
Yao Cheng 已提交
427
    exe = fluid.ParallelExecutor(use_cuda, loss_name=cost.name)
M
minqiyang 已提交
428
    for pass_id in six.moves.xrange(pass_num):
Y
Yao Cheng 已提交
429 430 431 432 433 434 435 436
        total_acc, total_cost, total_count, avg_cost, avg_acc = 0.0, 0.0, 0.0, 0.0, 0.0
        for data in train_reader():
            cost_val, acc_val = exe.run(feed=feeder.feed(data),
                                        fetch_list=[cost.name, acc.name])
            cost_val_list, acc_val_list = np.array(cost_val), np.array(acc_val)
            total_cost += cost_val_list.sum() * len(data)
            total_acc += acc_val_list.sum() * len(data)
            total_count += len(data)
C
chengyao 已提交
437

Y
Yao Cheng 已提交
438 439
        avg_cost = total_cost / total_count
        avg_acc = total_acc / total_count
C
chengyao 已提交
440 441 442
        print("[train info]: pass_id: %d, avg_acc: %f, avg_cost: %f" %
              (pass_id, avg_acc, avg_cost))

Y
Yao Cheng 已提交
443 444 445
        gpu_place = fluid.CUDAPlace(0)
        save_exe = fluid.Executor(gpu_place)
        epoch_model = save_dirname + "/" + "epoch" + str(pass_id)
C
chengyao 已提交
446
        fluid.io.save_inference_model(epoch_model, ["words"], pred, save_exe)
Y
Yao Cheng 已提交
447 448 449 450
        infer(test_reader, False, epoch_model)


def train_net(vocab="./thirdparty/train.vocab",
C
chengyao 已提交
451 452
              train_dir="./train",
              test_list=["car", "spot", "weibo", "lbs"]):
Y
Yao Cheng 已提交
453 454 455
    """
    w_dict = scdb_word_dict(vocab=vocab)
    test_files = [ "./thirdparty" + os.sep + f for f in test_list]
M
minqiyang 已提交
456

Y
Yao Cheng 已提交
457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472
    train_reader = paddle.batch(
                        scdb_train_data(train_dir, w_dict),
                        batch_size = 256)

    test_reader = [paddle.batch(scdb_test_data(test_file, w_dict), batch_size = 50) \
            for test_file in test_files]
    """
    w_dict = paddle.dataset.imdb.word_dict()
    print("dict ready")
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.imdb.train(w_dict), buf_size=50000),
        batch_size=128)

    test_reader = paddle.batch(
        paddle.reader.shuffle(
C
chengyao 已提交
473
            paddle.dataset.imdb.test(w_dict), buf_size=50000),
Y
Yao Cheng 已提交
474 475
        batch_size=128)
    test_reader = [test_reader]
C
chengyao 已提交
476 477 478 479 480 481 482 483 484 485 486
    start_train(
        train_reader,
        test_reader,
        w_dict,
        bilstm_net,
        use_cuda=True,
        parallel=False,
        save_dirname="scdb_bilstm_model",
        lr=0.05,
        pass_num=10,
        batch_size=256)
Y
Yao Cheng 已提交
487 488 489 490


if __name__ == "__main__":
    train_net()