test_simnet.py 5.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import random
import unittest

19
import numpy as np
20 21
from simnet_dygraph_model import BOW, HingeLoss

22
import paddle
23
from paddle import fluid
24

25 26 27 28 29 30
SEED = 102
random.seed(SEED)


def create_conf_dict():
    conf_dict = {}
31
    conf_dict["task_mode"] = "pairwise"
32 33 34 35 36 37 38
    conf_dict["net"] = {"emb_dim": 128, "bow_dim": 128, "hidden_dim": 128}
    conf_dict["loss"] = {"margin": 0.1}
    return conf_dict


def parse_args():
    parser = argparse.ArgumentParser()
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
    parser.add_argument(
        "--batch_size",
        type=int,
        default=32,
        help="Total examples' number in batch for training.",
    )
    parser.add_argument(
        "--seq_len", type=int, default=32, help="The length of each sentence."
    )
    parser.add_argument(
        "--epoch", type=int, default=1, help="The number of training epoch."
    )
    parser.add_argument(
        "--fake_sample_size",
        type=int,
        default=128,
        help="The number of samples of fake data.",
    )
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
    args = parser.parse_args([])
    return args


args = parse_args()


def fake_vocabulary():
    vocab = {}
    vocab["<unk>"] = 0
    for i in range(26):
        c = chr(ord('a') + i)
        vocab[c] = i + 1
    return vocab


vocab = fake_vocabulary()


76
class FakeReaderProcessor:
77 78 79 80 81 82 83 84 85 86
    def __init__(self, args, vocab):
        self.vocab = vocab
        self.seq_len = args.seq_len
        self.sample_size = args.fake_sample_size
        self.data_samples = []
        for i in range(self.sample_size):
            query = [random.randint(0, 26) for i in range(self.seq_len)]
            pos_title = query[:]
            neg_title = [26 - q for q in query]
            self.data_samples.append(
87 88
                np.array([query, pos_title, neg_title]).astype(np.int64)
            )
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105

    def get_reader(self, mode, epoch=0):
        def reader_with_pairwise():
            if mode == "train":
                for i in range(self.sample_size):
                    yield self.data_samples[i]

        return reader_with_pairwise


simnet_process = FakeReaderProcessor(args, vocab)


def train(conf_dict, to_static):
    """
    train process
    """
R
Ryan 已提交
106
    paddle.jit.enable_to_static(to_static)
107 108 109 110 111 112 113 114

    # Get device
    if fluid.is_compiled_with_cuda():
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()

    with fluid.dygraph.guard(place):
C
cnn 已提交
115
        paddle.seed(SEED)
L
Leo Chen 已提交
116
        paddle.framework.random._manual_program_seed(SEED)
117 118 119 120 121 122 123 124 125 126 127

        conf_dict['dict_size'] = len(vocab)
        conf_dict['seq_len'] = args.seq_len

        net = BOW(conf_dict)
        loss = HingeLoss(conf_dict)
        optimizer = fluid.optimizer.AdamOptimizer(
            learning_rate=0.001,
            beta1=0.9,
            beta2=0.999,
            epsilon=1e-08,
128 129
            parameter_list=net.parameters(),
        )
130 131 132 133 134 135 136

        metric = fluid.metrics.Auc(name="auc")

        global_step = 0
        losses = []

        train_loader = fluid.io.DataLoader.from_generator(
137 138 139 140 141
            capacity=16, return_list=True, iterable=True, use_double_buffer=True
        )
        get_train_examples = simnet_process.get_reader(
            "train", epoch=args.epoch
        )
142
        train_loader.set_sample_list_generator(
143 144
            paddle.batch(get_train_examples, batch_size=args.batch_size), place
        )
145 146

        for left, pos_right, neg_right in train_loader():
147 148 149
            left = paddle.reshape(left, shape=[-1, 1])
            pos_right = paddle.reshape(pos_right, shape=[-1, 1])
            neg_right = paddle.reshape(neg_right, shape=[-1, 1])
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
            net.train()
            global_step += 1
            left_feat, pos_score = net(left, pos_right)
            pred = pos_score
            _, neg_score = net(left, neg_right)
            avg_cost = loss.compute(pos_score, neg_score)
            losses.append(np.mean(avg_cost.numpy()))
            avg_cost.backward()
            optimizer.minimize(avg_cost)
            net.clear_gradients()
    return losses


class TestSimnet(unittest.TestCase):
    def test_dygraph_static_same_loss(self):
        if fluid.is_compiled_with_cuda():
            fluid.set_flags({"FLAGS_cudnn_deterministic": True})
        conf_dict = create_conf_dict()
        dygraph_loss = train(conf_dict, to_static=False)
        static_loss = train(conf_dict, to_static=True)

        self.assertEqual(len(dygraph_loss), len(static_loss))
        for i in range(len(dygraph_loss)):
            self.assertAlmostEqual(dygraph_loss[i], static_loss[i])


if __name__ == '__main__':
177
    unittest.main()