test_simnet.py 5.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import random
import unittest

19
import numpy as np
20
from dygraph_to_static_util import test_and_compare_with_new_ir
21 22
from simnet_dygraph_model import BOW, HingeLoss

23
import paddle
24
from paddle import fluid
25

26 27 28 29 30 31
SEED = 102
random.seed(SEED)


def create_conf_dict():
    conf_dict = {}
32
    conf_dict["task_mode"] = "pairwise"
33 34 35 36 37 38 39
    conf_dict["net"] = {"emb_dim": 128, "bow_dim": 128, "hidden_dim": 128}
    conf_dict["loss"] = {"margin": 0.1}
    return conf_dict


def parse_args():
    parser = argparse.ArgumentParser()
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
    parser.add_argument(
        "--batch_size",
        type=int,
        default=32,
        help="Total examples' number in batch for training.",
    )
    parser.add_argument(
        "--seq_len", type=int, default=32, help="The length of each sentence."
    )
    parser.add_argument(
        "--epoch", type=int, default=1, help="The number of training epoch."
    )
    parser.add_argument(
        "--fake_sample_size",
        type=int,
        default=128,
        help="The number of samples of fake data.",
    )
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
    args = parser.parse_args([])
    return args


args = parse_args()


def fake_vocabulary():
    vocab = {}
    vocab["<unk>"] = 0
    for i in range(26):
        c = chr(ord('a') + i)
        vocab[c] = i + 1
    return vocab


vocab = fake_vocabulary()


J
JYChen 已提交
77 78
class FakeReaderProcessor(paddle.io.Dataset):
    def __init__(self, args, vocab, length):
79 80 81 82 83 84 85 86 87
        self.vocab = vocab
        self.seq_len = args.seq_len
        self.sample_size = args.fake_sample_size
        self.data_samples = []
        for i in range(self.sample_size):
            query = [random.randint(0, 26) for i in range(self.seq_len)]
            pos_title = query[:]
            neg_title = [26 - q for q in query]
            self.data_samples.append(
88 89
                np.array([query, pos_title, neg_title]).astype(np.int64)
            )
J
JYChen 已提交
90 91 92 93
        self.query = []
        self.pos_title = []
        self.neg_title = []
        self._init_data(length)
94 95 96 97 98 99 100 101 102

    def get_reader(self, mode, epoch=0):
        def reader_with_pairwise():
            if mode == "train":
                for i in range(self.sample_size):
                    yield self.data_samples[i]

        return reader_with_pairwise

J
JYChen 已提交
103 104 105 106 107 108 109 110
    def _init_data(self, length):
        reader = self.get_reader("train", epoch=args.epoch)()
        for i, yield_data in enumerate(reader):
            if i >= length:
                break
            self.query.append(yield_data[0])
            self.pos_title.append(yield_data[1])
            self.neg_title.append(yield_data[2])
111

J
JYChen 已提交
112 113 114 115 116 117 118 119 120 121
    def __getitem__(self, idx):
        return self.query[idx], self.pos_title[idx], self.neg_title[idx]

    def __len__(self):
        return len(self.query)


simnet_process = FakeReaderProcessor(
    args, vocab, args.batch_size * (args.epoch + 1)
)
122 123 124 125 126 127


def train(conf_dict, to_static):
    """
    train process
    """
R
Ryan 已提交
128
    paddle.jit.enable_to_static(to_static)
129 130 131 132 133 134 135 136

    # Get device
    if fluid.is_compiled_with_cuda():
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()

    with fluid.dygraph.guard(place):
C
cnn 已提交
137
        paddle.seed(SEED)
L
Leo Chen 已提交
138
        paddle.framework.random._manual_program_seed(SEED)
139 140 141 142 143 144

        conf_dict['dict_size'] = len(vocab)
        conf_dict['seq_len'] = args.seq_len

        net = BOW(conf_dict)
        loss = HingeLoss(conf_dict)
L
LoneRanger 已提交
145
        optimizer = paddle.optimizer.Adam(
146 147 148 149
            learning_rate=0.001,
            beta1=0.9,
            beta2=0.999,
            epsilon=1e-08,
L
LoneRanger 已提交
150
            parameters=net.parameters(),
151
        )
152

153
        metric = paddle.metric.Auc(name="auc")
154 155 156 157

        global_step = 0
        losses = []

J
JYChen 已提交
158 159
        train_loader = paddle.io.DataLoader(
            simnet_process, batch_size=args.batch_size, places=[place]
160
        )
161 162

        for left, pos_right, neg_right in train_loader():
163 164 165
            left = paddle.reshape(left, shape=[-1, 1])
            pos_right = paddle.reshape(pos_right, shape=[-1, 1])
            neg_right = paddle.reshape(neg_right, shape=[-1, 1])
166 167 168 169 170 171 172 173 174 175 176 177 178 179
            net.train()
            global_step += 1
            left_feat, pos_score = net(left, pos_right)
            pred = pos_score
            _, neg_score = net(left, neg_right)
            avg_cost = loss.compute(pos_score, neg_score)
            losses.append(np.mean(avg_cost.numpy()))
            avg_cost.backward()
            optimizer.minimize(avg_cost)
            net.clear_gradients()
    return losses


class TestSimnet(unittest.TestCase):
180
    @test_and_compare_with_new_ir(True)
181 182 183 184 185 186 187 188 189 190 191 192 193
    def test_dygraph_static_same_loss(self):
        if fluid.is_compiled_with_cuda():
            fluid.set_flags({"FLAGS_cudnn_deterministic": True})
        conf_dict = create_conf_dict()
        dygraph_loss = train(conf_dict, to_static=False)
        static_loss = train(conf_dict, to_static=True)

        self.assertEqual(len(dygraph_loss), len(static_loss))
        for i in range(len(dygraph_loss)):
            self.assertAlmostEqual(dygraph_loss[i], static_loss[i])


if __name__ == '__main__':
194
    unittest.main()