test_ptb_lm_v2.py 11.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import time
import unittest

import numpy as np
20

21 22 23 24 25 26 27
import paddle

PRINT_STEP = 20
SEED = 2020


class SimpleLSTMRNN(paddle.nn.Layer):
28 29 30
    def __init__(
        self, hidden_size, num_steps, num_layers=2, init_scale=0.1, dropout=None
    ):
31
        super().__init__()
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
        self._hidden_size = hidden_size
        self._num_layers = num_layers
        self._init_scale = init_scale
        self._dropout = dropout
        self._num_steps = num_steps
        self.cell_array = []
        self.hidden_array = []

        self.weight_1_arr = []
        self.weight_2_arr = []
        self.bias_arr = []
        self.mask_array = []

        for i in range(self._num_layers):
            weight_1 = self.create_parameter(
47 48 49 50 51
                attr=paddle.ParamAttr(
                    initializer=paddle.nn.initializer.Uniform(
                        low=-self._init_scale, high=self._init_scale
                    )
                ),
52 53 54
                shape=[self._hidden_size * 2, self._hidden_size * 4],
                dtype="float32",
                default_initializer=paddle.nn.initializer.Uniform(
55 56 57
                    low=-self._init_scale, high=self._init_scale
                ),
            )
58 59
            self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))
            bias_1 = self.create_parameter(
60 61 62 63 64
                attr=paddle.ParamAttr(
                    initializer=paddle.nn.initializer.Uniform(
                        low=-self._init_scale, high=self._init_scale
                    )
                ),
65 66
                shape=[self._hidden_size * 4],
                dtype="float32",
67 68
                default_initializer=paddle.nn.initializer.Constant(0.0),
            )
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
            self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))

    def forward(self, input_embedding, init_hidden=None, init_cell=None):
        cell_array = []
        hidden_array = []

        for i in range(self._num_layers):
            hidden_array.append(init_hidden[i])
            cell_array.append(init_cell[i])

        res = []
        for index in range(self._num_steps):
            step_input = input_embedding[:, index, :]
            for k in range(self._num_layers):
                pre_hidden = hidden_array[k]
                pre_cell = cell_array[k]
                weight_1 = self.weight_1_arr[k]
                bias = self.bias_arr[k]

                nn = paddle.concat(x=[step_input, pre_hidden], axis=1)
                gate_input = paddle.matmul(x=nn, y=weight_1)

                gate_input = paddle.add(x=gate_input, y=bias)
92 93 94
                i, j, f, o = paddle.split(
                    x=gate_input, num_or_sections=4, axis=-1
                )
95
                c = pre_cell * paddle.nn.functional.sigmoid(
96 97
                    f
                ) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j)
98 99 100 101 102 103 104 105 106
                m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o)
                hidden_array[k] = m
                cell_array[k] = c
                step_input = m

                if self._dropout is not None and self._dropout > 0.0:
                    step_input = paddle.nn.functional.dropout(
                        step_input,
                        dropout_prob=self._dropout,
107 108
                        dropout_implementation='upscale_in_train',
                    )
109 110
            res.append(step_input)
        real_res = paddle.concat(x=res, axis=1)
111 112 113
        real_res = paddle.reshape(
            real_res, [-1, self._num_steps, self._hidden_size]
        )
114 115
        last_hidden = paddle.concat(x=hidden_array, axis=1)
        last_hidden = paddle.reshape(
116 117
            last_hidden, shape=[-1, self._num_layers, self._hidden_size]
        )
118 119 120
        last_hidden = paddle.transpose(x=last_hidden, perm=[1, 0, 2])
        last_cell = paddle.concat(x=cell_array, axis=1)
        last_cell = paddle.reshape(
121 122
            last_cell, shape=[-1, self._num_layers, self._hidden_size]
        )
123 124 125 126 127
        last_cell = paddle.transpose(x=last_cell, perm=[1, 0, 2])
        return real_res, last_hidden, last_cell


class PtbModel(paddle.nn.Layer):
128 129 130 131 132 133 134 135 136
    def __init__(
        self,
        hidden_size,
        vocab_size,
        num_layers=2,
        num_steps=20,
        init_scale=0.1,
        dropout=None,
    ):
137
        super().__init__()
138 139 140 141 142 143
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.init_scale = init_scale
        self.num_layers = num_layers
        self.num_steps = num_steps
        self.dropout = dropout
144 145 146 147 148 149 150
        self.simple_lstm_rnn = SimpleLSTMRNN(
            hidden_size,
            num_steps,
            num_layers=num_layers,
            init_scale=init_scale,
            dropout=dropout,
        )
151 152 153 154 155
        self.embedding = paddle.nn.Embedding(
            vocab_size,
            hidden_size,
            sparse=False,
            weight_attr=paddle.ParamAttr(
156
                name='embedding_para',
157 158 159 160 161
                initializer=paddle.nn.initializer.Uniform(
                    low=-init_scale, high=init_scale
                ),
            ),
        )
162 163 164 165 166
        self.softmax_weight = self.create_parameter(
            attr=paddle.ParamAttr(),
            shape=[self.hidden_size, self.vocab_size],
            dtype="float32",
            default_initializer=paddle.nn.initializer.Uniform(
167 168 169
                low=-self.init_scale, high=self.init_scale
            ),
        )
170 171 172 173 174
        self.softmax_bias = self.create_parameter(
            attr=paddle.ParamAttr(),
            shape=[self.vocab_size],
            dtype="float32",
            default_initializer=paddle.nn.initializer.Uniform(
175 176 177
                low=-self.init_scale, high=self.init_scale
            ),
        )
178 179 180 181 182 183 184

    def build_once(self, input, label, init_hidden, init_cell):
        pass

    @paddle.jit.to_static
    def forward(self, input, label, init_hidden, init_cell):

185 186 187
        init_h = paddle.reshape(
            init_hidden, shape=[self.num_layers, -1, self.hidden_size]
        )
188

189 190 191
        init_c = paddle.reshape(
            init_cell, shape=[self.num_layers, -1, self.hidden_size]
        )
192 193 194

        x_emb = self.embedding(input)

195 196 197
        x_emb = paddle.reshape(
            x_emb, shape=[-1, self.num_steps, self.hidden_size]
        )
198 199 200 201
        if self.dropout is not None and self.dropout > 0.0:
            x_emb = paddle.nn.functional.dropout(
                x_emb,
                dropout_prob=self.dropout,
202 203
                dropout_implementation='upscale_in_train',
            )
204
        rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(
205 206
            x_emb, init_h, init_c
        )
207 208 209 210 211

        projection = paddle.matmul(x=rnn_out, y=self.softmax_weight)
        projection = paddle.add(x=projection, y=self.softmax_bias)

        loss = paddle.nn.functional.softmax_with_cross_entropy(
212 213
            logits=projection, label=label, soft_label=False
        )
214
        loss = paddle.reshape(loss, shape=[-1, self.num_steps])
C
chentianyu03 已提交
215
        loss = paddle.mean(loss, axis=[0])
216
        loss = paddle.sum(loss)
217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237

        return loss, last_hidden, last_cell

    def debug_emb(self):

        np.save("emb_grad", self.x_emb.gradient())


def train(place):

    num_layers = 1
    batch_size = 4
    hidden_size = 10
    num_steps = 3
    init_scale = 0.1
    max_epoch = 1
    dropout = 0.0
    vocab_size = 1000
    batch_num = 200

    paddle.disable_static(place)
C
cnn 已提交
238
    paddle.seed(SEED)
239
    paddle.framework.random._manual_program_seed(SEED)
240 241 242 243 244 245 246 247 248 249 250 251
    ptb_model = PtbModel(
        hidden_size=hidden_size,
        vocab_size=vocab_size,
        num_layers=num_layers,
        num_steps=num_steps,
        init_scale=init_scale,
        dropout=dropout,
    )

    sgd = paddle.optimizer.SGD(
        learning_rate=1e-3, parameters=ptb_model.parameters()
    )
252 253 254 255 256 257 258

    for epoch_id in range(max_epoch):

        total_loss = 0.0
        iters = 0.0
        total_sample = 0

259 260 261 262 263 264 265 266 267 268 269 270 271
        init_hidden_data = np.zeros(
            (num_layers, batch_size, hidden_size), dtype='float32'
        )
        init_cell_data = np.zeros(
            (num_layers, batch_size, hidden_size), dtype='float32'
        )

        init_hidden = paddle.to_tensor(
            data=init_hidden_data, dtype=None, place=None, stop_gradient=True
        )
        init_cell = paddle.to_tensor(
            data=init_cell_data, dtype=None, place=None, stop_gradient=True
        )
272 273 274 275 276 277 278 279
        for step_id in range(batch_num):
            x_data = np.arange(12).reshape(4, 3).astype('int64')
            y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
            y_data = y_data.reshape((-1, 1))

            x_data = x_data.reshape((-1, num_steps, 1))
            y_data = y_data.reshape((-1, num_steps, 1))

280 281 282 283 284 285 286 287 288 289
            x = paddle.to_tensor(
                data=x_data, dtype=None, place=None, stop_gradient=True
            )
            y = paddle.to_tensor(
                data=y_data, dtype=None, place=None, stop_gradient=True
            )

            dy_loss, last_hidden, last_cell = ptb_model(
                x, y, init_hidden, init_cell
            )
290 291 292 293 294 295 296 297 298 299 300
            out_loss = dy_loss.numpy()

            dy_loss.backward()
            sgd.minimize(dy_loss)
            ptb_model.clear_gradients()

            total_loss += out_loss
            iters += num_steps
            total_sample += 1
            if step_id % PRINT_STEP == 0:
                if step_id == 0:
301 302 303 304
                    logging.info(
                        "epoch %d | step %d, loss %0.3f"
                        % (epoch_id, step_id, total_loss / total_sample)
                    )
305 306 307 308
                    avg_batch_time = time.time()
                else:
                    speed = PRINT_STEP / (time.time() - avg_batch_time)
                    logging.info(
309 310 311
                        "epoch %d | step %d, loss %0.3f, speed %.3f steps/s"
                        % (epoch_id, step_id, total_loss / total_sample, speed)
                    )
312 313 314 315 316 317 318 319
                    avg_batch_time = time.time()

    ret = out_loss, last_hidden.numpy(), last_cell.numpy()
    paddle.enable_static()
    return ret


def train_dygraph(place):
R
Ryan 已提交
320
    paddle.jit.enable_to_static(False)
321 322 323 324
    return train(place)


def train_static(place):
R
Ryan 已提交
325
    paddle.jit.enable_to_static(True)
326 327 328 329 330
    return train(place)


class TestPtb(unittest.TestCase):
    def setUp(self):
331 332 333
        self.place = (
            paddle.CUDAPlace(0)
            if paddle.is_compiled_with_cuda()
334
            else paddle.CPUPlace()
335
        )
336 337 338 339 340

    def test_check_result(self):
        loss_1, hidden_1, cell_1 = train_static(self.place)
        loss_2, hidden_2, cell_2 = train_dygraph(self.place)

341 342 343
        np.testing.assert_allclose(loss_1, loss_2, rtol=1e-05)
        np.testing.assert_allclose(hidden_1, hidden_2, rtol=1e-05)
        np.testing.assert_allclose(cell_1, cell_2, rtol=1e-05)
344 345 346 347


if __name__ == '__main__':
    unittest.main()