test_mnist_pure_fp16.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
from time import time

import numpy as np
from test_mnist import MNIST, SEED, TestMNIST

import paddle

if paddle.fluid.is_compiled_with_cuda():
    paddle.fluid.set_flags({'FLAGS_cudnn_deterministic': True})


class TestPureFP16(TestMNIST):
    def train_static(self):
        return self.train(to_static=True)

    def train_dygraph(self):
        return self.train(to_static=False)

    def test_mnist_to_static(self):
        if paddle.fluid.is_compiled_with_cuda():
            dygraph_loss = self.train_dygraph()
            static_loss = self.train_static()
            # NOTE: In pure fp16 training, loss is not stable, so we enlarge atol here.
            np.testing.assert_allclose(
                dygraph_loss,
                static_loss,
                rtol=1e-05,
                atol=0.001,
                err_msg='dygraph is {}\n static_res is \n{}'.format(
                    dygraph_loss, static_loss
                ),
            )

    def train(self, to_static=False):
        np.random.seed(SEED)
        paddle.seed(SEED)
        paddle.framework.random._manual_program_seed(SEED)

        mnist = MNIST()

        if to_static:
            print("Successfully to apply @to_static.")
            build_strategy = paddle.static.BuildStrategy()
            # Why set `build_strategy.enable_inplace = False` here?
            # Because we find that this PASS strategy of PE makes dy2st training loss unstable.
            build_strategy.enable_inplace = False
            mnist = paddle.jit.to_static(mnist, build_strategy=build_strategy)

        optimizer = paddle.optimizer.Adam(
            learning_rate=0.001, parameters=mnist.parameters()
        )

        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)

        mnist, optimizer = paddle.amp.decorate(
            models=mnist, optimizers=optimizer, level='O2', save_dtype='float32'
        )

        loss_data = []
        for epoch in range(self.epoch_num):
            start = time()
            for batch_id, data in enumerate(self.train_reader()):
                dy_x_data = np.array(
                    [x[0].reshape(1, 28, 28) for x in data]
                ).astype('float32')
                y_data = (
                    np.array([x[1] for x in data])
                    .astype('int64')
                    .reshape(-1, 1)
                )

                img = paddle.to_tensor(dy_x_data)
                label = paddle.to_tensor(y_data)
                label.stop_gradient = True

                with paddle.amp.auto_cast(
                    enable=True,
                    custom_white_list=None,
                    custom_black_list=None,
                    level='O2',
                ):
                    prediction, acc, avg_loss = mnist(img, label=label)

                scaled = scaler.scale(avg_loss)
                scaled.backward()
                scaler.minimize(optimizer, scaled)

                loss_data.append(avg_loss.numpy()[0])
                # save checkpoint
                mnist.clear_gradients()
                if batch_id % 2 == 0:
                    print(
                        "Loss at epoch {} step {}: loss: {:}, acc: {}, cost: {}".format(
                            epoch,
                            batch_id,
                            avg_loss.numpy(),
                            acc.numpy(),
                            time() - start,
                        )
                    )
                    start = time()
                if batch_id == 10:
                    break
        return loss_data


if __name__ == '__main__':
    unittest.main()