test_merged_momentum_op.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
import paddle
import numpy as np
from paddle.fluid.layer_helper import LayerHelper
from collections import OrderedDict


def run_momentum_op(params,
                    grads,
                    velocitys,
                    master_params,
                    learning_rate,
                    place,
                    multi_precision,
                    mu=0.9,
                    rescale_grad=0.01,
                    use_merged=False):
    assert len(params) == len(grads)
    assert len(params) == len(velocitys)
    if multi_precision:
        assert len(params) == len(master_params)
    op_type = 'merged_momentum' if use_merged else 'momentum'
    main = paddle.static.Program()
    startup = paddle.static.Program()
    with paddle.static.program_guard(main, startup):
        helper = LayerHelper(op_type, **locals())
        attrs = {
            'mu': mu,
            'multi_precision': multi_precision,
            'rescale_grad': rescale_grad,
        }

        param_vars = [
            helper.create_variable(
                persistable=True, shape=p.shape, dtype=p.dtype) for p in params
        ]
        grad_vars = [
            helper.create_variable(
                shape=g.shape, dtype=g.dtype) for g in grads
        ]
        velocity_vars = [
            helper.create_variable(
                persistable=True, shape=v.shape, dtype=v.dtype)
            for v in velocitys
        ]
        lr_var = helper.create_variable(
            persistable=True,
            shape=learning_rate.shape,
            dtype=learning_rate.dtype)

        feed_dict = OrderedDict()

        feed_dict.update(
            OrderedDict([(p_var.name, p_val)
                         for p_var, p_val in zip(param_vars, params)]))
        feed_dict.update(
            OrderedDict([(v_var.name, v_val)
                         for v_var, v_val in zip(velocity_vars, velocitys)]))
        fetch_list = list(feed_dict.keys())

        feed_dict.update(
            OrderedDict([(g_var.name, g_val)
                         for g_var, g_val in zip(grad_vars, grads)]))
        feed_dict.update({lr_var.name: learning_rate})

        if multi_precision:
            master_param_vars = [
                helper.create_variable(
                    persistable=True, shape=p.shape, dtype=p.dtype)
                for p in master_params
            ]
            feed_dict.update(
                OrderedDict([(mp_var.name, mp_val)
                             for mp_var, mp_val in zip(master_param_vars,
                                                       master_params)]))
            # CPUPlace does not use MasterParam
            if isinstance(place, paddle.CUDAPlace):
                fetch_list = fetch_list + [
                    mp_var.name for mp_var in master_param_vars
                ]
        else:
            master_param_vars = None

        if not use_merged:
            for i, (p, g,
                    v) in enumerate(zip(param_vars, grad_vars, velocity_vars)):
                inputs = {
                    'Param': p,
                    'Grad': g,
                    'Velocity': v,
                    'LearningRate': lr_var,
                }
                outputs = {'ParamOut': p, 'VelocityOut': v}
                if multi_precision:
                    inputs['MasterParam'] = master_param_vars[i]
                    outputs['MasterParamOut'] = master_param_vars[i]
                helper.append_op(
                    type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
        else:
            inputs = {
                'Param': param_vars,
                'Grad': grad_vars,
                'Velocity': velocity_vars,
                'LearningRate': lr_var,
            }
            outputs = {'ParamOut': param_vars, 'VelocityOut': velocity_vars}
            if multi_precision:
                inputs['MasterParam'] = master_param_vars
                outputs['MasterParamOut'] = master_param_vars
            helper.append_op(
                type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)

    exe = paddle.static.Executor(place)
    with paddle.static.scope_guard(paddle.static.Scope()):
        exe.run(startup)
        return exe.run(main, feed=feed_dict, fetch_list=fetch_list)


class TestMergedMomentum(unittest.TestCase):
    def setUp(self):
        paddle.enable_static()
        self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
        self.seed = 10

    def gen_rand_data(self, shapes, dtype):
        return [np.random.random(s).astype(dtype) for s in shapes]

    def prepare_data(self, shapes, multi_precision, seed, place):
        np.random.seed(seed)
        mp_dtype = np.float32
        dtype = np.float16 if multi_precision and isinstance(
            place, paddle.CUDAPlace) else np.float32
        params = self.gen_rand_data(shapes, dtype)
        grads = self.gen_rand_data(shapes, dtype)
        velocitys = self.gen_rand_data(shapes, mp_dtype)
        learning_rate = self.gen_rand_data([[1]], mp_dtype)[0]
        if multi_precision:
            master_params = [p.astype(mp_dtype) for p in params]
        else:
            master_params = None
        return params, grads, velocitys, master_params, learning_rate

    def check_with_place(self, place, multi_precision):
        params, grads, velocitys, master_params, learning_rate = self.prepare_data(
            self.shapes, multi_precision, self.seed, place)

        def run_op(use_merged):
            # FIXME(zengjinle): CPU Momentum Op does not support rescale_grad 
            rescale_grad = 1.0 if isinstance(place, paddle.CPUPlace) else 0.01
            return run_momentum_op(
                params,
                grads,
                velocitys,
                master_params,
                learning_rate,
                place,
                multi_precision,
                rescale_grad=rescale_grad,
                use_merged=use_merged)

        outs1 = run_op(True)
        outs2 = run_op(False)
        self.assertEqual(len(outs1), len(outs2))
        for i, (out1, out2) in enumerate(zip(outs1, outs2)):
            if isinstance(place, paddle.CUDAPlace):
                self.assertTrue(np.array_equal(out1, out2))
            else:
                self.assertTrue(np.allclose(out1, out2, atol=1e-7))

    def get_places(self):
        places = [paddle.CPUPlace()]
        if paddle.is_compiled_with_cuda():
            places.append(paddle.CUDAPlace(0))
        return places

    def test_main(self):
        for multi_precision in [False, True]:
            for place in self.get_places():
                self.check_with_place(place, multi_precision)


if __name__ == "__main__":
    unittest.main()