architect.py 6.8 KB
Newer Older
1
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
B
Bai Yifan 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle.fluid as fluid
from paddle.fluid.dygraph.base import to_variable


class Architect(object):
B
Bai Yifan 已提交
24
    def __init__(self, model, eta, arch_learning_rate, unrolled, parallel):
B
Bai Yifan 已提交
25 26 27 28 29 30 31 32 33 34 35
        self.network_momentum = 0.9
        self.network_weight_decay = 3e-4
        self.eta = eta
        self.model = model
        self.optimizer = fluid.optimizer.Adam(
            arch_learning_rate,
            0.5,
            0.999,
            regularization=fluid.regularizer.L2Decay(1e-3),
            parameter_list=self.model.arch_parameters())
        self.unrolled = unrolled
36
        self.parallel = parallel
B
Bai Yifan 已提交
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
        if self.unrolled:
            self.unrolled_model = self.model.new()
            self.unrolled_model_params = [
                p for p in self.unrolled_model.parameters()
                if p.name not in [
                    a.name for a in self.unrolled_model.arch_parameters()
                ] and p.trainable
            ]
            self.unrolled_optimizer = fluid.optimizer.MomentumOptimizer(
                self.eta,
                self.network_momentum,
                regularization=fluid.regularizer.L2DecayRegularizer(
                    self.network_weight_decay),
                parameter_list=self.unrolled_model_params)

52 53 54 55 56 57 58 59 60 61 62
        if self.parallel:
            strategy = fluid.dygraph.parallel.prepare_context()
            self.parallel_model = fluid.dygraph.parallel.DataParallel(
                self.model, strategy)
            if self.unrolled:
                self.parallel_unrolled_model = fluid.dygraph.parallel.DataParallel(
                    self.unrolled_model, strategy)

    def get_model(self):
        return self.parallel_model if self.parallel else self.model

B
Bai Yifan 已提交
63 64 65 66 67 68 69 70 71 72 73 74
    def step(self, input_train, target_train, input_valid, target_valid):
        if self.unrolled:
            params_grads = self._backward_step_unrolled(
                input_train, target_train, input_valid, target_valid)
            self.optimizer.apply_gradients(params_grads)
        else:
            loss = self._backward_step(input_valid, target_valid)
            self.optimizer.minimize(loss)
        self.optimizer.clear_gradients()

    def _backward_step(self, input_valid, target_valid):
        loss = self.model._loss(input_valid, target_valid)
75 76 77 78 79 80
        if self.parallel:
            loss = self.parallel_model.scale_loss(loss)
            loss.backward()
            self.parallel_model.apply_collective_grads()
        else:
            loss.backward()
B
Bai Yifan 已提交
81 82 83 84 85 86 87
        return loss

    def _backward_step_unrolled(self, input_train, target_train, input_valid,
                                target_valid):
        self._compute_unrolled_model(input_train, target_train)
        unrolled_loss = self.unrolled_model._loss(input_valid, target_valid)

88 89 90 91 92 93 94 95
        if self.parallel:
            unrolled_loss = self.parallel_unrolled_model.scale_loss(
                unrolled_loss)
            unrolled_loss.backward()
            self.parallel_unrolled_model.apply_collective_grads()
        else:
            unrolled_loss.backward()

B
Bai Yifan 已提交
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
        vector = [
            to_variable(param._grad_ivar().numpy())
            for param in self.unrolled_model_params
        ]
        arch_params_grads = [
            (alpha, to_variable(ualpha._grad_ivar().numpy()))
            for alpha, ualpha in zip(self.model.arch_parameters(),
                                     self.unrolled_model.arch_parameters())
        ]
        self.unrolled_model.clear_gradients()

        implicit_grads = self._hessian_vector_product(vector, input_train,
                                                      target_train)
        for (p, g), ig in zip(arch_params_grads, implicit_grads):
            new_g = g - (ig * self.unrolled_optimizer.current_step_lr())
B
Bai Yifan 已提交
111
            fluid.layers.assign(new_g.detach(), g)
B
Bai Yifan 已提交
112 113 114 115 116
        return arch_params_grads

    def _compute_unrolled_model(self, input, target):
        for x, y in zip(self.unrolled_model.parameters(),
                        self.model.parameters()):
B
Bai Yifan 已提交
117 118
            fluid.layers.assign(y.detach(), x)

B
Bai Yifan 已提交
119
        loss = self.unrolled_model._loss(input, target)
120 121 122 123 124 125 126
        if self.parallel:
            loss = self.parallel_unrolled_model.scale_loss(loss)
            loss.backward()
            self.parallel_unrolled_model.apply_collective_grads()
        else:
            loss.backward()

B
Bai Yifan 已提交
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
        self.unrolled_optimizer.minimize(loss)
        self.unrolled_model.clear_gradients()

    def _hessian_vector_product(self, vector, input, target, r=1e-2):
        R = r * fluid.layers.rsqrt(
            fluid.layers.sum([
                fluid.layers.reduce_sum(fluid.layers.square(v)) for v in vector
            ]))

        model_params = [
            p for p in self.model.parameters()
            if p.name not in [a.name for a in self.model.arch_parameters()] and
            p.trainable
        ]
        for param, grad in zip(model_params, vector):
            param_p = param + grad * R
B
Bai Yifan 已提交
143
            fluid.layers.assign(param_p.detach(), param)
B
Bai Yifan 已提交
144
        loss = self.model._loss(input, target)
145 146 147 148 149 150 151
        if self.parallel:
            loss = self.parallel_model.scale_loss(loss)
            loss.backward()
            self.parallel_model.apply_collective_grads()
        else:
            loss.backward()

B
Bai Yifan 已提交
152 153 154 155 156 157 158
        grads_p = [
            to_variable(param._grad_ivar().numpy())
            for param in self.model.arch_parameters()
        ]

        for param, grad in zip(model_params, vector):
            param_n = param - grad * R * 2
B
Bai Yifan 已提交
159
            fluid.layers.assign(param_n.detach(), param)
B
Bai Yifan 已提交
160 161 162
        self.model.clear_gradients()

        loss = self.model._loss(input, target)
163 164 165 166 167 168 169
        if self.parallel:
            loss = self.parallel_model.scale_loss(loss)
            loss.backward()
            self.parallel_model.apply_collective_grads()
        else:
            loss.backward()

B
Bai Yifan 已提交
170 171 172 173 174 175
        grads_n = [
            to_variable(param._grad_ivar().numpy())
            for param in self.model.arch_parameters()
        ]
        for param, grad in zip(model_params, vector):
            param_o = param + grad * R
B
Bai Yifan 已提交
176
            fluid.layers.assign(param_o.detach(), param)
B
Bai Yifan 已提交
177 178 179
        self.model.clear_gradients()
        arch_grad = [(p - n) / (2 * R) for p, n in zip(grads_p, grads_n)]
        return arch_grad