optimizer.py 12.3 KB
Newer Older
Q
Qiao Longfei 已提交
1
import paddle.v2.framework.framework as framework
2
from collections import defaultdict
Q
Qiao Longfei 已提交
3

4
__all__ = ['SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer']
Q
Qiao Longfei 已提交
5 6 7 8 9 10


class Optimizer(object):
    """Optimizer Base class.

    Define the common interface of an optimizer.
11 12
    User should not use this class directly,
    but need to use one of it's implementation.
Q
Qiao Longfei 已提交
13 14 15
    """

    def __init__(self):
16 17 18 19 20
        # Dictionary of accumulators. Some optimizer subclasses need to
        # allocate and manage extra variables associated with the parameters
        # to train. These variables are called accumulators.
        # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
        self._accumulators = defaultdict(lambda: dict())
Q
Qiao Longfei 已提交
21 22 23 24 25 26

    def _append_optimize_op(self, block, param_and_grad):
        """ append optimize operator to block and return all the added optimize_op
        """
        raise NotImplementedError()

27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
    def _initialize_tensors(self, block):
        """Create all necessary tensors, that will be shared for all parameter updates.

        Tensors like learning rate should be initialized here.

        Args:
            block: the block in which the loss variable is present
        """
        pass

    def _create_accumulators(self, block, parameters):
        """Create all accumulators needed by the parameters

        Args:
            block: the block in which the loss variable is present
            parameters: list of parameter variables for the optimizer
Q
Qiao Longfei 已提交
43
        """
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
        pass

    def _add_accumulator(self, block, name, param, dtype=None, fill_value=0.0):
        """Utility function to add an accumulator for a parameter

        Args:
            block: the block in which the loss variable is present
            name: name of the accumulator
            param: parameter variable for which accumulator is to be added
            dtype: data type of the accumulator variable
            fill_value: value to initialize the accumulator variable
        """
        if (name in self._accumulators and
                param.name in self._accumulators[name]):
            raise Exception("Accumulator {} already exists for parmeter {}".
                            format(name, param.name))
        global_block = block.program.global_block()
        param_shape = list(param.shape)
        param_acc = global_block.create_var(
            dtype=dtype, shape=param_shape, lod_level=0)

        # Initialize the accumulator with fill_value
        # FIXME: Fix when Initialization design has been implemented
        # https://github.com/PaddlePaddle/Paddle/pull/4852
        global_block.append_op(
            type="fill_constant",
            outputs={"Out": param_acc},
            attrs={"shape": param_shape,
                   "value": fill_value})

        # Add to accumulators dict
        self._accumulators[name][param.name] = param_acc

    def _get_accumulator(self, name, param):
        """Utility function to fetch an accumulator for a parameter

        Args:
            name: name of the accumulator
            param: parameter variable for which accumulator is to be fetched

        Returns:
            accumulator variable for the parameter
        """
        if (name not in self._accumulators or
                param.name not in self._accumulators[name]):
            raise Exception("Accumulator {} does not exist for parameter {}".
                            format(name, param.name))
        return self._accumulators[name][param.name]

    def create_backward_pass(self, loss, parameter_list=None, no_grad_set=None):
        """Create and add gradient Operators in BlockDesc to compute
        gradients of `loss` for parameters in parameter_list
Q
Qiao Longfei 已提交
96 97 98 99

        Args:
          loss: an variable generated by cost function.
          no_grad_set: variable that should not create gradient
100 101
          parameter_list: parameters that need to compute gradient and
          update to optimize the lost.
Q
Qiao Longfei 已提交
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122

        Returns:
          list of (parameters, gradients) pair.
        """
        assert isinstance(loss, framework.Variable)
        param_grad_map = loss.block.program.append_backward(loss, no_grad_set or
                                                            set())
        if parameter_list is not None:
            parameters = parameter_list
        else:
            params = loss.block.program.global_block().all_parameters()
            parameters = [param.name for param in params]
        params_and_grads = []
        for param in parameters:
            if param not in param_grad_map:
                raise Exception("param %s is not in map" % param)
            grad_info = param_grad_map[param]
            grad_block = loss.block.program.block(grad_info[1])
            if not grad_block.has_var(grad_info[0]):
                raise Exception("grad block[%d] did not have grad var %s" %
                                grad_info[1], grad_info[0])
123 124
            # Get the param var from the global block
            param_var = loss.block.program.global_block().var(param)
Q
Qiao Longfei 已提交
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139
            grad_var = grad_block.var(grad_info[0])
            if loss.block.has_var(grad_info[0]):
                params_and_grads.append((param_var, grad_var))
            else:
                params_and_grads.append((param_var, None))
        return params_and_grads

    def create_optimization_pass(self, parameters_and_grads, loss):
        """Add optimization operators to update gradients to variables.

        Args:
          loss: the target that this optimization is for.
          parameters_and_grads: a list of (variable, gradient) pair to update.

        Returns:
140 141
          optmization_op_list: a list of optimization operator that will update
          parameter using gradient.
Q
Qiao Longfei 已提交
142
        """
143 144 145 146 147 148 149 150 151 152 153 154 155
        # This is a default implementation of create_optimization_pass that
        # can be shared by most optimizers. This implementation assumes that
        # the subclass will implement the _append_optimize_op method and the
        #  _initialize_tensors method. The subclass can extend the
        # _create_accumulators method if it needs to create accumulators
        # for parameters.

        # Create any accumulators
        self._create_accumulators(loss.block,
                                  [p[0] for p in parameters_and_grads])
        # Create any necessary tensors
        self._initialize_tensors(loss.block)

Q
Qiao Longfei 已提交
156 157 158 159 160 161
        optimize_ops = []
        for param_and_grad in parameters_and_grads:
            if param_and_grad[1] is not None:
                optimize_op = self._append_optimize_op(loss.block,
                                                       param_and_grad)
                optimize_ops.append(optimize_op)
162

Q
Qiao Longfei 已提交
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
        return optimize_ops

    def minimize(self, loss, parameter_list=None, no_grad_set=None):
        """Add operations to minimize `loss` by updating `parameter_list`.

        This method combines interface `create_backward_pass()` and
        `create_optimization_pass()` into one.
        """
        params_grads = self.create_backward_pass(loss, parameter_list,
                                                 no_grad_set or set())
        optimize_ops = self.create_optimization_pass(params_grads, loss)
        return optimize_ops


class SGDOptimizer(Optimizer):
    """ Simple SGD optimizer without any state.
    """

    def __init__(self, learning_rate):
        assert learning_rate is not None
183
        super(SGDOptimizer, self).__init__()
Q
Qiao Longfei 已提交
184 185 186
        self.type = "sgd"
        self._learning_rate = learning_rate

187
    def _initialize_tensors(self, block):
Q
Qiao Longfei 已提交
188 189
        assert isinstance(block, framework.Block)
        lr_shape = [1]
190 191 192
        # create a variable for learning_rate
        self._lr = block.create_var(
            dtype="float32", shape=lr_shape, lod_level=0)
Q
Qiao Longfei 已提交
193 194

        # create an op to init the learning_rate
195 196 197
        # FIXME: Fix when Initialization design has been implemented
        # https://github.com/PaddlePaddle/Paddle/pull/4852
        block.append_op(
Q
Qiao Longfei 已提交
198
            type="fill_constant",
199
            outputs={"Out": self._lr},
Q
Qiao Longfei 已提交
200 201 202
            attrs={"shape": lr_shape,
                   "value": self._learning_rate})

203 204 205
    def _append_optimize_op(self, block, param_and_grad):
        assert isinstance(block, framework.Block)

Q
Qiao Longfei 已提交
206 207 208 209 210 211
        # create the optimize op
        sgd_op = block.append_op(
            type=self.type,
            inputs={
                "Param": param_and_grad[0],
                "Grad": param_and_grad[1],
212
                "LearningRate": self._lr
Q
Qiao Longfei 已提交
213
            },
214
            outputs={"ParamOut": param_and_grad[0]})
Q
Qiao Longfei 已提交
215 216

        return sgd_op
217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274


class MomentumOptimizer(Optimizer):
    """Simple Momentum optimizer with velocity state
    """
    _velocity_acc_str = "velocity"

    def __init__(self, learning_rate, momentum):
        assert learning_rate is not None
        assert momentum is not None
        super(MomentumOptimizer, self).__init__()
        self.type = "momentum"
        self._learning_rate = learning_rate
        self._momentum = momentum

    def _initialize_tensors(self, block):
        assert isinstance(block, framework.Block)
        lr_shape = [1]
        # create a variable for learning_rate
        self._lr = block.create_var(
            dtype="float32", shape=lr_shape, lod_level=0)

        # create an op to init the learning_rate
        # FIXME: Fix when Initialization design has been implemented
        # https://github.com/PaddlePaddle/Paddle/pull/4852
        block.append_op(
            type="fill_constant",
            outputs={"Out": self._lr},
            attrs={"shape": lr_shape,
                   "value": self._learning_rate})

    def _create_accumulators(self, block, parameters):
        assert isinstance(block, framework.Block)

        for p in parameters:
            self._add_accumulator(block, self._velocity_acc_str, p, 'float32')

    def _append_optimize_op(self, block, param_and_grad):
        assert isinstance(block, framework.Block)

        velocity_acc = self._get_accumulator(self._velocity_acc_str,
                                             param_and_grad[0])
        # create the momentum optimize op
        momentum_op = block.append_op(
            type=self.type,
            inputs={
                "Param": param_and_grad[0],
                "Grad": param_and_grad[1],
                "Velocity": velocity_acc,
                "LearningRate": self._lr
            },
            outputs={
                "ParamOut": param_and_grad[0],
                "VelocityOut": velocity_acc
            },
            attrs={"mu": self._momentum})

        return momentum_op
275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331


class AdagradOptimizer(Optimizer):
    """Simple Adagrad optimizer with moment state
    """
    _moment_acc_str = "moment"

    def __init__(self, learning_rate, epsilon):
        assert learning_rate is not None
        assert epsilon is not None
        super(AdagradOptimizer, self).__init__()
        self.type = "adagrad"
        self._learning_rate = learning_rate
        self._epsilon = epsilon

    def _initialize_tensors(self, block):
        assert isinstance(block, framework.Block)
        lr_shape = [1]
        # create a variable for learning_rate
        self._lr = block.create_var(
            dtype="float32", shape=lr_shape, lod_level=0)

        # create an op to init the learning_rate
        # FIXME: Fix when Initialization design has been implemented
        # https://github.com/PaddlePaddle/Paddle/pull/4852
        block.append_op(
            type="fill_constant",
            outputs={"Out": self._lr},
            attrs={"shape": lr_shape,
                   "value": self._learning_rate})

    def _create_accumulators(self, block, parameters):
        assert isinstance(block, framework.Block)

        for p in parameters:
            self._add_accumulator(block, self._moment_acc_str, p, 'float32')

    def _append_optimize_op(self, block, param_and_grad):
        assert isinstance(block, framework.Block)

        moment_acc = self._get_accumulator(self._moment_acc_str,
                                           param_and_grad[0])

        # create the adagrad optimizer op
        adagrad_op = block.append_op(
            type=self.type,
            inputs={
                "Param": param_and_grad[0],
                "Grad": param_and_grad[1],
                "Moment": moment_acc,
                "LearningRate": self._lr
            },
            outputs={"ParamOut": param_and_grad[0],
                     "MomentOut": moment_acc},
            attrs={"epsilon": self._epsilon})

        return adagrad_op