From fd854183295c0a8d6dc0682f135d7dcc13faa575 Mon Sep 17 00:00:00 2001 From: Wu Yi Date: Thu, 10 Jan 2019 16:27:52 +0800 Subject: [PATCH] [Feature] support mix precision training for resnet (#14899) * clip softmax for fp16 * updates * fuse xent support fp16 test=develop * wip * wip * add simple row reduce * wip fp16 accurate softmax * add accurate softmax kernel for fp16 test=develop * update test=develop * fix cpu build test=develop * update api.spec test=develop * follow comments test=develop * fix build test=develop * fix trt build test=develop * fix inference build test=develop * fix merge test=develop * update test=develop * try fix build test=develop * fix build test=develop * rename real_exp test=develop * fortest * remove hacky kernels test=develop * clean up test=develop --- paddle/fluid/API.spec | 22 ++ paddle/fluid/operators/conv_cudnn_op.cu.cc | 15 ++ .../elementwise/elementwise_sub_op.cu | 5 + paddle/fluid/operators/math/softmax.h | 1 + .../softmax_with_cross_entropy_op.cu | 64 ++--- python/paddle/fluid/optimizer.py | 226 +++++++++++------- .../fluid/tests/unittests/test_optimizer.py | 70 ++++-- .../test_softmax_with_cross_entropy_op.py | 60 ++++- 8 files changed, 333 insertions(+), 130 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 987263155..16d43f82d 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -405,28 +405,50 @@ paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)) paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)) paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.optimizer.SGDOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.SGDOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)) +paddle.fluid.optimizer.MomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.MomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None)) +paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)) +paddle.fluid.optimizer.AdamOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.AdamOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) +paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.AdamaxOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)) +paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.DecayedAdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)) +paddle.fluid.optimizer.FtrlOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.FtrlOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)) +paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.RMSPropOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)) +paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.AdadeltaOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)) paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.optimizer.ModelAverage.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.ModelAverage.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)) +paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.LarsMomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.LarsMomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)) diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index dbb6ffd5e..25a723fc0 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -297,6 +297,21 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( layout, framework::vectorize2int(filter->dims()), groups); +#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) + // Enable Tensor Core for cudnn backward + if (dev_ctx.GetComputeCapability() >= 70 && + std::type_index(typeid(T)) == + std::type_index(typeid(platform::float16))) { + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + cudnn_conv_desc, CUDNN_TENSOR_OP_MATH)); + VLOG(5) << "use cudnn_tensor_op_math for backward"; + } else { + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + cudnn_conv_desc, CUDNN_DEFAULT_MATH)); + VLOG(5) << "NOT use cudnn_tensor_op_math for backward"; + } +#endif + int input_channels = input->dims()[1]; int input_height, input_width, input_depth; if (input->dims().size() == 5) { diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu index 6f17d3292..f2adf1c83 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu @@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" +#include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( elementwise_sub, ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, ops::ElementwiseSubKernel); REGISTER_OP_CUDA_KERNEL( elementwise_sub_grad, ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel()(::Eigen::numext::log(x)); +} +static __device__ __forceinline__ float log_on_device(float x) { return math::TolerableValue()(logf(x)); } -static __device__ __forceinline__ double real_log(double x) { +static __device__ __forceinline__ double log_on_device(double x) { return math::TolerableValue()(log(x)); } @@ -72,25 +81,20 @@ static __device__ __forceinline__ double real_log(double x) { /* Supposing the x is `logits` and y is `labels`, the equations are as followings: - cross\_entropy_i = \sum_{j}[- y_i_j * log({e^{x_i_j}/\sum_{j}e^{x_i_j}})] = \sum_{j}[- y_i_j * log({e^{x_i_j - max_i}/\sum_{j}e^{x_i_j-max_i}})] = \sum_{j}[-y_i_j * (x_i_j - max_i - log\sum_{j}e^{x_i_j - max_i})] = \sum_{j}[-y_i_j * (x_i_j - max_i - logDiffMaxSum_i)] = \sum_{j}(-y_i_j * tmp_i_j) - softmax_i_j = e^{tmp_i_j} - where: max_i = \max_{j}{x_i_j} logDiffMaxSum_i = log\sum_{j}e^{x_i_j - max_i} tmp_i_j = x_i_j - max_i - logDiffMaxSum_i - Therefore, the calculation can be separated into 3 steps: Step 1: row-wise operation to calculate max_i Step 2: row-wise operation to calculate logDiffMaxSum_i Step 3: caculate tmp_i_j, and finally get softmax_i_j and cross\_entropy_i - To save memory, we can share memory among max_i, logDiffMaxSum_i and cross\_entropy_i. In this way, the 3 steps should be changed to: @@ -134,7 +138,8 @@ static __global__ void RowReductionForMax(const T* logits_data, T* max_data, cur_max = BlockReduce(temp_storage).Reduce(cur_max, cub::Max()); if (threadIdx.x == 0) { - max_data[blockIdx.x] = cur_max < -64 ? -64 : cur_max; + max_data[blockIdx.x] = + cur_max < static_cast(-64) ? static_cast(-64) : cur_max; } } @@ -151,17 +156,17 @@ static __global__ void RowReductionForDiffMaxSum(const T* logits_data, auto block_max = max_data[blockIdx.x]; softmax[beg_idx] = logits_data[beg_idx] - block_max; - T diff_max_sum = real_exp(softmax[beg_idx]); + T diff_max_sum = exp_on_device(softmax[beg_idx]); auto idx = beg_idx + BlockDim; while (idx < end_idx) { softmax[idx] = logits_data[idx] - block_max; - diff_max_sum += real_exp(softmax[idx]); + diff_max_sum += exp_on_device(softmax[idx]); idx += BlockDim; } diff_max_sum = BlockReduce(temp_storage).Reduce(diff_max_sum, cub::Sum()); - if (threadIdx.x == 0) max_data[blockIdx.x] = real_log(diff_max_sum); + if (threadIdx.x == 0) max_data[blockIdx.x] = log_on_device(diff_max_sum); if (!CalculateLogSoftmax) return; __syncthreads(); @@ -188,12 +193,12 @@ static __global__ void RowReductionForSoftmaxAndCrossEntropy( // log_diff_max_sum shares memory with loss auto block_log_diff_max_sum = loss_data[blockIdx.x]; auto tmp = softmax[beg_idx] - block_log_diff_max_sum; - softmax[beg_idx] = real_exp(tmp); + softmax[beg_idx] = exp_on_device(tmp); auto loss = -labels_data[beg_idx] * tmp; beg_idx += BlockDim; while (beg_idx < end_idx) { tmp = softmax[beg_idx] - block_log_diff_max_sum; - softmax[beg_idx] = real_exp(tmp); + softmax[beg_idx] = exp_on_device(tmp); loss -= (labels_data[beg_idx] * tmp); beg_idx += BlockDim; } @@ -218,10 +223,10 @@ struct HardLabelSoftmaxWithCrossEntropyFunctor { auto row_idx = idx / feature_size_; auto col_idx = idx % feature_size_; if (col_idx != labels_[row_idx]) { - log_softmax_[idx] = real_exp(log_softmax_[idx]); + log_softmax_[idx] = exp_on_device(log_softmax_[idx]); } else { auto softmax = log_softmax_[idx]; - log_softmax_[idx] = real_exp(softmax); + log_softmax_[idx] = exp_on_device(softmax); loss_[row_idx] = -softmax; } } @@ -253,10 +258,10 @@ struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx { auto row_idx = idx / feature_size_; auto col_idx = idx % feature_size_; if (col_idx != labels_[row_idx] || col_idx == ignore_idx_) { - log_softmax_[idx] = real_exp(log_softmax_[idx]); + log_softmax_[idx] = exp_on_device(log_softmax_[idx]); } else { auto softmax = log_softmax_[idx]; - log_softmax_[idx] = real_exp(softmax); + log_softmax_[idx] = exp_on_device(softmax); loss_[row_idx] = -softmax; } } @@ -464,9 +469,12 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy, - ops::SoftmaxWithCrossEntropyCUDAKernel, - ops::SoftmaxWithCrossEntropyCUDAKernel); -REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy_grad, - ops::SoftmaxWithCrossEntropyGradCUDAKernel, - ops::SoftmaxWithCrossEntropyGradCUDAKernel); +REGISTER_OP_CUDA_KERNEL( + softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyCUDAKernel, + ops::SoftmaxWithCrossEntropyCUDAKernel, + ops::SoftmaxWithCrossEntropyCUDAKernel); +REGISTER_OP_CUDA_KERNEL( + softmax_with_cross_entropy_grad, + ops::SoftmaxWithCrossEntropyGradCUDAKernel, + ops::SoftmaxWithCrossEntropyGradCUDAKernel, + ops::SoftmaxWithCrossEntropyGradCUDAKernel); diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 779cb5f96..bf3730ce5 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -195,22 +195,18 @@ class Optimizer(object): format(name, param.name)) return self._accumulators[name][param.name] - def _create_optimization_pass(self, - parameters_and_grads, - loss, - startup_program=None): + def _create_optimization_pass(self, parameters_and_grads): """Add optimization operators to update gradients to variables. Args: - loss(Variable): the target that this optimization is for. parameters_and_grads(list(tuple(Variable, Variable))): - a list of (variable, gradient) pair to update. + a list of (variable, gradient) pair to update. Returns: return_op_list: a list of operators that will complete one step of - optimization. This will include parameter update ops, global step - update ops and any other custom ops required by subclasses to manage - their internal state. + optimization. This will include parameter update ops, global step + update ops and any other custom ops required by subclasses to manage + their internal state. """ # This is a default implementation of create_optimization_pass that # can be shared by most optimizers. This implementation assumes that @@ -219,37 +215,33 @@ class Optimizer(object): # _create_accumulators method if it needs to create accumulators # for parameters and extend _finish_update method to add custom ops. - # Create any accumulators - program = loss.block.program - self._dtype = loss.dtype - with program_guard(program, startup_program): - global_block = framework.default_main_program().global_block() - start = len(global_block.ops) - self.helper = LayerHelper(self.__class__.__name__) - self._create_accumulators(loss.block, - [p[0] for p in parameters_and_grads]) - self._create_global_learning_rate() - - optimize_ops = [] - for param_and_grad in parameters_and_grads: - if param_and_grad[1] is None: - continue - with param_and_grad[0].block.program._optimized_guard( - param_and_grad), name_scope("optimizer"): - if param_and_grad[0].trainable is True: - optimize_op = self._append_optimize_op(loss.block, - param_and_grad) - optimize_ops.append(optimize_op) - - # Get custom finish ops for subclasses - # FIXME: Need to fix this once we figure out how to handle dependencies - self._finish_update(loss.block, parameters_and_grads) - - end = len(global_block.ops) - return global_block._slice_ops(start, end) - - def _process_distribute_lookuptable(self, param_grads, loss, - startup_program): + # Allways called under program_guard use global block as loss block + global_block = framework.default_main_program().global_block() + start = len(global_block.ops) + self.helper = LayerHelper(self.__class__.__name__) + self._create_accumulators(global_block, + [p[0] for p in parameters_and_grads]) + self._create_global_learning_rate() + + optimize_ops = [] + for param_and_grad in parameters_and_grads: + if param_and_grad[1] is None: + continue + with param_and_grad[0].block.program._optimized_guard( + param_and_grad), name_scope("optimizer"): + if param_and_grad[0].trainable is True: + optimize_op = self._append_optimize_op(global_block, + param_and_grad) + optimize_ops.append(optimize_op) + + # Get custom finish ops for subclasses + # FIXME: Need to fix this once we figure out how to handle dependencies + self._finish_update(global_block, parameters_and_grads) + + end = len(global_block.ops) + return global_block._slice_ops(start, end) + + def _process_distribute_lookuptable(self, param_grads): """ Because distribute lookup table only support SGD optimizer for now, not support other optimizer and regularization, so we should find the table parameter out, @@ -259,7 +251,8 @@ class Optimizer(object): :param loss: the loss variable. :param startup_program: the startup program """ - program = loss.block.program + program = framework.default_main_program() + global_block = framework.default_main_program().global_block() table_name = find_distributed_lookup_table(program) table_param = None table_grad = None @@ -275,38 +268,121 @@ class Optimizer(object): new_param_grads.append((p, g)) sgd_op = None if table_param is not None: - with program_guard(program, startup_program): - param_and_grad = [table_param, table_grad] - with table_param.block.program._optimized_guard(param_and_grad), \ - framework.name_scope("optimizer"): - self._create_global_learning_rate() - # create the optimize op - sgd_op = loss.block.append_op( - type='sgd', - inputs={ - "Param": table_param, - "Grad": table_grad, - "LearningRate": - self._create_param_lr(param_and_grad) - }, - outputs={"ParamOut": param_and_grad[0]}) + param_and_grad = [table_param, table_grad] + with table_param.block.program._optimized_guard(param_and_grad), \ + framework.name_scope("optimizer"): + self._create_global_learning_rate() + # create the optimize op + sgd_op = global_block.append_op( + type='sgd', + inputs={ + "Param": table_param, + "Grad": table_grad, + "LearningRate": self._create_param_lr(param_and_grad) + }, + outputs={"ParamOut": param_and_grad[0]}) return new_param_grads, (table_param, table_grad), sgd_op + def backward(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None, + callbacks=None): + """ + First part of `minimize`, do auto-diff to append backward ops for + the current program. + + Args: + loss (Variable): loss variable to run optimizations. + startup_program (Program): startup_program for initializing parameters + in `parameter_list`. + parameter_list (list): list of Variables to update. + no_grad_set (set|None): set of Variables should be ignored. + callbacks (list|None): list of callables to run when appending backward + operator for one parameter. + + Return: + list: list of (param, grad) pair, grad is the output of backward. + + Examples: + See examples in `apply_gradients`. + """ + if callbacks is None: + callbacks = [error_clip_callback] + else: + assert (isinstance(callbacks, list)) + callbacks.append(error_clip_callback) + return append_backward(loss, parameter_list, no_grad_set, callbacks) + + def apply_gradients(self, params_grads): + """ + Second part of `minimize`, appending optimization operators for + given `params_grads` pairs. + + Args: + params_grads (list): list of (param, grad) pair to do optimization. + + Returns: + list: A list of operators appended to the current program. + + Examples: + .. code-block:: python + + loss = network() + optimizer = fluid.optimizer.SGD(learning_rate=0.1) + params_grads = optimizer.backward(loss) + # you may append operations for params_grads here + # ... + optimizer.apply_gradients(params_grads) + """ + params_grads = sorted(params_grads, key=lambda x: x[0].name) + + params_grads, table_param_and_grad, table_optimize_op = \ + self._process_distribute_lookuptable(params_grads) + + params_grads = append_gradient_clip_ops(params_grads) + + # Add regularization if any + params_grads = append_regularization_ops(params_grads, + self.regularization) + + optimize_ops = self._create_optimization_pass(params_grads) + if table_optimize_op is not None: + optimize_ops.append(table_optimize_op) + params_grads.append(table_param_and_grad) + + return optimize_ops + def minimize(self, loss, startup_program=None, parameter_list=None, no_grad_set=None): - """Add operations to minimize `loss` by updating `parameter_list`. + """ + Add operations to minimize `loss` by updating `parameter_list`. - This method combines interface `append_backward()` and - `create_optimization_pass()` into one. + This method combines interface `backward()` and + `apply_gradients()` into one. + + Args: + loss (Variable): loss variable to run optimizations. + startup_program (Program): startup_program for initializing parameters + in `parameter_list`. + parameter_list (list): list of Variables to update. + no_grad_set (set|None): set of Variables should be ignored. + + Returns: + tuple: (optimize_ops, params_grads) which are, list of operators appended; + and list of (param, grad) Variables pair for optimization. """ + self._dtype = loss.dtype + program = loss.block.program + optimize_ops = [] if imperative_base.enabled(): if parameter_list is not None: params_grads = parameter_list else: - program = loss.block.program parameters = program.global_block().all_parameters() params_grads = [] for param in parameters: @@ -317,29 +393,13 @@ class Optimizer(object): stop_gradient=True) grad_var._value = param._ivar.grad_value params_grads.append((param, grad_var)) - - optimize_ops = self._create_optimization_pass(params_grads, loss, - startup_program) + with program_guard(program, startup_program): + optimize_ops = self._create_optimization_pass(params_grads) else: - params_grads = append_backward(loss, parameter_list, no_grad_set, - [error_clip_callback]) - - params_grads = sorted(params_grads, key=lambda x: x[0].name) - - params_grads, table_param_and_grad, table_optimize_op = \ - self._process_distribute_lookuptable(params_grads, loss, startup_program) - - params_grads = append_gradient_clip_ops(params_grads) - - # Add regularization if any - params_grads = append_regularization_ops(params_grads, - self.regularization) - - optimize_ops = self._create_optimization_pass(params_grads, loss, - startup_program) - if table_optimize_op is not None: - optimize_ops.append(table_optimize_op) - params_grads.append(table_param_and_grad) + with program_guard(program, startup_program): + params_grads = self.backward(loss, startup_program, + parameter_list, no_grad_set) + optimize_ops = self.apply_gradients(params_grads) return optimize_ops, params_grads diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py index 4374d198f..34c9b7e00 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer.py @@ -61,6 +61,48 @@ class TestOptimizer(unittest.TestCase): self.assertEqual([op.type for op in opts], ["sgd"]) +class TestOptimizerBackwardApplygrad(unittest.TestCase): + def test_sgd_optimizer(self): + def check_sgd_optimizer(optimizer_attr): + init_program = framework.Program() + program = framework.Program() + block = program.global_block() + mul_x = block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="mul.x", + optimize_attr=optimizer_attr) + mul_y = block.create_var( + dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") + mul_out = block.create_var( + dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") + mean_out = block.create_var( + dtype="float32", shape=[1], lod_level=0, name="mean.out") + block.append_op( + type="mul", + inputs={"X": mul_x, + "Y": mul_y}, + outputs={"Out": mul_out}, + attrs={"x_num_col_dims": 1}) + block.append_op( + type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) + sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01) + with framework.program_guard(program, init_program): + p_g = sgd_optimizer.backward(mean_out) + opts = sgd_optimizer.apply_gradients(p_g) + return opts + + opts = check_sgd_optimizer({'learning_rate': 1.1}) + self.assertEqual(len(opts), 3) + self.assertEqual([op.type for op in opts], + ["fill_constant", "elementwise_mul", "sgd"]) + + opts = check_sgd_optimizer({'learning_rate': 1.0}) + self.assertEqual(len(opts), 1) + self.assertEqual([op.type for op in opts], ["sgd"]) + + class TestMomentumOptimizer(unittest.TestCase): class MockMomentum(optimizer.MomentumOptimizer): def get_accumulators(self): @@ -99,8 +141,8 @@ class TestMomentumOptimizer(unittest.TestCase): params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) - opts = momentum_optimizer._create_optimization_pass( - params_grads, mul_out, init_program) + with framework.program_guard(program, init_program): + opts = momentum_optimizer.apply_gradients(params_grads) self.assertEqual(len(opts), 3) sgd_op = opts[-1] self.assertEqual([op.type for op in opts], @@ -153,8 +195,8 @@ class TestMomentumOptimizer(unittest.TestCase): params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) - opts = momentum_optimizer._create_optimization_pass( - params_grads, mul_out, init_program) + with framework.program_guard(program, init_program): + opts = momentum_optimizer.apply_gradients(params_grads) self.assertEqual(len(opts), 3) sgd_op = opts[-1] self.assertEqual([op.type for op in opts], @@ -216,8 +258,8 @@ class TestAdagradOptimizer(unittest.TestCase): params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0) - opts = adagrad_optimizer._create_optimization_pass( - params_grads, mul_out, init_program) + with framework.program_guard(program, init_program): + opts = adagrad_optimizer.apply_gradients(params_grads) self.assertEqual(len(opts), 3) self.assertEqual([op.type for op in opts], ["fill_constant", "elementwise_mul", "adagrad"]) @@ -280,8 +322,8 @@ class TestAdamOptimizer(unittest.TestCase): params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(adam_optimizer.get_accumulators()), 0) - opts = adam_optimizer._create_optimization_pass(params_grads, mul_out, - init_program) + with framework.program_guard(program, init_program): + opts = adam_optimizer.apply_gradients(params_grads) self.assertEqual(len(opts), 5) self.assertEqual( [op.type for op in opts], @@ -347,8 +389,8 @@ class TestAdamaxOptimizer(unittest.TestCase): params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(adamax_optimizer.get_accumulators()), 0) - opts = adamax_optimizer._create_optimization_pass(params_grads, mul_out, - init_program) + with framework.program_guard(program, init_program): + opts = adamax_optimizer.apply_gradients(params_grads) self.assertEqual(len(opts), 4) self.assertEqual( [op.type for op in opts], @@ -411,8 +453,8 @@ class TestDecayedAdagradOptimizer(unittest.TestCase): params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0) - opts = decayed_adagrad_optimizer._create_optimization_pass( - params_grads, mul_out, init_program) + with framework.program_guard(program, init_program): + opts = decayed_adagrad_optimizer.apply_gradients(params_grads) self.assertEqual(len(opts), 3) self.assertEqual( [op.type for op in opts], @@ -477,8 +519,8 @@ class TestFtrlOptimizer(unittest.TestCase): params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0) - opts = ftrl_optimizer._create_optimization_pass(params_grads, mul_out, - init_program) + with framework.program_guard(program, init_program): + opts = ftrl_optimizer.apply_gradients(params_grads) self.assertEqual(len(opts), 3) self.assertEqual([op.type for op in opts], ["fill_constant", "elementwise_mul", "ftrl"]) diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py index 37ee88097..b0494f114 100644 --- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py +++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py @@ -28,6 +28,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest): def initParams(self): self.numeric_stable_mode = False + self.dtype = np.float64 def setUp(self): self.initParams() @@ -36,19 +37,19 @@ class TestSoftmaxWithCrossEntropyOp(OpTest): class_num = 37 logits = np.random.uniform(0.1, 1.0, - [batch_size, class_num]).astype("float64") + [batch_size, class_num]).astype(self.dtype) softmax = np.apply_along_axis(stable_softmax, 1, logits) labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64") cross_entropy = np.asmatrix( [[-np.log(softmax[i][labels[i][0]])] for i in range(softmax.shape[0])], - dtype="float64") + dtype=self.dtype) self.inputs = {"Logits": logits, "Label": labels} self.outputs = { - "Softmax": softmax.astype("float64"), - "Loss": cross_entropy.astype("float64") + "Softmax": softmax.astype(self.dtype), + "Loss": cross_entropy.astype(self.dtype) } self.attrs = {"numeric_stable_mode": self.numeric_stable_mode} @@ -56,7 +57,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest): self.check_output() def test_check_grad(self): - self.check_grad(["Logits"], "Loss") + self.check_grad(["Logits"], "Loss", max_relative_error=0.05) class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp): @@ -64,6 +65,55 @@ class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp): self.numeric_stable_mode = True +class TestSoftmaxWithCrossEntropyOpFp16(TestSoftmaxWithCrossEntropyOp): + def initParams(self): + self.numeric_stable_mode = False + self.dtype = np.float16 + + def setUp(self): + self.initParams() + self.op_type = "softmax_with_cross_entropy" + batch_size = 41 + class_num = 37 + + # NOTE: numpy float16 have very low accuracy, use float32 for numpy check. + logits = np.random.uniform(0.1, 1.0, + [batch_size, class_num]).astype(np.float32) + softmax = np.apply_along_axis(stable_softmax, 1, logits) + labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64") + + cross_entropy = np.asmatrix( + [[-np.log(softmax[i][labels[i][0]])] + for i in range(softmax.shape[0])], + dtype=np.float32) + + self.inputs = { + "Logits": logits.astype(self.dtype).view(np.uint16), + "Label": labels + } + self.outputs = { + "Softmax": softmax.astype(self.dtype), + "Loss": cross_entropy.astype(self.dtype) + } + self.attrs = {"numeric_stable_mode": self.numeric_stable_mode} + + def test_check_output(self): + self.check_output(atol=1e-2) + + def test_check_grad(self): + self.check_grad(["Logits"], "Loss", max_relative_error=0.1) + + +class TestSoftmaxWithCrossEntropyOpNoCudnnFp16( + TestSoftmaxWithCrossEntropyOpFp16): + def initParams(self): + self.numeric_stable_mode = True + self.dtype = np.float16 + + def test_check_grad(self): + self.check_grad(["Logits"], "Loss", max_relative_error=0.1) + + class TestSoftmaxWithCrossEntropyOp2(OpTest): """ Test softmax with cross entropy operator with soft labels. -- GitLab