未验证 提交 fd854183 编写于 作者: W Wu Yi 提交者: GitHub

[Feature] support mix precision training for resnet (#14899)

* clip softmax for fp16

* updates

* fuse xent support fp16 test=develop

* wip

* wip

* add simple row reduce

* wip fp16 accurate softmax

* add accurate softmax kernel for fp16 test=develop

* update test=develop

* fix cpu build test=develop

* update api.spec test=develop

* follow comments test=develop

* fix build test=develop

* fix trt build test=develop

* fix inference build test=develop

* fix merge test=develop

* update test=develop

* try fix build test=develop

* fix build test=develop

* rename real_exp test=develop

* fortest

* remove hacky kernels test=develop

* clean up test=develop
上级 f2afb070
...@@ -405,28 +405,50 @@ paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None ...@@ -405,28 +405,50 @@ paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None
paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)) paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)) paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True))
paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.optimizer.SGDOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.SGDOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)) paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
paddle.fluid.optimizer.MomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.MomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None)) paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)) paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False))
paddle.fluid.optimizer.AdamOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdamOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdamaxOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)) paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.DecayedAdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)) paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None))
paddle.fluid.optimizer.FtrlOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.FtrlOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)) paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.RMSPropOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdadeltaOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)) paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None))
paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.optimizer.ModelAverage.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.ModelAverage.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)) paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None))
paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.LarsMomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.LarsMomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.LarsMomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)) paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
......
...@@ -297,6 +297,21 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -297,6 +297,21 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>( cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
layout, framework::vectorize2int(filter->dims()), groups); layout, framework::vectorize2int(filter->dims()), groups);
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Enable Tensor Core for cudnn backward
if (dev_ctx.GetComputeCapability() >= 70 &&
std::type_index(typeid(T)) ==
std::type_index(typeid(platform::float16))) {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
VLOG(5) << "use cudnn_tensor_op_math for backward";
} else {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
VLOG(5) << "NOT use cudnn_tensor_op_math for backward";
}
#endif
int input_channels = input->dims()[1]; int input_channels = input->dims()[1];
int input_height, input_width, input_depth; int input_height, input_width, input_depth;
if (input->dims().size() == 5) { if (input->dims().size() == 5) {
......
...@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_sub, elementwise_sub,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_sub_grad, elementwise_sub_grad,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
......
...@@ -49,6 +49,7 @@ class SoftmaxGradCUDNNFunctor { ...@@ -49,6 +49,7 @@ class SoftmaxGradCUDNNFunctor {
const framework::Tensor* Y, const framework::Tensor* y_grad, const framework::Tensor* Y, const framework::Tensor* y_grad,
framework::Tensor* x_grad); framework::Tensor* x_grad);
}; };
#endif #endif
} // namespace math } // namespace math
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -58,12 +55,24 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad, ...@@ -58,12 +55,24 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
} // namespace } // namespace
static __device__ __forceinline__ float real_exp(float x) { return expf(x); } static __device__ __forceinline__ platform::float16 exp_on_device(
static __device__ __forceinline__ double real_exp(double x) { return exp(x); } platform::float16 x) {
static __device__ __forceinline__ float real_log(float x) { return ::Eigen::numext::exp(x);
}
static __device__ __forceinline__ float exp_on_device(float x) {
return expf(x);
}
static __device__ __forceinline__ double exp_on_device(double x) {
return exp(x);
}
static __device__ __forceinline__ platform::float16 log_on_device(
platform::float16 x) {
return math::TolerableValue<platform::float16>()(::Eigen::numext::log(x));
}
static __device__ __forceinline__ float log_on_device(float x) {
return math::TolerableValue<float>()(logf(x)); return math::TolerableValue<float>()(logf(x));
} }
static __device__ __forceinline__ double real_log(double x) { static __device__ __forceinline__ double log_on_device(double x) {
return math::TolerableValue<double>()(log(x)); return math::TolerableValue<double>()(log(x));
} }
...@@ -72,25 +81,20 @@ static __device__ __forceinline__ double real_log(double x) { ...@@ -72,25 +81,20 @@ static __device__ __forceinline__ double real_log(double x) {
/* /*
Supposing the x is `logits` and y is `labels`, the equations are as Supposing the x is `logits` and y is `labels`, the equations are as
followings: followings:
cross\_entropy_i = \sum_{j}[- y_i_j * log({e^{x_i_j}/\sum_{j}e^{x_i_j}})] cross\_entropy_i = \sum_{j}[- y_i_j * log({e^{x_i_j}/\sum_{j}e^{x_i_j}})]
= \sum_{j}[- y_i_j * log({e^{x_i_j - max_i}/\sum_{j}e^{x_i_j-max_i}})] = \sum_{j}[- y_i_j * log({e^{x_i_j - max_i}/\sum_{j}e^{x_i_j-max_i}})]
= \sum_{j}[-y_i_j * (x_i_j - max_i - log\sum_{j}e^{x_i_j - max_i})] = \sum_{j}[-y_i_j * (x_i_j - max_i - log\sum_{j}e^{x_i_j - max_i})]
= \sum_{j}[-y_i_j * (x_i_j - max_i - logDiffMaxSum_i)] = \sum_{j}[-y_i_j * (x_i_j - max_i - logDiffMaxSum_i)]
= \sum_{j}(-y_i_j * tmp_i_j) = \sum_{j}(-y_i_j * tmp_i_j)
softmax_i_j = e^{tmp_i_j} softmax_i_j = e^{tmp_i_j}
where: where:
max_i = \max_{j}{x_i_j} max_i = \max_{j}{x_i_j}
logDiffMaxSum_i = log\sum_{j}e^{x_i_j - max_i} logDiffMaxSum_i = log\sum_{j}e^{x_i_j - max_i}
tmp_i_j = x_i_j - max_i - logDiffMaxSum_i tmp_i_j = x_i_j - max_i - logDiffMaxSum_i
Therefore, the calculation can be separated into 3 steps: Therefore, the calculation can be separated into 3 steps:
Step 1: row-wise operation to calculate max_i Step 1: row-wise operation to calculate max_i
Step 2: row-wise operation to calculate logDiffMaxSum_i Step 2: row-wise operation to calculate logDiffMaxSum_i
Step 3: caculate tmp_i_j, and finally get softmax_i_j and cross\_entropy_i Step 3: caculate tmp_i_j, and finally get softmax_i_j and cross\_entropy_i
To save memory, we can share memory among max_i, logDiffMaxSum_i and To save memory, we can share memory among max_i, logDiffMaxSum_i and
cross\_entropy_i. cross\_entropy_i.
In this way, the 3 steps should be changed to: In this way, the 3 steps should be changed to:
...@@ -134,7 +138,8 @@ static __global__ void RowReductionForMax(const T* logits_data, T* max_data, ...@@ -134,7 +138,8 @@ static __global__ void RowReductionForMax(const T* logits_data, T* max_data,
cur_max = BlockReduce<T, BlockDim>(temp_storage).Reduce(cur_max, cub::Max()); cur_max = BlockReduce<T, BlockDim>(temp_storage).Reduce(cur_max, cub::Max());
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
max_data[blockIdx.x] = cur_max < -64 ? -64 : cur_max; max_data[blockIdx.x] =
cur_max < static_cast<T>(-64) ? static_cast<T>(-64) : cur_max;
} }
} }
...@@ -151,17 +156,17 @@ static __global__ void RowReductionForDiffMaxSum(const T* logits_data, ...@@ -151,17 +156,17 @@ static __global__ void RowReductionForDiffMaxSum(const T* logits_data,
auto block_max = max_data[blockIdx.x]; auto block_max = max_data[blockIdx.x];
softmax[beg_idx] = logits_data[beg_idx] - block_max; softmax[beg_idx] = logits_data[beg_idx] - block_max;
T diff_max_sum = real_exp(softmax[beg_idx]); T diff_max_sum = exp_on_device(softmax[beg_idx]);
auto idx = beg_idx + BlockDim; auto idx = beg_idx + BlockDim;
while (idx < end_idx) { while (idx < end_idx) {
softmax[idx] = logits_data[idx] - block_max; softmax[idx] = logits_data[idx] - block_max;
diff_max_sum += real_exp(softmax[idx]); diff_max_sum += exp_on_device(softmax[idx]);
idx += BlockDim; idx += BlockDim;
} }
diff_max_sum = diff_max_sum =
BlockReduce<T, BlockDim>(temp_storage).Reduce(diff_max_sum, cub::Sum()); BlockReduce<T, BlockDim>(temp_storage).Reduce(diff_max_sum, cub::Sum());
if (threadIdx.x == 0) max_data[blockIdx.x] = real_log(diff_max_sum); if (threadIdx.x == 0) max_data[blockIdx.x] = log_on_device(diff_max_sum);
if (!CalculateLogSoftmax) return; if (!CalculateLogSoftmax) return;
__syncthreads(); __syncthreads();
...@@ -188,12 +193,12 @@ static __global__ void RowReductionForSoftmaxAndCrossEntropy( ...@@ -188,12 +193,12 @@ static __global__ void RowReductionForSoftmaxAndCrossEntropy(
// log_diff_max_sum shares memory with loss // log_diff_max_sum shares memory with loss
auto block_log_diff_max_sum = loss_data[blockIdx.x]; auto block_log_diff_max_sum = loss_data[blockIdx.x];
auto tmp = softmax[beg_idx] - block_log_diff_max_sum; auto tmp = softmax[beg_idx] - block_log_diff_max_sum;
softmax[beg_idx] = real_exp(tmp); softmax[beg_idx] = exp_on_device(tmp);
auto loss = -labels_data[beg_idx] * tmp; auto loss = -labels_data[beg_idx] * tmp;
beg_idx += BlockDim; beg_idx += BlockDim;
while (beg_idx < end_idx) { while (beg_idx < end_idx) {
tmp = softmax[beg_idx] - block_log_diff_max_sum; tmp = softmax[beg_idx] - block_log_diff_max_sum;
softmax[beg_idx] = real_exp(tmp); softmax[beg_idx] = exp_on_device(tmp);
loss -= (labels_data[beg_idx] * tmp); loss -= (labels_data[beg_idx] * tmp);
beg_idx += BlockDim; beg_idx += BlockDim;
} }
...@@ -218,10 +223,10 @@ struct HardLabelSoftmaxWithCrossEntropyFunctor { ...@@ -218,10 +223,10 @@ struct HardLabelSoftmaxWithCrossEntropyFunctor {
auto row_idx = idx / feature_size_; auto row_idx = idx / feature_size_;
auto col_idx = idx % feature_size_; auto col_idx = idx % feature_size_;
if (col_idx != labels_[row_idx]) { if (col_idx != labels_[row_idx]) {
log_softmax_[idx] = real_exp(log_softmax_[idx]); log_softmax_[idx] = exp_on_device(log_softmax_[idx]);
} else { } else {
auto softmax = log_softmax_[idx]; auto softmax = log_softmax_[idx];
log_softmax_[idx] = real_exp(softmax); log_softmax_[idx] = exp_on_device(softmax);
loss_[row_idx] = -softmax; loss_[row_idx] = -softmax;
} }
} }
...@@ -253,10 +258,10 @@ struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx { ...@@ -253,10 +258,10 @@ struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx {
auto row_idx = idx / feature_size_; auto row_idx = idx / feature_size_;
auto col_idx = idx % feature_size_; auto col_idx = idx % feature_size_;
if (col_idx != labels_[row_idx] || col_idx == ignore_idx_) { if (col_idx != labels_[row_idx] || col_idx == ignore_idx_) {
log_softmax_[idx] = real_exp(log_softmax_[idx]); log_softmax_[idx] = exp_on_device(log_softmax_[idx]);
} else { } else {
auto softmax = log_softmax_[idx]; auto softmax = log_softmax_[idx];
log_softmax_[idx] = real_exp(softmax); log_softmax_[idx] = exp_on_device(softmax);
loss_[row_idx] = -softmax; loss_[row_idx] = -softmax;
} }
} }
...@@ -464,9 +469,12 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> { ...@@ -464,9 +469,12 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy, REGISTER_OP_CUDA_KERNEL(
ops::SoftmaxWithCrossEntropyCUDAKernel<float>, softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
ops::SoftmaxWithCrossEntropyCUDAKernel<double>); ops::SoftmaxWithCrossEntropyCUDAKernel<paddle::platform::float16>,
REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy_grad, ops::SoftmaxWithCrossEntropyCUDAKernel<double>);
ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>, REGISTER_OP_CUDA_KERNEL(
ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>); softmax_with_cross_entropy_grad,
ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
ops::SoftmaxWithCrossEntropyGradCUDAKernel<paddle::platform::float16>,
ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>);
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -195,22 +195,18 @@ class Optimizer(object): ...@@ -195,22 +195,18 @@ class Optimizer(object):
format(name, param.name)) format(name, param.name))
return self._accumulators[name][param.name] return self._accumulators[name][param.name]
def _create_optimization_pass(self, def _create_optimization_pass(self, parameters_and_grads):
parameters_and_grads,
loss,
startup_program=None):
"""Add optimization operators to update gradients to variables. """Add optimization operators to update gradients to variables.
Args: Args:
loss(Variable): the target that this optimization is for.
parameters_and_grads(list(tuple(Variable, Variable))): parameters_and_grads(list(tuple(Variable, Variable))):
a list of (variable, gradient) pair to update. a list of (variable, gradient) pair to update.
Returns: Returns:
return_op_list: a list of operators that will complete one step of return_op_list: a list of operators that will complete one step of
optimization. This will include parameter update ops, global step optimization. This will include parameter update ops, global step
update ops and any other custom ops required by subclasses to manage update ops and any other custom ops required by subclasses to manage
their internal state. their internal state.
""" """
# This is a default implementation of create_optimization_pass that # This is a default implementation of create_optimization_pass that
# can be shared by most optimizers. This implementation assumes that # can be shared by most optimizers. This implementation assumes that
...@@ -219,37 +215,33 @@ class Optimizer(object): ...@@ -219,37 +215,33 @@ class Optimizer(object):
# _create_accumulators method if it needs to create accumulators # _create_accumulators method if it needs to create accumulators
# for parameters and extend _finish_update method to add custom ops. # for parameters and extend _finish_update method to add custom ops.
# Create any accumulators # Allways called under program_guard use global block as loss block
program = loss.block.program global_block = framework.default_main_program().global_block()
self._dtype = loss.dtype start = len(global_block.ops)
with program_guard(program, startup_program): self.helper = LayerHelper(self.__class__.__name__)
global_block = framework.default_main_program().global_block() self._create_accumulators(global_block,
start = len(global_block.ops) [p[0] for p in parameters_and_grads])
self.helper = LayerHelper(self.__class__.__name__) self._create_global_learning_rate()
self._create_accumulators(loss.block,
[p[0] for p in parameters_and_grads]) optimize_ops = []
self._create_global_learning_rate() for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None:
optimize_ops = [] continue
for param_and_grad in parameters_and_grads: with param_and_grad[0].block.program._optimized_guard(
if param_and_grad[1] is None: param_and_grad), name_scope("optimizer"):
continue if param_and_grad[0].trainable is True:
with param_and_grad[0].block.program._optimized_guard( optimize_op = self._append_optimize_op(global_block,
param_and_grad), name_scope("optimizer"): param_and_grad)
if param_and_grad[0].trainable is True: optimize_ops.append(optimize_op)
optimize_op = self._append_optimize_op(loss.block,
param_and_grad) # Get custom finish ops for subclasses
optimize_ops.append(optimize_op) # FIXME: Need to fix this once we figure out how to handle dependencies
self._finish_update(global_block, parameters_and_grads)
# Get custom finish ops for subclasses
# FIXME: Need to fix this once we figure out how to handle dependencies end = len(global_block.ops)
self._finish_update(loss.block, parameters_and_grads) return global_block._slice_ops(start, end)
end = len(global_block.ops) def _process_distribute_lookuptable(self, param_grads):
return global_block._slice_ops(start, end)
def _process_distribute_lookuptable(self, param_grads, loss,
startup_program):
""" """
Because distribute lookup table only support SGD optimizer for now, not support Because distribute lookup table only support SGD optimizer for now, not support
other optimizer and regularization, so we should find the table parameter out, other optimizer and regularization, so we should find the table parameter out,
...@@ -259,7 +251,8 @@ class Optimizer(object): ...@@ -259,7 +251,8 @@ class Optimizer(object):
:param loss: the loss variable. :param loss: the loss variable.
:param startup_program: the startup program :param startup_program: the startup program
""" """
program = loss.block.program program = framework.default_main_program()
global_block = framework.default_main_program().global_block()
table_name = find_distributed_lookup_table(program) table_name = find_distributed_lookup_table(program)
table_param = None table_param = None
table_grad = None table_grad = None
...@@ -275,38 +268,121 @@ class Optimizer(object): ...@@ -275,38 +268,121 @@ class Optimizer(object):
new_param_grads.append((p, g)) new_param_grads.append((p, g))
sgd_op = None sgd_op = None
if table_param is not None: if table_param is not None:
with program_guard(program, startup_program): param_and_grad = [table_param, table_grad]
param_and_grad = [table_param, table_grad] with table_param.block.program._optimized_guard(param_and_grad), \
with table_param.block.program._optimized_guard(param_and_grad), \ framework.name_scope("optimizer"):
framework.name_scope("optimizer"): self._create_global_learning_rate()
self._create_global_learning_rate() # create the optimize op
# create the optimize op sgd_op = global_block.append_op(
sgd_op = loss.block.append_op( type='sgd',
type='sgd', inputs={
inputs={ "Param": table_param,
"Param": table_param, "Grad": table_grad,
"Grad": table_grad, "LearningRate": self._create_param_lr(param_and_grad)
"LearningRate": },
self._create_param_lr(param_and_grad) outputs={"ParamOut": param_and_grad[0]})
},
outputs={"ParamOut": param_and_grad[0]})
return new_param_grads, (table_param, table_grad), sgd_op return new_param_grads, (table_param, table_grad), sgd_op
def backward(self,
loss,
startup_program=None,
parameter_list=None,
no_grad_set=None,
callbacks=None):
"""
First part of `minimize`, do auto-diff to append backward ops for
the current program.
Args:
loss (Variable): loss variable to run optimizations.
startup_program (Program): startup_program for initializing parameters
in `parameter_list`.
parameter_list (list): list of Variables to update.
no_grad_set (set|None): set of Variables should be ignored.
callbacks (list|None): list of callables to run when appending backward
operator for one parameter.
Return:
list: list of (param, grad) pair, grad is the output of backward.
Examples:
See examples in `apply_gradients`.
"""
if callbacks is None:
callbacks = [error_clip_callback]
else:
assert (isinstance(callbacks, list))
callbacks.append(error_clip_callback)
return append_backward(loss, parameter_list, no_grad_set, callbacks)
def apply_gradients(self, params_grads):
"""
Second part of `minimize`, appending optimization operators for
given `params_grads` pairs.
Args:
params_grads (list): list of (param, grad) pair to do optimization.
Returns:
list: A list of operators appended to the current program.
Examples:
.. code-block:: python
loss = network()
optimizer = fluid.optimizer.SGD(learning_rate=0.1)
params_grads = optimizer.backward(loss)
# you may append operations for params_grads here
# ...
optimizer.apply_gradients(params_grads)
"""
params_grads = sorted(params_grads, key=lambda x: x[0].name)
params_grads, table_param_and_grad, table_optimize_op = \
self._process_distribute_lookuptable(params_grads)
params_grads = append_gradient_clip_ops(params_grads)
# Add regularization if any
params_grads = append_regularization_ops(params_grads,
self.regularization)
optimize_ops = self._create_optimization_pass(params_grads)
if table_optimize_op is not None:
optimize_ops.append(table_optimize_op)
params_grads.append(table_param_and_grad)
return optimize_ops
def minimize(self, def minimize(self,
loss, loss,
startup_program=None, startup_program=None,
parameter_list=None, parameter_list=None,
no_grad_set=None): no_grad_set=None):
"""Add operations to minimize `loss` by updating `parameter_list`. """
Add operations to minimize `loss` by updating `parameter_list`.
This method combines interface `append_backward()` and This method combines interface `backward()` and
`create_optimization_pass()` into one. `apply_gradients()` into one.
Args:
loss (Variable): loss variable to run optimizations.
startup_program (Program): startup_program for initializing parameters
in `parameter_list`.
parameter_list (list): list of Variables to update.
no_grad_set (set|None): set of Variables should be ignored.
Returns:
tuple: (optimize_ops, params_grads) which are, list of operators appended;
and list of (param, grad) Variables pair for optimization.
""" """
self._dtype = loss.dtype
program = loss.block.program
optimize_ops = []
if imperative_base.enabled(): if imperative_base.enabled():
if parameter_list is not None: if parameter_list is not None:
params_grads = parameter_list params_grads = parameter_list
else: else:
program = loss.block.program
parameters = program.global_block().all_parameters() parameters = program.global_block().all_parameters()
params_grads = [] params_grads = []
for param in parameters: for param in parameters:
...@@ -317,29 +393,13 @@ class Optimizer(object): ...@@ -317,29 +393,13 @@ class Optimizer(object):
stop_gradient=True) stop_gradient=True)
grad_var._value = param._ivar.grad_value grad_var._value = param._ivar.grad_value
params_grads.append((param, grad_var)) params_grads.append((param, grad_var))
with program_guard(program, startup_program):
optimize_ops = self._create_optimization_pass(params_grads, loss, optimize_ops = self._create_optimization_pass(params_grads)
startup_program)
else: else:
params_grads = append_backward(loss, parameter_list, no_grad_set, with program_guard(program, startup_program):
[error_clip_callback]) params_grads = self.backward(loss, startup_program,
parameter_list, no_grad_set)
params_grads = sorted(params_grads, key=lambda x: x[0].name) optimize_ops = self.apply_gradients(params_grads)
params_grads, table_param_and_grad, table_optimize_op = \
self._process_distribute_lookuptable(params_grads, loss, startup_program)
params_grads = append_gradient_clip_ops(params_grads)
# Add regularization if any
params_grads = append_regularization_ops(params_grads,
self.regularization)
optimize_ops = self._create_optimization_pass(params_grads, loss,
startup_program)
if table_optimize_op is not None:
optimize_ops.append(table_optimize_op)
params_grads.append(table_param_and_grad)
return optimize_ops, params_grads return optimize_ops, params_grads
......
...@@ -61,6 +61,48 @@ class TestOptimizer(unittest.TestCase): ...@@ -61,6 +61,48 @@ class TestOptimizer(unittest.TestCase):
self.assertEqual([op.type for op in opts], ["sgd"]) self.assertEqual([op.type for op in opts], ["sgd"])
class TestOptimizerBackwardApplygrad(unittest.TestCase):
def test_sgd_optimizer(self):
def check_sgd_optimizer(optimizer_attr):
init_program = framework.Program()
program = framework.Program()
block = program.global_block()
mul_x = block.create_parameter(
dtype="float32",
shape=[5, 10],
lod_level=0,
name="mul.x",
optimize_attr=optimizer_attr)
mul_y = block.create_var(
dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
mul_out = block.create_var(
dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
mean_out = block.create_var(
dtype="float32", shape=[1], lod_level=0, name="mean.out")
block.append_op(
type="mul",
inputs={"X": mul_x,
"Y": mul_y},
outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1})
block.append_op(
type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
with framework.program_guard(program, init_program):
p_g = sgd_optimizer.backward(mean_out)
opts = sgd_optimizer.apply_gradients(p_g)
return opts
opts = check_sgd_optimizer({'learning_rate': 1.1})
self.assertEqual(len(opts), 3)
self.assertEqual([op.type for op in opts],
["fill_constant", "elementwise_mul", "sgd"])
opts = check_sgd_optimizer({'learning_rate': 1.0})
self.assertEqual(len(opts), 1)
self.assertEqual([op.type for op in opts], ["sgd"])
class TestMomentumOptimizer(unittest.TestCase): class TestMomentumOptimizer(unittest.TestCase):
class MockMomentum(optimizer.MomentumOptimizer): class MockMomentum(optimizer.MomentumOptimizer):
def get_accumulators(self): def get_accumulators(self):
...@@ -99,8 +141,8 @@ class TestMomentumOptimizer(unittest.TestCase): ...@@ -99,8 +141,8 @@ class TestMomentumOptimizer(unittest.TestCase):
params_grads = append_backward(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
opts = momentum_optimizer._create_optimization_pass( with framework.program_guard(program, init_program):
params_grads, mul_out, init_program) opts = momentum_optimizer.apply_gradients(params_grads)
self.assertEqual(len(opts), 3) self.assertEqual(len(opts), 3)
sgd_op = opts[-1] sgd_op = opts[-1]
self.assertEqual([op.type for op in opts], self.assertEqual([op.type for op in opts],
...@@ -153,8 +195,8 @@ class TestMomentumOptimizer(unittest.TestCase): ...@@ -153,8 +195,8 @@ class TestMomentumOptimizer(unittest.TestCase):
params_grads = append_backward(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
opts = momentum_optimizer._create_optimization_pass( with framework.program_guard(program, init_program):
params_grads, mul_out, init_program) opts = momentum_optimizer.apply_gradients(params_grads)
self.assertEqual(len(opts), 3) self.assertEqual(len(opts), 3)
sgd_op = opts[-1] sgd_op = opts[-1]
self.assertEqual([op.type for op in opts], self.assertEqual([op.type for op in opts],
...@@ -216,8 +258,8 @@ class TestAdagradOptimizer(unittest.TestCase): ...@@ -216,8 +258,8 @@ class TestAdagradOptimizer(unittest.TestCase):
params_grads = append_backward(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0) self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
opts = adagrad_optimizer._create_optimization_pass( with framework.program_guard(program, init_program):
params_grads, mul_out, init_program) opts = adagrad_optimizer.apply_gradients(params_grads)
self.assertEqual(len(opts), 3) self.assertEqual(len(opts), 3)
self.assertEqual([op.type for op in opts], self.assertEqual([op.type for op in opts],
["fill_constant", "elementwise_mul", "adagrad"]) ["fill_constant", "elementwise_mul", "adagrad"])
...@@ -280,8 +322,8 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -280,8 +322,8 @@ class TestAdamOptimizer(unittest.TestCase):
params_grads = append_backward(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adam_optimizer.get_accumulators()), 0) self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
opts = adam_optimizer._create_optimization_pass(params_grads, mul_out, with framework.program_guard(program, init_program):
init_program) opts = adam_optimizer.apply_gradients(params_grads)
self.assertEqual(len(opts), 5) self.assertEqual(len(opts), 5)
self.assertEqual( self.assertEqual(
[op.type for op in opts], [op.type for op in opts],
...@@ -347,8 +389,8 @@ class TestAdamaxOptimizer(unittest.TestCase): ...@@ -347,8 +389,8 @@ class TestAdamaxOptimizer(unittest.TestCase):
params_grads = append_backward(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adamax_optimizer.get_accumulators()), 0) self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
opts = adamax_optimizer._create_optimization_pass(params_grads, mul_out, with framework.program_guard(program, init_program):
init_program) opts = adamax_optimizer.apply_gradients(params_grads)
self.assertEqual(len(opts), 4) self.assertEqual(len(opts), 4)
self.assertEqual( self.assertEqual(
[op.type for op in opts], [op.type for op in opts],
...@@ -411,8 +453,8 @@ class TestDecayedAdagradOptimizer(unittest.TestCase): ...@@ -411,8 +453,8 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
params_grads = append_backward(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0) self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
opts = decayed_adagrad_optimizer._create_optimization_pass( with framework.program_guard(program, init_program):
params_grads, mul_out, init_program) opts = decayed_adagrad_optimizer.apply_gradients(params_grads)
self.assertEqual(len(opts), 3) self.assertEqual(len(opts), 3)
self.assertEqual( self.assertEqual(
[op.type for op in opts], [op.type for op in opts],
...@@ -477,8 +519,8 @@ class TestFtrlOptimizer(unittest.TestCase): ...@@ -477,8 +519,8 @@ class TestFtrlOptimizer(unittest.TestCase):
params_grads = append_backward(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0) self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0)
opts = ftrl_optimizer._create_optimization_pass(params_grads, mul_out, with framework.program_guard(program, init_program):
init_program) opts = ftrl_optimizer.apply_gradients(params_grads)
self.assertEqual(len(opts), 3) self.assertEqual(len(opts), 3)
self.assertEqual([op.type for op in opts], self.assertEqual([op.type for op in opts],
["fill_constant", "elementwise_mul", "ftrl"]) ["fill_constant", "elementwise_mul", "ftrl"])
......
...@@ -28,6 +28,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest): ...@@ -28,6 +28,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
def initParams(self): def initParams(self):
self.numeric_stable_mode = False self.numeric_stable_mode = False
self.dtype = np.float64
def setUp(self): def setUp(self):
self.initParams() self.initParams()
...@@ -36,19 +37,19 @@ class TestSoftmaxWithCrossEntropyOp(OpTest): ...@@ -36,19 +37,19 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
class_num = 37 class_num = 37
logits = np.random.uniform(0.1, 1.0, logits = np.random.uniform(0.1, 1.0,
[batch_size, class_num]).astype("float64") [batch_size, class_num]).astype(self.dtype)
softmax = np.apply_along_axis(stable_softmax, 1, logits) softmax = np.apply_along_axis(stable_softmax, 1, logits)
labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64") labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64")
cross_entropy = np.asmatrix( cross_entropy = np.asmatrix(
[[-np.log(softmax[i][labels[i][0]])] [[-np.log(softmax[i][labels[i][0]])]
for i in range(softmax.shape[0])], for i in range(softmax.shape[0])],
dtype="float64") dtype=self.dtype)
self.inputs = {"Logits": logits, "Label": labels} self.inputs = {"Logits": logits, "Label": labels}
self.outputs = { self.outputs = {
"Softmax": softmax.astype("float64"), "Softmax": softmax.astype(self.dtype),
"Loss": cross_entropy.astype("float64") "Loss": cross_entropy.astype(self.dtype)
} }
self.attrs = {"numeric_stable_mode": self.numeric_stable_mode} self.attrs = {"numeric_stable_mode": self.numeric_stable_mode}
...@@ -56,7 +57,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest): ...@@ -56,7 +57,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
self.check_grad(["Logits"], "Loss") self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp): class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp):
...@@ -64,6 +65,55 @@ class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp): ...@@ -64,6 +65,55 @@ class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp):
self.numeric_stable_mode = True self.numeric_stable_mode = True
class TestSoftmaxWithCrossEntropyOpFp16(TestSoftmaxWithCrossEntropyOp):
def initParams(self):
self.numeric_stable_mode = False
self.dtype = np.float16
def setUp(self):
self.initParams()
self.op_type = "softmax_with_cross_entropy"
batch_size = 41
class_num = 37
# NOTE: numpy float16 have very low accuracy, use float32 for numpy check.
logits = np.random.uniform(0.1, 1.0,
[batch_size, class_num]).astype(np.float32)
softmax = np.apply_along_axis(stable_softmax, 1, logits)
labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64")
cross_entropy = np.asmatrix(
[[-np.log(softmax[i][labels[i][0]])]
for i in range(softmax.shape[0])],
dtype=np.float32)
self.inputs = {
"Logits": logits.astype(self.dtype).view(np.uint16),
"Label": labels
}
self.outputs = {
"Softmax": softmax.astype(self.dtype),
"Loss": cross_entropy.astype(self.dtype)
}
self.attrs = {"numeric_stable_mode": self.numeric_stable_mode}
def test_check_output(self):
self.check_output(atol=1e-2)
def test_check_grad(self):
self.check_grad(["Logits"], "Loss", max_relative_error=0.1)
class TestSoftmaxWithCrossEntropyOpNoCudnnFp16(
TestSoftmaxWithCrossEntropyOpFp16):
def initParams(self):
self.numeric_stable_mode = True
self.dtype = np.float16
def test_check_grad(self):
self.check_grad(["Logits"], "Loss", max_relative_error=0.1)
class TestSoftmaxWithCrossEntropyOp2(OpTest): class TestSoftmaxWithCrossEntropyOp2(OpTest):
""" """
Test softmax with cross entropy operator with soft labels. Test softmax with cross entropy operator with soft labels.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册