diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index 5f314b0f925759844e9a4fce94623c1059ecb7fe..a02d528a4a84ed5f829545e80343ab6bf85e969d 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -261,6 +261,13 @@ NameVarBaseMap CastPureFp16Inputs(const std::string& op_type, dst_type = framework::proto::VarType::FP32; } for (auto& pair : new_ins) { + // NOTE: The run_program OP only has FP32 kernel. In dy2stat pure fp16 + // training, we have correctly cast the inputs of run_program OP before, + // so here should avoid casting for run_program OP. + if (op_type == "run_program") { + continue; + } + if ((op_type == "batch_norm" || op_type == "layer_norm" || op_type == "sync_batch_norm") && pair.first != "X") { diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py index 006287752839dde7c598a16a2230b50e4f03bbb8..6fb59d61736dc19d33a2d0212464aba3f8c35a52 100644 --- a/python/paddle/fluid/dygraph/amp/auto_cast.py +++ b/python/paddle/fluid/dygraph/amp/auto_cast.py @@ -118,6 +118,11 @@ def _in_amp_guard(): return False +def _in_pure_fp16_guard(): + tracer = _dygraph_tracer() + return tracer and tracer._amp_level == core.AmpLevel.O2 + + @dygraph_only def pure_fp16_initialize(models): for idx in range(len(models)): diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py index 9ccd2321b638ac2147835c4534757c8e903658ab..94fc5558ab162636e59a5569904d770970f812d1 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py @@ -27,8 +27,8 @@ from paddle.fluid.layers.utils import pack_sequence_as from paddle.fluid.layers.utils import _hash_with_id from paddle.fluid.compiler import BuildStrategy from paddle.fluid.contrib.mixed_precision.decorator import AutoMixedPrecisionLists -from paddle.fluid.contrib.mixed_precision.fp16_utils import rewrite_program -from paddle.fluid.dygraph.amp.auto_cast import _in_amp_guard +from paddle.fluid.contrib.mixed_precision.fp16_utils import rewrite_program, cast_model_to_fp16 +from paddle.fluid.dygraph.amp.auto_cast import _in_amp_guard, _in_pure_fp16_guard import paddle.compat as cpt from paddle import _C_ops @@ -152,8 +152,14 @@ class PartialProgramLayer: self._double_grads = self._get_double_grads(self._origin_main_program) self.training = True + custom_white_list, custom_black_list = None, None + tracer = framework._dygraph_tracer() + if tracer: + custom_white_list, custom_black_list = tracer._get_amp_op_list() # For AMP training - self._amp_list = AutoMixedPrecisionLists() + self._amp_list = AutoMixedPrecisionLists( + custom_white_list=custom_white_list, + custom_black_list=custom_black_list) @LazyInitialized def _infer_program(self): @@ -193,6 +199,26 @@ class PartialProgramLayer: """ return self._append_backward_desc(self._infer_amp_program) + @LazyInitialized + @switch_to_static_graph + def _infer_pure_fp16_program(self): + """ + Lazy initialized property of _infer_pure_fp16_program. + """ + infer_pure_fp16_program = self._origin_main_program.clone() + with program_guard(infer_pure_fp16_program): + cast_model_to_fp16( + infer_pure_fp16_program, self._amp_list, use_fp16_guard=False) + + return infer_pure_fp16_program + + @LazyInitialized + def _train_pure_fp16_program(self): + """ + Lazy initialized property of _train_pure_fp16_program. + """ + return self._append_backward_desc(self._infer_pure_fp16_program) + @LazyInitialized def _infer_program_id(self): return _hash_with_id(self._infer_program, self) @@ -213,6 +239,14 @@ class PartialProgramLayer: return program_id + @LazyInitialized + def _train_pure_fp16_program_id(self): + program_id = _hash_with_id(self._train_pure_fp16_program, self) + core._set_cached_executor_build_strategy(program_id, + self._build_strategy) + + return program_id + def _verify_program(self, main_program): """ Verify that the program parameter is initialized, prune some unused params, @@ -275,8 +309,12 @@ class PartialProgramLayer: return self._valid_vars(double_grads) def _get_end_op_index(self): - infer_program = self._infer_amp_program if _in_amp_guard( - ) else self._infer_program + if _in_amp_guard(): + infer_program = self._infer_amp_program + elif _in_pure_fp16_guard(): + infer_program = self._infer_pure_fp16_program + else: + infer_program = self._infer_program return infer_program.desc.block(0).op_size() def __call__(self, inputs): @@ -285,6 +323,9 @@ class PartialProgramLayer: attrs = ('global_block', self.program.desc.block(0), 'start_op_index', 0, 'end_op_index', self._get_end_op_index(), 'is_test', not self.training, 'program_id', self.program_id) + + self._cast_fp16_if_pure_fp16(in_vars) + _C_ops.run_program( self._valid_vars(in_vars), self._valid_vars(self._params), @@ -294,6 +335,16 @@ class PartialProgramLayer: restored_nest_out = self._restore_out(out_vars) return self._remove_no_value(restored_nest_out) + def _cast_fp16_if_pure_fp16(self, in_vars): + if _in_pure_fp16_guard(): + for i, var in enumerate(in_vars): + name = var.name + if (self.program.global_block().has_var(name) and + self.program.global_block().var(name).dtype == + paddle.float16): + in_vars[i] = var.astype('float16') + in_vars[i].name = name + def drop_scope_if_no_grad(self): tracer = framework._dygraph_tracer() if self.training and not tracer._has_grad: @@ -302,16 +353,24 @@ class PartialProgramLayer: @property def program(self): if self.training: - return self._train_amp_program if _in_amp_guard( - ) else self._train_program + if _in_amp_guard(): + return self._train_amp_program + elif _in_pure_fp16_guard(): + return self._train_pure_fp16_program + else: + return self._train_program else: return self._infer_program @property def program_id(self): if self.training: - return self._train_amp_program_id if _in_amp_guard( - ) else self._train_program_id + if _in_amp_guard(): + return self._train_amp_program_id + elif _in_pure_fp16_guard(): + return self._train_pure_fp16_program_id + else: + return self._train_program_id else: return self._infer_program_id diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py new file mode 100644 index 0000000000000000000000000000000000000000..4ddc9d1aa0860996a92fcef0ba4604339ae8b59b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py @@ -0,0 +1,106 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import unittest +import numpy as np +from time import time +from test_mnist import MNIST, TestMNIST, SEED, SimpleImgConvPool +from paddle.jit import ProgramTranslator +from paddle.fluid.optimizer import AdamOptimizer + +if paddle.fluid.is_compiled_with_cuda(): + paddle.fluid.set_flags({'FLAGS_cudnn_deterministic': True}) + + +class TestPureFP16(TestMNIST): + def train_static(self): + return self.train(to_static=True) + + def train_dygraph(self): + return self.train(to_static=False) + + def test_mnist_to_static(self): + if paddle.fluid.is_compiled_with_cuda(): + dygraph_loss = self.train_dygraph() + static_loss = self.train_static() + # NOTE: In pure fp16 training, loss is not stable, so we enlarge atol here. + self.assertTrue( + np.allclose( + dygraph_loss, static_loss, atol=1e-3), + msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss, + static_loss)) + + def train(self, to_static=False): + np.random.seed(SEED) + paddle.seed(SEED) + paddle.framework.random._manual_program_seed(SEED) + + mnist = MNIST() + + if to_static: + print("Successfully to apply @to_static.") + mnist = paddle.jit.to_static(mnist) + + optimizer = paddle.optimizer.Adam( + learning_rate=0.001, parameters=mnist.parameters()) + + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + + mnist, optimizer = paddle.amp.decorate( + models=mnist, + optimizers=optimizer, + level='O2', + save_dtype='float32') + + loss_data = [] + for epoch in range(self.epoch_num): + start = time() + for batch_id, data in enumerate(self.train_reader()): + dy_x_data = np.array([x[0].reshape(1, 28, 28) + for x in data]).astype('float32') + y_data = np.array( + [x[1] for x in data]).astype('int64').reshape(-1, 1) + + img = paddle.to_tensor(dy_x_data) + label = paddle.to_tensor(y_data) + label.stop_gradient = True + + with paddle.amp.auto_cast( + enable=True, + custom_white_list=None, + custom_black_list=None, + level='O2'): + prediction, acc, avg_loss = mnist(img, label=label) + + scaled = scaler.scale(avg_loss) + scaled.backward() + scaler.minimize(optimizer, scaled) + + loss_data.append(avg_loss.numpy()[0]) + # save checkpoint + mnist.clear_gradients() + if batch_id % 10 == 0: + print( + "Loss at epoch {} step {}: loss: {:}, acc: {}, cost: {}" + .format(epoch, batch_id, + avg_loss.numpy(), acc.numpy(), time() - start)) + start = time() + if batch_id == 50: + break + return loss_data + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py new file mode 100644 index 0000000000000000000000000000000000000000..6620703ab71823051f8b0f4b23237f7df588a4e3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py @@ -0,0 +1,124 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import math +import time +import unittest + +import numpy as np + +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph import declarative, ProgramTranslator +from paddle.fluid.dygraph.nn import BatchNorm, Conv2D, Linear, Pool2D +from test_resnet import ResNet, optimizer_setting, SEED + +# NOTE: Reduce batch_size from 8 to 2 to avoid unittest timeout. +batch_size = 2 +epoch_num = 1 + +program_translator = ProgramTranslator() + +if fluid.is_compiled_with_cuda(): + fluid.set_flags({'FLAGS_cudnn_deterministic': True}) + + +def train(to_static, build_strategy=None): + """ + Tests model decorated by `dygraph_to_static_output` in static mode. For users, the model is defined in dygraph mode and trained in static mode. + """ + np.random.seed(SEED) + paddle.seed(SEED) + paddle.framework.random._manual_program_seed(SEED) + + resnet = ResNet() + if to_static: + resnet = paddle.jit.to_static(resnet, build_strategy=build_strategy) + optimizer = optimizer_setting(parameter_list=resnet.parameters()) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + + resnet, optimizer = paddle.amp.decorate( + models=resnet, optimizers=optimizer, level='O2', save_dtype='float32') + + for epoch in range(epoch_num): + loss_data = [] + total_loss = 0.0 + total_acc1 = 0.0 + total_acc5 = 0.0 + total_sample = 0 + + for batch_id in range(100): + start_time = time.time() + img = paddle.to_tensor( + np.random.random([batch_size, 3, 224, 224]).astype('float32')) + label = paddle.to_tensor( + np.random.randint( + 0, 100, [batch_size, 1], dtype='int64')) + img.stop_gradient = True + label.stop_gradient = True + + with paddle.amp.auto_cast( + enable=True, + custom_white_list=None, + custom_black_list=None, + level='O2'): + pred = resnet(img) + loss = fluid.layers.cross_entropy(input=pred, label=label) + avg_loss = fluid.layers.mean(x=pred) + acc_top1 = fluid.layers.accuracy(input=pred, label=label, k=1) + acc_top5 = fluid.layers.accuracy(input=pred, label=label, k=5) + + scaled = scaler.scale(avg_loss) + scaled.backward() + scaler.minimize(optimizer, scaled) + resnet.clear_gradients() + + loss_data.append(avg_loss.numpy()[0]) + total_loss += avg_loss + total_acc1 += acc_top1 + total_acc5 += acc_top5 + total_sample += 1 + + end_time = time.time() + if batch_id % 2 == 0: + print( "epoch %d | batch step %d, loss %0.3f, acc1 %0.3f, acc5 %0.3f, time %f" % \ + ( epoch, batch_id, total_loss.numpy() / total_sample, \ + total_acc1.numpy() / total_sample, total_acc5.numpy() / total_sample, end_time-start_time)) + if batch_id == 10: + break + + return loss_data + + +class TestResnet(unittest.TestCase): + def train(self, to_static): + program_translator.enable(to_static) + return train(to_static) + + def test_resnet(self): + if fluid.is_compiled_with_cuda(): + static_loss = self.train(to_static=True) + dygraph_loss = self.train(to_static=False) + # NOTE: In pure fp16 training, loss is not stable, so we enlarge atol here. + self.assertTrue( + np.allclose( + static_loss, dygraph_loss, atol=1e-3), + msg="static_loss: {} \n dygraph_loss: {}".format(static_loss, + dygraph_loss)) + + +if __name__ == '__main__': + unittest.main()