From beda78258fafc9884bfcb941cd870a4c2107263e Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Thu, 25 Apr 2019 19:49:35 +0800 Subject: [PATCH] Init mixed precision training interface (#16856) * Init mixed precision training interface * Add fp16 test script test=develop * All initializers support float16 test=develop * Code cleanup & add more code annotations test=develop * Update API spec test=develop * Add usage example in doc test=develop --- paddle/fluid/API.spec | 1 + python/paddle/fluid/contrib/__init__.py | 3 + .../fluid/contrib/mixed_precision/__init__.py | 19 ++ .../contrib/mixed_precision/decorator.py | 157 +++++++++ .../contrib/mixed_precision/fp16_utils.py | 125 ++++++++ .../tests/test_image_classification_fp16.py | 301 ++++++++++++++++++ python/paddle/fluid/initializer.py | 174 ++++++++-- .../fluid/tests/unittests/test_initializer.py | 137 ++++++-- python/setup.py.in | 1 + 9 files changed, 860 insertions(+), 58 deletions(-) create mode 100644 python/paddle/fluid/contrib/mixed_precision/__init__.py create mode 100644 python/paddle/fluid/contrib/mixed_precision/decorator.py create mode 100644 python/paddle/fluid/contrib/mixed_precision/fp16_utils.py create mode 100644 python/paddle/fluid/contrib/tests/test_image_classification_fp16.py diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 0d12f26956c..97eac4258da 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -424,6 +424,7 @@ paddle.fluid.contrib.HDFSClient.upload (ArgSpec(args=['self', 'hdfs_path', 'loca paddle.fluid.contrib.multi_download (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,)), ('document', '100927be598ed8f9eaa1f3ef1b23568a')) paddle.fluid.contrib.multi_upload (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True)), ('document', '183f34c83d30dbe16e09e8716c41958a')) paddle.fluid.contrib.extend_with_decoupled_weight_decay (ArgSpec(args=['base_optimizer'], varargs=None, keywords=None, defaults=None), ('document', 'a1095dfd4ec725747f662d69cd7659d4')) +paddle.fluid.contrib.mixed_precision.decorate (ArgSpec(args=['optimizer', 'init_loss_scaling', 'use_dynamic_loss_scaling'], varargs=None, keywords=None, defaults=(1.0, False)), ('document', '67e9bf14f345b38da169beb1ebb276eb')) paddle.fluid.transpiler.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '292ab72977afbe58e6a3bde175452680')) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '78f4949aedf317666a89ca74b3748ba8')) diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py index ca10db0a545..f808f30bba4 100644 --- a/python/paddle/fluid/contrib/__init__.py +++ b/python/paddle/fluid/contrib/__init__.py @@ -34,6 +34,8 @@ from . import extend_optimizer from .extend_optimizer import * from . import model_stat from .model_stat import * +from . import mixed_precision +from .mixed_precision import * __all__ = [] __all__ += decoder.__all__ @@ -45,3 +47,4 @@ __all__ += reader.__all__ __all__ += slim.__all__ __all__ += utils.__all__ __all__ += extend_optimizer.__all__ +__all__ += ['mixed_precision'] diff --git a/python/paddle/fluid/contrib/mixed_precision/__init__.py b/python/paddle/fluid/contrib/mixed_precision/__init__.py new file mode 100644 index 00000000000..c2c3dc284f5 --- /dev/null +++ b/python/paddle/fluid/contrib/mixed_precision/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +from . import decorator +from .decorator import * + +__all__ = decorator.__all__ diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py new file mode 100644 index 00000000000..f17b63434de --- /dev/null +++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py @@ -0,0 +1,157 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ... import default_main_program +from ... import default_startup_program +from ... import layers +from ... import unique_name +from . import fp16_utils +from .fp16_utils import create_master_params_grads, master_param_to_train_param + +__all__ = ["decorate"] + + +class OptimizerWithMixedPrecison(object): + """ + Optimizer with mixed-precision (MP) training. This is a wrapper of a common + optimizer, plus the support of mixed-precision pretraining. The object + of this class almost has the same behavior as the common optimizer, with the + methods `minimize()`, `backward()`, `apply_gradients()` implemented. + Additionally, it enables the MP training automatically, i.e, the creation + and maintenance of master parameters, scaling of loss, etc. + + Args: + optimizer (Optimizer): A common Optimizer object. + init_loss_scaling (float): The initial loss scaling factor. + use_dynamic_loss_scaling (bool): Whether to use dynamic loss scaling. + """ + + def __init__(self, optimizer, init_loss_scaling, use_dynamic_loss_scaling): + self._optimizer = optimizer + self._param_grads = None + self._train_program = default_main_program() + self._startup_prog = default_startup_program() + self._loss_scaling = init_loss_scaling + self._use_dynamic_loss_scaling = use_dynamic_loss_scaling + + # Ensure the data type of learning rate vars is float32 (same as the + # master parameter dtype) + if isinstance(optimizer._learning_rate, float): + optimizer._learning_rate_map[default_main_program()] = \ + layers.create_global_var( + name=unique_name.generate("learning_rate"), + shape=[1], + value=float(optimizer._learning_rate), + dtype='float32', + persistable=True) + + def get_loss_scaling(self): + """Return the real-time loss scaling factor. + """ + return self._loss_scaling + + def backward(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None, + callbacks=None): + """ + Backward propogation or auto differentiation for gradients' computation. + + Args: + loss (Variable): The loss Variable to minimize. + startup_program (Program|None): The startup Program for initializing + parameters in `parameter_list`. + parameter_list (list|None): A list of Variables to update. + no_grad_set (set|None): A set of Variables should be ignored. + callbacks (list|None): A list of callables to run when appending + backward operator for one parameter. + + Returns: + A list of (param, grad), which is a tuple of a parameter and its + gradient respectively, and the scaled loss. + """ + scaled_loss = loss * self._loss_scaling + self._param_grads = self._optimizer.backward( + scaled_loss, startup_program, parameter_list, no_grad_set, + callbacks) + master_params_grads = create_master_params_grads( + self._param_grads, self._train_program, self._startup_prog, + self._loss_scaling) + + return master_params_grads, scaled_loss + + def apply_gradients(self, master_params_grads): + """ + Update master parameters by their gradients, and cast to parameters + in float16. + + Args: + master_params_grads (list): A list of master params and grads. + + Returns: + A list of optimize operators. + """ + optimize_ops = self._optimizer.apply_gradients(master_params_grads) + master_param_to_train_param(master_params_grads, self._param_grads, + self._train_program) + return optimize_ops + + def minimize(self, loss): + """ + Perform optimization by minimizing the given loss. + + Args: + loss (Variable): The loss Variable. + + Returns: + The scaled loss by scaling factor, the list of optimize ops, and a + list of master parameters and gradients. + """ + master_params_grads, scaled_loss = self.backward(loss) + optimize_ops = self.apply_gradients(master_params_grads) + + return scaled_loss, optimize_ops, master_params_grads + + +def decorate(optimizer, init_loss_scaling=1.0, use_dynamic_loss_scaling=False): + """ + Decorate the given optimizer to adapt to the mixed-precision training. + + Args: + optimizer(Optimizer): A common Optimizer. + init_loss_scaling(float): The initial loss scaling factor. + use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. + + Returns: + An optimizer acting like a normal one but with mixed-precision training + enabled. + + Examples: + .. code-block:: python + + loss = network() + optimizer = fluid.optimizer.Adam(learning_rate=0.001) + + mp_optimizer = fluid.contrib.mixed_precision.decorate( + optimizer=optimizer, init_loss_scaling=8.0) + + scaled_loss, _, _ = mp_optimizer.minimize(loss) + """ + + mp_optimizer = OptimizerWithMixedPrecison(optimizer, init_loss_scaling, + use_dynamic_loss_scaling) + + return mp_optimizer diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py new file mode 100644 index 00000000000..5e7fdcedead --- /dev/null +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py @@ -0,0 +1,125 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from ... import core +from ... import layers +from ... import framework + + +def append_cast_op(i, o, prog): + """ + Append a cast op in a given Program to cast input `i` to data type `o.dtype`. + + Args: + i (Variable): The input Variable. + o (Variable): The output Variable. + prog (Program): The Program to append cast op. + """ + prog.global_block().append_op( + type="cast", + inputs={"X": i}, + outputs={"Out": o}, + attrs={"in_dtype": i.dtype, + "out_dtype": o.dtype}) + + +def copy_to_master_param(p, block): + """ + New a master parameter for the input parameter, and they two share the same + attributes except the data type. + + Args: + p(Parameter): The input parameter in float16. + block(Program): The block in which the parameter is. + """ + v = block.vars.get(p.name, None) + if v is None: + raise ValueError("no param name %s found!" % p.name) + new_p = framework.Parameter( + block=block, + shape=v.shape, + dtype=core.VarDesc.VarType.FP32, + type=v.type, + lod_level=v.lod_level, + stop_gradient=p.stop_gradient, + trainable=p.trainable, + optimize_attr=p.optimize_attr, + regularizer=p.regularizer, + gradient_clip_attr=p.gradient_clip_attr, + error_clip=p.error_clip, + name=v.name + ".master") + return new_p + + +def create_master_params_grads(params_grads, main_prog, startup_prog, + loss_scaling): + """ + Create master parameters and gradients in float32 from params and grads + in float16. + + Args: + params_grads (list): A list of tuple (parameter, gradient) in float32. + main_prog (Program): The main program for training. + startup_prog (Program): The startup program to initialize all parameters. + loss_scaling (float): The factor to scale loss and gradients. + + Returns: + A list of master parameters and gradients. + """ + master_params_grads = [] + with main_prog._backward_role_guard(): + for p, g in params_grads: + # create master parameters + master_param = copy_to_master_param(p, main_prog.global_block()) + startup_master_param = startup_prog.global_block()._clone_variable( + master_param) + startup_p = startup_prog.global_block().var(p.name) + # fp16 -> fp32 + append_cast_op(startup_p, startup_master_param, startup_prog) + # cast fp16 gradients to fp32 before apply gradients + if g.name.find("batch_norm") > -1: + if loss_scaling > 1: + scaled_g = g / float(loss_scaling) + else: + scaled_g = g + master_params_grads.append([p, scaled_g]) + continue + master_grad = layers.cast(x=g, dtype="float32") + if loss_scaling > 1: + master_grad = master_grad / float(loss_scaling) + master_params_grads.append([master_param, master_grad]) + + return master_params_grads + + +def master_param_to_train_param(master_params_grads, params_grads, main_prog): + """ + Convert master master parameters and gradients in float32 to parameters and + gradients in float16 for forward computation. + + Args: + master_params_grads (list): A list of master parameters and gradients in + float32. + params_grads (list): A list of parameters and gradients in float16. + main_prog (list): The main program for execution. + """ + for idx, m_p_g in enumerate(master_params_grads): + train_p, _ = params_grads[idx] + if train_p.name.find("batch_norm") > -1: + continue + with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]): + # fp32 -> fp16 + append_cast_op(m_p_g[0], train_p, main_prog) diff --git a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py new file mode 100644 index 00000000000..b7a14fa59b4 --- /dev/null +++ b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py @@ -0,0 +1,301 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import paddle +import paddle.fluid as fluid +import contextlib +import math +import sys +import numpy +import unittest +import os +import numpy as np + + +def resnet_cifar10(input, depth=32): + def conv_bn_layer(input, + ch_out, + filter_size, + stride, + padding, + act='relu', + bias_attr=False): + tmp = fluid.layers.conv2d( + input=input, + filter_size=filter_size, + num_filters=ch_out, + stride=stride, + padding=padding, + act=None, + bias_attr=bias_attr) + return fluid.layers.batch_norm(input=tmp, act=act) + + def shortcut(input, ch_in, ch_out, stride): + if ch_in != ch_out: + return conv_bn_layer(input, ch_out, 1, stride, 0, None) + else: + return input + + def basicblock(input, ch_in, ch_out, stride): + tmp = conv_bn_layer(input, ch_out, 3, stride, 1) + tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None, bias_attr=True) + short = shortcut(input, ch_in, ch_out, stride) + return fluid.layers.elementwise_add(x=tmp, y=short, act='relu') + + def layer_warp(block_func, input, ch_in, ch_out, count, stride): + tmp = block_func(input, ch_in, ch_out, stride) + for i in range(1, count): + tmp = block_func(tmp, ch_out, ch_out, 1) + return tmp + + assert (depth - 2) % 6 == 0 + n = (depth - 2) // 6 + conv1 = conv_bn_layer( + input=input, ch_out=16, filter_size=3, stride=1, padding=1) + res1 = layer_warp(basicblock, conv1, 16, 16, n, 1) + res2 = layer_warp(basicblock, res1, 16, 32, n, 2) + res3 = layer_warp(basicblock, res2, 32, 64, n, 2) + pool = fluid.layers.pool2d( + input=res3, pool_size=8, pool_type='avg', pool_stride=1) + return pool + + +def vgg16_bn_drop(input): + def conv_block(input, num_filter, groups, dropouts): + return fluid.nets.img_conv_group( + input=input, + pool_size=2, + pool_stride=2, + conv_num_filter=[num_filter] * groups, + conv_filter_size=3, + conv_act='relu', + conv_with_batchnorm=True, + conv_batchnorm_drop_rate=dropouts, + pool_type='max') + + conv1 = conv_block(input, 64, 2, [0.3, 0]) + conv2 = conv_block(conv1, 128, 2, [0.4, 0]) + conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) + conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) + conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) + + drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) + fc1 = fluid.layers.fc(input=drop, size=4096, act=None) + bn = fluid.layers.batch_norm(input=fc1, act='relu') + drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) + fc2 = fluid.layers.fc(input=drop2, size=4096, act=None) + return fc2 + + +def train(net_type, use_cuda, save_dirname, is_local): + classdim = 10 + data_shape = [3, 32, 32] + + train_program = fluid.Program() + startup_prog = fluid.Program() + train_program.random_seed = 123 + startup_prog.random_seed = 456 + with fluid.program_guard(train_program, startup_prog): + images = fluid.layers.data( + name='pixel', shape=data_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + imgs = fluid.layers.cast(images, "float16") + if net_type == "vgg": + print("train vgg net") + net = vgg16_bn_drop(imgs) + elif net_type == "resnet": + print("train resnet") + net = resnet_cifar10(imgs, 32) + else: + raise ValueError("%s network is not supported" % net_type) + + logits = fluid.layers.fc(input=net, size=classdim, act="softmax") + cost, predict = fluid.layers.softmax_with_cross_entropy( + logits, label, return_softmax=True) + avg_cost = fluid.layers.mean(cost) + acc = fluid.layers.accuracy(input=predict, label=label) + + # Test program + test_program = train_program.clone(for_test=True) + + optimizer = fluid.optimizer.Adam(learning_rate=0.001) + + mp_optimizer = fluid.contrib.mixed_precision.decorate( + optimizer=optimizer, init_loss_scaling=8.0) + + scaled_loss, _, _ = mp_optimizer.minimize(avg_cost) + + BATCH_SIZE = 128 + PASS_NUM = 1 + + # no shuffle for unit test + train_reader = paddle.batch( + paddle.dataset.cifar.train10(), batch_size=BATCH_SIZE) + + test_reader = paddle.batch( + paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(place=place, feed_list=[images, label]) + + def train_loop(main_program): + exe.run(startup_prog) + loss = 0.0 + for pass_id in range(PASS_NUM): + for batch_id, data in enumerate(train_reader()): + np_scaled_loss, loss = exe.run( + main_program, + feed=feeder.feed(data), + fetch_list=[scaled_loss, avg_cost]) + print( + 'PassID {0:1}, BatchID {1:04}, train loss {2:2.4}, scaled train closs {3:2.4}'. + format(pass_id, batch_id + 1, + float(loss), float(np_scaled_loss))) + if (batch_id % 10) == 0: + acc_list = [] + avg_loss_list = [] + for tid, test_data in enumerate(test_reader()): + loss_t, acc_t = exe.run(program=test_program, + feed=feeder.feed(test_data), + fetch_list=[avg_cost, acc]) + if math.isnan(float(loss_t)): + sys.exit("got NaN loss, training failed.") + acc_list.append(float(acc_t)) + avg_loss_list.append(float(loss_t)) + break # Use 1 segment for speeding up CI + + acc_value = numpy.array(acc_list).mean() + avg_loss_value = numpy.array(avg_loss_list).mean() + + print( + 'PassID {0:1}, BatchID {1:04}, test loss {2:2.2}, acc {3:2.2}'. + format(pass_id, batch_id + 1, + float(avg_loss_value), float(acc_value))) + + if acc_value > 0.08: # Low threshold for speeding up CI + fluid.io.save_inference_model( + save_dirname, ["pixel"], [predict], + exe, + main_program=train_program) + return + + if is_local: + train_loop(train_program) + else: + port = os.getenv("PADDLE_PSERVER_PORT", "6174") + pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip... + eplist = [] + for ip in pserver_ips.split(","): + eplist.append(':'.join([ip, port])) + pserver_endpoints = ",".join(eplist) # ip:port,ip:port... + trainers = int(os.getenv("PADDLE_TRAINERS")) + current_endpoint = os.getenv("POD_IP") + ":" + port + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER") + t = fluid.DistributeTranspiler() + t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) + if training_role == "PSERVER": + pserver_prog = t.get_pserver_program(current_endpoint) + pserver_startup = t.get_startup_program(current_endpoint, + pserver_prog) + exe.run(pserver_startup) + exe.run(pserver_prog) + elif training_role == "TRAINER": + train_loop(t.get_trainer_program()) + + +def infer(use_cuda, save_dirname=None): + if save_dirname is None: + return + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + inference_scope = fluid.core.Scope() + with fluid.scope_guard(inference_scope): + # Use fluid.io.load_inference_model to obtain the inference program desc, + # the feed_target_names (the names of variables that will be feeded + # data using feed operators), and the fetch_targets (variables that + # we want to obtain data from using fetch operators). + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) + + # The input's dimension of conv should be 4-D or 5-D. + # Use normilized image pixels as input data, which should be in the range [0, 1.0]. + batch_size = 1 + tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype("float32") + + # Use inference_transpiler to speedup + inference_transpiler_program = inference_program.clone() + t = fluid.transpiler.InferenceTranspiler() + t.transpile(inference_transpiler_program, place) + + # Construct feed as a dictionary of {feed_target_name: feed_target_data} + # and results will contain a list of data corresponding to fetch_targets. + results = exe.run(inference_program, + feed={feed_target_names[0]: tensor_img}, + fetch_list=fetch_targets) + + transpiler_results = exe.run(inference_transpiler_program, + feed={feed_target_names[0]: tensor_img}, + fetch_list=fetch_targets) + + assert len(results[0]) == len(transpiler_results[0]) + for i in range(len(results[0])): + np.testing.assert_almost_equal( + results[0][i], transpiler_results[0][i], decimal=4) + + print("infer results: ", results[0]) + + fluid.io.save_inference_model(save_dirname, feed_target_names, + fetch_targets, exe, + inference_transpiler_program) + + +def main(net_type, use_cuda, is_local=True): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + + # Directory for saving the trained model + save_dirname = "image_classification_" + net_type + ".inference.model" + + train(net_type, use_cuda, save_dirname, is_local) + #infer(use_cuda, save_dirname) + + +class TestImageClassification(unittest.TestCase): + def test_vgg_cuda(self): + with self.scope_prog_guard(): + main('vgg', use_cuda=True) + + def test_resnet_cuda(self): + with self.scope_prog_guard(): + main('resnet', use_cuda=True) + + @contextlib.contextmanager + def scope_prog_guard(self): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + yield + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index da2591b9805..86596bd9c8f 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -154,17 +154,41 @@ class ConstantInitializer(Initializer): """ assert isinstance(var, framework.Variable) assert isinstance(block, framework.Block) + + # to be compatible of fp16 initializers + if var.dtype == VarDesc.VarType.FP16: + out_dtype = VarDesc.VarType.FP32 + out_var = block.create_var( + name=unique_name.generate(".".join( + ['constant_init', var.name, 'tmp'])), + shape=var.shape, + dtype=out_dtype, + type=VarDesc.VarType.LOD_TENSOR, + persistable=False) + else: + out_dtype = var.dtype + out_var = var + # Initialization Ops should be prepended and not appended op = block._prepend_op( type="fill_constant", - outputs={"Out": var}, + outputs={"Out": out_var}, attrs={ "shape": var.shape, - "dtype": int(var.dtype), + "dtype": int(out_dtype), "value": float(self._value), 'force_cpu': self._force_cpu or force_init_on_cpu() }, stop_gradient=True) + + if var.dtype == VarDesc.VarType.FP16: + block.append_op( + type="cast", + inputs={"X": out_var}, + outputs={"Out": var}, + attrs={"in_dtype": out_var.dtype, + "out_dtype": var.dtype}) + if not framework.in_dygraph_mode(): var.op = op return op @@ -216,7 +240,8 @@ class UniformInitializer(Initializer): if var.dtype == VarDesc.VarType.FP16: out_dtype = VarDesc.VarType.FP32 out_var = block.create_var( - name=unique_name.generate(".".join(['gaussian_random', 'tmp'])), + name=unique_name.generate(".".join( + ['uniform_random', var.name, 'tmp'])), shape=var.shape, dtype=out_dtype, type=VarDesc.VarType.LOD_TENSOR, @@ -295,7 +320,8 @@ class NormalInitializer(Initializer): if var.dtype == VarDesc.VarType.FP16: out_dtype = VarDesc.VarType.FP32 out_var = block.create_var( - name=unique_name.generate(".".join(['gaussian_random', 'tmp'])), + name=unique_name.generate(".".join( + ['gaussian_random', var.name, 'tmp'])), shape=var.shape, dtype=out_dtype, type=VarDesc.VarType.LOD_TENSOR, @@ -375,7 +401,7 @@ class TruncatedNormalInitializer(Initializer): out_dtype = VarDesc.VarType.FP32 out_var = block.create_var( name=unique_name.generate(".".join( - ['truncated_gaussian_random', 'tmp'])), + ['truncated_gaussian_random', var.name, 'tmp'])), shape=var.shape, dtype=out_dtype, type=VarDesc.VarType.LOD_TENSOR, @@ -482,14 +508,28 @@ class XavierInitializer(Initializer): if self._seed == 0: self._seed = block.program.random_seed + # to be compatible of fp16 initalizers + if var.dtype == VarDesc.VarType.FP16: + out_dtype = VarDesc.VarType.FP32 + out_var = block.create_var( + name=unique_name.generate(".".join( + ['xavier_init', var.name, 'tmp'])), + shape=var.shape, + dtype=out_dtype, + type=VarDesc.VarType.LOD_TENSOR, + persistable=False) + else: + out_dtype = var.dtype + out_var = var + if self._uniform: limit = np.sqrt(6.0 / float(fan_in + fan_out)) op = block._prepend_op( type="uniform_random", - outputs={"Out": var}, + outputs={"Out": out_var}, attrs={ - "shape": var.shape, - "dtype": int(var.dtype), + "shape": out_var.shape, + "dtype": out_dtype, "min": -limit, "max": limit, "seed": self._seed @@ -500,15 +540,24 @@ class XavierInitializer(Initializer): std = np.sqrt(2.0 / float(fan_in + fan_out)) op = block._prepend_op( type="gaussian_random", - outputs={"Out": var}, + outputs={"Out": out_var}, attrs={ - "shape": var.shape, - "dtype": int(var.dtype), + "shape": out_var.shape, + "dtype": out_dtype, "mean": 0.0, "std": std, "seed": self._seed }, stop_gradient=True) + + if var.dtype == VarDesc.VarType.FP16: + block.append_op( + type="cast", + inputs={"X": out_var}, + outputs={"Out": var}, + attrs={"in_dtype": out_var.dtype, + "out_dtype": var.dtype}) + if not framework.in_dygraph_mode(): var.op = op return op @@ -583,14 +632,28 @@ class MSRAInitializer(Initializer): if self._seed == 0: self._seed = block.program.random_seed + # to be compatible of fp16 initalizers + if var.dtype == VarDesc.VarType.FP16: + out_dtype = VarDesc.VarType.FP32 + out_var = block.create_var( + name=unique_name.generate(".".join( + ['masra_init', var.name, 'tmp'])), + shape=var.shape, + dtype=out_dtype, + type=VarDesc.VarType.LOD_TENSOR, + persistable=False) + else: + out_dtype = var.dtype + out_var = var + if self._uniform: limit = np.sqrt(6.0 / float(fan_in)) op = block._prepend_op( type="uniform_random", - outputs={"Out": var}, + outputs={"Out": out_var}, attrs={ - "shape": var.shape, - "dtype": int(var.dtype), + "shape": out_var.shape, + "dtype": int(out_dtype), "min": -limit, "max": limit, "seed": self._seed @@ -601,15 +664,24 @@ class MSRAInitializer(Initializer): std = np.sqrt(2.0 / float(fan_in)) op = block._prepend_op( type="gaussian_random", - outputs={"Out": var}, + outputs={"Out": out_var}, attrs={ - "shape": var.shape, - "dtype": int(var.dtype), + "shape": out_var.shape, + "dtype": int(out_dtype), "mean": 0.0, "std": std, "seed": self._seed }, stop_gradient=True) + + if var.dtype == VarDesc.VarType.FP16: + block.append_op( + type="cast", + inputs={"X": out_var}, + outputs={"Out": var}, + attrs={"in_dtype": out_var.dtype, + "out_dtype": var.dtype}) + if not framework.in_dygraph_mode(): var.op = op return op @@ -694,7 +766,21 @@ class BilinearInitializer(Initializer): weight[i] = (1 - abs(x / f - c)) * (1 - abs(y / f - c)) weight = np.reshape(weight, shape) - if var.dtype == VarDesc.VarType.FP32: + # to be compatible of fp16 initalizers + if var.dtype == VarDesc.VarType.FP16: + out_dtype = VarDesc.VarType.FP32 + out_var = block.create_var( + name=unique_name.generate(".".join( + ['bilinear_init', var.name, 'tmp'])), + shape=var.shape, + dtype=out_dtype, + type=VarDesc.VarType.LOD_TENSOR, + persistable=False) + else: + out_dtype = var.dtype + out_var = var + + if out_dtype == VarDesc.VarType.FP32: value_name = "fp32_values" values = [float(v) for v in weight.flat] else: @@ -703,12 +789,21 @@ class BilinearInitializer(Initializer): raise ValueError("The size of input is too big. ") op = block.append_op( type='assign_value', - outputs={'Out': [var]}, + outputs={'Out': [out_var]}, attrs={ - 'dtype': var.dtype, + 'dtype': out_dtype, 'shape': list(shape), value_name: values }) + + if var.dtype == VarDesc.VarType.FP16: + block.append_op( + type="cast", + inputs={"X": out_var}, + outputs={"Out": var}, + attrs={"in_dtype": out_var.dtype, + "out_dtype": var.dtype}) + if not framework.in_dygraph_mode(): var.op = op return op @@ -746,14 +841,30 @@ class NumpyArrayInitializer(Initializer): """ assert isinstance(var, framework.Variable) assert isinstance(block, framework.Block) + + # to be compatible of fp16 initalizers + if var.dtype == VarDesc.VarType.FP16: + out_dtype = VarDesc.VarType.FP32 + np_value = self._value.astype("float32") + out_var = block.create_var( + name=unique_name.generate(".".join( + ['numpy_array_init', var.name, 'tmp'])), + shape=var.shape, + dtype=out_dtype, + type=VarDesc.VarType.LOD_TENSOR, + persistable=False) + else: + out_var = var + out_dtype = var.dtype + np_value = self._value + # Initialization Ops should be prepended and not appended - dtype = framework.convert_np_dtype_to_dtype_(self._value.dtype) - if dtype == VarDesc.VarType.FP32: + if out_dtype == VarDesc.VarType.FP32: value_name = "fp32_values" - values = [float(v) for v in self._value.flat] - elif dtype == VarDesc.VarType.INT32: + values = [float(v) for v in np_value.flat] + elif out_dtype == VarDesc.VarType.INT32: value_name = "int32_values" - values = [int(v) for v in self._value.flat] + values = [int(v) for v in np_value.flat] else: raise ValueError("Unsupported dtype %s", self._value.dtype) if self._value.size > 1024 * 1024 * 1024: @@ -761,13 +872,22 @@ class NumpyArrayInitializer(Initializer): "saving it to file and 'load_op' to load it") op = block._prepend_op( type='assign_value', - outputs={'Out': var}, + outputs={'Out': out_var}, attrs={ - 'dtype': dtype, + 'dtype': out_dtype, 'shape': list(self._value.shape), value_name: values }, stop_gradient=True) + + if var.dtype == VarDesc.VarType.FP16: + block.append_op( + type="cast", + inputs={"X": out_var}, + outputs={"Out": var}, + attrs={"in_dtype": out_var.dtype, + "out_dtype": var.dtype}) + if not framework.in_dygraph_mode(): var.op = op return op diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py index 2d98b063d10..c6bed4db72e 100644 --- a/python/paddle/fluid/tests/unittests/test_initializer.py +++ b/python/paddle/fluid/tests/unittests/test_initializer.py @@ -19,65 +19,86 @@ import unittest import paddle.fluid.framework as framework import paddle.fluid.initializer as initializer +from paddle.fluid.core import VarDesc DELTA = 0.00001 +def check_cast_op(op): + return op.type == 'cast' and \ + op.attr('in_dtype') == VarDesc.VarType.FP32 and \ + op.attr('out_dtype') == VarDesc.VarType.FP16 + + class TestConstantInitializer(unittest.TestCase): - def test_constant_initializer_default_value(self): + def test_constant_initializer_default_value(self, dtype="float32"): """Test the constant initializer with default value """ program = framework.Program() block = program.global_block() for _ in range(2): block.create_parameter( - dtype="float32", + dtype=dtype, shape=[5, 10], lod_level=0, name="param", initializer=initializer.ConstantInitializer()) - self.assertEqual(len(block.ops), 1) + num_ops = 2 if dtype == "float16" else 1 + self.assertEqual(len(block.ops), num_ops) init_op = block.ops[0] self.assertEqual(init_op.type, 'fill_constant') self.assertAlmostEqual(init_op.attr('value'), 0.0, delta=DELTA) + return block - def test_constant_initializer(self): + def test_constant_initializer(self, dtype="float32"): """Test constant initializer with supplied value """ program = framework.Program() block = program.global_block() for _ in range(2): block.create_parameter( - dtype="float32", + dtype=dtype, shape=[5, 10], lod_level=0, name="param", initializer=initializer.ConstantInitializer(2.3)) - self.assertEqual(len(block.ops), 1) + num_ops = 2 if dtype == "float16" else 1 + self.assertEqual(len(block.ops), num_ops) init_op = block.ops[0] self.assertEqual(init_op.type, 'fill_constant') self.assertAlmostEqual(init_op.attr('value'), 2.3, delta=DELTA) + return block + + def test_constant_initializer_fp16(self): + """Test constant initializer with float16 + """ + block = self.test_constant_initializer_default_value("float16") + self.assertTrue(check_cast_op(block.ops[1])) + block = self.test_constant_initializer("float16") + self.assertTrue(check_cast_op(block.ops[1])) class TestUniformInitializer(unittest.TestCase): - def test_uniform_initializer_default_value(self): + def test_uniform_initializer_default_value(self, dtype="float32"): """Test the uniform initializer with default value """ program = framework.Program() block = program.global_block() for _ in range(2): block.create_parameter( - dtype="float32", + dtype=dtype, shape=[5, 10], lod_level=0, name="param", initializer=initializer.UniformInitializer()) - self.assertEqual(len(block.ops), 1) + num_ops = 2 if dtype == "float16" else 1 + self.assertEqual(len(block.ops), num_ops) init_op = block.ops[0] self.assertEqual(init_op.type, 'uniform_random') self.assertAlmostEqual(init_op.attr('min'), -1.0, delta=DELTA) self.assertAlmostEqual(init_op.attr('max'), 1.0, delta=DELTA) self.assertEqual(init_op.attr('seed'), 0) + return block def test_uniform_initializer_random_seed(self): """Test the uniform initializer with manually setting seed @@ -103,43 +124,57 @@ class TestUniformInitializer(unittest.TestCase): init_op1 = block.ops[0] self.assertEqual(init_op1.attr("seed"), 456) - def test_uniform_initializer(self): + def test_uniform_initializer(self, dtype="float32"): """Test uniform initializer with supplied attributes """ program = framework.Program() block = program.global_block() for _ in range(2): block.create_parameter( - dtype="float32", + dtype=dtype, shape=[5, 10], lod_level=0, name="param", initializer=initializer.UniformInitializer(-4.2, 3.1, 123)) - self.assertEqual(len(block.ops), 1) + num_ops = 2 if dtype == "float16" else 1 + self.assertEqual(len(block.ops), num_ops) init_op = block.ops[0] self.assertEqual(init_op.type, 'uniform_random') self.assertAlmostEqual(init_op.attr('min'), -4.2, delta=DELTA) self.assertAlmostEqual(init_op.attr('max'), 3.1, delta=DELTA) self.assertEqual(init_op.attr('seed'), 123) + return block - def test_uniform_initializer_two_op(self): + def test_uniform_initializer_two_op(self, dtype="float32"): """Test uniform initializer with supplied attributes """ program = framework.Program() block = program.global_block() for i in range(2): block.create_parameter( - dtype="float32", + dtype=dtype, shape=[5, 10], lod_level=0, name="param", initializer=initializer.UniformInitializer(-4.2, float(i), 123)) - self.assertEqual(len(block.ops), 1) + num_ops = 2 if dtype == "float16" else 1 + self.assertEqual(len(block.ops), num_ops) init_op0 = block.ops[0] self.assertEqual(init_op0.type, 'uniform_random') self.assertAlmostEqual(init_op0.attr('min'), -4.2, delta=DELTA) self.assertAlmostEqual(init_op0.attr('max'), 0.0, delta=DELTA) self.assertEqual(init_op0.attr('seed'), 123) + return block + + def test_uniform_initializer_fp16(self): + """Test uniform initializer with float16 + """ + block = self.test_uniform_initializer_default_value("float16") + self.assertTrue(check_cast_op(block.ops[1])) + block = self.test_uniform_initializer(dtype="float16") + self.assertTrue(check_cast_op(block.ops[1])) + block = self.test_uniform_initializer_two_op("float16") + self.assertTrue(check_cast_op(block.ops[1])) class TestNormalInitializer(unittest.TestCase): @@ -162,24 +197,32 @@ class TestNormalInitializer(unittest.TestCase): self.assertAlmostEqual(init_op.attr('std'), 1.0, delta=DELTA) self.assertEqual(init_op.attr('seed'), 0) - def test_normal_initializer(self): + def test_normal_initializer(self, dtype="float32"): """Test normal initializer with supplied attributes """ program = framework.Program() block = program.global_block() for _ in range(2): block.create_parameter( - dtype="float32", + dtype=dtype, shape=[5, 10], lod_level=0, name="param", initializer=initializer.NormalInitializer(2.3, 1.9, 123)) - self.assertEqual(len(block.ops), 1) + num_ops = 2 if dtype == "float16" else 1 + self.assertEqual(len(block.ops), num_ops) init_op = block.ops[0] self.assertEqual(init_op.type, 'gaussian_random') self.assertAlmostEqual(init_op.attr('mean'), 2.3, delta=DELTA) self.assertAlmostEqual(init_op.attr('std'), 1.9, delta=DELTA) self.assertEqual(init_op.attr('seed'), 123) + return block + + def test_normal_initializer_fp16(self): + """Test normal initializer with float16 + """ + block = self.test_normal_initializer("float16") + self.assertTrue(check_cast_op(block.ops[1])) class TestXavierInitializer(unittest.TestCase): @@ -271,26 +314,34 @@ class TestXavierInitializer(unittest.TestCase): self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA) self.assertEqual(init_op.attr('seed'), 0) - def test_xavier_initializer_supplied_arguments(self): + def test_xavier_initializer_supplied_arguments(self, dtype="float32"): """Test the Xavier initializer with supplied arguments """ program = framework.Program() block = program.global_block() for _ in range(2): block.create_parameter( - dtype="float32", + dtype=dtype, shape=[5, 10], lod_level=0, name="param", initializer=initializer.XavierInitializer( fan_in=12, fan_out=23, seed=134)) - self.assertEqual(len(block.ops), 1) + num_ops = 2 if dtype == "float16" else 1 + self.assertEqual(len(block.ops), num_ops) init_op = block.ops[0] self.assertEqual(init_op.type, 'uniform_random') limit = np.sqrt(6.0 / (12 + 23)) self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA) self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA) self.assertEqual(init_op.attr('seed'), 134) + return block + + def test_xavier_initializer_fp16(self): + """Test the Xavier initializer with float16 + """ + block = self.test_xavier_initializer_supplied_arguments("float16") + self.assertTrue(check_cast_op(block.ops[1])) class TestMSRAInitializer(unittest.TestCase): @@ -380,54 +431,70 @@ class TestMSRAInitializer(unittest.TestCase): self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA) self.assertEqual(init_op.attr('seed'), 0) - def test_msra_initializer_supplied_arguments(self): + def test_msra_initializer_supplied_arguments(self, dtype="float32"): """Test the MSRA initializer with supplied arguments """ program = framework.Program() block = program.global_block() for _ in range(2): block.create_parameter( - dtype="float32", + dtype=dtype, shape=[5, 10], lod_level=0, name="param", initializer=initializer.MSRAInitializer( fan_in=12, seed=134)) - self.assertEqual(len(block.ops), 1) + num_ops = 2 if dtype == "float16" else 1 + self.assertEqual(len(block.ops), num_ops) init_op = block.ops[0] self.assertEqual(init_op.type, 'uniform_random') limit = np.sqrt(6.0 / 12) self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA) self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA) self.assertEqual(init_op.attr('seed'), 134) + return block + def test_msra_initializer_fp16(self): + """Test the MSRA initializer with float16 + """ + block = self.test_msra_initializer_supplied_arguments("float16") + self.assertTrue(check_cast_op(block.ops[1])) -class TestMSRAInitializer(unittest.TestCase): - def test_bilinear_initializer(self): + +class TestBilinearInitializer(unittest.TestCase): + def test_bilinear_initializer(self, dtype="float32"): """Test the bilinear initializer with supplied arguments """ program = framework.Program() block = program.global_block() for _ in range(2): block.create_parameter( - dtype="float32", + dtype=dtype, shape=[8, 1, 3, 3], lod_level=0, name="param", initializer=initializer.BilinearInitializer()) - self.assertEqual(len(block.ops), 1) + num_ops = 2 if dtype == "float16" else 1 + self.assertEqual(len(block.ops), num_ops) init_op = block.ops[0] self.assertEqual(init_op.type, 'assign_value') + return block + + def test_bilinear_initializer_fp16(self): + """Test the bilinear initializer with supplied arguments + """ + block = self.test_bilinear_initializer("float16") + self.assertTrue(check_cast_op(block.ops[1])) class TestNumpyArrayInitializer(unittest.TestCase): - def test_numpy_array_initializer(self): + def test_numpy_array_initializer(self, dtype="float32"): """Test the numpy array initializer with supplied arguments """ import numpy program = framework.Program() block = program.global_block() - np_array = numpy.random.random((10000)).astype("float32") + np_array = numpy.random.random((10000)).astype(dtype) for _ in range(2): block.create_parameter( dtype=np_array.dtype, @@ -435,10 +502,18 @@ class TestNumpyArrayInitializer(unittest.TestCase): lod_level=0, name="param", initializer=initializer.NumpyArrayInitializer(np_array)) - self.assertEqual(len(block.ops), 1) + num_ops = 2 if dtype == "float16" else 1 + self.assertEqual(len(block.ops), num_ops) init_op = block.ops[0] self.assertEqual(init_op.type, 'assign_value') assert (init_op.attr('fp32_values') == np_array).all() + return block + + def test_numpy_array_initializer_fp16(self): + """Test the numpy array initializer with float16 + """ + block = self.test_numpy_array_initializer("float16") + self.assertTrue(block.ops[1]) if __name__ == '__main__': diff --git a/python/setup.py.in b/python/setup.py.in index 1180c1f69f4..0ce98481f04 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -120,6 +120,7 @@ packages=['paddle', 'paddle.fluid.contrib.slim.distillation', 'paddle.fluid.contrib.utils', 'paddle.fluid.contrib.extend_optimizer', + 'paddle.fluid.contrib.mixed_precision', 'paddle.fluid.transpiler', 'paddle.fluid.transpiler.details', 'paddle.fluid.incubate', -- GitLab