diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py index 8754e5d4d0c8c829303f1fe9cd39ead36619ac3b..72d6c20bc644b57c5d1b8188a3f2381d9c6140b9 100644 --- a/python/paddle/fluid/imperative/nn.py +++ b/python/paddle/fluid/imperative/nn.py @@ -27,6 +27,7 @@ __all__ = [ 'Conv2D', 'Pool2D', 'FC', + 'BatchNorm', ] @@ -209,14 +210,24 @@ class FC(layers.Layer): def __init__(self, size, param_attr=None, + bias_attr=None, + dtype=core.VarDesc.VarType.FP32, num_flatten_dims=1, - dtype=core.VarDesc.VarType.FP32): + act=None, + is_test=False, + name=None): super(FC, self).__init__() + self._size = size self._num_flatten_dims = num_flatten_dims self._dtype = dtype from ..layer_helper import LayerHelper - self._helper = LayerHelper('FC', param_attr=param_attr) + self._helper = LayerHelper( + 'FC', + param_attr=param_attr, + bias_attr=bias_attr, + act=act, + name=name) def _build_once(self, input): input_shape = input.shape @@ -247,4 +258,132 @@ class FC(layers.Layer): inputs={"X": [tmp]}, outputs={"Out": out}, attrs={"use_mkldnn": False}) - return out + + pre_activation = self._helper.append_bias_op( + pre_bias, dim_start=num_flatten_dims) + return self._helper.append_activation(pre_activation) + + +class BatchNorm(layers.Layer): + def __init__(self, + num_channels, + act=None, + is_test=False, + momentum=0.9, + epsilon=1e-05, + param_attr=None, + bias_attr=None, + dtype=core.VarDesc.VarType.FP32, + data_layout='NCHW', + in_place=False, + name=None, + moving_mean_name=None, + moving_variance_name=None, + do_model_average_for_mean_and_var=False, + fuse_with_relu=False, + use_global_stats=False): + super(BatchNorm, self).__init__() + + assert bias_attr is not False, "bias_attr should not be False in batch_norm." + + from ..layer_helper import LayerHelper + self._helper = LayerHelper( + 'batch_norm', param_attr=param_attr, bias_attr=bias_attr, name=name) + + if dtype == core.VarDesc.VarType.FP16: + self._dtype = core.VarDesc.VarType.FP32 + else: + self._dtype = dtype + + param_shape = [num_channels] + + # create parameter + self._scale = self._helper.create_parameter( + attr=self._helper.param_attr, + shape=param_shape, + dtype=self._dtype, + default_initializer=Constant(1.0)) + + # setting stop_gradient=True to reduce computation + if use_global_stats and self._helper.param_attr.learning_rate == 0.: + self._scale.stop_gradient = True + + self._bias = self._helper.create_parameter( + attr=self._helper.bias_attr, + shape=param_shape, + dtype=self._dtype, + is_bias=True) + # setting stop_gradient=True to reduce computation + if use_global_stats and self._helper.bias_attr.learning_rate == 0.: + self._bias.stop_gradient = True + + self._mean = self._helper.create_parameter( + attr=ParamAttr( + name=moving_mean_name, + initializer=Constant(0.0), + trainable=False, + do_model_average=do_model_average_for_mean_and_var), + shape=param_shape, + dtype=self._dtype) + self._mean.stop_gradient = True + + self._variance = self._helper.create_parameter( + attr=ParamAttr( + name=moving_variance_name, + initializer=Constant(1.0), + trainable=False, + do_model_average=do_model_average_for_mean_and_var), + shape=param_shape, + dtype=self._dtype) + self._variance.stop_gradient = True + + self._in_place = in_place + self._momentum = momentum + self._epsilon = epsilon + self._is_test = is_test + self._fuse_with_relu = fuse_with_relu + self._use_global_stats = use_global_stats + + def _build_once(self, input): + pass + + def forward(self, input): + # create output + # mean and mean_out share the same memory + mean_out = self._mean + # variance and variance out share the same memory + variance_out = self._variance + + saved_mean = self._helper.create_variable_for_type_inference( + dtype=dtype, stop_gradient=True) + saved_variance = self._helper.create_variable_for_type_inference( + dtype=dtype, stop_gradient=True) + batch_norm_out = input if self._in_place else self._helper.create_variable_for_type_inference( + dtype) + + self._helper.append_op( + type="batch_norm", + inputs={ + "X": input, + "Scale": self._scale, + "Bias": self._bias, + "Mean": self._mean, + "Variance": self._variance + }, + outputs={ + "Y": batch_norm_out, + "MeanOut": mean_out, + "VarianceOut": variance_out, + "SavedMean": saved_mean, + "SavedVariance": saved_variance + }, + attrs={ + "momentum": self._momentum, + "epsilon": self._epsilon, + "is_test": self._is_test, + "use_mkldnn": False, + "fuse_with_relu": self._fuse_with_relu, + "use_global_stats": self._use_global_stats + }) + + return self._helper.append_activation(batch_norm_out) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index a4787e769f62ebbefd3ea6b70b402e660c02b576..235a1556e749b9925b4574ba33669021af49f193 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2835,7 +2835,7 @@ def batch_norm(input, attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True) # setting stop_gradient=True to reduce computation if use_global_stats and helper.bias_attr.learning_rate == 0.: - scale.stop_gradient = True + bias.stop_gradient = True mean = helper.create_parameter( attr=ParamAttr( @@ -9412,7 +9412,7 @@ def teacher_student_sigmoid_loss(input, by the previous operator. label (Variable|list): the ground truth which is a 2-D tensor with shape [N x 1], where N is the batch size. - soft_max_up_bound (float): if input > soft_max_up_bound, will be bound + soft_max_up_bound (float): if input > soft_max_up_bound, will be bound soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound Returns: diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..4bf80afd49b5221ab530a46ba997cc08288332f7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py @@ -0,0 +1,273 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import unittest +import numpy as np +import six + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid.optimizer import SGDOptimizer +from paddle.fluid.imperative.nn import Conv2D, Pool2D, BatchNorm, FC +from paddle.fluid.imperative.base import to_variable +from test_imperative_base import new_program_scope + +train_parameters = { + "input_size": [3, 224, 224], + "input_mean": [0.485, 0.456, 0.406], + "input_std": [0.229, 0.224, 0.225], + "learning_strategy": { + "name": "piecewise_decay", + "batch_size": 256, + "epochs": [30, 60, 90], + "steps": [0.1, 0.01, 0.001, 0.0001] + } +} + + +def optimizer_setting(params): + ls = params["learning_strategy"] + if ls["name"] == "piecewise_decay": + if "total_images" not in params: + total_images = 1281167 + else: + total_images = params["total_images"] + batch_size = ls["batch_size"] + step = int(total_images / batch_size + 1) + + bd = [step * e for e in ls["epochs"]] + base_lr = params["lr"] + lr = [] + lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] + optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + + return optimizer + + +class ConvBNLayer(fluid.imperative.Layer): + def __init__(self, num_filters, filter_size, stride=1, groups=1, act=None): + super(ConvBNLayer, self).__init__() + + self._conv = Conv2D( + 3, + num_filters, + filter_size, + stride, (filter_size - 1) // 2, + groups=groups, + act=None, + bias_attr=None) + + self._batch_norm = BatchNorm(num_filters, act=act) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + + return y + + +class BottleneckBlock(fluid.imperative.Layer): + def __init__(self, num_filters, stride, shortcut=False): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + num_filters=num_filters, filter_size=1, act='relu') + self.conv1 = ConvBNLayer( + num_filters=num_filters, filter_size=3, stride=stride, act='relu') + self.conv2 = ConvBNLayer( + num_filters=num_filters * 4, filter_size=1, act=None) + + if shortcut: + self.short = ConvBNLayer( + num_filters=num_filters * 4, filter_size=1, stride=stride) + + self.shortcut = shortcut + + def forward(self, inputs): + self.conv0() + self.conv1() + self.conv2() + + if self.shortcut: + self.short() + + return fluid.layers.elementwise_add( + x=self.short, y=self.conv2, act='relu') + + +class ResNet(fluid.imperative.Layer): + def __init__(self, layers=50, class_dim=1000): + self.layers = layers + supported_layers = [50, 101, 152] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format(supported_layers, layers) + + if layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + num_filters = [64, 128, 256, 512] + + self.conv = ConvBNLayer( + num_filters=64, filter_size=7, stride=2, act='relu') + self.pool2d_max = Pool2D( + pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') + + self.bottleneck_block_list = [] + for block in range(len(depth)): + shortcut = True + for i in range(depth[block]): + bottleneck_block = BottleneckBlock( + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut) + self.bottleneck_block_list.append(bottleneck_block) + shortcut = False + + self.pool2d_avg = Pool2D( + pool_size=7, pool_type='avg', global_pooling=True) + + import math + stdv = 1.0 / math.sqrt(2048 * 1.0) + + self.out = FC(size=class_dim, + act='softmax', + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv))) + + def forward(self, inputs): + y = self.conv(inputs) + y = self.pool2d_max(y) + for bottleneck_block in self.bottleneck_block_list: + y = bottleneck_block(y) + y = self.pool2d_avg(y) + y = self.out() + return y + + +class TestImperativeResnet(unittest.TestCase): + def test_resnet_cpu_float32(self): + seed = 90 + + with fluid.imperative.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + resnet = ResNet() + optimizer = optimizer_setting(train_parameters) + train_reader = paddle.batch( + paddle.dataset.flowers.train(), batch_size=256) + + dy_param_init_value = {} + for batch_id, data in enumerate(train_reader()): + if batch_id >= 2: + break + + x_data = np.array( + [x[0].reshape(1, 28, 28) for x in data]).astype('float32') + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + 128, 1) + + img = to_variable(x_data) + label = to_variable(y_data) + label._stop_gradient = True + + cost = resnet(img) + loss = fluid.layers.cross_entropy(input=out, label=label) + avg_loss = fluid.layers.mean(x=cost) + dy_out = avg_loss._numpy() + + if batch_id == 0: + for param in fluid.default_main_program().global_block( + ).all_parameters(): + dy_param_init_value[param.name] = param._numpy() + + avg_loss._backward() + optimizer.minimize(avg_loss) + dy_param_value = {} + for param in fluid.default_main_program().global_block( + ).all_parameters(): + dy_param_value[param.name] = param._numpy() + + # with new_program_scope(): + # fluid.default_startup_program().random_seed = seed + # fluid.default_main_program().random_seed = seed + + # exe = fluid.Executor(fluid.CPUPlace()) + + # # mnist = Conv2D(1, 20, 5) + # mnist = MNIST() + # sgd = SGDOptimizer(learning_rate=1e-3) + # train_reader = paddle.batch( + # paddle.dataset.mnist.train(), batch_size=128) + + # img = fluid.layers.data( + # name='pixel', shape=[1, 28, 28], dtype='float32') + # label = fluid.layers.data(name='label', shape=[1], dtype='int64') + # cost = mnist(img) + # loss = fluid.layers.reduce_mean(cost) + # sgd.minimize(loss) + + # # initialize params and fetch them + # static_param_init_value = {} + # static_param_name_list = [] + # for param in fluid.default_startup_program().global_block( + # ).all_parameters(): + # static_param_name_list.append(param.name) + + # out = exe.run(fluid.default_startup_program(), + # fetch_list=static_param_name_list) + + # for i in range(len(static_param_name_list)): + # static_param_init_value[static_param_name_list[i]] = out[i] + + # for batch_id, data in enumerate(train_reader()): + # if batch_id >= 2: + # break + + # x_data = np.array( + # [x[0].reshape(1, 28, 28) for x in data]).astype('float32') + # y_data = np.array([x[1] for x in data]).astype('int64').reshape( + # [128, 1]) + + # fetch_list = [loss.name] + # fetch_list.extend(static_param_name_list) + # out = exe.run(fluid.default_main_program(), + # feed={"pixel": x_data, + # "label": y_data}, + # fetch_list=fetch_list) + + # static_param_value = {} + # static_out = out[0] + # for i in range(1, len(out)): + # static_param_value[static_param_name_list[i - 1]] = out[i] + + # for key, value in six.iteritems(static_param_init_value): + # self.assertTrue( + # np.allclose(value.all(), dy_param_init_value[key].all())) + # self.assertTrue(np.allclose(static_out.all(), dy_out.all())) + # for key, value in six.iteritems(static_param_value): + # self.assertTrue(np.allclose(value.all(), dy_param_value[key].all())) + + +if __name__ == '__main__': + unittest.main()