gradient_checker.py 9.8 KB
Newer Older
1 2 3
import unittest

import numpy
Y
Yu Yang 已提交
4
import paddle.v2.framework.core as core
Y
Yu Yang 已提交
5
from paddle.v2.framework.op import Operator
Y
Yu Yang 已提交
6

Y
Yu Yang 已提交
7 8
__all__ = ['get_numeric_gradient']

Y
Yu Yang 已提交
9

10 11 12 13 14 15 16 17 18 19 20 21 22 23
def create_op(op_type):
    kwargs = dict()
    for in_name in Operator.get_op_input_names(op_type):
        kwargs[in_name] = in_name
    for out_name in Operator.get_op_output_names(op_type):
        kwargs[out_name] = out_name

    return Operator(op_type, **kwargs)


def grad_var_name(var_name):
    return var_name + "@GRAD"


Y
Yu Yang 已提交
24 25 26 27
def get_numeric_gradient(op,
                         input_values,
                         output_name,
                         input_to_check,
28
                         delta=0.005,
Y
Yu Yang 已提交
29
                         local_scope=None):
Y
Yu Yang 已提交
30 31 32 33 34 35 36 37 38 39 40 41 42 43
    """
    Get Numeric Gradient for an operator's input.
    
    :param op: C++ operator instance, could be an network 
    :param input_values: The input variables. Should be an dictionary, key is 
    variable name. Value is numpy array.
    :param output_name: The final output variable name. 
    :param input_to_check: The input variable need to get gradient.
    :param delta: The perturbation value for numeric gradient method. The 
    smaller delta is, the more accurate result will get. But if that delta is
     too small, it could occur numerical stability problem.
    :param local_scope: The local scope used for get_numeric_gradient.
    :return: The gradient array in numpy format.
    """
Y
Yu Yang 已提交
44 45
    if local_scope is None:
        local_scope = core.Scope()
Y
Yu Yang 已提交
46 47

    # Create all input variable in local_scope
Y
Yu Yang 已提交
48 49 50 51
    for var_name in input_values:
        var = local_scope.new_var(var_name)
        tensor = var.get_tensor()
        tensor.set_dims(input_values[var_name].shape)
Y
Yu Yang 已提交
52 53
        tensor.alloc_float(core.CPUPlace())
        tensor.set(input_values[var_name], core.CPUPlace())
Y
Yu Yang 已提交
54

Y
Yu Yang 已提交
55
    # Create all output variable in local_scope
Y
Yu Yang 已提交
56 57 58 59 60
    opts = op.outputs()
    for key in opts:
        for output in opts[key]:
            if local_scope.find_var(output) is None:
                local_scope.new_var(output).get_tensor()
Y
Yu Yang 已提交
61 62
    op.infer_shape(local_scope)

Y
Yu Yang 已提交
63
    # allocate output memory
Y
Yu Yang 已提交
64 65 66 67
    for key in opts:
        for output in opts[key]:
            local_scope.find_var(output).get_tensor().alloc_float(core.CPUPlace(
            ))
Y
Yu Yang 已提交
68

Y
Yu Yang 已提交
69
    # TODO(yuyang18): Only CPU is support now.
Y
Yu Yang 已提交
70
    cpu_ctx = core.DeviceContext.create(core.CPUPlace())
Y
Yu Yang 已提交
71 72 73 74 75 76 77 78

    def get_output():
        op.run(local_scope, cpu_ctx)
        return numpy.array(local_scope.find_var(output_name).get_tensor()).sum()

    def product(dim):
        return reduce(lambda a, b: a * b, dim, 1)

Q
qiaolongfei 已提交
79
    # get the input tensor that we want to get it's numeric gradient.
Y
Yu Yang 已提交
80 81
    tensor_to_check = local_scope.find_var(input_to_check).get_tensor()
    tensor_size = product(tensor_to_check.get_dims())
Q
qiaolongfei 已提交
82
    # prepare a numpy array to store the gradient.
Y
Yu Yang 已提交
83
    gradient_flat = numpy.zeros(shape=(tensor_size, ), dtype='float32')
Q
qiaolongfei 已提交
84 85 86

    # we only compute gradient of one element each time.
    # we use a for loop to compute the gradient of every element.
Y
Yu Yang 已提交
87
    for i in xrange(tensor_size):
Q
qiaolongfei 已提交
88
        # get one input element throw it's index i.
Y
Yu Yang 已提交
89
        origin = tensor_to_check.get_float_element(i)
Q
qiaolongfei 已提交
90 91

        # add delta to it, run op and then get the sum of the result tensor.
Y
Yu Yang 已提交
92 93 94 95
        x_pos = origin + delta
        tensor_to_check.set_float_element(i, x_pos)
        y_pos = get_output()

Q
qiaolongfei 已提交
96
        # plus delta to this element, run op and get the sum of the result tensor.
Y
Yu Yang 已提交
97 98 99 100
        x_neg = origin - delta
        tensor_to_check.set_float_element(i, x_neg)
        y_neg = get_output()

Q
qiaolongfei 已提交
101 102 103 104
        # restore old value
        tensor_to_check.set_float_element(i, origin)

        # compute the gradient of this element and store it into a numpy array.
Y
Yu Yang 已提交
105
        gradient_flat[i] = (y_pos - y_neg) / delta / 2
Q
qiaolongfei 已提交
106 107

    # reshape the gradient result to the shape of the source tensor.
Y
Yu Yang 已提交
108 109 110
    return gradient_flat.reshape(tensor_to_check.get_dims())


111
class GradientChecker(unittest.TestCase):
Y
Yu Yang 已提交
112 113
    def assert_is_close(self, numeric_grads, scope, max_relative_error,
                        msg_prefix):
114
        for name in numeric_grads:
115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
            b = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
            a = numeric_grads[name]

            abs_a = numpy.abs(a)
            # if abs_a is nearly zero, then use abs error for a, not relative
            # error.
            abs_a[abs_a < 1e-3] = 1

            diff_mat = numpy.abs(a - b) / abs_a
            max_diff = numpy.max(diff_mat)

            def err_msg():
                offset = numpy.argmax(diff_mat > max_relative_error)
                return "%s Variable %s max gradient diff %f over limit %f, the first " \
                       "error element is %d" % (
                       msg_prefix, name, max_diff, max_relative_error, offset)

            self.assertLessEqual(max_diff, max_relative_error, err_msg())
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155

    def check_grad(self,
                   forward_op,
                   input_vars,
                   inputs_to_check,
                   output_name,
                   no_grad_set=None,
                   only_cpu=False,
                   max_relative_error=0.005):
        """
        :param forward_op: used to create backward_op
        :param input_vars: numpy value of input variable. The following
            computation will use these variables.
        :param inputs_to_check: inputs var names that should check gradient.
        :param output_name: output name that used to
        :param max_relative_error: The relative tolerance parameter.
        :param no_grad_set: used when create backward ops
        :param only_cpu: only compute and check gradient on cpu kernel.
        :return:
        """
        if no_grad_set is None:
            no_grad_set = set()

Y
Yu Yang 已提交
156
        no_tmp_out = forward_op.no_intermediate_outputs()
157 158 159
        if len(no_tmp_out) != 1:
            raise ValueError("non temp out_names should be 1")

Y
Yu Yang 已提交
160 161 162 163 164
        inputs = forward_op.inputs()
        in_names = [item for k in inputs for item in inputs[k]]
        outputs = forward_op.outputs()
        out_names = [item for k in outputs for item in outputs[k]]

165 166 167 168 169
        for no_grad in no_grad_set:
            if no_grad not in in_names:
                raise ValueError("no_grad should be in in_names")
        backward_op = core.Operator.backward(forward_op, no_grad_set)

Y
Yu Yang 已提交
170 171 172
        bwd_outputs = backward_op.outputs()
        bwd_out_names = [item for k in bwd_outputs for item in bwd_outputs[k]]

173 174 175 176 177 178 179 180
        places = [core.CPUPlace()]
        if not only_cpu and core.is_compile_gpu() and backward_op.support_gpu():
            places.append(core.GPUPlace(0))

        numeric_grad = dict()
        # get numeric gradient
        for check_name in inputs_to_check:
            numeric_grad[check_name] = \
181 182
                get_numeric_gradient(forward_op, input_vars, output_name,
                                     check_name)
183 184 185 186 187 188 189 190 191 192 193 194 195 196 197

        # get operator gradient according to different device
        for place in places:
            scope = core.Scope()
            ctx = core.DeviceContext.create(place)

            # create input var and set value
            for name, value in input_vars.iteritems():
                if name not in in_names:
                    raise ValueError(name + " not in op.inputs_")
                var = scope.new_var(name).get_tensor()
                var.set_dims(value.shape)
                var.set(value, place)

            # create output var
Y
Yu Yang 已提交
198
            for out_name in out_names:
199 200 201 202 203 204 205 206 207
                scope.new_var(out_name).get_tensor()

            # infer the shape of output var and compute/set value of output var
            forward_op.infer_shape(scope)
            forward_op.run(scope, ctx)

            # create output grad var
            # set shape as the output var
            # set value of this grad to ones
Y
Yu Yang 已提交
208
            for name in out_names:
209 210 211 212 213 214 215
                out_tensor = scope.find_var(name).get_tensor()
                grad_tensor = scope.new_var(grad_var_name(name)).get_tensor()
                grad_tensor.set_dims(out_tensor.shape())
                data = 1.0 * numpy.ones(out_tensor.shape())
                grad_tensor.set(data, place)

            # create input grad var
Y
Yu Yang 已提交
216
            for name in bwd_out_names:
217 218 219 220 221 222 223
                scope.new_var(name).get_tensor()

            # infer the shape of input gradient var and compute/set it's value
            # with backward op
            backward_op.infer_shape(scope)
            backward_op.run(scope, ctx)

Y
Yu Yang 已提交
224 225
            self.assert_is_close(numeric_grad, scope, max_relative_error,
                                 "Gradient Check On %s" % str(place))
226 227


Y
Yu Yang 已提交
228 229 230 231
if __name__ == '__main__':

    class GetNumericGradientTest(unittest.TestCase):
        def test_add_op(self):
Y
Yu Yang 已提交
232
            add_op = Operator('add_two', X="X", Y="Y", Out="Z")
Y
Yu Yang 已提交
233 234 235 236 237 238
            x = numpy.random.random((10, 1)).astype("float32")
            y = numpy.random.random((10, 1)).astype("float32")

            arr = get_numeric_gradient(add_op, {'X': x, "Y": y}, 'Z', 'X')
            self.assertAlmostEqual(arr.mean(), 1.0, delta=1e-2)

239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
        def test_softmax_op(self):
            def stable_softmax(x):
                """Compute the softmax of vector x in a numerically stable way."""
                shiftx = x - numpy.max(x)
                exps = numpy.exp(shiftx)
                return exps / numpy.sum(exps)

            def label_softmax_grad(Y, dY):
                dX = Y * 0.0
                for i in range(Y.shape[0]):
                    d = numpy.dot(Y[i, :], dY[i, :])
                    dX[i, :] = Y[i, :] * (dY[i, :] - d)
                return dX

            softmax_op = Operator("softmax", X="X", Y="Y")

            X = numpy.random.random((2, 2)).astype("float32")
            Y = numpy.apply_along_axis(stable_softmax, 1, X)
            dY = numpy.ones(Y.shape)
            dX = label_softmax_grad(Y, dY)

            arr = get_numeric_gradient(softmax_op, {"X": X}, 'Y', 'X')
            numpy.testing.assert_almost_equal(arr, dX, decimal=1e-2)

Y
Yu Yang 已提交
263
    unittest.main()