gradient_checker.py 11.8 KB
Newer Older
1 2 3
import unittest

import numpy
4
import itertools
Y
Yu Yang 已提交
5
import paddle.v2.framework.core as core
Y
Yu Yang 已提交
6
from paddle.v2.framework.op import Operator
Y
Yu Yang 已提交
7

Y
Yu Yang 已提交
8 9
__all__ = ['get_numeric_gradient']

Y
Yu Yang 已提交
10

11
def create_op(op_type):
12
    # TODO need to set attrs
13
    kwargs = dict()
Y
Yancey1989 已提交
14
    for in_name in Operator.get_op_input_names(op_type):
15
        kwargs[in_name] = in_name
Y
Yancey1989 已提交
16
    for out_name in Operator.get_op_output_names(op_type):
17 18 19 20 21 22 23 24
        kwargs[out_name] = out_name
    return Operator(op_type, **kwargs)


def grad_var_name(var_name):
    return var_name + "@GRAD"


25 26 27 28
def empty_var_name():
    return "@EMPTY@"


Y
Yu Yang 已提交
29 30 31 32
def get_numeric_gradient(op,
                         input_values,
                         output_name,
                         input_to_check,
33
                         delta=0.005,
Z
zchen0211 已提交
34 35
                         local_scope=None,
                         in_place=False):
Y
Yu Yang 已提交
36 37
    """
    Get Numeric Gradient for an operator's input.
X
Xinghai Sun 已提交
38 39 40

    :param op: C++ operator instance, could be an network
    :param input_values: The input variables. Should be an dictionary, key is
Y
Yu Yang 已提交
41
    variable name. Value is numpy array.
X
Xinghai Sun 已提交
42
    :param output_name: The final output variable name.
Y
Yu Yang 已提交
43
    :param input_to_check: The input variable need to get gradient.
X
Xinghai Sun 已提交
44
    :param delta: The perturbation value for numeric gradient method. The
Y
Yu Yang 已提交
45 46 47 48 49
    smaller delta is, the more accurate result will get. But if that delta is
     too small, it could occur numerical stability problem.
    :param local_scope: The local scope used for get_numeric_gradient.
    :return: The gradient array in numpy format.
    """
Y
Yu Yang 已提交
50 51
    if local_scope is None:
        local_scope = core.Scope()
Y
Yu Yang 已提交
52 53

    # Create all input variable in local_scope
Y
Yu Yang 已提交
54 55 56 57
    for var_name in input_values:
        var = local_scope.new_var(var_name)
        tensor = var.get_tensor()
        tensor.set_dims(input_values[var_name].shape)
Y
Yu Yang 已提交
58 59
        tensor.alloc_float(core.CPUPlace())
        tensor.set(input_values[var_name], core.CPUPlace())
Y
Yu Yang 已提交
60

Y
Yu Yang 已提交
61
    # Create all output variable in local_scope
Y
Yu Yang 已提交
62 63 64 65 66
    opts = op.outputs()
    for key in opts:
        for output in opts[key]:
            if local_scope.find_var(output) is None:
                local_scope.new_var(output).get_tensor()
Y
Yu Yang 已提交
67 68
    op.infer_shape(local_scope)

Y
Yu Yang 已提交
69
    # allocate output memory
Y
Yu Yang 已提交
70 71 72 73
    for key in opts:
        for output in opts[key]:
            local_scope.find_var(output).get_tensor().alloc_float(core.CPUPlace(
            ))
Y
Yu Yang 已提交
74

Y
Yu Yang 已提交
75
    cpu_ctx = core.DeviceContext.create(core.CPUPlace())
Y
Yu Yang 已提交
76 77 78 79 80 81 82 83

    def get_output():
        op.run(local_scope, cpu_ctx)
        return numpy.array(local_scope.find_var(output_name).get_tensor()).sum()

    def product(dim):
        return reduce(lambda a, b: a * b, dim, 1)

Z
zchen0211 已提交
84
    def restore_inputs():
Z
zchen0211 已提交
85 86 87 88
        for var_name in input_values:
            tensor_ = local_scope.find_var(var_name).get_tensor()
            tensor_.set(numpy.copy(input_values[var_name]), core.CPUPlace())

Q
qiaolongfei 已提交
89
    # get the input tensor that we want to get it's numeric gradient.
Y
Yu Yang 已提交
90 91
    tensor_to_check = local_scope.find_var(input_to_check).get_tensor()
    tensor_size = product(tensor_to_check.get_dims())
Q
qiaolongfei 已提交
92
    # prepare a numpy array to store the gradient.
Y
Yu Yang 已提交
93
    gradient_flat = numpy.zeros(shape=(tensor_size, ), dtype='float32')
Q
qiaolongfei 已提交
94 95 96

    # we only compute gradient of one element each time.
    # we use a for loop to compute the gradient of every element.
Y
Yu Yang 已提交
97
    for i in xrange(tensor_size):
Z
zchen0211 已提交
98
        if in_place:
Z
zchen0211 已提交
99
            restore_inputs()
Q
qiaolongfei 已提交
100
        # get one input element throw it's index i.
Y
Yu Yang 已提交
101
        origin = tensor_to_check.get_float_element(i)
Q
qiaolongfei 已提交
102 103

        # add delta to it, run op and then get the sum of the result tensor.
Y
Yu Yang 已提交
104 105 106 107
        x_pos = origin + delta
        tensor_to_check.set_float_element(i, x_pos)
        y_pos = get_output()

Q
qiaolongfei 已提交
108
        # plus delta to this element, run op and get the sum of the result tensor.
Z
zchen0211 已提交
109
        if in_place:
Z
zchen0211 已提交
110
            restore_inputs()
Y
Yu Yang 已提交
111 112 113 114
        x_neg = origin - delta
        tensor_to_check.set_float_element(i, x_neg)
        y_neg = get_output()

Q
qiaolongfei 已提交
115 116 117 118
        # restore old value
        tensor_to_check.set_float_element(i, origin)

        # compute the gradient of this element and store it into a numpy array.
Y
Yu Yang 已提交
119
        gradient_flat[i] = (y_pos - y_neg) / delta / 2
Q
qiaolongfei 已提交
120 121

    # reshape the gradient result to the shape of the source tensor.
Y
Yu Yang 已提交
122 123 124
    return gradient_flat.reshape(tensor_to_check.get_dims())


125
class GradientChecker(unittest.TestCase):
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
    def __get_gradient(self, forward_op, backward_op, input_value, grad_names,
                       place):
        """Get the input gradients after running forward and backward operators
        on the given places.

        :param forward_op: forward operator
        :type forward_op: Operator
        :param backward_op: backward operator
        :type backward_op: Operator
        :param input_value: input values.
        :type input_value: dict{string:numpy.array}
        :param grad_names: the names of returned input gradients.
        :type input_value: a list of string
        :param place: the device type.
        :type place: CPUPlace or GPUPlace
        :return: the input grdients of given grad_names.
        :rtype: a list of numpy.array
        """
144 145
        scope = core.Scope()
        ctx = core.DeviceContext.create(place)
146

147 148 149 150 151 152
        inputs = forward_op.inputs()
        in_names = [item for k in inputs for item in inputs[k]]
        outputs = forward_op.outputs()
        out_names = [item for k in outputs for item in outputs[k]]

        # create input var and set value
153
        for name, value in input_value.iteritems():
154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
            if name not in in_names:
                raise ValueError(name + "does not exist in Op's inputs.")
            var = scope.new_var(name).get_tensor()
            var.set_dims(value.shape)
            var.set(value, place)

        # run forward op
        for out_name in out_names:
            scope.new_var(out_name)
        forward_op.infer_shape(scope)
        forward_op.run(scope, ctx)

        # set output var's shape
        # set output grad to ones
        for name in out_names:
            out_tensor = scope.find_var(name).get_tensor()
            grad_tensor = scope.new_var(grad_var_name(name)).get_tensor()
            grad_tensor.set_dims(out_tensor.shape())
            data = numpy.ones(out_tensor.shape(), dtype=numpy.float32)
            grad_tensor.set(data, place)

        # run backward op
Y
Yu Yang 已提交
176 177 178 179 180
        backward_outs = backward_op.outputs()
        backward_names = [
            item for key in backward_outs for item in backward_outs[key]
        ]
        for name in backward_names:
181
            scope.new_var(name)
Y
Yu Yang 已提交
182

183 184 185 186 187 188 189 190 191
        backward_op.infer_shape(scope)
        backward_op.run(scope, ctx)

        outs = [
            numpy.array(scope.find_var(name).get_tensor())
            for name in grad_names
        ]
        return outs

192
    def compare_grad(self, forward_op, input_value, no_grad_set=None):
193 194 195 196 197 198 199
        """ Compare the input gradients between CPU and GPU for the given forward
        operator.

        :param forward_op: forward operator
        :type forward_op: Operator
        :param input_value: input values.
        :type input_value: dict{string:numpy.array}
200 201
        :param no_grad_set: the set of variables names without gradients.
        :type no_grad_set: a set of string
202 203
        :raises: AssertionError, there is different gradient value.
        """
204 205 206
        if no_grad_set is None:
            no_grad_set = set()
        backward_op = core.Operator.backward(forward_op, no_grad_set)
D
dangqingqing 已提交
207
        # return if not compile with GPU or not implementing GPU kernel
208 209
        if not (core.is_compile_gpu() and backward_op.support_gpu()):
            return
210

211 212
        outputs = backward_op.outputs()
        out_names = [item for k in outputs for item in outputs[k]]
213
        out_names = filter(lambda x: x != empty_var_name(), out_names)
214 215 216 217
        cpu_grads = self.__get_gradient(forward_op, backward_op, input_value,
                                        out_names, core.CPUPlace())
        gpu_grads = self.__get_gradient(forward_op, backward_op, input_value,
                                        out_names, core.GPUPlace(0))
218 219 220 221

        for c_grad, g_grad, name in itertools.izip(cpu_grads, gpu_grads,
                                                   out_names):
            self.assertTrue(
222 223
                numpy.allclose(
                    c_grad, g_grad, atol=1e-4),
224 225
                "output name: " + name + " has diff")

226 227 228 229 230
    def __assert_is_close(self, numeric_grads, analytic_grads, names,
                          max_relative_error, msg_prefix):
        """Use relative error for the comparison.

        :param numeric_grads: the numerical graidents.
X
Xinghai Sun 已提交
231
        :type numeric_grads: a list of numpy.array
232
        :param analytic_grads: the analytical graidents.
X
Xinghai Sun 已提交
233
        :type analytic_grads: a list of numpy.array
234 235 236 237 238
        :param name: the names of gradients, used to print for debug.
        :type names: a list of string
        :param msg_prefix: string info, used to print for debug.
        :type msf_prefix: string
        """
239
        for a, b, name in itertools.izip(numeric_grads, analytic_grads, names):
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254
            abs_a = numpy.abs(a)
            # if abs_a is nearly zero, then use abs error for a, not relative
            # error.
            abs_a[abs_a < 1e-3] = 1

            diff_mat = numpy.abs(a - b) / abs_a
            max_diff = numpy.max(diff_mat)

            def err_msg():
                offset = numpy.argmax(diff_mat > max_relative_error)
                return "%s Variable %s max gradient diff %f over limit %f, the first " \
                       "error element is %d" % (
                       msg_prefix, name, max_diff, max_relative_error, offset)

            self.assertLessEqual(max_diff, max_relative_error, err_msg())
255 256 257 258 259 260 261 262

    def check_grad(self,
                   forward_op,
                   input_vars,
                   inputs_to_check,
                   output_name,
                   no_grad_set=None,
                   only_cpu=False,
Z
zchen0211 已提交
263
                   in_place=False,
264 265 266 267 268 269
                   max_relative_error=0.005):
        """
        :param forward_op: used to create backward_op
        :param input_vars: numpy value of input variable. The following
            computation will use these variables.
        :param inputs_to_check: inputs var names that should check gradient.
Q
qingqing01 已提交
270
        :param output_name: the output variable name of forward network.
271 272 273 274 275 276 277 278
        :param max_relative_error: The relative tolerance parameter.
        :param no_grad_set: used when create backward ops
        :param only_cpu: only compute and check gradient on cpu kernel.
        :return:
        """
        if no_grad_set is None:
            no_grad_set = set()

Y
Yu Yang 已提交
279
        no_tmp_out = forward_op.no_intermediate_outputs()
280 281 282
        if len(no_tmp_out) != 1:
            raise ValueError("non temp out_names should be 1")

Y
Yu Yang 已提交
283 284
        inputs = forward_op.inputs()
        in_names = [item for k in inputs for item in inputs[k]]
285 286 287
        for no_grad in no_grad_set:
            if no_grad not in in_names:
                raise ValueError("no_grad should be in in_names")
288
            if no_grad in inputs_to_check:
289 290
                raise ValueError("no_grad should not be in inputs_to_check")

291 292 293 294 295 296
        backward_op = core.Operator.backward(forward_op, no_grad_set)

        places = [core.CPUPlace()]
        if not only_cpu and core.is_compile_gpu() and backward_op.support_gpu():
            places.append(core.GPUPlace(0))

297 298
        # get numerical gradients
        numeric_grads = [
Z
zchen0211 已提交
299 300
            get_numeric_gradient(
                forward_op, input_vars, output_name, name, in_place=in_place)
301 302
            for name in inputs_to_check
        ]
303

304
        check_names = [grad_var_name(name) for name in inputs_to_check]
305
        for place in places:
306 307
            analytic_grads = self.__get_gradient(forward_op, backward_op,
                                                 input_vars, check_names, place)
308 309 310
            self.__assert_is_close(numeric_grads, analytic_grads, check_names,
                                   max_relative_error,
                                   "Gradient Check On %s" % str(place))