diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc index 2ee9bf700c260124c34c225848f4941eadf443c2..59a4dac940503cac657e98af290539b03db7204f 100644 --- a/paddle/operators/tensor_array_read_write_op.cc +++ b/paddle/operators/tensor_array_read_write_op.cc @@ -136,6 +136,17 @@ class ReadFromArrayOp : public ArrayOp { auto &dev_ctx = *pool.Borrow(place); framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor); out_tensor->set_lod(x_array[offset].lod()); + if (Input("X") == "dynamic_rnn_0_output_array_fc_0.tmp_0_0@GRAD") { + VLOG(10) << "Offset = " << offset; + if (x_array[offset].numel() != 0) { + auto d = x_array[offset].dims(); + std::ostringstream sout; + for (int64_t i = 0; i < d[0]; ++i) { + sout << x_array[offset].data()[0 * d[1]] << ", "; + } + VLOG(10) << "Grad = " << sout.str(); + } + } } else { VLOG(10) << "offset " << offset << " >= " << x_array.size(); } diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc index 11ee96faad5aba0c2dbc13937d0c060aea98078a..d7c34297cd54edbe5b654ec0737bd39c9f9da27a 100644 --- a/paddle/operators/while_op.cc +++ b/paddle/operators/while_op.cc @@ -129,6 +129,9 @@ class WhileGradOp : public framework::OperatorBase { auto &og_inside = detail::Ref(cur_scope.Var(inside_og_name), "Cannot find inside gradient %s", inside_og_name); + + VLOG(10) << "OG " << outside_og_name << " Type is " + << og_outside.Type().name(); if (og_outside.Type().hash_code() == typeid(framework::LoDTensor).hash_code()) { auto &outside_tensor = og_outside.Get(); @@ -145,7 +148,6 @@ class WhileGradOp : public framework::OperatorBase { inside_array.resize(outside_array.size()); for (size_t j = 0; j < inside_array.size(); ++j) { - VLOG(10) << j << " " << outside_array[j].numel(); if (outside_array[j].numel() != 0) { inside_array[j].set_lod(outside_array[j].lod()); inside_array[j].ShareDataWith(outside_array[j]); @@ -198,6 +200,17 @@ class WhileGradOp : public framework::OperatorBase { auto sum_op = framework::OpRegistry::CreateOp( "sum", {{"X", {pg_names[param_id], new_inside_name}}}, {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); + + VLOG(10) << "Accumulate the gradient of " << pg_names[param_id]; + + if (pg_names[param_id] == "W@GRAD") { + auto &w_g = detail::Ref(cur_scope.FindVar(new_inside_name)) + .Get(); + VLOG(10) << "W_G is" << w_g.data()[0]; + } else { + VLOG(10) << pg_names[param_id]; + } + sum_op->Run(cur_scope, dev_place); cur_scope.Rename(new_inside_name, inside_grad_name); } diff --git a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py new file mode 100644 index 0000000000000000000000000000000000000000..99b92854663766c0eeba38a816c027f8fac4b2cc --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py @@ -0,0 +1,215 @@ +import numpy +import random +import collections +import paddle.v2.fluid as fluid +import unittest +import copy + + +class Memory(object): + def __init__(self, shape, dtype='float32'): + self.ex = numpy.zeros(shape=shape, dtype=dtype) + self.cur = None + + def update(self, val): + assert val.shape == self.ex.shape + assert val.dtype == self.ex.dtype + self.cur = val + + def ex(self): + return self.ex + + def next(self): + self.ex = self.cur + self.cur = None + + def __next__(self): + self.next() + + def reset(self): + self.ex = numpy.zeros(shape=self.ex.shape, dtype=self.ex.dtype) + self.cur = None + + +class Output(object): + def __init__(self): + self.outs = [] + + def next_sequence(self): + self.outs.append([]) + + def out(self, val): + self.outs[-1].append(val) + + def last(self): + return self.outs[-1][-1] + + +class BaseRNN(object): + def __init__(self, ins, mems, params, outs, num_seq=5, max_seq_len=15): + self.num_seq = num_seq + self.inputs = collections.defaultdict(list) + + for _ in xrange(num_seq): + seq_len = random.randint(1, max_seq_len - 1) + for iname in ins: + ishape = ins[iname].get('shape', None) + idtype = ins[iname].get('dtype', 'float32') + lst = [] + for _ in xrange(seq_len): + lst.append(numpy.random.random(size=ishape).astype(idtype)) + self.inputs[iname].append(lst) + + self.mems = dict() + for mname in mems: + mshape = mems[mname].get('shape', None) + mdtype = mems[mname].get('dtype', 'float32') + self.mems[mname] = Memory(shape=mshape, dtype=mdtype) + + self.params = dict() + for pname in params: + pshape = params[pname].get('shape', None) + pdtype = params[pname].get('dtype', 'float32') + self.params[pname] = numpy.random.random(size=pshape).astype(pdtype) + + self.outputs = dict() + + for oname in outs: + self.outputs[oname] = Output() + + def step(self, **kwargs): + pass + + def exe(self): + retv = dict() + for out in self.outputs: + retv[out] = [] + + for seq_id in xrange(self.num_seq): + for mname in self.mems: + self.mems[mname].reset() + for out in self.outputs: + self.outputs[out].next_sequence() + + iname0 = self.inputs.keys()[0] + seq_len = len(self.inputs[iname0][seq_id]) + + for step_id in xrange(seq_len): + xargs = dict() + + for iname in self.inputs: + xargs[iname] = self.inputs[iname][seq_id][step_id] + + for mname in self.mems: + xargs[mname] = self.mems[mname] + + for pname in self.params: + xargs[pname] = self.params[pname] + + for out in self.outputs: + xargs[out] = self.outputs[out] + + self.step(**xargs) + + for mname in self.mems: + next(self.mems[mname]) + + for out in self.outputs: + retv[out].append(self.outputs[out].last()) + + for out in retv: + retv[out] = numpy.array(retv[out]) + return retv + + def to_feed(self, place): + feed_dict = dict() + + for iname in self.inputs: + lod = [0] + np_flatten = [] + for seq_id in xrange(len(self.inputs[iname])): + seq_len = len(self.inputs[iname][seq_id]) + lod.append(lod[-1] + seq_len) + np_flatten.extend(self.inputs[iname][seq_id]) + + t = fluid.Tensor() + t.set(numpy.array(np_flatten), place) + t.set_lod([lod]) + feed_dict[iname] = t + + for pname in self.params: + feed_dict[pname] = self.params[pname] + return feed_dict + + def get_numeric_gradient_of_param(self, param_name, delta=0.01): + p = self.params[param_name] + g = numpy.zeros(shape=p.shape, dtype=p.dtype) + + for p_it, g_it in numpy.nditer([p, g], op_flags=['readwrite']): + o = float(p_it) + p_it[...] = o + delta + pos = self._exe_mean_out_() + p_it[...] = o - delta + neg = self._exe_mean_out_() + p_it[...] = o + g[:] = (pos - neg) / (delta * 2) + return g + + def _exe_mean_out_(self): + outs = self.exe() + return numpy.array([o.mean() for o in outs.itervalues()]).mean() + + +class SimpleMul(BaseRNN): + def __init__(self): + super(SimpleMul, self).__init__({ + 'X': { + 'shape': [32] + } + }, {}, {'W': { + 'shape': [32, 10] + }}, ['Out']) + + def step(self, X, W, Out): + Out.out(numpy.matmul(X, W)) + + +class TestSimpleMul(unittest.TestCase): + def setUp(self): + self.python_impl = SimpleMul() + + def test_forward(self): + program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(program, startup_program): + dat = fluid.layers.data(name='X', shape=[32], lod_level=1) + + rnn = fluid.layers.DynamicRNN() + with rnn.block(): + d = rnn.step_input(dat) + o = fluid.layers.fc(input=d, + param_attr='W', + bias_attr=False, + size=10, + act=None) + rnn.output(o) + + out = rnn() + out = fluid.layers.sequence_pool(out, pool_type='last') + loss = fluid.layers.mean(x=out) + fluid.backward.append_backward_ops(loss) + + cpu = fluid.CPUPlace() + exe = fluid.Executor(cpu) + out, w_g = exe.run(program, + feed=self.python_impl.to_feed(cpu), + fetch_list=[out, "W@GRAD"]) + out_by_python = self.python_impl.exe()['Out'] + self.assertTrue(numpy.allclose(out, out_by_python)) + w_g_num = self.python_impl.get_numeric_gradient_of_param("W") + print w_g_num[0][0] + print w_g_num - w_g + + +if __name__ == '__main__': + unittest.main()