未验证 提交 4624d7c6 编写于 作者: J Jiabin Yang 提交者: GitHub

test=develop, add gradient sort backward strategy (#17125)

* test=develop, add gradient sort backward strategy

* test=develop, fix test by add FLAGS_cudnn_deterministic on new tests
上级 1d0ba5e8
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Created by Jiabin on 2019-04-25.
//
#pragma once
#ifndef PADDLE_BACKWARDSTRATEGY_H
#define PADDLE_BACKWARDSTRATEGY_H
#endif // PADDLE_BACKWARDSTRATEGY_H
namespace paddle {
namespace imperative {
namespace detail {
class BackwardStrategy {
public:
/* DyGraph now support two kinds of backward strategy, one is sorted sum
* gradient, another is sum gradient once they are created */
// TODO(jiabin): add more Strategy when we support
bool sorted_sum_gradient_{false};
};
} // namespace detail
} // namespace imperative
} // namespace paddle
......@@ -101,26 +101,50 @@ void AddTo(Variable* src, Variable* dst, platform::Place place) {
boost::apply_visitor(func, place);
}
void AddGradBySort(BackwardSumMap* bck_map, VarBase* target) {
PADDLE_ENFORCE(bck_map->find(target) != bck_map->end(),
"Can't find %s in backward grad map", target->Name());
std::pair<platform::Place, std::vector<std::pair<int, VarBase*>>>& current =
bck_map->at(target);
std::sort(
current.second.begin(), current.second.end(),
[](const std::pair<int, VarBase*>& a, const std::pair<int, VarBase*>& b) {
return a.first > b.first;
});
for (auto& var_pair : current.second) {
Variable* origin_grad = target->var_;
Variable* grad_to_add = var_pair.second->var_;
VLOG(2) << "add origin_grad: " << target->Name();
VLOG(2) << "added grad: " << var_pair.second->Name()
<< " trace id is: " << var_pair.first;
AddTo(grad_to_add, origin_grad, current.first);
delete grad_to_add;
var_pair.second = nullptr;
}
}
class Autograd {
public:
Autograd() {}
void RunBackward(VarBase* var) {
void RunBackward(VarBase* var, const detail::BackwardStrategy& bck_stratedy) {
if (var->IsStopGradient()) {
return;
}
VLOG(3) << "start autograd";
bck_map = new BackwardSumMap();
grad_ref = new GradientRef();
std::deque<OpBase*> ready;
ready.push_back(var->PreOp());
std::map<OpBase*, int> dep_counts = ComputeDepCounts(var->PreOp());
std::map<OpBase*, int> dep_counts =
ComputeDepCounts(var->PreOp(), bck_stratedy);
while (!ready.empty()) {
OpBase* ready_op = ready.front();
ready.pop_front();
std::map<std::string, std::vector<VarBase*>> input_grads =
ready_op->ApplyGrad();
ready_op->ApplyGrad(bck_map, grad_ref, bck_stratedy);
for (auto it = input_grads.rbegin(); it != input_grads.rend(); ++it) {
const std::vector<VarBase*>& ingrads = it->second;
......@@ -146,7 +170,8 @@ class Autograd {
}
private:
std::map<OpBase*, int> ComputeDepCounts(OpBase* op) {
std::map<OpBase*, int> ComputeDepCounts(
OpBase* op, const detail::BackwardStrategy& bck_stratedy) {
std::map<OpBase*, int> ret;
std::deque<OpBase*> queue;
......@@ -156,10 +181,25 @@ class Autograd {
while (!queue.empty()) {
OpBase* candidate = queue.front();
queue.pop_front();
if (bck_stratedy.sorted_sum_gradient_) {
for (const auto& map : candidate->grad_output_vars_) {
for (const auto& it : map) {
for (const auto& vb : it.second) {
if (grad_ref->find(vb) == grad_ref->end()) {
grad_ref->insert(std::make_pair(vb, 1));
} else {
// add ref count by 1 when we find grad_var can be generated by
// one grad_op
grad_ref->at(vb) += 1;
}
}
}
}
}
for (auto it : candidate->pre_ops_) {
for (OpBase* pre_op : it.second) {
if (!pre_op) continue;
VLOG(5) << "op dep " << candidate->Type() << " trace id "
VLOG(2) << "op dep " << candidate->Type() << " trace id "
<< candidate->trace_id_ << " <---- " << it.first << " <---- "
<< pre_op->Type() << " trace id " << pre_op->trace_id_;
if (visited.find(pre_op) == visited.end()) {
......@@ -172,6 +212,9 @@ class Autograd {
}
return ret;
}
BackwardSumMap* bck_map;
GradientRef* grad_ref;
};
std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
......@@ -213,7 +256,9 @@ framework::LoDTensor& VarBase::GradValue() {
return *(grads_->var_->GetMutable<framework::LoDTensor>());
}
std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad(
BackwardSumMap* bck_map, GradientRef* grad_ref,
const detail::BackwardStrategy& bck_stratedy) {
PADDLE_ENFORCE(!grad_op_descs_.empty() || backward_id_ > 0,
"%s has no backward implementation", Type());
......@@ -313,9 +358,47 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
for (size_t i = 0; i < outputs.size(); ++i) {
// track outputs used by sum
if (bck_stratedy.sorted_sum_gradient_) {
#ifndef PADDLE_WITH_CUDA
VLOG(2) << "origin_outputs is : " << origin_outputs[i]->Name() << " ";
VLOG(2) << origin_outputs[i]
->var_->GetMutable<framework::LoDTensor>()
->data<float>()[0];
VLOG(2) << "outputs is : " << outputs[i]->Name() << " ";
VLOG(2) << outputs[i]
->var_->GetMutable<framework::LoDTensor>()
->data<float>()[0];
#endif
if (bck_map->find(origin_outputs[i]) != bck_map->end()) {
VLOG(2) << "add sub grad to " << origin_outputs[i]->Name();
bck_map->at(origin_outputs[i])
.second.emplace_back(
std::pair<int, VarBase*>(this->trace_id_, outputs[i]));
} else {
VLOG(2) << "insert new map for " << origin_outputs[i]->Name();
std::pair<platform::Place, std::vector<std::pair<int, VarBase*>>>
tmp(place_, {std::make_pair(this->trace_id_, outputs[i])});
bck_map->insert(std::make_pair(origin_outputs[i], tmp));
}
PADDLE_ENFORCE(grad_ref->find(origin_outputs[i]) != grad_ref->end(),
"Can't find %s in grad_reference count map",
origin_outputs[i]->Name());
PADDLE_ENFORCE(grad_ref->at(origin_outputs[i]) >= 1,
"Backward error when calculate grad reference");
if (grad_ref->at(origin_outputs[i]) > 1) {
VLOG(2) << "remove ref for " << origin_outputs[i]->Name();
grad_ref->at(origin_outputs[i])--;
} else {
VLOG(2) << "Add grad for: " << origin_outputs[i]->Name();
AddGradBySort(bck_map, origin_outputs[i]);
grad_ref->at(origin_outputs[i])--;
}
} else {
framework::Variable* grad = outputs[i]->var_;
framework::Variable* orig_grad = origin_outputs[i]->var_;
VLOG(3) << "AddTo Called with orig_grad is: "
VLOG(2) << "AddTo Called with orig_grad is: "
<< origin_outputs[i]->name_ << " Grad to be added is "
<< outputs[i]->name_;
AddTo(grad, orig_grad, place_);
......@@ -323,6 +406,7 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
}
}
}
}
return input_vars_;
}
......@@ -347,7 +431,7 @@ void OpBase::RegisterBackwardHooks(const py::object& callable, bool front) {
}
}
void VarBase::RunBackward() {
void VarBase::RunBackward(const detail::BackwardStrategy& bck_stratedy) {
if (!pre_op_) return;
VLOG(3) << "start backward";
......@@ -360,7 +444,7 @@ void VarBase::RunBackward() {
PADDLE_ENFORCE(
grads_ ==
pre_op_->output_vars_[pre_op_out_name_][pre_op_out_idx_]->grads_);
Autograd().RunBackward(this);
Autograd().RunBackward(this, bck_stratedy);
}
void PyLayer::RegisterFunc(int func_id, const py::object& py_func) {
......
......@@ -31,7 +31,7 @@
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/imperative/backward_strategy.h"
#include "paddle/fluid/imperative/type_defs.h"
namespace paddle {
......@@ -217,7 +217,7 @@ class VarBase {
inline OpBase* PreOp() const { return pre_op_; }
inline int PreOpOutIdx() const { return pre_op_out_idx_; }
void RunBackward();
void RunBackward(const detail::BackwardStrategy& bck_stratedy);
inline void ResetPreOp(OpBase* op) {
if (op == pre_op_) {
......@@ -302,7 +302,9 @@ class PYBIND11_HIDDEN OpBase {
}
}
std::map<std::string, std::vector<VarBase*>> ApplyGrad();
std::map<std::string, std::vector<VarBase*>> ApplyGrad(
BackwardSumMap* bck_map, GradientRef* grad_ref,
const detail::BackwardStrategy& bck_stratedy);
inline std::string Type() const { return type_; }
inline std::string GradOpType(size_t index) const {
......
......@@ -16,6 +16,8 @@ limitations under the License. */
#include <map>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
namespace paddle {
......@@ -27,6 +29,11 @@ class OpBase;
typedef std::map<std::string, std::vector<VarBase*>> VarBasePtrMap;
typedef std::map<std::string, std::vector<const VarBase*>> ConstVarBasePtrMap;
typedef std::map<std::string, std::vector<OpBase*>> OpBasePtrMap;
typedef std::unordered_map<
const VarBase*,
std::pair<platform::Place, std::vector<std::pair<int, VarBase*>>>>
BackwardSumMap; // var_grad -> {place, {id -> var_grad@rename}}
typedef std::unordered_map<const VarBase*, int> GradientRef;
} // namespace imperative
} // namespace paddle
......@@ -184,6 +184,18 @@ PYBIND11_MODULE(core, m) {
m.def("print_mem_usage",
[]() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); });
py::class_<imperative::detail::BackwardStrategy> backward_strategy(
m, "BackwardStrategy", R"DOC()DOC");
backward_strategy.def(py::init())
.def_property("sort_sum_gradient",
[](const imperative::detail::BackwardStrategy &self) {
return self.sorted_sum_gradient_;
},
[](imperative::detail::BackwardStrategy &self,
bool sorted_sum_gradient) {
self.sorted_sum_gradient_ = sorted_sum_gradient;
});
m.def("start_imperative_gperf_profiler",
[]() { imperative::StartProfile(); });
......@@ -199,7 +211,10 @@ PYBIND11_MODULE(core, m) {
const std::vector<int64_t>,
const paddle::platform::CUDAPlace, bool, bool>())
.def("_run_backward",
[](imperative::VarBase &self) { self.RunBackward(); })
[](imperative::VarBase &self,
const imperative::detail::BackwardStrategy &bckst) {
self.RunBackward(bckst);
})
.def("_grad_name", &imperative::VarBase::GradName)
.def("_grad_value", &imperative::VarBase::GradValue)
.def("_clear_gradient", &imperative::VarBase::ClearGradient)
......
......@@ -38,6 +38,9 @@ from .checkpoint import *
from . import learning_rate_scheduler
from .learning_rate_scheduler import *
from . import backward_strategy
from .backward_strategy import *
__all__ = []
__all__ += layers.__all__
__all__ += base.__all__
......@@ -47,3 +50,4 @@ __all__ += profiler.__all__
__all__ += parallel.__all__
__all__ += checkpoint.__all__
__all__ += learning_rate_scheduler.__all__
__all__ += backward_strategy.__all__
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.fluid import core
__all__ = ["BackwardStrategy"]
BackwardStrategy = core.BackwardStrategy
......@@ -529,8 +529,17 @@ class Variable(object):
new_ivar = self._ivar._copy_to(core.CPUPlace(), True)
return np.array(new_ivar.value().get_tensor())
def backward(self):
self._ivar._run_backward()
def backward(self, backward_strategy=None):
from .dygraph import BackwardStrategy
if isinstance(backward_strategy, BackwardStrategy):
self._ivar._run_backward(backward_strategy)
elif backward_strategy is not None:
raise TypeError(
"only BackwardStrategy type should be passed in backward")
else:
backward_strategy = BackwardStrategy()
backward_strategy.sort_sum_gradient = False
self._ivar._run_backward(backward_strategy)
def gradient(self):
new_ivar = self._ivar._grad_ivar()._copy_to(core.CPUPlace(), True)
......
......@@ -80,6 +80,8 @@ list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
list(REMOVE_ITEM TEST_OPS test_imperative_resnet_sorted_gradient)
list(REMOVE_ITEM TEST_OPS test_imperative_mnist_sorted_gradient)
list(REMOVE_ITEM TEST_OPS test_imperative_se_resnext)
list(REMOVE_ITEM TEST_OPS test_imperative_mnist)
list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer)
......@@ -129,8 +131,12 @@ py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${G
py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS} SERIAL)
py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
FLAGS_cudnn_deterministic=1 SERIAL)
py_test_modules(test_imperative_resnet_sorted_gradient MODULES test_imperative_resnet_sorted_gradient ENVS
FLAGS_cudnn_deterministic=1 SERIAL)
py_test_modules(test_imperative_mnist MODULES test_imperative_mnist ENVS
FLAGS_cudnn_deterministic=1 SERIAL)
py_test_modules(test_imperative_mnist_sorted_gradient MODULES test_imperative_mnist_sorted_gradient ENVS
FLAGS_cudnn_deterministic=1 SERIAL)
py_test_modules(test_imperative_se_resnext MODULES test_imperative_se_resnext ENVS
FLAGS_cudnn_deterministic=1 SERIAL)
......
......@@ -201,8 +201,21 @@ class TestImperative(unittest.TestCase):
ret = fluid.layers.sums(inputs)
loss = fluid.layers.reduce_sum(ret)
loss.backward()
with fluid.dygraph.guard():
inputs2 = []
for _ in range(10):
inputs2.append(fluid.dygraph.base.to_variable(x))
ret2 = fluid.layers.sums(inputs2)
loss2 = fluid.layers.reduce_sum(ret2)
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
loss2.backward(backward_strategy)
self.assertTrue(np.allclose(ret.numpy(), x * 10))
self.assertTrue(np.allclose(inputs[0].gradient(), x))
self.assertTrue(np.allclose(ret2.numpy(), x * 10))
a = inputs2[0].gradient()
self.assertTrue(np.allclose(inputs2[0].gradient(), x))
def test_layer(self):
with fluid.dygraph.guard():
......@@ -291,6 +304,17 @@ class TestImperative(unittest.TestCase):
x.backward()
dy_grad = l._x_for_debug.gradient()
with fluid.dygraph.guard():
var_inp2 = fluid.dygraph.base.to_variable(np_inp)
l2 = MyLayer("my_layer")
x2 = l2(var_inp2)[0]
self.assertIsNotNone(x2)
dy_out2 = x2.numpy()
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
x2.backward(backward_strategy)
dy_grad2 = l2._x_for_debug.gradient()
with new_program_scope():
inp = fluid.layers.data(
name="inp", shape=[3], append_batch_size=False)
......@@ -307,6 +331,8 @@ class TestImperative(unittest.TestCase):
self.assertTrue(np.allclose(dy_out, static_out))
self.assertTrue(np.allclose(dy_grad, static_grad))
self.assertTrue(np.allclose(dy_out2, static_out))
self.assertTrue(np.allclose(dy_grad2, static_grad))
def test_mlp(self):
np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
......@@ -318,6 +344,16 @@ class TestImperative(unittest.TestCase):
out.backward()
dy_grad = mlp._fc1._w.gradient()
with fluid.dygraph.guard():
var_inp2 = fluid.dygraph.base.to_variable(np_inp)
mlp2 = MLP("mlp")
out2 = mlp2(var_inp2)
dy_out2 = out2.numpy()
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
out2.backward(backward_strategy)
dy_grad2 = mlp2._fc1._w.gradient()
with new_program_scope():
inp = fluid.layers.data(
name="inp", shape=[2, 2], append_batch_size=False)
......@@ -335,6 +371,8 @@ class TestImperative(unittest.TestCase):
self.assertTrue(np.allclose(dy_out, static_out))
self.assertTrue(np.allclose(dy_grad, static_grad))
self.assertTrue(np.allclose(dy_out2, static_out))
self.assertTrue(np.allclose(dy_grad2, static_grad))
params = mlp.parameters(True)
self.assertEqual("mlp/MLP_0/FC_0.w_0", params[0].name)
......@@ -413,6 +451,19 @@ class TestImperative(unittest.TestCase):
dy_grad_h2h = simple_rnn._cell._h2h_w.gradient()
dy_grad_i2h = simple_rnn._cell._i2h_w.gradient()
with fluid.dygraph.guard():
var_inp2 = fluid.dygraph.base.to_variable(np_inp)
var_inp2 = fluid.layers.reshape(var_inp2, shape=[1, 4, 3])
simple_rnn2 = SimpleRNN("simple_rnn")
outs2, pre_hiddens2 = simple_rnn2.forward(var_inp2)
dy_out2 = outs2[3].numpy()
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
outs2[3].backward(backward_strategy)
dy_grad_h2o2 = simple_rnn2._cell._h2o_w.gradient()
dy_grad_h2h2 = simple_rnn2._cell._h2h_w.gradient()
dy_grad_i2h2 = simple_rnn2._cell._i2h_w.gradient()
with new_program_scope():
inp = fluid.layers.data(
name="inp", shape=[1, 4, 3], append_batch_size=False)
......@@ -427,10 +478,15 @@ class TestImperative(unittest.TestCase):
outs[3].name, param_grads[0][1].name,
param_grads[1][1].name, param_grads[2][1].name
])
self.assertTrue(np.allclose(dy_out, static_out))
self.assertTrue(np.allclose(dy_grad_h2o, static_grad_h2o))
self.assertTrue(np.allclose(dy_grad_h2h, static_grad_h2h))
self.assertTrue(np.allclose(dy_grad_i2h, static_grad_i2h))
self.assertTrue(np.allclose(dy_out2, static_out))
self.assertTrue(np.allclose(dy_grad_h2o2, static_grad_h2o))
self.assertTrue(np.allclose(dy_grad_h2h2, static_grad_h2h))
self.assertTrue(np.allclose(dy_grad_i2h2, static_grad_i2h))
if __name__ == '__main__':
......
......@@ -258,7 +258,35 @@ class TestDygraphDeepCF(unittest.TestCase):
dy_loss = loss.numpy()
sys.stderr.write('dynamic loss: %s %s\n' % (slice, dy_loss))
with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
deepcf2 = DeepCF('deepcf', num_users, num_items, matrix)
adam2 = fluid.optimizer.AdamOptimizer(0.01)
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
for e in range(NUM_EPOCHES):
sys.stderr.write('epoch %d\n' % e)
for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
if slice + BATCH_SIZE >= users_np.shape[0]:
break
prediction2 = deepcf2(
to_variable(users_np[slice:slice + BATCH_SIZE]),
to_variable(items_np[slice:slice + BATCH_SIZE]))
loss2 = fluid.layers.reduce_sum(
fluid.layers.log_loss(prediction2,
to_variable(labels_np[
slice:slice + BATCH_SIZE])))
loss2.backward(backward_strategy)
adam2.minimize(loss2)
deepcf2.clear_gradients()
dy_loss2 = loss2.numpy()
sys.stderr.write('dynamic loss: %s %s\n' %
(slice, dy_loss2))
self.assertEqual(static_loss, dy_loss)
self.assertEqual(static_loss, dy_loss2)
if __name__ == '__main__':
......
......@@ -170,11 +170,59 @@ class TestDygraphGAN(unittest.TestCase):
dy_g_loss = g_loss.numpy()
dy_d_loss = d_loss.numpy()
dy_params2 = dict()
with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
discriminator2 = Discriminator("d")
generator2 = Generator("g")
sgd2 = SGDOptimizer(learning_rate=1e-3)
d_real2 = discriminator2(to_variable(np.ones([2, 1], np.float32)))
d_loss_real2 = fluid.layers.reduce_mean(
fluid.layers.sigmoid_cross_entropy_with_logits(
x=d_real2, label=to_variable(np.ones([2, 1], np.float32))))
d_fake2 = discriminator2(
generator2(to_variable(np.ones([2, 2], np.float32))))
d_loss_fake2 = fluid.layers.reduce_mean(
fluid.layers.sigmoid_cross_entropy_with_logits(
x=d_fake2, label=to_variable(np.zeros([2, 1], np.float32))))
d_loss2 = d_loss_real2 + d_loss_fake2
d_loss2.backward(backward_strategy)
sgd2.minimize(d_loss2)
discriminator2.clear_gradients()
generator2.clear_gradients()
d_fake2 = discriminator2(
generator2(to_variable(np.ones([2, 2], np.float32))))
g_loss2 = fluid.layers.reduce_mean(
fluid.layers.sigmoid_cross_entropy_with_logits(
x=d_fake2, label=to_variable(np.ones([2, 1], np.float32))))
g_loss2.backward(backward_strategy)
sgd2.minimize(g_loss2)
for p in discriminator2.parameters():
dy_params2[p.name] = p.numpy()
for p in generator.parameters():
dy_params2[p.name] = p.numpy()
dy_g_loss2 = g_loss2.numpy()
dy_d_loss2 = d_loss2.numpy()
self.assertEqual(dy_g_loss, static_g_loss)
self.assertEqual(dy_d_loss, static_d_loss)
for k, v in six.iteritems(dy_params):
self.assertTrue(np.allclose(v, static_params[k]))
self.assertEqual(dy_g_loss2, static_g_loss)
self.assertEqual(dy_d_loss2, static_d_loss)
for k, v in six.iteritems(dy_params2):
self.assertTrue(np.allclose(v, static_params[k]))
if __name__ == '__main__':
unittest.main()
......@@ -132,8 +132,31 @@ class TestDygraphGNN(unittest.TestCase):
loss = fluid.layers.reduce_sum(loss)
adam = AdamOptimizer(learning_rate=1e-3)
adam.minimize(loss)
with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
features2 = np.zeros([1, 100, 50], dtype=np.float32)
# Use selected rows when it's supported.
adj2 = np.zeros([1, 100, 100], dtype=np.float32)
labels2 = np.zeros([100, 1], dtype=np.int64)
model2 = GCN('test_gcn', 50)
logits2 = model2(to_variable(features2), to_variable(adj2))
logits2 = fluid.layers.reshape(logits2, logits2.shape[1:])
# In other example, it's nll with log_softmax. However, paddle's
# log_loss only supports binary classification now.
loss2 = fluid.layers.softmax_with_cross_entropy(
logits2, to_variable(labels2))
loss2 = fluid.layers.reduce_sum(loss2)
adam2 = AdamOptimizer(learning_rate=1e-3)
adam2.minimize(loss2)
self.assertEqual(static_loss, loss.numpy())
self.assertTrue(np.allclose(static_weight, model.gc.weight.numpy()))
self.assertEqual(static_loss, loss2.numpy())
self.assertTrue(np.allclose(static_weight, model2.gc.weight.numpy()))
sys.stderr.write('%s %s\n' % (static_loss, loss.numpy()))
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import contextlib
import unittest
import numpy as np
import six
import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope
from test_imperative_mnist import MNIST
class TestImperativeMnistSortGradient(unittest.TestCase):
def test_mnist_sort_gradient_float32(self):
seed = 90
epoch_num = 1
with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
mnist2 = MNIST("mnist")
sgd2 = SGDOptimizer(learning_rate=1e-3)
train_reader2 = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
mnist2.train()
dy_param_init_value2 = {}
for epoch in range(epoch_num):
for batch_id, data in enumerate(train_reader2()):
dy_x_data2 = np.array(
[x[0].reshape(1, 28, 28)
for x in data]).astype('float32')
y_data2 = np.array(
[x[1] for x in data]).astype('int64').reshape(128, 1)
img2 = to_variable(dy_x_data2)
label2 = to_variable(y_data2)
label2.stop_gradient = True
cost2 = mnist2(img2)
loss2 = fluid.layers.cross_entropy(cost2, label2)
avg_loss2 = fluid.layers.mean(loss2)
dy_out2 = avg_loss2.numpy()
if epoch == 0 and batch_id == 0:
for param in mnist2.parameters():
dy_param_init_value2[param.name] = param.numpy()
avg_loss2.backward(backward_strategy)
sgd2.minimize(avg_loss2)
mnist2.clear_gradients()
dy_param_value2 = {}
for param in mnist2.parameters():
dy_param_value2[param.name] = param.numpy()
if batch_id == 20:
break
with new_program_scope():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
mnist = MNIST("mnist")
sgd = SGDOptimizer(learning_rate=1e-3)
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
img = fluid.layers.data(
name='pixel', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
cost = mnist(img)
loss = fluid.layers.cross_entropy(cost, label)
avg_loss = fluid.layers.mean(loss)
sgd.minimize(avg_loss)
# initialize params and fetch them
static_param_init_value = {}
static_param_name_list = []
for param in mnist.parameters():
static_param_name_list.append(param.name)
out = exe.run(fluid.default_startup_program(),
fetch_list=static_param_name_list)
for i in range(len(static_param_name_list)):
static_param_init_value[static_param_name_list[i]] = out[i]
for epoch in range(epoch_num):
for batch_id, data in enumerate(train_reader()):
static_x_data = np.array(
[x[0].reshape(1, 28, 28)
for x in data]).astype('float32')
y_data = np.array(
[x[1] for x in data]).astype('int64').reshape([128, 1])
fetch_list = [avg_loss.name]
fetch_list.extend(static_param_name_list)
out = exe.run(
fluid.default_main_program(),
feed={"pixel": static_x_data,
"label": y_data},
fetch_list=fetch_list)
static_param_value = {}
static_out = out[0]
for i in range(1, len(out)):
static_param_value[static_param_name_list[i - 1]] = out[
i]
if batch_id == 20:
break
self.assertTrue(np.allclose(dy_x_data2.all(), static_x_data.all()))
for key, value in six.iteritems(static_param_init_value):
self.assertTrue(np.allclose(value, dy_param_init_value2[key]))
self.assertTrue(np.allclose(static_out, dy_out2))
for key, value in six.iteritems(static_param_value):
self.assertTrue(np.allclose(value, dy_param_value2[key], atol=1e-5))
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.dygraph.nn import Embedding
import paddle.fluid.framework as framework
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope
from test_imperative_ptb_rnn import PtbModel
import numpy as np
import six
class TestDygraphPtbRnnSortGradient(unittest.TestCase):
def test_ptb_rnn_sort_gradient_cpu_float32(self):
seed = 90
hidden_size = 10
vocab_size = 1000
num_layers = 1
num_steps = 3
init_scale = 0.1
batch_size = 4
batch_num = 200
with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
# TODO: marsyang1993 Change seed to
ptb_model = PtbModel(
"ptb_model",
hidden_size=hidden_size,
vocab_size=vocab_size,
num_layers=num_layers,
num_steps=num_steps,
init_scale=init_scale)
sgd = SGDOptimizer(learning_rate=1e-3)
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
last_hidden = None
last_cell = None
for i in range(batch_num):
x_data = np.arange(12).reshape(4, 3).astype('int64')
y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
x_data = x_data.reshape((-1, num_steps, 1))
y_data = y_data.reshape((-1, 1))
init_hidden_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
init_cell_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
x = to_variable(x_data)
y = to_variable(y_data)
init_hidden = to_variable(init_hidden_data)
init_cell = to_variable(init_cell_data)
dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
init_cell)
if i == 0:
for param in ptb_model.parameters():
dy_param_init[param.name] = param.numpy()
dy_loss.backward(backward_strategy)
sgd.minimize(dy_loss)
ptb_model.clear_gradients()
if i == batch_num - 1:
for param in ptb_model.parameters():
dy_param_updated[param.name] = param.numpy()
with new_program_scope():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
ptb_model = PtbModel(
"ptb_model",
hidden_size=hidden_size,
vocab_size=vocab_size,
num_layers=num_layers,
num_steps=num_steps,
init_scale=init_scale)
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
sgd = SGDOptimizer(learning_rate=1e-3)
x = fluid.layers.data(
name="x", shape=[-1, num_steps, 1], dtype='int64')
y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
init_hidden = fluid.layers.data(
name="init_hidden", shape=[1], dtype='float32')
init_cell = fluid.layers.data(
name="init_cell", shape=[1], dtype='float32')
static_loss, static_last_hidden, static_last_cell = ptb_model(
x, y, init_hidden, init_cell)
sgd.minimize(static_loss)
static_param_updated = dict()
static_param_init = dict()
static_param_name_list = list()
for param in ptb_model.parameters():
static_param_name_list.append(param.name)
out = exe.run(framework.default_startup_program(),
fetch_list=static_param_name_list)
for i in range(len(static_param_name_list)):
static_param_init[static_param_name_list[i]] = out[i]
static_loss_value = None
static_last_cell_value = None
static_last_hidden_value = None
for i in range(batch_num):
x_data = np.arange(12).reshape(4, 3).astype('int64')
y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
x_data = x_data.reshape((-1, num_steps, 1))
y_data = y_data.reshape((-1, 1))
init_hidden_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
init_cell_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
fetch_list = [static_loss, static_last_hidden, static_last_cell]
fetch_list.extend(static_param_name_list)
out = exe.run(fluid.default_main_program(),
feed={
"x": x_data,
"y": y_data,
"init_hidden": init_hidden_data,
"init_cell": init_cell_data
},
fetch_list=fetch_list)
static_loss_value = out[0]
static_last_hidden_value = out[1]
static_last_cell_value = out[2]
if i == batch_num - 1:
for k in range(3, len(out)):
static_param_updated[static_param_name_list[k -
3]] = out[k]
self.assertTrue(np.array_equal(static_loss_value, dy_loss.numpy()))
self.assertTrue(
np.array_equal(static_last_cell_value, last_cell.numpy()))
self.assertTrue(
np.array_equal(static_last_hidden_value, last_hidden.numpy()))
for key, value in six.iteritems(static_param_init):
self.assertTrue(np.array_equal(value, dy_param_init[key]))
for key, value in six.iteritems(static_param_updated):
self.assertTrue(np.array_equal(value, dy_param_updated[key]))
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import six
import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope
from test_imperative_resnet import ResNet
batch_size = 8
train_parameters = {
"input_size": [3, 224, 224],
"input_mean": [0.485, 0.456, 0.406],
"input_std": [0.229, 0.224, 0.225],
"learning_strategy": {
"name": "piecewise_decay",
"batch_size": batch_size,
"epochs": [30, 60, 90],
"steps": [0.1, 0.01, 0.001, 0.0001]
},
"batch_size": batch_size,
"lr": 0.1,
"total_images": 1281164,
}
def optimizer_setting(params):
ls = params["learning_strategy"]
if ls["name"] == "piecewise_decay":
if "total_images" not in params:
total_images = 1281167
else:
total_images = params["total_images"]
batch_size = ls["batch_size"]
step = int(total_images / batch_size + 1)
bd = [step * e for e in ls["epochs"]]
base_lr = params["lr"]
lr = []
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.SGD(learning_rate=0.01)
# TODO(minqiyang): Add learning rate scheduler support to dygraph mode
# optimizer = fluid.optimizer.Momentum(
# learning_rate=params["lr"],
# learning_rate=fluid.layers.piecewise_decay(
# boundaries=bd, values=lr),
# momentum=0.9,
# regularization=fluid.regularizer.L2Decay(1e-4))
return optimizer
class TestDygraphResnetSortGradient(unittest.TestCase):
def test_resnet_sort_gradient_float32(self):
seed = 90
batch_size = train_parameters["batch_size"]
batch_num = 20
with fluid.dygraph.guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
resnet = ResNet("resnet")
optimizer = optimizer_setting(train_parameters)
np.random.seed(seed)
import random
random.seed = seed
train_reader = paddle.batch(
paddle.dataset.flowers.train(use_xmap=False),
batch_size=batch_size)
dy_param_init_value = {}
for param in resnet.parameters():
dy_param_init_value[param.name] = param.numpy()
for batch_id, data in enumerate(train_reader()):
if batch_id >= batch_num:
break
dy_x_data = np.array(
[x[0].reshape(3, 224, 224) for x in data]).astype('float32')
y_data = np.array([x[1] for x in data]).astype('int64').reshape(
batch_size, 1)
img = to_variable(dy_x_data)
label = to_variable(y_data)
label.stop_gradient = True
out = resnet(img)
loss = fluid.layers.cross_entropy(input=out, label=label)
avg_loss = fluid.layers.mean(x=loss)
dy_out = avg_loss.numpy()
if batch_id == 0:
for param in resnet.parameters():
if param.name not in dy_param_init_value:
dy_param_init_value[param.name] = param.numpy()
avg_loss.backward(backward_strategy)
dy_grad_value = {}
for param in resnet.parameters():
if param.trainable:
np_array = np.array(param._ivar._grad_ivar().value()
.get_tensor())
dy_grad_value[param.name + core.grad_var_suffix(
)] = np_array
optimizer.minimize(avg_loss)
resnet.clear_gradients()
dy_param_value = {}
for param in resnet.parameters():
dy_param_value[param.name] = param.numpy()
with new_program_scope():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
resnet = ResNet("resnet")
optimizer = optimizer_setting(train_parameters)
np.random.seed(seed)
import random
random.seed = seed
train_reader = paddle.batch(
paddle.dataset.flowers.train(use_xmap=False),
batch_size=batch_size)
img = fluid.layers.data(
name='pixel', shape=[3, 224, 224], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
out = resnet(img)
loss = fluid.layers.cross_entropy(input=out, label=label)
avg_loss = fluid.layers.mean(x=loss)
optimizer.minimize(avg_loss)
# initialize params and fetch them
static_param_init_value = {}
static_param_name_list = []
static_grad_name_list = []
for param in resnet.parameters():
static_param_name_list.append(param.name)
for param in resnet.parameters():
if param.trainable:
static_grad_name_list.append(param.name +
core.grad_var_suffix())
out = exe.run(fluid.default_startup_program(),
fetch_list=static_param_name_list)
for i in range(len(static_param_name_list)):
static_param_init_value[static_param_name_list[i]] = out[i]
for batch_id, data in enumerate(train_reader()):
if batch_id >= batch_num:
break
static_x_data = np.array(
[x[0].reshape(3, 224, 224) for x in data]).astype('float32')
y_data = np.array([x[1] for x in data]).astype('int64').reshape(
[batch_size, 1])
fetch_list = [avg_loss.name]
fetch_list.extend(static_param_name_list)
fetch_list.extend(static_grad_name_list)
out = exe.run(fluid.default_main_program(),
feed={"pixel": static_x_data,
"label": y_data},
fetch_list=fetch_list)
static_param_value = {}
static_grad_value = {}
static_out = out[0]
param_start_pos = 1
grad_start_pos = len(static_param_name_list) + param_start_pos
for i in range(param_start_pos,
len(static_param_name_list) + param_start_pos):
static_param_value[static_param_name_list[
i - param_start_pos]] = out[i]
for i in range(grad_start_pos,
len(static_grad_name_list) + grad_start_pos):
static_grad_value[static_grad_name_list[
i - grad_start_pos]] = out[i]
self.assertTrue(np.allclose(static_out, dy_out))
self.assertEqual(len(dy_param_init_value), len(static_param_init_value))
for key, value in six.iteritems(static_param_init_value):
self.assertTrue(np.allclose(value, dy_param_init_value[key]))
self.assertTrue(np.isfinite(value.all()))
self.assertFalse(np.isnan(value.any()))
self.assertEqual(len(dy_grad_value), len(static_grad_value))
for key, value in six.iteritems(static_grad_value):
self.assertTrue(np.allclose(value, dy_grad_value[key]))
self.assertTrue(np.isfinite(value.all()))
self.assertFalse(np.isnan(value.any()))
self.assertEqual(len(dy_param_value), len(static_param_value))
for key, value in six.iteritems(static_param_value):
self.assertTrue(np.allclose(value, dy_param_value[key]))
self.assertTrue(np.isfinite(value.all()))
self.assertFalse(np.isnan(value.any()))
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid as fluid
from paddle.fluid import Embedding, LayerNorm, FC, Layer
from paddle.fluid.dygraph import to_variable, guard
from test_imperative_base import new_program_scope
from paddle.fluid import core
from test_imperative_transformer import TransFormer, TrainTaskConfig, ModelHyperParams
import numpy as np
import six
np.set_printoptions(suppress=True)
def create_data(is_static=False):
if is_static:
return [
src_word_np, src_pos_np, src_slf_attn_bias_np, trg_word_np,
trg_pos_np, trg_slf_attn_bias_np, trg_src_attn_bias_np, lbl_word_np,
lbl_weight_np
]
else:
enc_inputs = [
to_variable(
src_word_np, name='src_word'), to_variable(
src_pos_np, name='src_pos'), to_variable(
src_slf_attn_bias_np, name='src_slf_attn_bias')
]
dec_inputs = [
to_variable(
trg_word_np, name='trg_word'), to_variable(
trg_pos_np, name='trg_pos'), to_variable(
trg_slf_attn_bias_np, name='trg_slf_attn_bias'),
to_variable(
trg_src_attn_bias_np, name='trg_src_attn_bias')
]
label = to_variable(lbl_word_np, name='lbl_word')
weight = to_variable(lbl_weight_np, name='lbl_weight')
return enc_inputs, dec_inputs, label, weight
def create_feed_dict_list(data, init=False):
if init:
data_input_names = encoder_data_input_fields + \
decoder_data_input_fields[:-1] + label_data_input_fields + pos_enc_param_names
else:
data_input_names = encoder_data_input_fields + \
decoder_data_input_fields[:-1] + label_data_input_fields
feed_dict_list = dict()
for i in range(len(data_input_names)):
feed_dict_list[data_input_names[i]] = data[i]
return feed_dict_list
def make_all_inputs(input_fields):
"""
Define the input data layers for the transformer model.
"""
inputs = []
for input_field in input_fields:
input_var = fluid.layers.data(
name=input_field,
shape=input_descs[input_field][0],
dtype=input_descs[input_field][1],
lod_level=input_descs[input_field][2]
if len(input_descs[input_field]) == 3 else 0,
append_batch_size=False)
inputs.append(input_var)
return inputs
# The placeholder for batch_size in compile time. Must be -1 currently to be
# consistent with some ops' infer-shape output in compile time, such as the
# sequence_expand op used in beamsearch decoder.
batch_size = -1
# The placeholder for squence length in compile time.
seq_len = ModelHyperParams.max_length
# Here list the data shapes and data types of all inputs.
# The shapes here act as placeholder and are set to pass the infer-shape in
# compile time.
input_descs = {
# The actual data shape of src_word is:
# [batch_size, max_src_len_in_batch, 1]
"src_word": [(batch_size, seq_len, 1), "int64", 2],
# The actual data shape of src_pos is:
# [batch_size, max_src_len_in_batch, 1]
"src_pos": [(batch_size, seq_len, 1), "int64"],
# This input is used to remove attention weights on paddings in the
# encoder.
# The actual data shape of src_slf_attn_bias is:
# [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
"src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
seq_len), "float32"],
# The actual data shape of trg_word is:
# [batch_size, max_trg_len_in_batch, 1]
"trg_word": [(batch_size, seq_len, 1), "int64",
2], # lod_level is only used in fast decoder.
# The actual data shape of trg_pos is:
# [batch_size, max_trg_len_in_batch, 1]
"trg_pos": [(batch_size, seq_len, 1), "int64"],
# This input is used to remove attention weights on paddings and
# subsequent words in the decoder.
# The actual data shape of trg_slf_attn_bias is:
# [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
"trg_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
seq_len), "float32"],
# This input is used to remove attention weights on paddings of the source
# input in the encoder-decoder attention.
# The actual data shape of trg_src_attn_bias is:
# [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
"trg_src_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
seq_len), "float32"],
# This input is used in independent decoder program for inference.
# The actual data shape of enc_output is:
# [batch_size, max_src_len_in_batch, d_model]
"enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"],
# The actual data shape of label_word is:
# [batch_size * max_trg_len_in_batch, 1]
"lbl_word": [(batch_size * seq_len, 1), "int64"],
# This input is used to mask out the loss of paddding tokens.
# The actual data shape of label_weight is:
# [batch_size * max_trg_len_in_batch, 1]
"lbl_weight": [(batch_size * seq_len, 1), "float32"],
# This input is used in beam-search decoder.
"init_score": [(batch_size, 1), "float32", 2],
# This input is used in beam-search decoder for the first gather
# (cell states updation)
"init_idx": [(batch_size, ), "int32"],
}
# Names of word embedding table which might be reused for weight sharing.
word_emb_param_names = (
"src_word_emb_table",
"trg_word_emb_table", )
# Names of position encoding table which will be initialized externally.
pos_enc_param_names = (
"src_pos_enc_table",
"trg_pos_enc_table", )
# separated inputs for different usages.
encoder_data_input_fields = (
"src_word",
"src_pos",
"src_slf_attn_bias", )
decoder_data_input_fields = (
"trg_word",
"trg_pos",
"trg_slf_attn_bias",
"trg_src_attn_bias",
"enc_output", )
label_data_input_fields = (
"lbl_word",
"lbl_weight", )
# In fast decoder, trg_pos (only containing the current time step) is generated
# by ops and trg_slf_attn_bias is not needed.
fast_decoder_data_input_fields = (
"trg_word",
"init_score",
"init_idx",
"trg_src_attn_bias", )
# if we use py_reader
use_py_reader = False
# if we run sync mode
sync = False
# how many batches we use
batch_num = 5
np.random.seed = 90
src_word_np = np.random.randint(
1,
ModelHyperParams.src_vocab_size - 1,
size=(TrainTaskConfig.batch_size, seq_len, 1),
dtype='int64')
src_pos_np = np.random.randint(
1, seq_len, size=(TrainTaskConfig.batch_size, seq_len, 1), dtype='int64')
src_slf_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size,
ModelHyperParams.n_head, seq_len,
seq_len).astype('float32')
trg_word_np = np.random.randint(
1,
ModelHyperParams.src_vocab_size - 1,
size=(TrainTaskConfig.batch_size, seq_len, 1),
dtype='int64')
trg_pos_np = np.random.randint(
1, seq_len, size=(TrainTaskConfig.batch_size, seq_len, 1), dtype='int64')
trg_slf_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size,
ModelHyperParams.n_head, seq_len,
seq_len).astype('float32')
trg_src_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size,
ModelHyperParams.n_head, seq_len,
seq_len).astype('float32')
lbl_word_np = np.random.randint(
1,
ModelHyperParams.src_vocab_size - 1,
size=(TrainTaskConfig.batch_size * seq_len, 1),
dtype='int64')
lbl_weight_np = np.random.randn(TrainTaskConfig.batch_size * seq_len,
1).astype('float32')
class TestDygraphTransformerSortGradient(unittest.TestCase):
def test_transformer_sort_gradient_float32(self):
seed = 90
with guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
transformer = TransFormer(
'transformer',
ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size,
ModelHyperParams.max_length + 1,
ModelHyperParams.n_layer,
ModelHyperParams.n_head,
ModelHyperParams.d_key,
ModelHyperParams.d_value,
ModelHyperParams.d_model,
ModelHyperParams.d_inner_hid,
ModelHyperParams.prepostprocess_dropout,
ModelHyperParams.attention_dropout,
ModelHyperParams.relu_dropout,
ModelHyperParams.preprocess_cmd,
ModelHyperParams.postprocess_cmd,
ModelHyperParams.weight_sharing,
TrainTaskConfig.label_smooth_eps,
use_py_reader=use_py_reader,
is_test=False)
if sync:
lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
ModelHyperParams.d_model, TrainTaskConfig.warmup_steps)
with fluid.default_main_program()._lr_schedule_guard():
learning_rate = lr_decay * TrainTaskConfig.learning_rate
optimizer = fluid.optimizer.Adam(
learning_rate=learning_rate,
beta1=TrainTaskConfig.beta1,
beta2=TrainTaskConfig.beta2,
epsilon=TrainTaskConfig.eps)
else:
optimizer = fluid.optimizer.SGD(learning_rate=0.003)
dy_param_init = dict()
dy_param_updated = dict()
for i in range(batch_num):
enc_inputs, dec_inputs, label, weights = create_data()
dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = transformer(
enc_inputs, dec_inputs, label, weights)
if i == 0:
for param in transformer.parameters():
dy_param_init[param.name] = param.numpy()
dy_avg_cost.backward(backward_strategy)
optimizer.minimize(dy_avg_cost)
transformer.clear_gradients()
if i == batch_num - 1:
for param in transformer.parameters():
dy_param_updated[param.name] = param.numpy()
with new_program_scope():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
transformer = TransFormer(
'transformer',
ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size,
ModelHyperParams.max_length + 1,
ModelHyperParams.n_layer,
ModelHyperParams.n_head,
ModelHyperParams.d_key,
ModelHyperParams.d_value,
ModelHyperParams.d_model,
ModelHyperParams.d_inner_hid,
ModelHyperParams.prepostprocess_dropout,
ModelHyperParams.attention_dropout,
ModelHyperParams.relu_dropout,
ModelHyperParams.preprocess_cmd,
ModelHyperParams.postprocess_cmd,
ModelHyperParams.weight_sharing,
TrainTaskConfig.label_smooth_eps,
use_py_reader=use_py_reader,
is_test=False)
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
optimizer = fluid.optimizer.SGD(learning_rate=0.003)
data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
-1] + label_data_input_fields
all_inputs = make_all_inputs(data_input_names)
enc_inputs_len = len(encoder_data_input_fields)
dec_inputs_len = len(decoder_data_input_fields[:-1])
enc_inputs = all_inputs[0:enc_inputs_len]
dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len +
dec_inputs_len]
label = all_inputs[-2]
weights = all_inputs[-1]
static_param_updated = dict()
static_param_init = dict()
static_param_name_list = list()
static_sum_cost, static_avg_cost, static_predict, static_token_num = transformer(
enc_inputs, dec_inputs, label, weights)
optimizer.minimize(static_avg_cost)
for param in transformer.parameters():
static_param_name_list.append(param.name)
out = exe.run(fluid.default_startup_program(),
fetch_list=static_param_name_list)
for i in range(len(static_param_name_list)):
static_param_init[static_param_name_list[i]] = out[i]
static_sum_cost_value = None
static_avg_cost_value = None
static_predict_value = None
static_token_num_value = None
for i in range(batch_num):
feed_dict = create_feed_dict_list(create_data(True))
fetch_list = [
static_sum_cost, static_avg_cost, static_predict,
static_token_num
]
fetch_list.extend(static_param_name_list)
out = exe.run(fluid.default_main_program(),
feed=feed_dict,
fetch_list=fetch_list)
static_sum_cost_value = out[0]
static_avg_cost_value = out[1]
static_predict_value = out[2]
static_token_num_value = out[3]
if i == batch_num - 1:
for k in range(4, len(out)):
static_param_updated[static_param_name_list[k -
4]] = out[k]
self.assertTrue(
np.array_equal(static_avg_cost_value, dy_avg_cost.numpy()))
self.assertTrue(
np.array_equal(static_sum_cost_value, dy_sum_cost.numpy()))
self.assertTrue(
np.array_equal(static_predict_value, dy_predict.numpy()))
self.assertTrue(
np.array_equal(static_token_num_value, dy_token_num.numpy()))
for key, value in six.iteritems(static_param_init):
self.assertTrue(np.array_equal(value, dy_param_init[key]))
for key, value in six.iteritems(static_param_updated):
self.assertTrue(np.array_equal(value, dy_param_updated[key]))
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册