提交 6ebf0f47 编写于 作者: Z zhongpu 提交者: hong

support SelectedRows in dygraph, test=develop (#21078)

* support SelectedRows in dygraph, test=develop

* fix bug of _grad_ivar interface, test=develop

* add optest for support seletedrows, test=develop

* fix bug for gradient_accumulator in GPU mode, test=develop

* fix error when Selectedrows addto LodTensor in sorted_gradient mdoe in dygraph, test=develop

* refine and simplify gradient accumulator code, test=develop

* add optest, test=develop

* add optest and simplify code, test=develop

* fix bug for test_imperative_selected_rows, test=develop

* add optest for Coverage, test=develop

* fix gradient interface and simplify code, test=develop

* update api for gradient, test=develop

* fix ShareDim's bug in DygraphExecutionContext class, test=develop

* add optest, test=develop
上级 70eb3976
...@@ -2,7 +2,7 @@ cc_library(imperative_flag SRCS flags.cc DEPS gflags) ...@@ -2,7 +2,7 @@ cc_library(imperative_flag SRCS flags.cc DEPS gflags)
cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform) cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform)
cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry) cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows var_type_traits layer) cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer)
add_subdirectory(jit) add_subdirectory(jit)
cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer) cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer)
......
...@@ -131,11 +131,12 @@ class GradOpBaseMakerBase { ...@@ -131,11 +131,12 @@ class GradOpBaseMakerBase {
"VarBase grad of OP [%s] should not be null", "VarBase grad of OP [%s] should not be null",
fw_op_base_->Type()); fw_op_base_->Type());
auto grad_var_base_tmp = var_base_temp->GradVarBase(); auto grad_var_base_tmp = var_base_temp->GradVarBase();
if (!is_input) {
auto* tensor = grad_var_base_tmp->MutableVar() auto* tensor = grad_var_base_tmp->MutableVar()
->GetMutable<framework::LoDTensor>(); ->GetMutable<framework::LoDTensor>();
tensor->Resize( tensor->Resize(
var_base_temp->Var().Get<framework::LoDTensor>().dims()); var_base_temp->Var().Get<framework::LoDTensor>().dims());
}
vec_temp.emplace_back(grad_var_base_tmp); vec_temp.emplace_back(grad_var_base_tmp);
} else { } else {
vec_temp.emplace_back(var_base_temp); vec_temp.emplace_back(var_base_temp);
......
...@@ -16,11 +16,13 @@ ...@@ -16,11 +16,13 @@
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <utility> #include <utility>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -84,7 +86,7 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) { ...@@ -84,7 +86,7 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
auto data_type = src_tensor.type(); auto data_type = src_tensor.type();
auto place = src_tensor.place(); auto place = src_tensor.place();
#define PADDLE_TENSOR_ADD_MACRO(cpp_type) \ #define PADDLE_TENSOR_ADD(cpp_type) \
if (data_type == framework::DataTypeTrait<cpp_type>::DataType()) { \ if (data_type == framework::DataTypeTrait<cpp_type>::DataType()) { \
TensorAddFunctor<cpp_type> func( \ TensorAddFunctor<cpp_type> func( \
numel, src_tensor.data<cpp_type>(), \ numel, src_tensor.data<cpp_type>(), \
...@@ -93,25 +95,155 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) { ...@@ -93,25 +95,155 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
return; \ return; \
} }
PADDLE_TENSOR_ADD_MACRO(float); PADDLE_TENSOR_ADD(float);
PADDLE_TENSOR_ADD_MACRO(double); PADDLE_TENSOR_ADD(double);
#undef PADDLE_TENSOR_ADD_MACRO #undef PADDLE_TENSOR_ADD
PADDLE_THROW("Not supported data type %s for AddTo", PADDLE_THROW("Not supported data type %s for AddTo",
framework::DataTypeToString(data_type)); framework::DataTypeToString(data_type));
} }
void SelectedRowsAddToTensor(const framework::Variable& src,
framework::Variable* dst) {
auto* dst_tensor = dst->GetMutable<framework::LoDTensor>();
auto& src_selected_rows = src.Get<framework::SelectedRows>();
auto place = dst_tensor->place();
auto data_type = src_selected_rows.value().type();
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
#define PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(dev_ctx_type, cpp_type) \
if (data_type == framework::DataTypeTrait<cpp_type>::DataType()) { \
paddle::platform::DeviceContext* dev_ctx = pool.Get(place); \
paddle::operators::math::SelectedRowsAddToTensor<dev_ctx_type, cpp_type> \
functor; \
functor(*(dynamic_cast<dev_ctx_type*>(dev_ctx)), src_selected_rows, \
dst_tensor); \
return; \
}
#ifdef PADDLE_WITH_CUDA
if (paddle::platform::is_gpu_place(place)) {
PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, float);
PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, double);
} else {
#endif
PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CPUDeviceContext, float);
PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CPUDeviceContext, double);
#ifdef PADDLE_WITH_CUDA
}
#endif
#undef PADDLE_SELECTED_ROWS_ADD_TO_TENSOR
PADDLE_THROW(platform::errors::InvalidArgument(
"Not supported data type %s for SelectedRowsAddToTensor",
framework::DataTypeToString(data_type)));
}
// Note(chenweihang): when two selected rows need to be added,
// adding one to another is not equal to merging two selected rows
// to one then add it to a empty selected rows, the after is correct
std::shared_ptr<VarBase> SelectedRowsMerge(const framework::Variable& src1,
const framework::Variable& src2) {
auto& src_selected_rows1 = src1.Get<framework::SelectedRows>();
auto& src_selected_rows2 = src2.Get<framework::SelectedRows>();
auto place = src_selected_rows1.value().place();
auto data_type = src_selected_rows1.value().type();
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
std::vector<const framework::SelectedRows*> src_selected_rows;
src_selected_rows.emplace_back(&src_selected_rows1);
src_selected_rows.emplace_back(&src_selected_rows2);
auto dst_var = std::make_shared<VarBase>(false, "Temp");
auto* dst_selected_rows =
dst_var->MutableVar()->GetMutable<framework::SelectedRows>();
#define PADDLE_SELECTED_ROWS_ADD(dev_ctx_type, cpp_type) \
if (data_type == framework::DataTypeTrait<cpp_type>::DataType()) { \
paddle::platform::DeviceContext* dev_ctx = pool.Get(place); \
paddle::operators::math::scatter::MergeAdd<dev_ctx_type, cpp_type> \
merge_add; \
merge_add(*(dynamic_cast<dev_ctx_type*>(dev_ctx)), src_selected_rows, \
dst_selected_rows); \
return dst_var; \
}
#ifdef PADDLE_WITH_CUDA
if (paddle::platform::is_gpu_place(place)) {
PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, float);
PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, double);
} else {
#endif
PADDLE_SELECTED_ROWS_ADD(platform::CPUDeviceContext, float);
PADDLE_SELECTED_ROWS_ADD(platform::CPUDeviceContext, double);
#ifdef PADDLE_WITH_CUDA
}
#endif
#undef PADDLE_SELECTED_ROWS_ADD
PADDLE_THROW(platform::errors::InvalidArgument(
"Not supported data type %s for SelectedRowsMerge",
framework::DataTypeToString(data_type)));
}
void VarBaseAdd(std::shared_ptr<VarBase> var, VarBase* var_) {
auto& src = var->Var();
auto* dst = var_->MutableVar();
if (dst->IsType<framework::LoDTensor>()) {
if (src.IsType<framework::LoDTensor>()) {
TensorAdd(src, dst);
} else if (src.IsType<framework::SelectedRows>()) {
SelectedRowsAddToTensor(src, dst);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Unexpected branch, output variable type is %s",
framework::ToTypeName(dst->Type())));
}
} else {
if (src.IsType<framework::LoDTensor>()) {
auto* src_mutable = var->MutableVar();
SelectedRowsAddToTensor(*dst, src_mutable);
*dst = std::move(*(var->MutableVar()));
var_->SetType(framework::proto::VarType::LOD_TENSOR);
} else if (src.IsType<framework::SelectedRows>()) {
std::shared_ptr<VarBase> temp = SelectedRowsMerge(src, *dst);
*dst = std::move(*(temp->MutableVar()));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Unexpected branch, output variable type is %s",
framework::ToTypeName(dst->Type())));
}
}
}
platform::Place GetPlaceOfVarBase(const std::shared_ptr<VarBase>& var) {
platform::Place place;
if (var->Var().IsType<framework::LoDTensor>()) {
place = var->Var().Get<framework::LoDTensor>().place();
} else if (var->Var().IsType<framework::SelectedRows>()) {
place = var->Var().Get<framework::SelectedRows>().place();
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"only support LoDTensor and SelectedRows in dygraph"));
}
return place;
}
void EagerGradientAccumulator::Add(std::shared_ptr<VarBase> var, void EagerGradientAccumulator::Add(std::shared_ptr<VarBase> var,
size_t trace_id) { size_t trace_id) {
auto* dst_var = var_->MutableVar(); auto* dst_var = var_->MutableVar();
auto place = var->Var().Get<framework::LoDTensor>().place(); platform::Place place = GetPlaceOfVarBase(var);
if (!var_->OverridedStopGradient()) { if (!var_->OverridedStopGradient()) {
VLOG(3) << "Sum Gradient for: " << var_->Name(); VLOG(3) << "Sum Gradient for: " << var_->Name();
if (cur_cnt_ == 0) { if (cur_cnt_ == 0) {
if (var->Var().IsType<framework::SelectedRows>()) {
var_->SetType(framework::proto::VarType::SELECTED_ROWS);
}
*dst_var = std::move(*(var->MutableVar())); *dst_var = std::move(*(var->MutableVar()));
} else { } else {
TensorAdd(var->Var(), dst_var); VarBaseAdd(var, var_);
} }
} else { } else {
if (!var_->Var().IsInitialized() || if (!var_->Var().IsInitialized() ||
...@@ -139,10 +271,15 @@ void EagerGradientAccumulator::Add(std::shared_ptr<VarBase> var, ...@@ -139,10 +271,15 @@ void EagerGradientAccumulator::Add(std::shared_ptr<VarBase> var,
void SortedGradientAccumulator::Add(std::shared_ptr<VarBase> var, void SortedGradientAccumulator::Add(std::shared_ptr<VarBase> var,
size_t trace_id) { size_t trace_id) {
auto* dst_var = var_->MutableVar(); auto* dst_var = var_->MutableVar();
auto place = var->Var().Get<framework::LoDTensor>().place(); platform::Place place = GetPlaceOfVarBase(var);
if (!var_->OverridedStopGradient()) { if (!var_->OverridedStopGradient()) {
if (ref_cnt_ == 1) { if (ref_cnt_ == 1) {
if (var->Var().IsType<framework::SelectedRows>()) {
var_->SetType(framework::proto::VarType::SELECTED_ROWS);
*dst_var = std::move(*(var->MutableVar()));
} else {
*dst_var = std::move(*(var->MutableVar())); *dst_var = std::move(*(var->MutableVar()));
}
} else { } else {
if (tmp_grad_vars_.empty()) { if (tmp_grad_vars_.empty()) {
tmp_grad_vars_.reserve(ref_cnt_); tmp_grad_vars_.reserve(ref_cnt_);
...@@ -160,11 +297,47 @@ void SortedGradientAccumulator::Add(std::shared_ptr<VarBase> var, ...@@ -160,11 +297,47 @@ void SortedGradientAccumulator::Add(std::shared_ptr<VarBase> var,
return p1.second > p2.second; return p1.second > p2.second;
}); });
#ifdef PADDLE_WITH_CUDA
if (paddle::platform::is_gpu_place(place)) {
bool dst_varbase_is_initialized = false;
// accumulate selected rows firstly
for (size_t i = 0; i < tmp_grad_vars_.size(); ++i) {
if (tmp_grad_vars_[i]
.first->Var()
.IsType<framework::SelectedRows>()) {
if (!dst_varbase_is_initialized) {
dst_varbase_is_initialized = true;
var_->SetType(framework::proto::VarType::SELECTED_ROWS);
*dst_var = std::move(*(tmp_grad_vars_[i].first->MutableVar()));
} else {
VarBaseAdd(tmp_grad_vars_[i].first, var_);
}
}
}
// accumulate lod tensor
for (size_t i = 0; i < tmp_grad_vars_.size(); ++i) {
if (!dst_varbase_is_initialized) {
dst_varbase_is_initialized = true;
*dst_var = std::move(*(tmp_grad_vars_[0].first->MutableVar()));
}
if (tmp_grad_vars_[i].first->Var().IsType<framework::LoDTensor>()) {
VarBaseAdd(tmp_grad_vars_[i].first, var_);
}
}
} else {
#endif
if (tmp_grad_vars_[0].first->Var().IsType<framework::SelectedRows>()) {
var_->SetType(framework::proto::VarType::SELECTED_ROWS);
*dst_var = std::move(*(tmp_grad_vars_[0].first->MutableVar()));
} else {
*dst_var = std::move(*(tmp_grad_vars_[0].first->MutableVar())); *dst_var = std::move(*(tmp_grad_vars_[0].first->MutableVar()));
}
for (size_t i = 1; i < tmp_grad_vars_.size(); ++i) { for (size_t i = 1; i < tmp_grad_vars_.size(); ++i) {
TensorAdd(tmp_grad_vars_[i].first->Var(), dst_var); VarBaseAdd(tmp_grad_vars_[i].first, var_);
} }
#ifdef PADDLE_WITH_CUDA
}
#endif
tmp_grad_vars_.clear(); tmp_grad_vars_.clear();
} }
} else { } else {
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <algorithm> #include <algorithm>
#include <queue> #include <queue>
#include <utility> #include <utility>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/imperative/prepared_operator.h" #include "paddle/fluid/imperative/prepared_operator.h"
...@@ -205,23 +206,32 @@ void VarBase::AddGradOps(const std::weak_ptr<OpBase>& op) { ...@@ -205,23 +206,32 @@ void VarBase::AddGradOps(const std::weak_ptr<OpBase>& op) {
void VarBase::ClearGradient() { void VarBase::ClearGradient() {
if (grad_var_) { if (grad_var_) {
if (grad_var_->var_.IsType<framework::SelectedRows>()) {
auto* grad_t = grad_var_->var_.GetMutable<framework::SelectedRows>();
if (grad_t->mutable_value()->IsInitialized()) {
grad_t->mutable_rows()->clear();
grad_t->mutable_value()->clear();
}
} else {
auto* grad_t = grad_var_->var_.GetMutable<framework::LoDTensor>(); auto* grad_t = grad_var_->var_.GetMutable<framework::LoDTensor>();
if (grad_t->IsInitialized()) { if (grad_t->IsInitialized()) {
auto* dev_ctx = auto* dev_ctx =
platform::DeviceContextPool::Instance().Get(grad_t->place()); platform::DeviceContextPool::Instance().Get(grad_t->place());
operators::math::set_constant(*dev_ctx, grad_t, 0.0); operators::math::set_constant(*dev_ctx, grad_t, 0.0);
} }
} }
}
} }
std::shared_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place, std::shared_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
const bool blocking) const { const bool blocking) const {
PADDLE_ENFORCE_EQ(var_.IsInitialized() && var_.IsType<framework::LoDTensor>(), PADDLE_ENFORCE_EQ(
true, var_.IsInitialized() && (var_.IsType<framework::LoDTensor>() ||
"Variable must be initialized and type of LoDTensor when " var_.IsType<framework::SelectedRows>()),
"getting numpy tensor"); true, platform::errors::InvalidArgument(
"Variable is not initialized or Variable's type is not "
"LoDTensor or SelectedRows when getting numpy tensor"));
if (var_.IsType<framework::LoDTensor>()) {
auto& src_tensor = var_.Get<framework::LoDTensor>(); auto& src_tensor = var_.Get<framework::LoDTensor>();
// TODO(Jiabin): change this after move unique_name generator to CXX // TODO(Jiabin): change this after move unique_name generator to CXX
...@@ -245,6 +255,30 @@ std::shared_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place, ...@@ -245,6 +255,30 @@ std::shared_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
} }
return new_var; return new_var;
} else {
auto& src_selected_rows = var_.Get<framework::SelectedRows>();
auto new_var = std::make_shared<VarBase>(
false, "Itmp" + std::to_string(copied_counter_++));
new_var->SetType(framework::proto::VarType::SELECTED_ROWS);
auto* dst_selected_rows =
new_var->var_.GetMutable<framework::SelectedRows>();
framework::TensorCopy(src_selected_rows.value(), dst_place,
dst_selected_rows->mutable_value());
if (blocking) {
platform::DeviceContextPool::Instance().Get(dst_place)->Wait();
auto src_place = src_selected_rows.place();
if (!(src_place == dst_place)) {
platform::DeviceContextPool::Instance().Get(src_place)->Wait();
}
}
dst_selected_rows->set_height(src_selected_rows.height());
dst_selected_rows->set_rows(src_selected_rows.rows());
if (platform::is_gpu_place(dst_place)) {
VLOG(3) << "copy selected rows " << Name() << " from gpu";
}
return new_var;
}
} }
// create OpBase from optype // create OpBase from optype
OpBase::OpBase(size_t id, const std::string& type, const NameVarBaseMap& ins, OpBase::OpBase(size_t id, const std::string& type, const NameVarBaseMap& ins,
......
...@@ -453,6 +453,10 @@ class RuntimeInferVarTypeContext : public framework::InferVarTypeContext { ...@@ -453,6 +453,10 @@ class RuntimeInferVarTypeContext : public framework::InferVarTypeContext {
VLOG(2) << "SUPER UGLY FIX, remove this when move imperative mode in C++"; VLOG(2) << "SUPER UGLY FIX, remove this when move imperative mode in C++";
} else { } else {
var_set_[name]->SetType(type); var_set_[name]->SetType(type);
if ((var_set_[name]->MutableVar()->IsInitialized() == true) &&
(var_set_[name]->MutableVar()->Type() != type)) {
var_set_[name]->MutableVar()->Clear();
}
} }
} }
...@@ -766,9 +770,17 @@ class DygraphInferShapeContext : public framework::InferShapeContext { ...@@ -766,9 +770,17 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"The type of %s and %s is not the same.", in, out)); "The type of %s and %s is not the same.", in, out));
if (in_var->IsType<framework::LoDTensor>()) {
auto& in_lod_tensor = in_var->Get<framework::LoDTensor>(); auto& in_lod_tensor = in_var->Get<framework::LoDTensor>();
auto* out_lod_tensor = out_var->GetMutable<framework::LoDTensor>(); auto* out_lod_tensor = out_var->GetMutable<framework::LoDTensor>();
out_lod_tensor->Resize(in_lod_tensor.dims()); out_lod_tensor->Resize(in_lod_tensor.dims());
} else {
auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
auto out_sele_rows = out_var->GetMutable<framework::SelectedRows>();
out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims());
out_sele_rows->set_rows(in_sele_rows.rows());
out_sele_rows->set_height(in_sele_rows.height());
}
} }
void ShareAllLoD(const std::string& in, void ShareAllLoD(const std::string& in,
......
...@@ -5,7 +5,7 @@ else() ...@@ -5,7 +5,7 @@ else()
endif(WIN32) endif(WIN32)
cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS gradient_accumulator memcpy) cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows selected_rows_functor gradient_accumulator)
cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op memcpy) cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op memcpy)
cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split assign_op place) cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split assign_op place)
cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy) cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy)
...@@ -277,14 +277,19 @@ void BindImperative(py::module *m_ptr) { ...@@ -277,14 +277,19 @@ void BindImperative(py::module *m_ptr) {
.def("_grad_ivar", .def("_grad_ivar",
[](const imperative::VarBase &self) { [](const imperative::VarBase &self) {
auto &grad_var = self.GradVarBase(); auto &grad_var = self.GradVarBase();
if (grad_var && grad_var->Var().IsInitialized()) {
auto *tensor = auto *tensor =
grad_var->MutableVar()->GetMutable<framework::LoDTensor>(); grad_var->MutableVar()->IsType<framework::LoDTensor>()
if (grad_var && grad_var->Var().IsInitialized() && ? grad_var->MutableVar()
tensor->IsInitialized()) { ->GetMutable<framework::LoDTensor>()
: grad_var->MutableVar()
->GetMutable<framework::SelectedRows>()
->mutable_value();
if (tensor->IsInitialized()) {
return grad_var; return grad_var;
} else {
return std::shared_ptr<imperative::VarBase>(nullptr);
} }
}
return std::shared_ptr<imperative::VarBase>(nullptr);
}, },
py::return_value_policy::copy) py::return_value_policy::copy)
.def("_copy_to", .def("_copy_to",
...@@ -305,6 +310,9 @@ void BindImperative(py::module *m_ptr) { ...@@ -305,6 +310,9 @@ void BindImperative(py::module *m_ptr) {
if (self.Var().IsType<framework::LoDTensor>()) { if (self.Var().IsType<framework::LoDTensor>()) {
return framework::vectorize<int>( return framework::vectorize<int>(
self.Var().Get<framework::LoDTensor>().dims()); self.Var().Get<framework::LoDTensor>().dims());
} else if (self.Var().IsType<framework::SelectedRows>()) {
return framework::vectorize<int>(
self.Var().Get<framework::SelectedRows>().value().dims());
} else { } else {
VLOG(2) << "It is meaningless to get shape of variable type " VLOG(2) << "It is meaningless to get shape of variable type "
<< GetTypeName(self); << GetTypeName(self);
......
...@@ -263,7 +263,11 @@ class GradClipByGlobalNorm(GradClipBase): ...@@ -263,7 +263,11 @@ class GradClipByGlobalNorm(GradClipBase):
for p, g in para_and_grad: for p, g in para_and_grad:
if g is None: if g is None:
continue continue
power = layers.square(g) merge_grad = g
if g._ivar.type == core.VarDesc.VarType.SELECTED_ROWS:
merge_grad = layers.merge_selected_rows(g)
merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
power = layers.square(merge_grad)
sum_t = layers.reduce_sum(power) sum_t = layers.reduce_sum(power)
norm_arr.append(sum_t) norm_arr.append(sum_t)
...@@ -280,7 +284,7 @@ class GradClipByGlobalNorm(GradClipBase): ...@@ -280,7 +284,7 @@ class GradClipByGlobalNorm(GradClipBase):
if g is None: if g is None:
out.append((p, g)) out.append((p, g))
continue continue
new_grad = g * clip_scale new_grad = layers.elementwise_mul(x=g, y=clip_scale)
out.append((p, new_grad)) out.append((p, new_grad))
......
...@@ -902,7 +902,7 @@ class Variable(object): ...@@ -902,7 +902,7 @@ class Variable(object):
Get the Gradient of Current Variable Get the Gradient of Current Variable
Returns: Returns:
ndarray: Numpy value of the gradient of current Variable ndarray or tuple of ndarray: if Variable's type is LoDTensor, return numpy value of the gradient of current Variable, if Variable's type is SelectedRows, return tuple of ndarray, first element of tuple is numpy value of the gradient of current Variable, second element of tuple is numpy value of the rows of current Variable.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -929,11 +929,11 @@ class Variable(object): ...@@ -929,11 +929,11 @@ class Variable(object):
raise ValueError("%s has no grad, Please set Variable.stop_gradient=False, or " \ raise ValueError("%s has no grad, Please set Variable.stop_gradient=False, or " \
"check if this is the first and only variable need grad, if so, please set its pre-Variable's " \ "check if this is the first and only variable need grad, if so, please set its pre-Variable's " \
"stop_gradient=False, to make sure it has gradient " % self.name) "stop_gradient=False, to make sure it has gradient " % self.name)
if not self._ivar._grad_ivar().value().get_tensor()._is_initialized():
raise ValueError(
"%s's Grad is Empty, Please check if it has no data in" %
self.name)
new_ivar = self._ivar._grad_ivar()._copy_to(core.CPUPlace(), True) new_ivar = self._ivar._grad_ivar()._copy_to(core.CPUPlace(), True)
if self._ivar._grad_ivar().type == core.VarDesc.VarType.SELECTED_ROWS:
return (np.array(new_ivar.value().get_selected_rows().get_tensor()),
np.array(new_ivar.value().get_selected_rows().rows()))
else:
return np.array(new_ivar.value().get_tensor()) return np.array(new_ivar.value().get_tensor())
@dygraph_only @dygraph_only
......
...@@ -529,9 +529,11 @@ class Optimizer(object): ...@@ -529,9 +529,11 @@ class Optimizer(object):
if not param.trainable: if not param.trainable:
continue continue
if param._ivar._grad_ivar() is not None: if param._ivar._grad_ivar() is not None:
ivar_type = param._ivar._grad_ivar().type
# create gradient variable # create gradient variable
grad_var = Variable( grad_var = Variable(
block=loss.block, block=loss.block,
type=ivar_type,
name=param._ivar._grad_name(), name=param._ivar._grad_name(),
stop_gradient=True, stop_gradient=True,
ivar=param._ivar._grad_ivar()) ivar=param._ivar._grad_ivar())
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.dygraph.nn import Embedding
import paddle.fluid.framework as framework
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope
import numpy as np
import six
from utils import DyGraphProgramDescTracerTestHelper
class SimpleNet(fluid.Layer):
def __init__(self,
name_scope,
hidden_size,
vocab_size,
num_steps=20,
init_scale=0.1,
is_sparse=False,
dtype='float32'):
super(SimpleNet, self).__init__(name_scope)
self.hidden_size = hidden_size
self.vocab_size = vocab_size
self.init_scale = init_scale
self.num_steps = num_steps
self.embedding = Embedding(
self.full_name(),
size=[vocab_size, hidden_size],
dtype=dtype,
is_sparse=is_sparse,
param_attr=fluid.ParamAttr(
name='embedding_para',
initializer=fluid.initializer.UniformInitializer(
low=-init_scale, high=init_scale)))
self.softmax_bias = self.create_parameter(
attr=fluid.ParamAttr(),
shape=[self.vocab_size],
dtype=dtype,
default_initializer=fluid.initializer.UniformInitializer(
low=-self.init_scale, high=self.init_scale))
def forward(self, input, label):
x_emb = self.embedding(input)
projection = fluid.layers.matmul(
x_emb, fluid.layers.transpose(
self.embedding._w, perm=[1, 0]))
projection = fluid.layers.elementwise_add(projection, self.softmax_bias)
projection = fluid.layers.reshape(
projection, shape=[-1, self.vocab_size])
loss = fluid.layers.softmax_with_cross_entropy(
logits=projection, label=label, soft_label=False)
loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps])
loss = fluid.layers.reduce_mean(loss, dim=[0])
loss = fluid.layers.reduce_sum(loss)
loss.permissions = True
return loss
class TestDygraphSimpleNet(unittest.TestCase):
def test_simple_net(self):
for is_sparse in [True, False]:
for dtype in ["float32", "float64"]:
self.simple_net_float32(is_sparse, dtype)
def simple_net_float32(self, is_sparse, dtype):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for place in places:
seed = 90
hidden_size = 10
vocab_size = 1000
num_steps = 3
init_scale = 0.1
batch_size = 4
batch_num = 200
for is_sort_sum_gradient in [True, False]:
with fluid.dygraph.guard(place):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
simple_net = SimpleNet(
"simple_net",
hidden_size=hidden_size,
vocab_size=vocab_size,
num_steps=num_steps,
init_scale=init_scale,
is_sparse=is_sparse,
dtype=dtype)
sgd = SGDOptimizer(learning_rate=1e-3)
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
helper = DyGraphProgramDescTracerTestHelper(self)
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = is_sort_sum_gradient
for i in range(batch_num):
x_data = np.arange(12).reshape(4, 3).astype('int64')
y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
x_data = x_data.reshape((-1, num_steps, 1))
y_data = y_data.reshape((-1, 1))
x = to_variable(x_data)
y = to_variable(y_data)
outs = simple_net(x, y)
dy_loss = outs
if i == 0:
for param in simple_net.parameters():
dy_param_init[param.name] = param.numpy()
dy_loss.backward(backward_strategy)
sgd.minimize(dy_loss)
simple_net.clear_gradients()
if i == batch_num - 1:
for param in simple_net.parameters():
dy_param_updated[param.name] = param.numpy()
dy_loss_value = dy_loss.numpy()
with new_program_scope():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
simple_net = SimpleNet(
"simple_net",
hidden_size=hidden_size,
vocab_size=vocab_size,
num_steps=num_steps,
is_sparse=is_sparse,
dtype=dtype)
exe = fluid.Executor(place)
sgd = SGDOptimizer(learning_rate=1e-3)
x = fluid.layers.data(
name="x", shape=[-1, num_steps, 1], dtype='int64')
y = fluid.layers.data(name="y", shape=[-1, 1], dtype=dtype)
static_loss = simple_net(x, y)
sgd.minimize(static_loss)
static_param_updated = dict()
static_param_init = dict()
static_param_name_list = list()
for param in simple_net.parameters():
static_param_name_list.append(param.name)
out = exe.run(fluid.default_startup_program(),
fetch_list=static_param_name_list)
for i in range(len(static_param_name_list)):
static_param_init[static_param_name_list[i]] = out[i]
static_loss_value = None
for i in range(batch_num):
x_data = np.arange(12).reshape(4, 3).astype('int64')
y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
x_data = x_data.reshape((-1, num_steps, 1))
y_data = y_data.reshape((-1, 1))
fetch_list = [static_loss]
fetch_list.extend(static_param_name_list)
out = exe.run(fluid.default_main_program(),
feed={"x": x_data,
"y": y_data},
fetch_list=fetch_list)
static_loss_value = out[0]
if i == batch_num - 1:
for k in range(3, len(out)):
static_param_updated[static_param_name_list[
k - 1]] = out[k]
self.assertTrue(
np.array_equal(static_loss_value, dy_loss_value))
for key, value in six.iteritems(static_param_init):
self.assertTrue(np.array_equal(value, dy_param_init[key]))
for key, value in six.iteritems(static_param_updated):
self.assertTrue(
np.array_equal(value, dy_param_updated[key]))
if __name__ == '__main__':
unittest.main()
...@@ -141,6 +141,7 @@ class PtbModel(fluid.Layer): ...@@ -141,6 +141,7 @@ class PtbModel(fluid.Layer):
num_layers=2, num_layers=2,
num_steps=20, num_steps=20,
init_scale=0.1, init_scale=0.1,
is_sparse=False,
dropout=None): dropout=None):
super(PtbModel, self).__init__(name_scope) super(PtbModel, self).__init__(name_scope)
self.hidden_size = hidden_size self.hidden_size = hidden_size
...@@ -160,7 +161,7 @@ class PtbModel(fluid.Layer): ...@@ -160,7 +161,7 @@ class PtbModel(fluid.Layer):
self.full_name(), self.full_name(),
size=[vocab_size, hidden_size], size=[vocab_size, hidden_size],
dtype='float32', dtype='float32',
is_sparse=False, is_sparse=is_sparse,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name='embedding_para', name='embedding_para',
initializer=fluid.initializer.UniformInitializer( initializer=fluid.initializer.UniformInitializer(
...@@ -212,7 +213,11 @@ class PtbModel(fluid.Layer): ...@@ -212,7 +213,11 @@ class PtbModel(fluid.Layer):
class TestDygraphPtbRnn(unittest.TestCase): class TestDygraphPtbRnn(unittest.TestCase):
def test_ptb_rnn_cpu_float32(self): def test_ptb_rnn(self):
for is_sparse in [True, False]:
self.ptb_rnn_cpu_float32(is_sparse)
def ptb_rnn_cpu_float32(self, is_sparse):
seed = 90 seed = 90
hidden_size = 10 hidden_size = 10
vocab_size = 1000 vocab_size = 1000
...@@ -233,7 +238,8 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -233,7 +238,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
vocab_size=vocab_size, vocab_size=vocab_size,
num_layers=num_layers, num_layers=num_layers,
num_steps=num_steps, num_steps=num_steps,
init_scale=init_scale) init_scale=init_scale,
is_sparse=is_sparse)
sgd = SGDOptimizer(learning_rate=1e-3) sgd = SGDOptimizer(learning_rate=1e-3)
dy_param_updated = dict() dy_param_updated = dict()
...@@ -300,7 +306,8 @@ class TestDygraphPtbRnn(unittest.TestCase): ...@@ -300,7 +306,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
vocab_size=vocab_size, vocab_size=vocab_size,
num_layers=num_layers, num_layers=num_layers,
num_steps=num_steps, num_steps=num_steps,
init_scale=init_scale) init_scale=init_scale,
is_sparse=is_sparse)
exe = fluid.Executor(fluid.CPUPlace( exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
......
...@@ -28,7 +28,11 @@ import six ...@@ -28,7 +28,11 @@ import six
class TestDygraphPtbRnnSortGradient(unittest.TestCase): class TestDygraphPtbRnnSortGradient(unittest.TestCase):
def test_ptb_rnn_sort_gradient_cpu_float32(self): def test_ptb_rnn_sort_gradient(self):
for is_sparse in [True, False]:
self.ptb_rnn_sort_gradient_cpu_float32(is_sparse)
def ptb_rnn_sort_gradient_cpu_float32(self, is_sparse):
seed = 90 seed = 90
hidden_size = 10 hidden_size = 10
vocab_size = 1000 vocab_size = 1000
...@@ -50,7 +54,8 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase): ...@@ -50,7 +54,8 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
vocab_size=vocab_size, vocab_size=vocab_size,
num_layers=num_layers, num_layers=num_layers,
num_steps=num_steps, num_steps=num_steps,
init_scale=init_scale) init_scale=init_scale,
is_sparse=is_sparse)
sgd = SGDOptimizer(learning_rate=1e-3) sgd = SGDOptimizer(learning_rate=1e-3)
dy_param_updated = dict() dy_param_updated = dict()
...@@ -97,7 +102,8 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase): ...@@ -97,7 +102,8 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
vocab_size=vocab_size, vocab_size=vocab_size,
num_layers=num_layers, num_layers=num_layers,
num_steps=num_steps, num_steps=num_steps,
init_scale=init_scale) init_scale=init_scale,
is_sparse=is_sparse)
exe = fluid.Executor(fluid.CPUPlace( exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid as fluid
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.dygraph.nn import Embedding
from paddle.fluid.optimizer import SGDOptimizer
import numpy as np
import paddle.fluid.core as core
class SimpleNet(fluid.Layer):
def __init__(self, name_scope, vocab_size, hidden_size, dtype):
super(SimpleNet, self).__init__(name_scope)
self.emb = fluid.dygraph.Embedding(
self.full_name(),
size=[vocab_size, hidden_size],
dtype=dtype,
param_attr='emb.w',
is_sparse=True)
def forward(self, input):
input_emb = self.emb(input)
return input_emb, self.emb
class TestSimpleNet(unittest.TestCase):
def test_selectedrows_gradient1(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for place in places:
for dtype in ["float32", "float64"]:
for sort_sum_gradient in [True, False]:
with fluid.dygraph.guard(place):
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = sort_sum_gradient
adam = SGDOptimizer(learning_rate=0.001)
# grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5.0)
input_word = np.array(
[[[1], [2]], [[2], [1]]]).astype('int64')
input = to_variable(input_word)
simplenet = SimpleNet("SimpleNet", 20, 32, dtype)
input_emb, emb = simplenet(input)
try:
emb._w.gradient()
except ValueError as e:
pass
try:
input_emb.gradient()
except ValueError as e:
pass
input_emb.backward(backward_strategy)
adam.minimize(input_emb) # grad_clip=grad_clip
emb._w.gradient()
emb.clear_gradients()
try:
emb._w.gradient()
except ValueError as e:
pass
input_emb.clear_gradient()
try:
input_emb.gradient()
except ValueError as e:
pass
def test_selectedrows_gradient2(self):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for place in places:
for sort_sum_gradient in [True, False]:
with fluid.dygraph.guard(place):
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = sort_sum_gradient
adam = SGDOptimizer(learning_rate=0.001)
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(
5.0)
input_word = np.array(
[[[1], [2]], [[2], [1]]]).astype('int64')
input = to_variable(input_word)
simplenet = SimpleNet("SimpleNet", 20, 32, "float32")
input_emb, emb = simplenet(input)
try:
emb._w.gradient()
except ValueError as e:
pass
try:
input_emb.gradient()
except ValueError as e:
pass
input_emb.backward(backward_strategy)
adam.minimize(input_emb, grad_clip=grad_clip)
emb._w.gradient()
emb.clear_gradients()
try:
emb._w.gradient()
except ValueError as e:
pass
input_emb.clear_gradient()
try:
input_emb.gradient()
except ValueError as e:
pass
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.dygraph.nn import Embedding
import paddle.fluid.framework as framework
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope
import numpy as np
import six
from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
from paddle.fluid.dygraph.jit import TracedLayer
class SimpleNet(fluid.Layer):
def __init__(self,
name_scope,
hidden_size,
vocab_size,
num_steps=20,
init_scale=0.1,
is_sparse=False,
dtype='float32'):
super(SimpleNet, self).__init__(name_scope)
self.hidden_size = hidden_size
self.vocab_size = vocab_size
self.init_scale = init_scale
self.num_steps = num_steps
self.embedding = Embedding(
self.full_name(),
size=[vocab_size, hidden_size],
dtype=dtype,
is_sparse=is_sparse,
param_attr=fluid.ParamAttr(
name='embedding_para',
initializer=fluid.initializer.UniformInitializer(
low=-init_scale, high=init_scale)))
self.softmax_weight = self.create_parameter(
attr=fluid.ParamAttr(),
shape=[self.hidden_size, self.hidden_size],
dtype=dtype,
default_initializer=fluid.initializer.UniformInitializer(
low=-self.init_scale, high=self.init_scale))
self.softmax_bias = self.create_parameter(
attr=fluid.ParamAttr(),
shape=[self.hidden_size],
dtype=dtype,
default_initializer=fluid.initializer.UniformInitializer(
low=-self.init_scale, high=self.init_scale))
def forward(self, input, label):
x_emb = self.embedding(input)
fc = fluid.layers.matmul(x_emb, self.softmax_weight)
fc = fluid.layers.elementwise_add(fc, self.softmax_bias)
projection = fluid.layers.matmul(
fc, fluid.layers.transpose(
self.embedding._w, perm=[1, 0]))
projection = fluid.layers.reshape(
projection, shape=[-1, self.vocab_size])
loss = fluid.layers.softmax_with_cross_entropy(
logits=projection, label=label, soft_label=False)
loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps])
loss = fluid.layers.reduce_mean(loss, dim=[0])
loss = fluid.layers.reduce_sum(loss)
loss.permissions = True
return loss
class TestDygraphSimpleNet(unittest.TestCase):
def test_simple_net(self):
for is_sparse in [True, False]:
for dtype in ["float32", "float64"]:
self.simple_net_float(is_sparse, dtype)
def simple_net_float(self, is_sparse, dtype):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for place in places:
seed = 90
hidden_size = 10
vocab_size = 1000
num_steps = 3
init_scale = 0.1
batch_size = 4
batch_num = 200
for is_sort_sum_gradient in [True, False]:
traced_layer = None
with fluid.dygraph.guard(place):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
simple_net = SimpleNet(
"simple_net",
hidden_size=hidden_size,
vocab_size=vocab_size,
num_steps=num_steps,
init_scale=init_scale,
is_sparse=is_sparse,
dtype=dtype)
sgd = SGDOptimizer(learning_rate=1e-3)
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
helper = DyGraphProgramDescTracerTestHelper(self)
program = None
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = is_sort_sum_gradient
for i in range(batch_num):
x_data = np.arange(12).reshape(4, 3).astype('int64')
y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
x_data = x_data.reshape((-1, num_steps, 1))
y_data = y_data.reshape((-1, 1))
x = to_variable(x_data)
y = to_variable(y_data)
outs = simple_net(x, y)
dy_loss = outs
if i == 0:
for param in simple_net.parameters():
dy_param_init[param.name] = param.numpy()
dy_loss.backward(backward_strategy)
sgd.minimize(dy_loss)
simple_net.clear_gradients()
if i == batch_num - 1:
for param in simple_net.parameters():
dy_param_updated[param.name] = param.numpy()
dy_loss_value = dy_loss.numpy()
with new_program_scope():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
simple_net = SimpleNet(
"simple_net",
hidden_size=hidden_size,
vocab_size=vocab_size,
num_steps=num_steps,
is_sparse=is_sparse,
dtype=dtype)
exe = fluid.Executor(place)
sgd = SGDOptimizer(learning_rate=1e-3)
x = fluid.layers.data(
name="x", shape=[-1, num_steps, 1], dtype='int64')
y = fluid.layers.data(name="y", shape=[-1, 1], dtype=dtype)
static_loss = simple_net(x, y)
sgd.minimize(static_loss)
static_param_updated = dict()
static_param_init = dict()
static_param_name_list = list()
for param in simple_net.parameters():
static_param_name_list.append(param.name)
out = exe.run(framework.default_startup_program(),
fetch_list=static_param_name_list)
for i in range(len(static_param_name_list)):
static_param_init[static_param_name_list[i]] = out[i]
static_loss_value = None
for i in range(batch_num):
x_data = np.arange(12).reshape(4, 3).astype('int64')
y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
x_data = x_data.reshape((-1, num_steps, 1))
y_data = y_data.reshape((-1, 1))
fetch_list = [static_loss]
fetch_list.extend(static_param_name_list)
out = exe.run(fluid.default_main_program(),
feed={"x": x_data,
"y": y_data},
fetch_list=fetch_list)
static_loss_value = out[0]
if i == batch_num - 1:
for k in range(3, len(out)):
static_param_updated[static_param_name_list[
k - 1]] = out[k]
self.assertTrue(
np.array_equal(static_loss_value, dy_loss_value))
for key, value in six.iteritems(static_param_init):
self.assertTrue(np.array_equal(value, dy_param_init[key]))
for key, value in six.iteritems(static_param_updated):
self.assertTrue(
np.array_equal(value, dy_param_updated[key]))
if __name__ == '__main__':
unittest.main()
...@@ -586,6 +586,7 @@ class PrepareEncoderDecoderLayer(Layer): ...@@ -586,6 +586,7 @@ class PrepareEncoderDecoderLayer(Layer):
src_emb_dim, src_emb_dim,
src_max_len, src_max_len,
dropout_rate, dropout_rate,
is_sparse=False,
word_emb_param_name=None, word_emb_param_name=None,
pos_enc_param_name=None): pos_enc_param_name=None):
super(PrepareEncoderDecoderLayer, self).__init__(name_scope) super(PrepareEncoderDecoderLayer, self).__init__(name_scope)
...@@ -596,6 +597,7 @@ class PrepareEncoderDecoderLayer(Layer): ...@@ -596,6 +597,7 @@ class PrepareEncoderDecoderLayer(Layer):
self._input_emb = Embedding( self._input_emb = Embedding(
name_scope=self.full_name(), name_scope=self.full_name(),
size=[src_vocab_size, src_emb_dim], size=[src_vocab_size, src_emb_dim],
is_sparse=is_sparse,
padding_idx=0, padding_idx=0,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name=word_emb_param_name, name=word_emb_param_name,
...@@ -608,6 +610,7 @@ class PrepareEncoderDecoderLayer(Layer): ...@@ -608,6 +610,7 @@ class PrepareEncoderDecoderLayer(Layer):
self._pos_emb = Embedding( self._pos_emb = Embedding(
name_scope=self.full_name(), name_scope=self.full_name(),
size=[self._src_max_len, src_emb_dim], size=[self._src_max_len, src_emb_dim],
is_sparse=is_sparse,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name=pos_enc_param_name, name=pos_enc_param_name,
initializer=fluid.initializer.NumpyArrayInitializer(pos_inp), initializer=fluid.initializer.NumpyArrayInitializer(pos_inp),
...@@ -633,10 +636,23 @@ class PrepareEncoderDecoderLayer(Layer): ...@@ -633,10 +636,23 @@ class PrepareEncoderDecoderLayer(Layer):
class WrapEncoderLayer(Layer): class WrapEncoderLayer(Layer):
def __init__(self, name_cope, src_vocab_size, max_length, n_layer, n_head, def __init__(self,
d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, name_cope,
attention_dropout, relu_dropout, preprocess_cmd, src_vocab_size,
postprocess_cmd, weight_sharing): max_length,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
weight_sharing,
is_sparse=False):
""" """
The wrapper assembles together all needed layers for the encoder. The wrapper assembles together all needed layers for the encoder.
""" """
...@@ -648,6 +664,7 @@ class WrapEncoderLayer(Layer): ...@@ -648,6 +664,7 @@ class WrapEncoderLayer(Layer):
d_model, d_model,
max_length, max_length,
prepostprocess_dropout, prepostprocess_dropout,
is_sparse=is_sparse,
word_emb_param_name=word_emb_param_names[0], word_emb_param_name=word_emb_param_names[0],
pos_enc_param_name=pos_enc_param_names[0]) pos_enc_param_name=pos_enc_param_names[0])
self._encoder = EncoderLayer( self._encoder = EncoderLayer(
...@@ -814,7 +831,8 @@ class WrapDecoderLayer(Layer): ...@@ -814,7 +831,8 @@ class WrapDecoderLayer(Layer):
postprocess_cmd, postprocess_cmd,
weight_sharing, weight_sharing,
caches=None, caches=None,
gather_idx=None): gather_idx=None,
is_sparse=False):
""" """
The wrapper assembles together all needed layers for the encoder. The wrapper assembles together all needed layers for the encoder.
""" """
...@@ -826,6 +844,7 @@ class WrapDecoderLayer(Layer): ...@@ -826,6 +844,7 @@ class WrapDecoderLayer(Layer):
d_model, d_model,
max_length, max_length,
prepostprocess_dropout, prepostprocess_dropout,
is_sparse=is_sparse,
word_emb_param_name=word_emb_param_names[1], word_emb_param_name=word_emb_param_names[1],
pos_enc_param_name=pos_enc_param_names[1]) pos_enc_param_name=pos_enc_param_names[1])
self._decoder_layer = DecoderLayer( self._decoder_layer = DecoderLayer(
...@@ -893,7 +912,8 @@ class TransFormer(Layer): ...@@ -893,7 +912,8 @@ class TransFormer(Layer):
weight_sharing, weight_sharing,
label_smooth_eps, label_smooth_eps,
use_py_reader=False, use_py_reader=False,
is_test=False): is_test=False,
is_sparse=False):
super(TransFormer, self).__init__(name_scope) super(TransFormer, self).__init__(name_scope)
self._label_smooth_eps = label_smooth_eps self._label_smooth_eps = label_smooth_eps
self._trg_vocab_size = trg_vocab_size self._trg_vocab_size = trg_vocab_size
...@@ -902,15 +922,39 @@ class TransFormer(Layer): ...@@ -902,15 +922,39 @@ class TransFormer(Layer):
"Vocabularies in source and target should be same for weight sharing." "Vocabularies in source and target should be same for weight sharing."
) )
self._wrap_encoder_layer = WrapEncoderLayer( self._wrap_encoder_layer = WrapEncoderLayer(
self.full_name(), src_vocab_size, max_length, n_layer, n_head, self.full_name(),
d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, src_vocab_size,
attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, max_length,
weight_sharing) n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
weight_sharing,
is_sparse=is_sparse)
self._wrap_decoder_layer = WrapDecoderLayer( self._wrap_decoder_layer = WrapDecoderLayer(
self.full_name(), trg_vocab_size, max_length, n_layer, n_head, self.full_name(),
d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, trg_vocab_size,
attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, max_length,
weight_sharing) n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
weight_sharing,
is_sparse=is_sparse)
if weight_sharing: if weight_sharing:
self._wrap_decoder_layer._prepare_decoder_layer._input_emb._w = self._wrap_encoder_layer._prepare_encoder_layer._input_emb._w self._wrap_decoder_layer._prepare_decoder_layer._input_emb._w = self._wrap_encoder_layer._prepare_encoder_layer._input_emb._w
...@@ -937,7 +981,11 @@ class TransFormer(Layer): ...@@ -937,7 +981,11 @@ class TransFormer(Layer):
class TestDygraphTransformerSortGradient(unittest.TestCase): class TestDygraphTransformerSortGradient(unittest.TestCase):
def test_transformer_sort_gradient_float32(self): def test_transformer_sort_gradient(self):
for is_sparse in [True, False]:
self.transformer_sort_gradient_float32(is_sparse)
def transformer_sort_gradient_float32(self, is_sparse):
seed = 90 seed = 90
with guard(): with guard():
...@@ -964,7 +1012,8 @@ class TestDygraphTransformerSortGradient(unittest.TestCase): ...@@ -964,7 +1012,8 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
ModelHyperParams.weight_sharing, ModelHyperParams.weight_sharing,
TrainTaskConfig.label_smooth_eps, TrainTaskConfig.label_smooth_eps,
use_py_reader=use_py_reader, use_py_reader=use_py_reader,
is_test=False) is_test=False,
is_sparse=is_sparse)
if sync: if sync:
lr_decay = fluid.layers.learning_rate_scheduler.noam_decay( lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
ModelHyperParams.d_model, TrainTaskConfig.warmup_steps) ModelHyperParams.d_model, TrainTaskConfig.warmup_steps)
...@@ -1045,7 +1094,8 @@ class TestDygraphTransformerSortGradient(unittest.TestCase): ...@@ -1045,7 +1094,8 @@ class TestDygraphTransformerSortGradient(unittest.TestCase):
ModelHyperParams.weight_sharing, ModelHyperParams.weight_sharing,
TrainTaskConfig.label_smooth_eps, TrainTaskConfig.label_smooth_eps,
use_py_reader=use_py_reader, use_py_reader=use_py_reader,
is_test=False) is_test=False,
is_sparse=is_sparse)
exe = fluid.Executor(fluid.CPUPlace( exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
optimizer = fluid.optimizer.SGD(learning_rate=0.003) optimizer = fluid.optimizer.SGD(learning_rate=0.003)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册