未验证 提交 93606c2c 编写于 作者: Z Zeng Jinle 提交者: GitHub

Merge pull request #13689 from sneaxiy/sparse_rmsprop

Fix sparse rmsprop
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/math/algorithm.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
...@@ -199,23 +200,9 @@ struct SparseAdamFunctor { ...@@ -199,23 +200,9 @@ struct SparseAdamFunctor {
row_numel_(row_numel), row_numel_(row_numel),
row_count_(row_count) {} row_count_(row_count) {}
inline HOSTDEVICE int64_t BinarySearchInRows(int64_t row) const {
int64_t beg = 0, end = row_count_ - 1;
while (beg <= end) {
auto mid = ((beg + end) >> 1);
if (rows_[mid] == row)
return mid;
else if (rows_[mid] < row)
beg = mid + 1;
else
end = mid - 1;
}
return -1;
}
inline HOSTDEVICE void operator()(size_t i) const { inline HOSTDEVICE void operator()(size_t i) const {
int64_t row = i / row_numel_; auto row_idx =
auto row_idx = BinarySearchInRows(row); math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
// The following code is the same as dense // The following code is the same as dense
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <cstdint> // for int64_t
#include <numeric>
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) {
int64_t beg = 0, end = num - 1;
while (beg <= end) {
auto mid = ((beg + end) >> 1);
if (x[mid] == val)
return mid;
else if (x[mid] < val)
beg = mid + 1;
else
end = mid - 1;
}
return -1;
}
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <map>
#include <set> #include <set>
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
namespace paddle { namespace paddle {
...@@ -245,40 +247,42 @@ struct MergeAdd<platform::CPUDeviceContext, T> { ...@@ -245,40 +247,42 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
const framework::SelectedRows& input, const framework::SelectedRows& input,
framework::SelectedRows* output) { framework::SelectedRows* output) {
framework::SelectedRows& out = *output; framework::SelectedRows& out = *output;
auto input_rows = input.rows(); std::vector<int64_t> input_rows(input.rows());
std::vector<int64_t> merge_rows;
merge_rows.reserve(input_rows.size()); std::map<int64_t, std::vector<int64_t>> merge_row_map;
std::unordered_map<int64_t, size_t> rows_pos_map; for (size_t i = 0; i < input_rows.size(); ++i) {
rows_pos_map.reserve(input_rows.size()); merge_row_map[input_rows[i]].push_back(i);
size_t idx = 0u;
for (std::vector<int64_t>::iterator iter = input_rows.begin();
iter != input_rows.end(); ++iter) {
if (rows_pos_map.find(*iter) == rows_pos_map.end()) {
rows_pos_map[*iter] = idx++;
merge_rows.emplace_back(*iter);
}
} }
auto input_width = input.value().dims()[1]; std::vector<int64_t> merge_rows(merge_row_map.size());
out.set_rows(merge_rows); size_t idx = 0;
int64_t input_width = input.value().dims()[1];
out.set_height(input.height()); out.set_height(input.height());
out.mutable_value()->mutable_data<T>(
T* out_data = out.mutable_value()->mutable_data<T>(
framework::make_ddim( framework::make_ddim(
{static_cast<int64_t>(merge_rows.size()), input_width}), {static_cast<int64_t>(merge_rows.size()), input_width}),
context.GetPlace()); context.GetPlace());
const T* in_data = input.value().data<T>();
math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
constant_functor(context, out.mutable_value(), 0.0); for (auto& row_pair : merge_row_map) {
auto* out_ptr = out_data + idx * input_width;
auto* out_data = out.mutable_value()->data<T>(); auto& rows = row_pair.second;
auto* input_data = input.value().data<T>(); merge_rows[idx] = row_pair.first;
++idx;
for (size_t i = 0; i < input_rows.size(); i++) { // rows.size() is always larger than 0
size_t out_i = rows_pos_map[input_rows[i]]; std::memcpy(out_ptr, in_data + rows[0] * input_width,
for (int64_t j = 0; j < input_width; j++) { sizeof(T) * input_width);
out_data[out_i * input_width + j] += input_data[i * input_width + j];
for (size_t i = 1; i < rows.size(); ++i) {
auto* in_ptr = in_data + rows[i] * input_width;
for (int64_t j = 0; j < input_width; ++j) {
out_ptr[j] += in_ptr[j];
}
} }
} }
out.set_rows(merge_rows);
} }
}; };
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
...@@ -97,41 +98,39 @@ struct MergeAdd<platform::CPUDeviceContext, float> { ...@@ -97,41 +98,39 @@ struct MergeAdd<platform::CPUDeviceContext, float> {
const framework::SelectedRows& input, const framework::SelectedRows& input,
framework::SelectedRows* output) { framework::SelectedRows* output) {
framework::SelectedRows& out = *output; framework::SelectedRows& out = *output;
auto input_rows = input.rows(); std::vector<int64_t> input_rows(input.rows());
std::vector<int64_t> merge_rows;
merge_rows.reserve(input_rows.size()); std::map<int64_t, std::vector<int64_t>> merge_row_map;
std::unordered_map<int64_t, size_t> rows_pos_map; for (size_t i = 0; i < input_rows.size(); ++i) {
rows_pos_map.reserve(input_rows.size()); merge_row_map[input_rows[i]].push_back(i);
size_t idx = 0u;
for (std::vector<int64_t>::iterator iter = input_rows.begin();
iter != input_rows.end(); ++iter) {
if (rows_pos_map.find(*iter) == rows_pos_map.end()) {
rows_pos_map[*iter] = idx++;
merge_rows.emplace_back(*iter);
}
} }
auto input_width = input.value().dims()[1]; std::vector<int64_t> merge_rows(merge_row_map.size());
out.set_rows(merge_rows); size_t idx = 0;
int64_t input_width = input.value().dims()[1];
out.set_height(input.height()); out.set_height(input.height());
out.mutable_value()->mutable_data<float>(
auto* out_data = out.mutable_value()->mutable_data<float>(
framework::make_ddim( framework::make_ddim(
{static_cast<int64_t>(merge_rows.size()), input_width}), {static_cast<int64_t>(merge_rows.size()), input_width}),
context.GetPlace()); context.GetPlace());
auto* in_data = input.value().data<float>();
math::SetConstant<platform::CPUDeviceContext, float> constant_functor;
constant_functor(context, out.mutable_value(), 0.0);
auto* out_data = out.mutable_value()->data<float>();
auto* input_data = input.value().data<float>();
auto blas = GetBlas<platform::CPUDeviceContext, float>(context); auto blas = GetBlas<platform::CPUDeviceContext, float>(context);
for (size_t i = 0; i < input_rows.size(); i++) { for (auto& row_pair : merge_row_map) {
size_t out_i = rows_pos_map[input_rows[i]]; auto* out_ptr = out_data + idx * input_width;
float* y = out_data + out_i * input_width; auto& rows = row_pair.second;
const float* x = input_data + i * input_width; merge_rows[idx] = row_pair.first;
blas.AXPY(input_width, 1., x, y); ++idx;
// rows.size() is always larger than 0
blas.VCOPY(input_width, in_data + rows[0] * input_width, out_ptr);
for (size_t i = 1; i < rows.size(); ++i) {
blas.AXPY(input_width, 1., in_data + rows[i] * input_width, out_ptr);
}
} }
out.set_rows(merge_rows);
} }
}; };
...@@ -148,41 +147,39 @@ struct MergeAdd<platform::CPUDeviceContext, double> { ...@@ -148,41 +147,39 @@ struct MergeAdd<platform::CPUDeviceContext, double> {
const framework::SelectedRows& input, const framework::SelectedRows& input,
framework::SelectedRows* output) { framework::SelectedRows* output) {
framework::SelectedRows& out = *output; framework::SelectedRows& out = *output;
auto input_rows = input.rows(); std::vector<int64_t> input_rows(input.rows());
std::vector<int64_t> merge_rows;
merge_rows.reserve(input_rows.size()); std::map<int64_t, std::vector<int64_t>> merge_row_map;
std::unordered_map<int64_t, size_t> rows_pos_map; for (size_t i = 0; i < input_rows.size(); ++i) {
rows_pos_map.reserve(input_rows.size()); merge_row_map[input_rows[i]].push_back(i);
size_t idx = 0u;
for (std::vector<int64_t>::iterator iter = input_rows.begin();
iter != input_rows.end(); ++iter) {
if (rows_pos_map.find(*iter) == rows_pos_map.end()) {
rows_pos_map[*iter] = idx++;
merge_rows.emplace_back(*iter);
}
} }
auto input_width = input.value().dims()[1]; std::vector<int64_t> merge_rows(merge_row_map.size());
out.set_rows(merge_rows); size_t idx = 0;
int64_t input_width = input.value().dims()[1];
out.set_height(input.height()); out.set_height(input.height());
out.mutable_value()->mutable_data<double>(
auto* out_data = out.mutable_value()->mutable_data<double>(
framework::make_ddim( framework::make_ddim(
{static_cast<int64_t>(merge_rows.size()), input_width}), {static_cast<int64_t>(merge_rows.size()), input_width}),
context.GetPlace()); context.GetPlace());
auto* in_data = input.value().data<double>();
math::SetConstant<platform::CPUDeviceContext, double> constant_functor;
constant_functor(context, out.mutable_value(), 0.0);
auto* out_data = out.mutable_value()->data<double>();
auto* input_data = input.value().data<double>();
auto blas = GetBlas<platform::CPUDeviceContext, double>(context); auto blas = GetBlas<platform::CPUDeviceContext, double>(context);
for (size_t i = 0; i < input_rows.size(); i++) { for (auto& row_pair : merge_row_map) {
size_t out_i = rows_pos_map[input_rows[i]]; auto* out_ptr = out_data + idx * input_width;
double* y = out_data + out_i * input_width; auto& rows = row_pair.second;
const double* x = input_data + i * input_width; merge_rows[idx] = row_pair.first;
blas.AXPY(input_width, 1., x, y); ++idx;
// rows.size() is always larger than 0
blas.VCOPY(input_width, in_data + rows[0] * input_width, out_ptr);
for (size_t i = 1; i < rows.size(); ++i) {
blas.AXPY(input_width, 1., in_data + rows[i] * input_width, out_ptr);
}
} }
out.set_rows(merge_rows);
} }
}; };
......
...@@ -13,72 +13,254 @@ See the License for the specific language governing permissions and ...@@ -13,72 +13,254 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <math.h>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/algorithm.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/for_range.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor, template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>; using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename T>
struct DenseRmspropGradFunctor {
inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {}
HOSTDEVICE inline T operator()(int64_t idx) const { return grad_[idx]; }
const T *grad_;
};
template <typename T>
struct SparseRmspropGradFunctor {
inline SparseRmspropGradFunctor(const T *grad, const int64_t *rows,
int64_t row_numel, int64_t row_count)
: grad_(grad),
rows_(rows),
row_numel_(row_numel),
row_count_(row_count) {}
HOSTDEVICE inline T operator()(int64_t idx) const {
auto row_idx = math::BinarySearch(rows_, row_count_, idx / row_numel_);
return row_idx >= 0 ? grad_[row_idx * row_numel_ + idx % row_numel_] : 0;
}
const T *grad_;
const int64_t *rows_;
int64_t row_numel_;
int64_t row_count_;
};
template <typename T, typename GradFunctor>
struct UncenteredRmspropFunctor {
UncenteredRmspropFunctor(T *param, T *ms, T *mom, const T *lr, T rho,
T epsilon, T momentum,
const GradFunctor &grad_functor)
: param_(param),
ms_(ms),
mom_(mom),
lr_(lr),
rho_(rho),
epsilon_(epsilon),
momentum_(momentum),
grad_functor_(grad_functor) {}
HOSTDEVICE inline void operator()(int64_t idx) const {
T g = grad_functor_(idx);
T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
T mom_out = momentum_ * mom_[idx] + lr_[0] * g / sqrt(ms_out + epsilon_);
param_[idx] -= mom_out;
ms_[idx] = ms_out;
mom_[idx] = mom_out;
}
T *param_;
T *ms_;
T *mom_;
const T *lr_;
T rho_;
T epsilon_;
T momentum_;
GradFunctor grad_functor_;
};
template <typename T, typename GradFunctor>
struct CenteredRmspropFunctor {
CenteredRmspropFunctor(T *param, T *ms, T *mom, T *mean_grad, const T *lr,
T rho, T epsilon, T momentum,
const GradFunctor &grad_functor)
: param_(param),
ms_(ms),
mom_(mom),
mean_grad_(mean_grad),
lr_(lr),
rho_(rho),
epsilon_(epsilon),
momentum_(momentum),
grad_functor_(grad_functor) {}
HOSTDEVICE inline void operator()(int64_t idx) const {
T g = grad_functor_(idx);
T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
T mg_out = rho_ * mean_grad_[idx] + (1 - rho_) * g;
T mom_out = momentum_ * mom_[idx] +
lr_[0] * g / sqrt(ms_out - mg_out * mg_out + epsilon_);
param_[idx] -= mom_out;
ms_[idx] = ms_out;
mom_[idx] = mom_out;
mean_grad_[idx] = mg_out;
}
T *param_;
T *ms_;
T *mom_;
T *mean_grad_;
const T *lr_;
T rho_;
T epsilon_;
T momentum_;
GradFunctor grad_functor_;
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class RmspropOpKernel : public framework::OpKernel<T> { class RmspropOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
const auto* param_var = ctx.InputVar("Param"); using LoDTensor = framework::LoDTensor;
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(), auto *grad_var = ctx.InputVar("Grad");
"The Var(%s)'s type should be LoDTensor, " auto *param_out = ctx.Output<LoDTensor>("ParamOut");
"but the received is %s", auto *moment_out = ctx.Output<LoDTensor>("MomentOut");
ctx.Inputs("Param").front(), param_var->Type().name()); auto *mean_square_out = ctx.Output<LoDTensor>("MeanSquareOut");
auto* param_out = ctx.Output<Tensor>("ParamOut"); auto epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
auto* moment_out = ctx.Output<Tensor>("MomentOut"); auto rho = static_cast<T>(ctx.Attr<float>("decay"));
auto* mean_square_out = ctx.Output<Tensor>("MeanSquareOut"); auto momentum = static_cast<T>(ctx.Attr<float>("momentum"));
auto grad = ctx.Input<Tensor>("Grad");
param_out->mutable_data<T>(ctx.GetPlace());
moment_out->mutable_data<T>(ctx.GetPlace());
mean_square_out->mutable_data<T>(ctx.GetPlace());
float epsilon = ctx.Attr<float>("epsilon");
float rho = ctx.Attr<float>("decay");
float momentum = ctx.Attr<float>("momentum");
bool centered = ctx.Attr<bool>("centered"); bool centered = ctx.Attr<bool>("centered");
auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param")); auto &p_tensor = *ctx.Input<LoDTensor>("Param");
auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare")); auto &ms_tensor = *ctx.Input<LoDTensor>("MeanSquare");
auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate")); auto &lr_tensor = *ctx.Input<LoDTensor>("LearningRate");
auto g = EigenVector<T>::Flatten(*grad); auto &mom_tensor = *ctx.Input<LoDTensor>("Moment");
auto mom = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment"));
PADDLE_ENFORCE_EQ(&p_tensor, param_out,
auto p_out = EigenVector<T>::Flatten(*param_out); "Param and ParamOut must be the same Tensor");
auto mom_out = EigenVector<T>::Flatten(*moment_out); PADDLE_ENFORCE_EQ(&mom_tensor, moment_out,
auto ms_out = EigenVector<T>::Flatten(*mean_square_out); "Moment and MomentOut must be the same Tensor");
auto& place = *ctx.template device_context<DeviceContext>().eigen_device(); PADDLE_ENFORCE_EQ(&ms_tensor, mean_square_out,
"MeanSquare and MeanSquareOut must be the same Tensor");
Eigen::DSizes<int, 1> grad_dsize(static_cast<int>(grad->numel()));
auto &dev_ctx = ctx.template device_context<DeviceContext>();
ms_out.device(place) = rho * ms + (1 - rho) * g * g; size_t limit = static_cast<size_t>(ms_tensor.numel());
if (centered) {
auto mg = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanGrad")); if (grad_var->IsType<LoDTensor>()) {
auto* mean_grad_out = ctx.Output<Tensor>("MeanGradOut"); auto &grad_tensor = grad_var->Get<LoDTensor>();
mean_grad_out->mutable_data<T>(ctx.GetPlace());
auto mg_out = EigenVector<T>::Flatten(*mean_grad_out); if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value) {
auto &place =
mg_out.device(place) = rho * mg + (1 - rho) * g; *ctx.template device_context<DeviceContext>().eigen_device();
mom_out.device(place) = momentum * mom + auto lr_value = lr_tensor.data<T>()[0];
lr.broadcast(grad_dsize) * g /
(ms_out - mg_out.square() + epsilon).sqrt(); auto p = EigenVector<T>::Flatten(p_tensor);
auto ms = EigenVector<T>::Flatten(ms_tensor);
auto g = EigenVector<T>::Flatten(grad_tensor);
auto mom = EigenVector<T>::Flatten(mom_tensor);
auto p_out = EigenVector<T>::Flatten(*param_out);
auto mom_out = EigenVector<T>::Flatten(*moment_out);
auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
ms_out.device(place) = rho * ms + (1 - rho) * g * g;
if (centered) {
auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
auto mg = EigenVector<T>::Flatten(mg_tensor);
auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
"MeanGrad and MeanGradOut must be the same Tensor");
auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
mg_out.device(place) = rho * mg + (1 - rho) * g;
mom_out.device(place) =
momentum * mom +
lr_value * g / (ms_out - mg_out.square() + epsilon).sqrt();
} else {
mom_out.device(place) =
momentum * mom + lr_value * g / (ms_out + epsilon).sqrt();
}
p_out.device(place) = p - mom_out;
} else {
DenseRmspropGradFunctor<T> grad_func(grad_tensor.data<T>());
platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
if (centered) {
auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
"MeanGrad and MeanGradOut must be the same Tensor");
for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
param_out->mutable_data<T>(ctx.GetPlace()),
mean_square_out->mutable_data<T>(ctx.GetPlace()),
moment_out->mutable_data<T>(ctx.GetPlace()),
mean_grad_out->mutable_data<T>(ctx.GetPlace()),
lr_tensor.data<T>(), rho, epsilon, momentum, grad_func));
} else {
for_range(UncenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
param_out->mutable_data<T>(ctx.GetPlace()),
mean_square_out->mutable_data<T>(ctx.GetPlace()),
moment_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
rho, epsilon, momentum, grad_func));
}
}
} else if (grad_var->IsType<framework::SelectedRows>()) {
auto &grad = grad_var->Get<framework::SelectedRows>();
auto *merged_grad = const_cast<framework::Scope &>(ctx.scope())
.Var()
->GetMutable<framework::SelectedRows>();
math::scatter::MergeAdd<DeviceContext, T> merge_func;
merge_func(dev_ctx, grad, merged_grad);
platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
const int64_t *rows;
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
rows = merged_grad->rows().CUDAData(ctx.GetPlace());
} else {
#endif
rows = merged_grad->rows().data();
#ifdef PADDLE_WITH_CUDA
}
#endif
auto &merged_tensor = merged_grad->value();
int64_t row_count = merged_grad->rows().size();
int64_t row_numel = merged_tensor.numel() / row_count;
SparseRmspropGradFunctor<T> grad_func(merged_tensor.data<T>(), rows,
row_numel, row_count);
if (centered) {
auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
"MeanGrad and MeanGradOut must be the same Tensor");
for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
param_out->mutable_data<T>(ctx.GetPlace()),
mean_square_out->mutable_data<T>(ctx.GetPlace()),
moment_out->mutable_data<T>(ctx.GetPlace()),
mean_grad_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
rho, epsilon, momentum, grad_func));
} else {
for_range(UncenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
param_out->mutable_data<T>(ctx.GetPlace()),
mean_square_out->mutable_data<T>(ctx.GetPlace()),
moment_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
rho, epsilon, momentum, grad_func));
}
} else { } else {
mom_out.device(place) = PADDLE_THROW("RMSProp only supports LoDTensor or SelectedRows gradient");
momentum * mom +
lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
} }
p_out.device(place) = p - mom_out;
} }
}; };
......
...@@ -19,33 +19,76 @@ import unittest ...@@ -19,33 +19,76 @@ import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.op import Operator from paddle.fluid.op import Operator
import paddle.fluid as fluid
def create_selected_rows_and_tensor(scope, place, height, row_num,
embedding_size):
sr = scope.var("@selected_rows@").get_selected_rows()
tensor = scope.var("grad").get_tensor()
rows = np.random.random_integers(
low=0, high=height - 1, size=[row_num, ]).astype('int64')
sr_val = np.random.random(size=[row_num, embedding_size]).astype('float32')
sr.set_height(height)
sr.set_rows(rows)
sr.get_tensor().set(sr_val, place)
tensor_val = np.zeros(shape=[height, embedding_size], dtype='float32')
for i in range(row_num):
row = rows[i]
tensor_val[row, :] = tensor_val[row, :] + sr_val[i, :]
tensor.set(tensor_val, place)
return tensor_val, sr_val
class TestBase(unittest.TestCase): class TestBase(unittest.TestCase):
def setup(self, centered, epsilon=1e-6): def setup(self,
place,
is_sparse,
centered,
size,
row_num=None,
epsilon=1e-6):
np.random.seed(5) # fix seed np.random.seed(5) # fix seed
self.scope = fluid.global_scope()
self.place = place
self.param_name = "param" self.param_name = "param"
self.param = np.random.random((123, 321)).astype("float32") self.param = np.random.random(size).astype("float32")
self.mean_square_name = "mean_square" self.mean_square_name = "mean_square"
self.mean_square = np.random.random((123, 321)).astype("float32") self.mean_square = np.random.uniform(
low=1, high=2, size=size).astype("float32")
self.mean_grad_name = "mean_grad" self.mean_grad_name = "mean_grad"
self.mean_grad = np.random.random((123, 321)).astype("float32") self.mean_grad = np.random.random(size).astype("float32")
self.lr_name = "lr" self.lr_name = "lr"
self.learning_rate = np.array([0.01]).astype("float32") self.learning_rate = np.array([0.01]).astype("float32")
self.grad_name = "grad" self.grad_name = "grad"
self.grad = np.random.random((123, 321)).astype("float32")
self.is_sparse = is_sparse
if self.is_sparse:
self.grad_sr_name = "@selected_rows@"
self.grad, self.grad_sr = create_selected_rows_and_tensor(
self.scope, place, size[0], row_num, size[1])
else:
self.grad = np.random.random(size).astype("float32")
grad_tensor = self.scope.var(self.grad_name).get_tensor()
grad_tensor.set(self.grad, place)
self.moment_name = "moment" self.moment_name = "moment"
self.moment = np.zeros((123, 321)).astype("float32") self.moment = np.random.uniform(
low=0, high=1, size=size).astype("float32")
self.epsilon = epsilon self.epsilon = epsilon
self.decay = 0.9 self.decay = 0.9
self.momentum = 0.0 self.momentum = 0.1
self.centered = centered self.centered = centered
self.ms_out = self.decay * self.mean_square + (1 - self.decay self.ms_out = self.decay * self.mean_square + (1 - self.decay
...@@ -61,118 +104,122 @@ class TestBase(unittest.TestCase): ...@@ -61,118 +104,122 @@ class TestBase(unittest.TestCase):
self.param_out = self.param - self.moment_out self.param_out = self.param - self.moment_out
def check(self,
actual_t,
expect_t,
place,
out_name,
atol=1e-5,
equal_nan=False):
self.assertTrue(
np.allclose(
actual_t, expect_t, atol=atol, equal_nan=equal_nan),
"Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
+ str(expect_t) + "\n" + "But Got" + str(actual_t))
class TestRmspropOp(TestBase):
def check_with_place(self, place, centered, epsilon):
self.setup(centered, epsilon)
scope = core.Scope()
# create and initialize Param Variable # create and initialize Param Variable
param = scope.var(self.param_name).get_tensor() self.param_tensor = self.scope.var(self.param_name).get_tensor()
param.set(self.param, place) self.param_tensor.set(self.param, place)
mean_square = scope.var(self.mean_square_name).get_tensor() self.mean_square_tensor = self.scope.var(
mean_square.set(self.mean_square, place) self.mean_square_name).get_tensor()
self.mean_square_tensor.set(self.mean_square, place)
lr = scope.var(self.lr_name).get_tensor() lr = self.scope.var(self.lr_name).get_tensor()
lr.set(self.learning_rate, place) lr.set(self.learning_rate, place)
grad = scope.var(self.grad_name).get_tensor() self.moment_tensor = self.scope.var(self.moment_name).get_tensor()
grad.set(self.grad, place) self.moment_tensor.set(self.moment, place)
moment = scope.var(self.moment_name).get_tensor() if self.centered:
moment.set(self.moment, place) self.mean_grad_tensor = self.scope.var(
self.mean_grad_name).get_tensor()
self.mean_grad_tensor.set(self.mean_grad, place)
# create and run sgd operator def check(self, actual_t, expect_t, place, out_name, atol=1e-5):
self.assertTrue(
np.allclose(
actual_t, expect_t, atol=atol),
"Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
+ str(expect_t) + "\n" + "But Got" + str(actual_t))
if self.centered:
mean_grad = scope.var(self.mean_grad_name).get_tensor() class TestRmspropOp(TestBase):
mean_grad.set(self.mean_grad, place) def check_with_place(self,
place,
rmsprop_op = Operator( is_sparse,
"rmsprop", centered,
Param=self.param_name, size,
Grad=self.grad_name, row_num=None,
MeanSquare=self.mean_square_name, epsilon=1e-6):
MeanGrad=self.mean_grad_name, self.setup(place, is_sparse, centered, size, row_num, epsilon)
Moment=self.moment_name, self.run_and_check()
LearningRate=self.lr_name,
ParamOut=self.param_name, def run_and_check(self):
MeanSquareOut=self.mean_square_name, grad_name = self.grad_sr_name if self.is_sparse else self.grad_name
MomentOut=self.moment_name,
MeanGradOut=self.mean_grad_name, kwargs = {
epsilon=self.epsilon, 'Param': self.param_name,
decay=self.decay, 'Grad': grad_name,
momentum=self.momentum, 'MeanSquare': self.mean_square_name,
centered=True) 'Moment': self.moment_name,
else: 'LearningRate': self.lr_name,
rmsprop_op = Operator( 'ParamOut': self.param_name,
"rmsprop", 'MeanSquareOut': self.mean_square_name,
Param=self.param_name, 'MomentOut': self.moment_name,
Grad=self.grad_name, 'epsilon': self.epsilon,
MeanSquare=self.mean_square_name, 'decay': self.decay,
Moment=self.moment_name, 'momentum': self.momentum,
LearningRate=self.lr_name, 'centered': self.centered
ParamOut=self.param_name, }
MeanSquareOut=self.mean_square_name,
MomentOut=self.moment_name,
epsilon=self.epsilon,
decay=self.decay,
momentum=self.momentum,
centered=False)
rmsprop_op.run(scope, place)
atol = 1e-5
equal_nan = False
if self.centered: if self.centered:
atol = 1e-3 kwargs['MeanGrad'] = self.mean_grad_name
equal_nan = True kwargs['MeanGradOut'] = self.mean_grad_name
rmsprop_op = Operator('rmsprop', **kwargs)
atol = 1e-6
rmsprop_op.run(self.scope, self.place)
self.check( self.check(
np.array(mean_square), self.ms_out, place, self.mean_square_name) np.array(self.mean_square_tensor),
self.ms_out,
self.place,
self.mean_square_name,
atol=atol)
self.check( self.check(
np.array(moment), np.array(self.moment_tensor),
self.moment_out, self.moment_out,
place, self.place,
self.moment_name, self.moment_name,
atol=atol, atol=atol)
equal_nan=equal_nan)
self.check( self.check(
np.array(param), np.array(self.param_tensor),
self.param_out, self.param_out,
place, self.place,
self.param_name, self.param_name,
atol=atol, atol=atol)
equal_nan=equal_nan)
if self.centered: if self.centered:
self.check( self.check(
np.array(mean_grad), self.mg_out, place, self.mean_grad_name) np.array(self.mean_grad_tensor), self.mg_out, self.place,
self.mean_grad_name)
def test_rmsprop(self): def test_rmsprop(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
size = (128, 320)
for place in places: for place in places:
self.check_with_place(place, False, 1e-6) for centered in [False, True]:
self.check_with_place(place, False, 1e-10) with fluid.scope_guard(core.Scope()):
self.check_with_place(place, True, 1e-6) self.check_with_place(
self.check_with_place(place, True, 1e-10) place, is_sparse=False, centered=centered, size=size)
with fluid.scope_guard(core.Scope()):
self.check_with_place(
place,
is_sparse=True,
centered=centered,
row_num=512,
size=size)
with fluid.scope_guard(core.Scope()):
self.check_with_place(
place,
is_sparse=True,
centered=centered,
row_num=60,
size=size)
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册