提交 b4a32eaf 编写于 作者: Q Qiao Longfei

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into optimize-sum-seq-pooling-op

test=develop
...@@ -18,7 +18,7 @@ function(copy TARGET) ...@@ -18,7 +18,7 @@ function(copy TARGET)
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DSTS DEPS) set(multiValueArgs SRCS DSTS DEPS)
cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE) set(fluid_lib_dist_dep ${TARGET} ${fluid_lib_dist_dep} PARENT_SCOPE)
list(LENGTH copy_lib_SRCS copy_lib_SRCS_len) list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
list(LENGTH copy_lib_DSTS copy_lib_DSTS_len) list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
...@@ -185,7 +185,8 @@ copy(cmake_cache ...@@ -185,7 +185,8 @@ copy(cmake_cache
SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
DSTS ${FLUID_INSTALL_DIR}) DSTS ${FLUID_INSTALL_DIR})
add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) # This command generates a complete fluid library for both train and inference
add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
# paddle fluid version # paddle fluid version
execute_process( execute_process(
......
...@@ -127,6 +127,7 @@ paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None ...@@ -127,6 +127,7 @@ paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None
paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.margin_rank_loss ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None))
paddle.fluid.layers.elu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None)) paddle.fluid.layers.elu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
paddle.fluid.layers.relu6 ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None)) paddle.fluid.layers.relu6 ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None))
paddle.fluid.layers.pow ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None)) paddle.fluid.layers.pow ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
......
...@@ -12,6 +12,5 @@ endif(NOT WIN32) ...@@ -12,6 +12,5 @@ endif(NOT WIN32)
if(WITH_INFERENCE) if(WITH_INFERENCE)
# NOTE: please add subdirectory inference at last. # NOTE: please add subdirectory inference at last.
add_subdirectory(inference) add_subdirectory(inference)
add_subdirectory(train)
endif() endif()
add_subdirectory(train)
...@@ -64,7 +64,8 @@ class OpHandleBase { ...@@ -64,7 +64,8 @@ class OpHandleBase {
virtual bool IsMultiDeviceTransfer() { return false; } virtual bool IsMultiDeviceTransfer() { return false; }
const platform::DeviceContext *DeviceContext(platform::Place place) { const platform::DeviceContext *DeviceContext(platform::Place place) {
return dev_ctxes_[place]; auto it = dev_ctxes_.find(place);
return it != dev_ctxes_.end() ? it->second : nullptr;
} }
void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) { void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {
......
...@@ -46,6 +46,41 @@ ExecutorPrepareContext::~ExecutorPrepareContext() { ...@@ -46,6 +46,41 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
VLOG(5) << "destroy ExecutorPrepareContext"; VLOG(5) << "destroy ExecutorPrepareContext";
} }
template <typename RefCntMap>
static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
GarbageCollector<Tensor>* gc,
RefCntMap* ref_cnts) {
std::unordered_set<Tensor*> erase_tensors;
auto handler = [&](const VariableNameMap& name_map) {
for (auto& name_pair : name_map) {
for (auto& name : name_pair.second) {
auto it = ref_cnts->find(name);
if (it == ref_cnts->end()) continue;
if ((it->second)-- == 1) {
auto* var = scope.FindVar(name);
if (var != nullptr) {
VLOG(10) << "Erase tensor \'" << name << "\'";
if (var->IsType<LoDTensor>()) {
erase_tensors.insert(var->GetMutable<LoDTensor>());
} else if (var->IsType<SelectedRows>()) {
erase_tensors.insert(
var->GetMutable<SelectedRows>()->mutable_value());
}
}
}
}
}
};
handler(op->Inputs());
handler(op->Outputs());
if (!erase_tensors.empty()) {
gc->Add(erase_tensors);
}
}
Executor::Executor(const platform::Place& place) : place_(place) {} Executor::Executor(const platform::Place& place) : place_(place) {}
void Executor::Close() { void Executor::Close() {
...@@ -331,9 +366,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -331,9 +366,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
} }
int64_t max_memory_size = GetEagerDeletionThreshold(); int64_t max_memory_size = GetEagerDeletionThreshold();
std::unique_ptr<GarbageCollector<Tensor>> gc; std::unique_ptr<GarbageCollector<Tensor>> gc;
if (max_memory_size >= 0) { // WhileOp would set keep_kids to false
// WhileGradOp would need the scopes created in WhileOp
// Perhaps, we should not perform eager deletion in WhileOp
// The scopes and variables created by WhileOp would be deleted
// in WhileGradOp.
if (max_memory_size >= 0 && !keep_kids) {
ctx->ResetReferenceCount(); ctx->ResetReferenceCount();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) { if (platform::is_gpu_place(place_)) {
...@@ -352,45 +391,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -352,45 +391,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
op->Run(*local_scope, place_); op->Run(*local_scope, place_);
if (gc != nullptr) { if (gc != nullptr) {
std::vector<std::string> erase_vars; DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
for (auto& input : op->Inputs()) { &(ctx->cur_ref_cnts_));
for (auto& input_name : input.second) {
auto it = ctx->cur_ref_cnts_.find(input_name);
if (it == ctx->cur_ref_cnts_.end()) continue;
if (it->second == 1) { // should delete it
erase_vars.emplace_back(input_name);
ctx->cur_ref_cnts_.erase(input_name);
} else {
--(it->second);
}
}
}
for (auto& output : op->Outputs()) {
for (auto& output_name : output.second) {
auto it = ctx->cur_ref_cnts_.find(output_name);
if (it == ctx->cur_ref_cnts_.end()) continue;
if (it->second == 1) {
erase_vars.emplace_back(output_name);
ctx->cur_ref_cnts_.erase(output_name);
} else {
--(it->second);
}
}
}
if (!erase_vars.empty()) {
std::vector<framework::LoDTensor*> erase_tensors;
for (auto& name : erase_vars) {
auto* var = local_scope->FindVar(name);
if (var == nullptr) continue;
if (var->IsType<framework::LoDTensor>()) {
auto* tensor = var->GetMutable<framework::LoDTensor>();
erase_tensors.push_back(tensor);
}
}
if (!erase_tensors.empty()) gc->Add(erase_tensors);
}
} }
if (FLAGS_benchmark) { if (FLAGS_benchmark) {
......
...@@ -32,38 +32,32 @@ template <typename T> ...@@ -32,38 +32,32 @@ template <typename T>
std::unordered_map<std::string, T> GetNonPersistableReferenceCount( std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
const ProgramDesc& prog, size_t block_id) { const ProgramDesc& prog, size_t block_id) {
auto& block = prog.Block(block_id); auto& block = prog.Block(block_id);
std::unordered_set<std::string> ignored_vars;
std::unordered_map<std::string, T> ref_cnts; std::unordered_map<std::string, T> ref_cnts;
for (auto var_desc : block.AllVars()) { auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
for (auto& name_pair : name_map) {
for (auto& name : name_pair.second) {
auto* var_desc = block.FindVar(name);
if (var_desc == nullptr || var_desc->Persistable()) continue;
auto type = var_desc->Proto()->type().type(); auto type = var_desc->Proto()->type().type();
if (type != proto::VarType::LOD_TENSOR || var_desc->Persistable()) { if (type != proto::VarType::LOD_TENSOR &&
ignored_vars.insert(var_desc->Name()); // ignore persistable vars type != proto::VarType::SELECTED_ROWS) {
} continue;
} }
for (auto op_desc : block.AllOps()) { auto it = ref_cnts.find(name);
for (auto& input : op_desc->Inputs()) { if (it != ref_cnts.end()) {
for (auto& input_name : input.second) { ++it->second;
if (!ignored_vars.count(input_name)) { } else {
if (ref_cnts.count(input_name)) ref_cnts[name] = 1;
++ref_cnts[input_name];
else
ref_cnts[input_name] = 1;
} }
} }
} }
};
for (auto& output : op_desc->Outputs()) { for (auto op_desc : block.AllOps()) {
for (auto output_name : output.second) { update_ref_cnts(op_desc, op_desc->Inputs());
if (!ignored_vars.count(output_name)) { update_ref_cnts(op_desc, op_desc->Outputs());
if (ref_cnts.count(output_name))
++ref_cnts[output_name];
else
ref_cnts[output_name] = 1;
}
}
}
} }
return ref_cnts; return ref_cnts;
} }
......
...@@ -44,89 +44,6 @@ namespace ir { ...@@ -44,89 +44,6 @@ namespace ir {
GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean, pattern_name); \ GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance, pattern_name) GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance, pattern_name)
template <typename UnaryOperation>
LoDTensor tensor_apply(const LoDTensor& vec, UnaryOperation f) {
LoDTensor vec_y;
vec_y.Resize(vec.dims());
const float* x = vec.data<float>();
float* y = vec_y.mutable_data<float>(platform::CPUPlace());
for (int64_t i = 0; i < vec.numel(); i++) {
y[i] = f(x[i]);
}
return vec_y;
}
void tensor_apply_inplace(LoDTensor* vec, float (*f)(float)) {
float* data = vec->mutable_data<float>(platform::CPUPlace());
for (int64_t i = 0; i < vec->numel(); i++) {
data[i] = f(data[i]);
}
}
template <typename BinaryOperation>
LoDTensor tensor_apply_eltwise(const LoDTensor& vec_a, const LoDTensor& vec_b,
BinaryOperation f) {
PADDLE_ENFORCE_EQ(vec_a.dims(), vec_b.dims());
LoDTensor vec_y;
vec_y.Resize(vec_a.dims());
const float* a = vec_a.data<float>();
const float* b = vec_b.data<float>();
float* y = vec_y.mutable_data<float>(platform::CPUPlace());
for (int64_t i = 0; i < vec_a.numel(); i++) {
y[i] = f(a[i], b[i]);
}
return vec_y;
}
template <typename BinaryOperation>
LoDTensor tensor_apply_eltwise_broadcast(const LoDTensor& vec_a,
const LoDTensor& vec_b,
BinaryOperation f) {
PADDLE_ENFORCE_EQ(vec_a.dims().size(), 2);
PADDLE_ENFORCE_EQ(vec_b.dims().size(), 2);
PADDLE_ENFORCE_EQ(vec_a.dims()[0], vec_b.dims()[0]);
PADDLE_ENFORCE_EQ(vec_b.dims()[1], 1);
LoDTensor vec_y;
vec_y.Resize(vec_a.dims());
const float* a = vec_a.data<float>();
const float* b = vec_b.data<float>();
float* y = vec_y.mutable_data<float>(platform::CPUPlace());
size_t a_height = vec_a.dims()[0];
size_t a_width = vec_a.dims()[1];
for (size_t h = 0; h < a_height; h++) {
for (size_t w = 0; w < a_width; ++w) {
*(y++) = f(*(a++), b[h]);
}
}
return vec_y;
}
// reshape to two dimensions {A, B * C * ...}
void make_tensor_2d(LoDTensor* tensor_to_reshape) {
auto dims_count = tensor_to_reshape->dims().size();
PADDLE_ENFORCE_GT(dims_count, 0);
int size2 = 1;
for (int i = 1; i < dims_count; i++) {
size2 *= tensor_to_reshape->dims()[i];
}
tensor_to_reshape->Resize(make_ddim({tensor_to_reshape->dims()[0], size2}));
}
void recompute_conv_weights(LoDTensor* weights, LoDTensor* tmp) {
// remember the weights tensor shape {A, B, C, ...}
auto weights_shape = weights->dims();
// reduce the weights to 2d {A, B * C * ...}
make_tensor_2d(weights);
// make tmp tensor 2d by adding 1 as second dim {A, 1}
make_tensor_2d(tmp);
*weights =
tensor_apply_eltwise_broadcast(*weights, *tmp, std::multiplies<float>());
// reshape weights to the original dims {A, B, C, ...}
weights->Resize(weights_shape);
}
void recompute_bias_and_weights(const Scope* scope, void recompute_bias_and_weights(const Scope* scope,
ir::Node* conv_weight, // ir::Node* conv_weight, //
const ir::Node& bn_scale, // const ir::Node& bn_scale, //
...@@ -135,6 +52,13 @@ void recompute_bias_and_weights(const Scope* scope, ...@@ -135,6 +52,13 @@ void recompute_bias_and_weights(const Scope* scope,
const ir::Node& bn_variance, // const ir::Node& bn_variance, //
LoDTensor* eltwise_y_in_tensor, // LoDTensor* eltwise_y_in_tensor, //
float epsilon) { float epsilon) {
using EigenVectorArrayMap =
Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
using EigenMatrixArrayMap = Eigen::Map<
Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
// Re-compute bias of conv2d from BN // Re-compute bias of conv2d from BN
PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), bn_bias_tensor.dims()); PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), bn_bias_tensor.dims());
...@@ -143,31 +67,38 @@ void recompute_bias_and_weights(const Scope* scope, ...@@ -143,31 +67,38 @@ void recompute_bias_and_weights(const Scope* scope,
scope->FindVar(bn_variance.Name())->GetMutable<LoDTensor>(); scope->FindVar(bn_variance.Name())->GetMutable<LoDTensor>();
auto* mean_tensor = scope->FindVar(bn_mean.Name())->GetMutable<LoDTensor>(); auto* mean_tensor = scope->FindVar(bn_mean.Name())->GetMutable<LoDTensor>();
auto std_tensor = LoDTensor(); ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
std_tensor.Resize(bn_bias_tensor.dims()); scale_tensor->numel(), 1);
std_tensor = EigenVectorArrayMap variance_array(
tensor_apply(*variance_tensor, [&](float x) { return x + epsilon; }); variance_tensor->mutable_data<float>(platform::CPUPlace()),
variance_tensor->numel(), 1);
ConstEigenVectorArrayMap mean_array(mean_tensor->data<float>(),
mean_tensor->numel(), 1);
ConstEigenVectorArrayMap bn_bias_array(bn_bias_tensor.data<float>(),
bn_bias_tensor.numel(), 1);
using EigenVectorArrayMap = // variance will not be used anymore, so make it std_array and then tmp_array
Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>; variance_array += epsilon;
variance_array = variance_array.sqrt();
variance_array = scale_array / variance_array;
EigenVectorArrayMap eltwise_y_in_array(
eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
eltwise_y_in_tensor->numel(), 1);
EigenVectorArrayMap std_vec( eltwise_y_in_array =
std_tensor.mutable_data<float>(platform::CPUPlace()), std_tensor.numel(), ((eltwise_y_in_array - mean_array) * variance_array) + bn_bias_array;
1);
std_vec = std_vec.sqrt();
auto tmp_tensor =
tensor_apply_eltwise(*scale_tensor, std_tensor, std::divides<float>());
auto tensor_minus = tensor_apply_eltwise(*eltwise_y_in_tensor, *mean_tensor,
std::minus<float>());
auto tensor_mul =
tensor_apply_eltwise(tensor_minus, tmp_tensor, std::multiplies<float>());
*eltwise_y_in_tensor =
tensor_apply_eltwise(tensor_mul, bn_bias_tensor, std::plus<float>());
// Re-compute weight of conv2d from BN // Re-compute weight of conv2d from BN
auto* current_param = auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>(); auto weights_shape = weights->dims();
recompute_conv_weights(current_param, &tmp_tensor); auto weights_shape_2d = flatten_to_2d(weights_shape, 1);
EigenMatrixArrayMap weights_array_2d(
weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0],
weights_shape_2d[1]);
weights_array_2d.colwise() *= variance_array;
} }
std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl( std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
......
...@@ -307,6 +307,10 @@ ParallelExecutor::~ParallelExecutor() { ...@@ -307,6 +307,10 @@ ParallelExecutor::~ParallelExecutor() {
} }
} }
} }
// member_ must be destructed before gcs_ since the destructor of
// ReferenceCountOpHandle use raw pointers of gcs_ inside.
member_.reset();
} }
} // namespace framework } // namespace framework
......
...@@ -75,7 +75,7 @@ class ParallelExecutor { ...@@ -75,7 +75,7 @@ class ParallelExecutor {
private: private:
void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const; void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
ParallelExecutorPrivate *member_; std::unique_ptr<ParallelExecutorPrivate> member_;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
// ref_cnts_ is only initialized when ParallelExecutor constructs, and then // ref_cnts_ is only initialized when ParallelExecutor constructs, and then
......
...@@ -49,18 +49,18 @@ int64_t GetEagerDeletionThreshold() { ...@@ -49,18 +49,18 @@ int64_t GetEagerDeletionThreshold() {
Scope::~Scope() { DropKids(); } Scope::~Scope() { DropKids(); }
Scope& Scope::NewScope() const { Scope& Scope::NewScope() const {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
kids_.push_back(new Scope(this)); kids_.push_back(new Scope(this));
return *kids_.back(); return *kids_.back();
} }
Variable* Scope::Var(const std::string& name) { Variable* Scope::Var(const std::string& name) {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
return VarInternal(name); return VarInternal(name);
} }
Variable* Scope::Var(std::string* name) { Variable* Scope::Var(std::string* name) {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
auto new_name = string::Sprintf("%p.%d", this, vars_.size()); auto new_name = string::Sprintf("%p.%d", this, vars_.size());
if (name != nullptr) { if (name != nullptr) {
*name = new_name; *name = new_name;
...@@ -69,29 +69,34 @@ Variable* Scope::Var(std::string* name) { ...@@ -69,29 +69,34 @@ Variable* Scope::Var(std::string* name) {
} }
Variable* Scope::FindVar(const std::string& name) const { Variable* Scope::FindVar(const std::string& name) const {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
return FindVarInternal(name); return FindVarInternal(name);
} }
Variable* Scope::FindLocalVar(const std::string& name) const {
std::lock_guard<std::mutex> lock(mutex_);
return FindVarLocally(name);
}
const Scope* Scope::FindScope(const Variable* var) const { const Scope* Scope::FindScope(const Variable* var) const {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
return FindScopeInternal(var); return FindScopeInternal(var);
} }
void Scope::DropKids() { void Scope::DropKids() {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
for (Scope* s : kids_) delete s; for (Scope* s : kids_) delete s;
kids_.clear(); kids_.clear();
} }
bool Scope::HasKid(const Scope* scope) const { bool Scope::HasKid(const Scope* scope) const {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
return it != this->kids_.end(); return it != this->kids_.end();
} }
std::vector<std::string> Scope::LocalVarNames() const { std::vector<std::string> Scope::LocalVarNames() const {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
std::vector<std::string> known_vars; std::vector<std::string> known_vars;
known_vars.reserve(this->vars_.size()); known_vars.reserve(this->vars_.size());
for (auto& p : vars_) { for (auto& p : vars_) {
...@@ -101,7 +106,7 @@ std::vector<std::string> Scope::LocalVarNames() const { ...@@ -101,7 +106,7 @@ std::vector<std::string> Scope::LocalVarNames() const {
} }
void Scope::DeleteScope(Scope* scope) const { void Scope::DeleteScope(Scope* scope) const {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
this->kids_.erase(it); this->kids_.erase(it);
...@@ -114,7 +119,7 @@ void Scope::DeleteScope(Scope* scope) const { ...@@ -114,7 +119,7 @@ void Scope::DeleteScope(Scope* scope) const {
} }
void Scope::EraseVars(const std::vector<std::string>& var_names) { void Scope::EraseVars(const std::vector<std::string>& var_names) {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
std::set<std::string> var_set(var_names.begin(), var_names.end()); std::set<std::string> var_set(var_names.begin(), var_names.end());
for (auto it = vars_.begin(); it != vars_.end();) { for (auto it = vars_.begin(); it != vars_.end();) {
if (var_set.find(it->first) != var_set.end()) { if (var_set.find(it->first) != var_set.end()) {
...@@ -127,12 +132,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) { ...@@ -127,12 +132,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
void Scope::Rename(const std::string& origin_name, void Scope::Rename(const std::string& origin_name,
const std::string& new_name) const { const std::string& new_name) const {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
RenameInternal(origin_name, new_name); RenameInternal(origin_name, new_name);
} }
std::string Scope::Rename(const std::string& origin_name) const { std::string Scope::Rename(const std::string& origin_name) const {
std::unique_lock<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
auto new_name = string::Sprintf("%p.%d", this, vars_.size()); auto new_name = string::Sprintf("%p.%d", this, vars_.size());
RenameInternal(origin_name, new_name); RenameInternal(origin_name, new_name);
return new_name; return new_name;
......
...@@ -63,6 +63,11 @@ class Scope { ...@@ -63,6 +63,11 @@ class Scope {
/// Caller doesn't own the returned Variable. /// Caller doesn't own the returned Variable.
Variable* FindVar(const std::string& name) const; Variable* FindVar(const std::string& name) const;
/// Find a variable in the current scope.
/// Return nullptr if cannot find.
/// Caller doesn't own the returned Variable.
Variable* FindLocalVar(const std::string& name) const;
const Scope* parent() const { return parent_; } const Scope* parent() const { return parent_; }
/// Find the scope or an ancestor scope that contains the given variable. /// Find the scope or an ancestor scope that contains the given variable.
......
...@@ -18,6 +18,7 @@ namespace paddle { ...@@ -18,6 +18,7 @@ namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
class AdadeltaOp : public framework::OperatorWithKernel { class AdadeltaOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -31,6 +32,16 @@ class AdadeltaOp : public framework::OperatorWithKernel { ...@@ -31,6 +32,16 @@ class AdadeltaOp : public framework::OperatorWithKernel {
"Input(AvgSquaredGrad) of AdadeltaOp should not be null."); "Input(AvgSquaredGrad) of AdadeltaOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"), PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"),
"Input(AvgSquaredUpdate) of AdadeltaOp should not be null."); "Input(AvgSquaredUpdate) of AdadeltaOp should not be null.");
PADDLE_ENFORCE(
ctx->GetInputsVarType("Param").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
PADDLE_ENFORCE(
ctx->GetInputsVarType("Grad").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(ParamOut) of AdadeltaOp should not be null."); "Output(ParamOut) of AdadeltaOp should not be null.");
...@@ -56,6 +67,7 @@ class AdadeltaOp : public framework::OperatorWithKernel { ...@@ -56,6 +67,7 @@ class AdadeltaOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("AvgSquaredGradOut", param_dim); ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim); ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim);
} }
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
auto input_data_type = auto input_data_type =
......
...@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T> ...@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
class AdadeltaOpKernel : public framework::OpKernel<T> { class AdadeltaOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
const auto* grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Grad").front(), grad_var->Type().name());
auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut"); auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
auto avg_squared_grad_out_tensor = auto avg_squared_grad_out_tensor =
ctx.Output<framework::Tensor>("AvgSquaredGradOut"); ctx.Output<framework::Tensor>("AvgSquaredGradOut");
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -21,25 +22,31 @@ namespace operators { ...@@ -21,25 +22,31 @@ namespace operators {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
struct SparseAdagradFunctor { struct SparseAdagradFunctor {
void operator()(const DeviceContext& context, void operator()(const DeviceContext &context,
const framework::SelectedRows& grad, const framework::SelectedRows &grad,
const framework::Tensor& learning_rate, T epsilon, const framework::Tensor &learning_rate, T epsilon,
framework::Tensor* moment, framework::Tensor* param); framework::Tensor *moment, framework::Tensor *param);
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class AdagradOpKernel : public framework::OpKernel<T> { class AdagradOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
auto* param_out_tensor = ctx.Output<framework::Tensor>("ParamOut"); const auto *param_var = ctx.InputVar("Param");
auto* moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut"); PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
auto *param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
auto *moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
param_out_tensor->mutable_data<T>(ctx.GetPlace()); param_out_tensor->mutable_data<T>(ctx.GetPlace());
moment_out_tensor->mutable_data<T>(ctx.GetPlace()); moment_out_tensor->mutable_data<T>(ctx.GetPlace());
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon")); T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
auto* grad_var = ctx.InputVar("Grad"); auto *grad_var = ctx.InputVar("Grad");
if (grad_var->IsType<framework::LoDTensor>()) { if (grad_var->IsType<framework::LoDTensor>()) {
auto param = framework::EigenVector<T>::Flatten( auto param = framework::EigenVector<T>::Flatten(
*ctx.Input<framework::Tensor>("Param")); *ctx.Input<framework::Tensor>("Param"));
...@@ -47,16 +54,16 @@ class AdagradOpKernel : public framework::OpKernel<T> { ...@@ -47,16 +54,16 @@ class AdagradOpKernel : public framework::OpKernel<T> {
*ctx.Input<framework::Tensor>("Grad")); *ctx.Input<framework::Tensor>("Grad"));
auto moment = framework::EigenVector<T>::Flatten( auto moment = framework::EigenVector<T>::Flatten(
*ctx.Input<framework::Tensor>("Moment")); *ctx.Input<framework::Tensor>("Moment"));
auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate"); auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor); auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor); auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
auto* place = ctx.template device_context<DeviceContext>().eigen_device(); auto *place = ctx.template device_context<DeviceContext>().eigen_device();
moment_out.device(*place) = moment + grad * grad; moment_out.device(*place) = moment + grad * grad;
Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel()); Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
if (platform::is_cpu_place(ctx.GetPlace())) { if (platform::is_cpu_place(ctx.GetPlace())) {
auto* lr = learning_rate->data<T>(); auto *lr = learning_rate->data<T>();
param_out.device(*place) = param_out.device(*place) =
param - lr[0] * grad / (moment_out.sqrt() + epsilon); param - lr[0] * grad / (moment_out.sqrt() + epsilon);
} else { } else {
...@@ -66,10 +73,10 @@ class AdagradOpKernel : public framework::OpKernel<T> { ...@@ -66,10 +73,10 @@ class AdagradOpKernel : public framework::OpKernel<T> {
lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
} }
} else if (grad_var->IsType<framework::SelectedRows>()) { } else if (grad_var->IsType<framework::SelectedRows>()) {
auto* param_tensor = ctx.Input<framework::Tensor>("Param"); auto *param_tensor = ctx.Input<framework::Tensor>("Param");
PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor); PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor);
auto* moment_tensor = ctx.Input<framework::Tensor>("Moment"); auto *moment_tensor = ctx.Input<framework::Tensor>("Moment");
PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor); PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor);
SparseAdagradFunctor<DeviceContext, T> functor; SparseAdagradFunctor<DeviceContext, T> functor;
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/math/algorithm.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
...@@ -199,23 +200,9 @@ struct SparseAdamFunctor { ...@@ -199,23 +200,9 @@ struct SparseAdamFunctor {
row_numel_(row_numel), row_numel_(row_numel),
row_count_(row_count) {} row_count_(row_count) {}
inline HOSTDEVICE int64_t BinarySearchInRows(int64_t row) const {
int64_t beg = 0, end = row_count_ - 1;
while (beg <= end) {
auto mid = ((beg + end) >> 1);
if (rows_[mid] == row)
return mid;
else if (rows_[mid] < row)
beg = mid + 1;
else
end = mid - 1;
}
return -1;
}
inline HOSTDEVICE void operator()(size_t i) const { inline HOSTDEVICE void operator()(size_t i) const {
int64_t row = i / row_numel_; auto row_idx =
auto row_idx = BinarySearchInRows(row); math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
// The following code is the same as dense // The following code is the same as dense
...@@ -244,6 +231,12 @@ template <typename DeviceContext, typename T> ...@@ -244,6 +231,12 @@ template <typename DeviceContext, typename T>
class AdamOpKernel : public framework::OpKernel<T> { class AdamOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
using paddle::framework::LoDTensor; using paddle::framework::LoDTensor;
using paddle::operators::detail::Ref; using paddle::operators::detail::Ref;
......
...@@ -35,6 +35,16 @@ class AdamaxOp : public framework::OperatorWithKernel { ...@@ -35,6 +35,16 @@ class AdamaxOp : public framework::OperatorWithKernel {
"Input(LearningRate) of AdamaxOp should not be null."); "Input(LearningRate) of AdamaxOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
"Input(Beta1Pow) of AdamaxOp should not be null."); "Input(Beta1Pow) of AdamaxOp should not be null.");
PADDLE_ENFORCE(
ctx->GetInputsVarType("Param").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
PADDLE_ENFORCE(
ctx->GetInputsVarType("Grad").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(ParamOut) of AdamaxOp should not be null."); "Output(ParamOut) of AdamaxOp should not be null.");
......
...@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T> ...@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
class AdamaxOpKernel : public framework::OpKernel<T> { class AdamaxOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
const auto* grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Grad").front(), grad_var->Type().name());
auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut"); auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut"); auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut"); auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut");
......
...@@ -32,6 +32,16 @@ class DecayedAdagradOp : public framework::OperatorWithKernel { ...@@ -32,6 +32,16 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->HasInput("LearningRate"), ctx->HasInput("LearningRate"),
"Input(LearningRate) of DecayedAdagradOp should not be null."); "Input(LearningRate) of DecayedAdagradOp should not be null.");
PADDLE_ENFORCE(
ctx->GetInputsVarType("Param").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
PADDLE_ENFORCE(
ctx->GetInputsVarType("Grad").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(ParamOut) of DecayedAdagradOp should not be null."); "Output(ParamOut) of DecayedAdagradOp should not be null.");
......
...@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T> ...@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
class DecayedAdagradOpKernel : public framework::OpKernel<T> { class DecayedAdagradOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
const auto* grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Grad").front(), grad_var->Type().name());
auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut"); auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut"); auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
......
...@@ -34,6 +34,16 @@ class FTRLOp : public framework::OperatorWithKernel { ...@@ -34,6 +34,16 @@ class FTRLOp : public framework::OperatorWithKernel {
"Input(Grad) of FTRL should not be null."); "Input(Grad) of FTRL should not be null.");
PADDLE_ENFORCE(ctx->HasInput("LearningRate"), PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
"Input(LearningRate) of FTRL should not be null."); "Input(LearningRate) of FTRL should not be null.");
PADDLE_ENFORCE(
ctx->GetInputsVarType("Param").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
PADDLE_ENFORCE(
ctx->GetInputsVarType("Grad").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(ParamOut) of FTRL should not be null."); "Output(ParamOut) of FTRL should not be null.");
......
...@@ -28,6 +28,17 @@ template <typename DeviceContext, typename T> ...@@ -28,6 +28,17 @@ template <typename DeviceContext, typename T>
class FTRLOpKernel : public framework::OpKernel<T> { class FTRLOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
const auto* grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Grad").front(), grad_var->Type().name());
auto* param_out = ctx.Output<Tensor>("ParamOut"); auto* param_out = ctx.Output<Tensor>("ParamOut");
auto* sq_accum_out = ctx.Output<Tensor>("SquaredAccumOut"); auto* sq_accum_out = ctx.Output<Tensor>("SquaredAccumOut");
auto* lin_accum_out = ctx.Output<Tensor>("LinearAccumOut"); auto* lin_accum_out = ctx.Output<Tensor>("LinearAccumOut");
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <cstdint> // for int64_t
#include <numeric>
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) {
int64_t beg = 0, end = num - 1;
while (beg <= end) {
auto mid = ((beg + end) >> 1);
if (x[mid] == val)
return mid;
else if (x[mid] < val)
beg = mid + 1;
else
end = mid - 1;
}
return -1;
}
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include <set> #include <set>
#include <unordered_map> #include <unordered_map>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
namespace paddle { namespace paddle {
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
......
...@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/sequence_pooling.h"
#include <string> #include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_pooling.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -180,6 +182,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> { ...@@ -180,6 +182,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
} }
auto lod = input.lod()[0]; auto lod = input.lod()[0];
auto& place = *context.eigen_device(); auto& place = *context.eigen_device();
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) { for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
Tensor in_t = Tensor in_t =
input.Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1])); input.Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
...@@ -191,7 +194,14 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> { ...@@ -191,7 +194,14 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
if (pooltype == "AVERAGE") { if (pooltype == "AVERAGE") {
out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}})); out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
} else if (pooltype == "SUM") { } else if (pooltype == "SUM") {
out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})); if (h > 0) {
const T* in_data = in_t.data<T>();
T* out_data = out_t.mutable_data<T>(context.GetPlace());
blas.VCOPY(w, in_data, out_data);
for (int64_t r = 1; r != h; ++r) {
blas.AXPY(w, 1., in_data + r * w, out_data);
}
}
} else if (pooltype == "SQRT") { } else if (pooltype == "SQRT") {
out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) / out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
std::sqrt(static_cast<T>(h)); std::sqrt(static_cast<T>(h));
...@@ -223,6 +233,7 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> { ...@@ -223,6 +233,7 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
} }
auto lod = in_grad->lod()[0]; auto lod = in_grad->lod()[0];
auto& place = *context.eigen_device(); auto& place = *context.eigen_device();
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) { for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
auto in_g_t = in_grad->Slice(static_cast<int>(lod[i]), auto in_g_t = in_grad->Slice(static_cast<int>(lod[i]),
static_cast<int>(lod[i + 1])); static_cast<int>(lod[i + 1]));
...@@ -237,7 +248,11 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> { ...@@ -237,7 +248,11 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
if (pooltype == "AVERAGE") { if (pooltype == "AVERAGE") {
in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast); in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
} else if (pooltype == "SUM") { } else if (pooltype == "SUM") {
in_g_e.device(place) = (out_g_e).broadcast(bcast); const T* out_g_data = out_g_t.data<T>();
T* in_g_data = in_g_t.mutable_data<T>(context.GetPlace());
for (int r = 0; r != h; ++r) {
blas.VCOPY(w, out_g_data, in_g_data + r * w);
}
} else if (pooltype == "SQRT") { } else if (pooltype == "SQRT") {
in_g_e.device(place) = in_g_e.device(place) =
(out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast); (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
......
...@@ -33,6 +33,11 @@ class MomentumOp : public framework::OperatorWithKernel { ...@@ -33,6 +33,11 @@ class MomentumOp : public framework::OperatorWithKernel {
"Input(velocity) of Momentum should not be null."); "Input(velocity) of Momentum should not be null.");
PADDLE_ENFORCE(ctx->HasInput("LearningRate"), PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
"Input(LearningRate) of Momentum should not be null."); "Input(LearningRate) of Momentum should not be null.");
PADDLE_ENFORCE(
ctx->GetInputsVarType("Param").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(ParamOut) of Momentum should not be null."); "Output(ParamOut) of Momentum should not be null.");
......
...@@ -46,6 +46,17 @@ template <typename T> ...@@ -46,6 +46,17 @@ template <typename T>
class MomentumOpCUDAKernel : public framework::OpKernel<T> { class MomentumOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
const auto* grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Grad").front(), grad_var->Type().name());
auto param_out = ctx.Output<framework::Tensor>("ParamOut"); auto param_out = ctx.Output<framework::Tensor>("ParamOut");
auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut"); auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
auto param = ctx.Input<framework::Tensor>("Param"); auto param = ctx.Input<framework::Tensor>("Param");
......
...@@ -23,6 +23,12 @@ template <typename T> ...@@ -23,6 +23,12 @@ template <typename T>
class MomentumOpKernel : public framework::OpKernel<T> { class MomentumOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
auto param_out = ctx.Output<framework::Tensor>("ParamOut"); auto param_out = ctx.Output<framework::Tensor>("ParamOut");
auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut"); auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
auto param = ctx.Input<framework::Tensor>("Param"); auto param = ctx.Input<framework::Tensor>("Param");
......
...@@ -31,8 +31,8 @@ class BlockingQueue { ...@@ -31,8 +31,8 @@ class BlockingQueue {
// is a workaround and a simplified version of framework::Channel as it // is a workaround and a simplified version of framework::Channel as it
// doesn't support GPU and it implements on buffered blocking queue. // doesn't support GPU and it implements on buffered blocking queue.
public: public:
explicit BlockingQueue(size_t capacity) explicit BlockingQueue(size_t capacity, bool speed_test_mode = false)
: capacity_(capacity), closed_(false) { : capacity_(capacity), speed_test_mode_(speed_test_mode), closed_(false) {
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
capacity_, 0, capacity_, 0,
"The capacity of a reader::BlockingQueue must be greater than 0."); "The capacity of a reader::BlockingQueue must be greater than 0.");
...@@ -72,7 +72,9 @@ class BlockingQueue { ...@@ -72,7 +72,9 @@ class BlockingQueue {
if (!queue_.empty()) { if (!queue_.empty()) {
PADDLE_ENFORCE_NOT_NULL(elem); PADDLE_ENFORCE_NOT_NULL(elem);
*elem = queue_.front(); *elem = queue_.front();
if (LIKELY(!speed_test_mode_)) {
queue_.pop_front(); queue_.pop_front();
}
send_cv_.notify_one(); send_cv_.notify_one();
return true; return true;
} else { } else {
...@@ -114,6 +116,7 @@ class BlockingQueue { ...@@ -114,6 +116,7 @@ class BlockingQueue {
private: private:
size_t capacity_; size_t capacity_;
bool speed_test_mode_;
bool closed_; bool closed_;
std::deque<T> queue_; std::deque<T> queue_;
......
...@@ -33,8 +33,9 @@ class LoDTensorBlockingQueue { ...@@ -33,8 +33,9 @@ class LoDTensorBlockingQueue {
private: private:
LoDTensorBlockingQueue(size_t capacity, LoDTensorBlockingQueue(size_t capacity,
const std::vector<framework::DDim>& dims) const std::vector<framework::DDim>& dims,
: queue_(capacity), dims_(dims) {} bool speed_test_mode = false)
: queue_(capacity, speed_test_mode), dims_(dims) {}
public: public:
bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) { bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
...@@ -69,11 +70,12 @@ class LoDTensorBlockingQueue { ...@@ -69,11 +70,12 @@ class LoDTensorBlockingQueue {
class LoDTensorBlockingQueueHolder { class LoDTensorBlockingQueueHolder {
public: public:
void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) { void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims,
bool speed_test_mode = false) {
PADDLE_ENFORCE( PADDLE_ENFORCE(
queue_ == nullptr, queue_ == nullptr,
"LoDTensorBlockingQueueHolder::InitOnce() can only be called once"); "LoDTensorBlockingQueueHolder::InitOnce() can only be called once");
queue_.reset(new LoDTensorBlockingQueue(capacity, dims)); queue_.reset(new LoDTensorBlockingQueue(capacity, dims, speed_test_mode));
} }
inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const { inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const {
......
...@@ -217,3 +217,27 @@ TEST(BlockingQueue, MyClassTest) { ...@@ -217,3 +217,27 @@ TEST(BlockingQueue, MyClassTest) {
q.Receive(&b); q.Receive(&b);
EXPECT_EQ(a.val_, b.val_); EXPECT_EQ(a.val_, b.val_);
} }
TEST(BlockingQueue, speed_test_mode) {
size_t queue_size = 10;
BlockingQueue<size_t> q1(queue_size, false);
for (size_t i = 0; i < queue_size; ++i) {
q1.Send(i);
}
size_t b;
for (size_t i = 0; i < queue_size; ++i) {
q1.Receive(&b);
EXPECT_EQ(b, i);
}
EXPECT_EQ(q1.Size(), 0);
BlockingQueue<size_t> q2(queue_size, true);
for (size_t i = 0; i < queue_size; ++i) {
q2.Send(i);
}
for (size_t i = 0; i < queue_size; ++i) {
q2.Receive(&b);
EXPECT_EQ(b, 0);
}
EXPECT_EQ(q2.Size(), queue_size);
}
...@@ -32,6 +32,11 @@ class RmspropOp : public framework::OperatorWithKernel { ...@@ -32,6 +32,11 @@ class RmspropOp : public framework::OperatorWithKernel {
"Input(Grad) of RmspropOp should not be null."); "Input(Grad) of RmspropOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Moment"), PADDLE_ENFORCE(ctx->HasInput("Moment"),
"Input(Moment) of RmspropOp should not be null."); "Input(Moment) of RmspropOp should not be null.");
PADDLE_ENFORCE(
ctx->GetInputsVarType("Param").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(param_out) of RmspropOp should not be null."); "Output(param_out) of RmspropOp should not be null.");
......
...@@ -13,66 +13,254 @@ See the License for the specific language governing permissions and ...@@ -13,66 +13,254 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <math.h>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/algorithm.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/for_range.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor, template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>; using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename T>
struct DenseRmspropGradFunctor {
inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {}
HOSTDEVICE inline T operator()(int64_t idx) const { return grad_[idx]; }
const T *grad_;
};
template <typename T>
struct SparseRmspropGradFunctor {
inline SparseRmspropGradFunctor(const T *grad, const int64_t *rows,
int64_t row_numel, int64_t row_count)
: grad_(grad),
rows_(rows),
row_numel_(row_numel),
row_count_(row_count) {}
HOSTDEVICE inline T operator()(int64_t idx) const {
auto row_idx = math::BinarySearch(rows_, row_count_, idx / row_numel_);
return row_idx >= 0 ? grad_[row_idx * row_numel_ + idx % row_numel_] : 0;
}
const T *grad_;
const int64_t *rows_;
int64_t row_numel_;
int64_t row_count_;
};
template <typename T, typename GradFunctor>
struct UncenteredRmspropFunctor {
UncenteredRmspropFunctor(T *param, T *ms, T *mom, const T *lr, T rho,
T epsilon, T momentum,
const GradFunctor &grad_functor)
: param_(param),
ms_(ms),
mom_(mom),
lr_(lr),
rho_(rho),
epsilon_(epsilon),
momentum_(momentum),
grad_functor_(grad_functor) {}
HOSTDEVICE inline void operator()(int64_t idx) const {
T g = grad_functor_(idx);
T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
T mom_out = momentum_ * mom_[idx] + lr_[0] * g / sqrt(ms_out + epsilon_);
param_[idx] -= mom_out;
ms_[idx] = ms_out;
mom_[idx] = mom_out;
}
T *param_;
T *ms_;
T *mom_;
const T *lr_;
T rho_;
T epsilon_;
T momentum_;
GradFunctor grad_functor_;
};
template <typename T, typename GradFunctor>
struct CenteredRmspropFunctor {
CenteredRmspropFunctor(T *param, T *ms, T *mom, T *mean_grad, const T *lr,
T rho, T epsilon, T momentum,
const GradFunctor &grad_functor)
: param_(param),
ms_(ms),
mom_(mom),
mean_grad_(mean_grad),
lr_(lr),
rho_(rho),
epsilon_(epsilon),
momentum_(momentum),
grad_functor_(grad_functor) {}
HOSTDEVICE inline void operator()(int64_t idx) const {
T g = grad_functor_(idx);
T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g;
T mg_out = rho_ * mean_grad_[idx] + (1 - rho_) * g;
T mom_out = momentum_ * mom_[idx] +
lr_[0] * g / sqrt(ms_out - mg_out * mg_out + epsilon_);
param_[idx] -= mom_out;
ms_[idx] = ms_out;
mom_[idx] = mom_out;
mean_grad_[idx] = mg_out;
}
T *param_;
T *ms_;
T *mom_;
T *mean_grad_;
const T *lr_;
T rho_;
T epsilon_;
T momentum_;
GradFunctor grad_functor_;
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class RmspropOpKernel : public framework::OpKernel<T> { class RmspropOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
auto* param_out = ctx.Output<Tensor>("ParamOut"); using LoDTensor = framework::LoDTensor;
auto* moment_out = ctx.Output<Tensor>("MomentOut"); auto *grad_var = ctx.InputVar("Grad");
auto* mean_square_out = ctx.Output<Tensor>("MeanSquareOut"); auto *param_out = ctx.Output<LoDTensor>("ParamOut");
auto *moment_out = ctx.Output<LoDTensor>("MomentOut");
auto *mean_square_out = ctx.Output<LoDTensor>("MeanSquareOut");
auto grad = ctx.Input<Tensor>("Grad"); auto epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
auto rho = static_cast<T>(ctx.Attr<float>("decay"));
auto momentum = static_cast<T>(ctx.Attr<float>("momentum"));
bool centered = ctx.Attr<bool>("centered");
param_out->mutable_data<T>(ctx.GetPlace()); auto &p_tensor = *ctx.Input<LoDTensor>("Param");
moment_out->mutable_data<T>(ctx.GetPlace()); auto &ms_tensor = *ctx.Input<LoDTensor>("MeanSquare");
mean_square_out->mutable_data<T>(ctx.GetPlace()); auto &lr_tensor = *ctx.Input<LoDTensor>("LearningRate");
auto &mom_tensor = *ctx.Input<LoDTensor>("Moment");
float epsilon = ctx.Attr<float>("epsilon"); PADDLE_ENFORCE_EQ(&p_tensor, param_out,
float rho = ctx.Attr<float>("decay"); "Param and ParamOut must be the same Tensor");
float momentum = ctx.Attr<float>("momentum"); PADDLE_ENFORCE_EQ(&mom_tensor, moment_out,
bool centered = ctx.Attr<bool>("centered"); "Moment and MomentOut must be the same Tensor");
PADDLE_ENFORCE_EQ(&ms_tensor, mean_square_out,
"MeanSquare and MeanSquareOut must be the same Tensor");
auto &dev_ctx = ctx.template device_context<DeviceContext>();
size_t limit = static_cast<size_t>(ms_tensor.numel());
auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param")); if (grad_var->IsType<LoDTensor>()) {
auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare")); auto &grad_tensor = grad_var->Get<LoDTensor>();
auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
auto g = EigenVector<T>::Flatten(*grad); if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value) {
auto mom = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment")); auto &place =
*ctx.template device_context<DeviceContext>().eigen_device();
auto lr_value = lr_tensor.data<T>()[0];
auto p = EigenVector<T>::Flatten(p_tensor);
auto ms = EigenVector<T>::Flatten(ms_tensor);
auto g = EigenVector<T>::Flatten(grad_tensor);
auto mom = EigenVector<T>::Flatten(mom_tensor);
auto p_out = EigenVector<T>::Flatten(*param_out); auto p_out = EigenVector<T>::Flatten(*param_out);
auto mom_out = EigenVector<T>::Flatten(*moment_out); auto mom_out = EigenVector<T>::Flatten(*moment_out);
auto ms_out = EigenVector<T>::Flatten(*mean_square_out); auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
Eigen::DSizes<int, 1> grad_dsize(static_cast<int>(grad->numel()));
ms_out.device(place) = rho * ms + (1 - rho) * g * g; ms_out.device(place) = rho * ms + (1 - rho) * g * g;
if (centered) { if (centered) {
auto mg = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanGrad")); auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
auto* mean_grad_out = ctx.Output<Tensor>("MeanGradOut"); auto mg = EigenVector<T>::Flatten(mg_tensor);
mean_grad_out->mutable_data<T>(ctx.GetPlace()); auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
"MeanGrad and MeanGradOut must be the same Tensor");
auto mg_out = EigenVector<T>::Flatten(*mean_grad_out); auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
mg_out.device(place) = rho * mg + (1 - rho) * g; mg_out.device(place) = rho * mg + (1 - rho) * g;
mom_out.device(place) = momentum * mom +
lr.broadcast(grad_dsize) * g /
(ms_out - mg_out.square() + epsilon).sqrt();
} else {
mom_out.device(place) = mom_out.device(place) =
momentum * mom + momentum * mom +
lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt(); lr_value * g / (ms_out - mg_out.square() + epsilon).sqrt();
} else {
mom_out.device(place) =
momentum * mom + lr_value * g / (ms_out + epsilon).sqrt();
} }
p_out.device(place) = p - mom_out; p_out.device(place) = p - mom_out;
} else {
DenseRmspropGradFunctor<T> grad_func(grad_tensor.data<T>());
platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
if (centered) {
auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
"MeanGrad and MeanGradOut must be the same Tensor");
for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
param_out->mutable_data<T>(ctx.GetPlace()),
mean_square_out->mutable_data<T>(ctx.GetPlace()),
moment_out->mutable_data<T>(ctx.GetPlace()),
mean_grad_out->mutable_data<T>(ctx.GetPlace()),
lr_tensor.data<T>(), rho, epsilon, momentum, grad_func));
} else {
for_range(UncenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
param_out->mutable_data<T>(ctx.GetPlace()),
mean_square_out->mutable_data<T>(ctx.GetPlace()),
moment_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
rho, epsilon, momentum, grad_func));
}
}
} else if (grad_var->IsType<framework::SelectedRows>()) {
auto &grad = grad_var->Get<framework::SelectedRows>();
auto *merged_grad = const_cast<framework::Scope &>(ctx.scope())
.Var()
->GetMutable<framework::SelectedRows>();
math::scatter::MergeAdd<DeviceContext, T> merge_func;
merge_func(dev_ctx, grad, merged_grad);
platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
const int64_t *rows;
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
rows = merged_grad->rows().CUDAData(ctx.GetPlace());
} else {
#endif
rows = merged_grad->rows().data();
#ifdef PADDLE_WITH_CUDA
}
#endif
auto &merged_tensor = merged_grad->value();
int64_t row_count = merged_grad->rows().size();
int64_t row_numel = merged_tensor.numel() / row_count;
SparseRmspropGradFunctor<T> grad_func(merged_tensor.data<T>(), rows,
row_numel, row_count);
if (centered) {
auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
PADDLE_ENFORCE(&mg_tensor, mean_grad_out,
"MeanGrad and MeanGradOut must be the same Tensor");
for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
param_out->mutable_data<T>(ctx.GetPlace()),
mean_square_out->mutable_data<T>(ctx.GetPlace()),
moment_out->mutable_data<T>(ctx.GetPlace()),
mean_grad_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
rho, epsilon, momentum, grad_func));
} else {
for_range(UncenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
param_out->mutable_data<T>(ctx.GetPlace()),
mean_square_out->mutable_data<T>(ctx.GetPlace()),
moment_out->mutable_data<T>(ctx.GetPlace()), lr_tensor.data<T>(),
rho, epsilon, momentum, grad_func));
}
} else {
PADDLE_THROW("RMSProp only supports LoDTensor or SelectedRows gradient");
}
} }
}; };
......
...@@ -21,7 +21,7 @@ class SGDOp : public framework::OperatorWithKernel { ...@@ -21,7 +21,7 @@ class SGDOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"), PADDLE_ENFORCE(ctx->HasInput("Param"),
"Input(Param) of SGDOp should not be null."); "Input(Param) of SGDOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Grad"), PADDLE_ENFORCE(ctx->HasInput("Grad"),
...@@ -42,7 +42,7 @@ class SGDOp : public framework::OperatorWithKernel { ...@@ -42,7 +42,7 @@ class SGDOp : public framework::OperatorWithKernel {
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext &ctx) const override {
auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param")); auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
return framework::OpKernelType(data_type, ctx.device_context()); return framework::OpKernelType(data_type, ctx.device_context());
} }
...@@ -50,17 +50,20 @@ class SGDOp : public framework::OperatorWithKernel { ...@@ -50,17 +50,20 @@ class SGDOp : public framework::OperatorWithKernel {
class SGDOpInferVarType : public framework::VarTypeInference { class SGDOpInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc& op_desc, void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc* block) const override { framework::BlockDesc *block) const override {
auto input_var = op_desc.Input("Param")[0]; auto input_var_n = op_desc.Input("Param")[0];
for (auto& out_var : op_desc.Output("ParamOut")) { auto in_var_type = block->FindRecursiveOrCreateVar(input_var_n).GetType();
if (block->FindRecursiveOrCreateVar(input_var).GetType() == PADDLE_ENFORCE(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
framework::proto::VarType::SELECTED_ROWS) { in_var_type == framework::proto::VarType::LOD_TENSOR,
block->FindRecursiveOrCreateVar(out_var).SetType( "The input Var's type should be LoDtensor or SelectedRows,"
framework::proto::VarType::SELECTED_ROWS); " but the received var(%s)'s type is %s",
} else { input_var_n, in_var_type);
block->FindRecursiveOrCreateVar(out_var).SetType(
framework::proto::VarType::LOD_TENSOR); for (auto &out_var_n : op_desc.Output("ParamOut")) {
auto &out_var = block->FindRecursiveOrCreateVar(out_var_n);
if (out_var.GetType() != in_var_type) {
out_var.SetType(in_var_type);
} }
} }
} }
......
...@@ -56,6 +56,12 @@ template <typename T> ...@@ -56,6 +56,12 @@ template <typename T>
class SGDOpCUDAKernel : public framework::OpKernel<T> { class SGDOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(), param_var->Type().name());
auto* param = ctx.Input<framework::Tensor>("Param"); auto* param = ctx.Input<framework::Tensor>("Param");
auto* param_out = ctx.Output<framework::Tensor>("ParamOut"); auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate"); auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
......
...@@ -198,9 +198,9 @@ class CudnnHolder { ...@@ -198,9 +198,9 @@ class CudnnHolder {
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
: place_(place), cudnn_holder_(nullptr) { : place_(place), cudnn_holder_(nullptr) {
SetDeviceId(place_.device); SetDeviceId(place_.device);
compute_capability = GetCUDAComputeCapability(place_.device); compute_capability_ = GetCUDAComputeCapability(place_.device);
multi_process = GetCUDAMultiProcessors(place_.device); multi_process_ = GetCUDAMultiProcessors(place_.device);
max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device); max_threads_per_mp_ = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
PADDLE_ENFORCE(cudaStreamCreate(&stream_)); PADDLE_ENFORCE(cudaStreamCreate(&stream_));
eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_.reset(new EigenCudaStreamDevice());
eigen_stream_->Reinitialize(&stream_, place); eigen_stream_->Reinitialize(&stream_, place);
...@@ -211,6 +211,16 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) ...@@ -211,6 +211,16 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
cudnn_holder_.reset(new CudnnHolder(&stream_, place)); cudnn_holder_.reset(new CudnnHolder(&stream_, place));
} }
driver_version_ = GetCUDADriverVersion(place_.device);
runtime_version_ = GetCUDARuntimeVersion(place_.device);
LOG(INFO) << "device: " << place_.device
<< ", CUDA Capability: " << compute_capability_
<< ", Driver Version: " << driver_version_ / 1000 << "."
<< (driver_version_ % 100) / 10
<< ", Runtime Version: " << runtime_version_ / 1000 << "."
<< (runtime_version_ % 100) / 10;
callback_manager_.reset(new StreamCallbackManager(stream_)); callback_manager_.reset(new StreamCallbackManager(stream_));
} }
...@@ -232,11 +242,11 @@ void CUDADeviceContext::Wait() const { ...@@ -232,11 +242,11 @@ void CUDADeviceContext::Wait() const {
} }
int CUDADeviceContext::GetComputeCapability() const { int CUDADeviceContext::GetComputeCapability() const {
return compute_capability; return compute_capability_;
} }
int CUDADeviceContext::GetMaxPhysicalThreadCount() const { int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
return multi_process * max_threads_per_mp; return multi_process_ * max_threads_per_mp_;
} }
Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
......
...@@ -135,9 +135,11 @@ class CUDADeviceContext : public DeviceContext { ...@@ -135,9 +135,11 @@ class CUDADeviceContext : public DeviceContext {
cudaStream_t stream_; cudaStream_t stream_;
cublasHandle_t cublas_handle_; cublasHandle_t cublas_handle_;
int compute_capability; int compute_capability_;
int multi_process; int runtime_version_;
int max_threads_per_mp; int driver_version_;
int multi_process_;
int max_threads_per_mp_;
mutable std::mutex mtx_; mutable std::mutex mtx_;
......
...@@ -130,6 +130,13 @@ struct EOFException : public std::exception { ...@@ -130,6 +130,13 @@ struct EOFException : public std::exception {
#define UNLIKELY(condition) (condition == 0) #define UNLIKELY(condition) (condition == 0)
#endif #endif
#if !defined(_WIN32)
#define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
#else
// there is no equivalent intrinsics in msvc.
#define LIKELY(condition) (condition != 0)
#endif
template <typename... Args> template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error( inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
bool stat, const Args&... args) { bool stat, const Args&... args) {
......
...@@ -46,6 +46,24 @@ int GetCUDAComputeCapability(int id) { ...@@ -46,6 +46,24 @@ int GetCUDAComputeCapability(int id) {
return device_prop.major * 10 + device_prop.minor; return device_prop.major * 10 + device_prop.minor;
} }
int GetCUDARuntimeVersion(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
int runtime_version = 0;
PADDLE_ENFORCE(cudaRuntimeGetVersion(&runtime_version),
"cudaRuntimeGetVersion failed in "
"paddle::platform::cudaRuntimeGetVersion");
return runtime_version;
}
int GetCUDADriverVersion(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
int driver_version = 0;
PADDLE_ENFORCE(cudaDriverGetVersion(&driver_version),
"cudaDriverGetVersion failed in "
"paddle::platform::GetCUDADriverVersion");
return driver_version;
}
int GetCUDAMultiProcessors(int id) { int GetCUDAMultiProcessors(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
int count; int count;
......
...@@ -29,6 +29,12 @@ int GetCUDADeviceCount(); ...@@ -29,6 +29,12 @@ int GetCUDADeviceCount();
//! Get the compute capability of the ith GPU (format: major * 10 + minor) //! Get the compute capability of the ith GPU (format: major * 10 + minor)
int GetCUDAComputeCapability(int i); int GetCUDAComputeCapability(int i);
//! Get the runtime version of the ith GPU
int GetCUDARuntimeVersion(int id);
//! Get the driver version of the ith GPU
int GetCUDADriverVersion(int id);
//! Get the MultiProcessors of the ith GPU. //! Get the MultiProcessors of the ith GPU.
int GetCUDAMultiProcessors(int i); int GetCUDAMultiProcessors(int i);
......
...@@ -57,6 +57,10 @@ limitations under the License. */ ...@@ -57,6 +57,10 @@ limitations under the License. */
#include "pybind11/stl.h" #include "pybind11/stl.h"
DEFINE_bool(reader_queue_speed_test_mode, false,
"If set true, the queue.pop will only get data from queue but not "
"remove the data from queue for speed testing");
// disable auto conversion to list in Python // disable auto conversion to list in Python
PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
...@@ -170,14 +174,14 @@ PYBIND11_PLUGIN(core) { ...@@ -170,14 +174,14 @@ PYBIND11_PLUGIN(core) {
A LoDTensor X can look like the example below. It contains 2 sequences. A LoDTensor X can look like the example below. It contains 2 sequences.
The first has length 2 and the second has length 3, as described by x.lod. The first has length 2 and the second has length 3, as described by x.lod.
The first tensor dimension 6=2+3 is calculated from LoD if it's available. The first tensor dimension 5=2+3 is calculated from LoD if it's available.
It means the total number of sequence element. In X, each element has 2 It means the total number of sequence element. In X, each element has 2
columns, hence [6, 2]. columns, hence [5, 2].
x.lod = [[2, 3]] x.lod = [[2, 3]]
x.data = [[1, 2], [3, 4], x.data = [[1, 2], [3, 4],
[5, 6], [7, 8], [9, 10], [11, 12]] [5, 6], [7, 8], [9, 10]]
x.shape = [6, 2] x.shape = [5, 2]
LoD can have multiple levels (for example, a paragraph can have multiple LoD can have multiple levels (for example, a paragraph can have multiple
sentences and a sentence can have multiple words). In the following sentences and a sentence can have multiple words). In the following
...@@ -380,7 +384,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -380,7 +384,8 @@ All parameter, weight, gradient are variables in Paddle.
return make_ddim(shape); return make_ddim(shape);
}); });
auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>(); auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
holder->InitOnce(capacity, dims); holder->InitOnce(capacity, dims,
FLAGS_reader_queue_speed_test_mode);
return holder->GetQueue(); return holder->GetQueue();
}, },
py::return_value_policy::copy); py::return_value_policy::copy);
......
...@@ -15,7 +15,7 @@ cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \ ...@@ -15,7 +15,7 @@ cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \
-DWITH_MKL=OFF \ -DWITH_MKL=OFF \
-DWITH_MKLDNN=OFF -DWITH_MKLDNN=OFF
make -j8 make -j8
make -j8 inference_lib_dist make -j8 fluid_lib_dist
``` ```
### step 2. generate program desc ### step 2. generate program desc
......
...@@ -648,25 +648,25 @@ function gen_capi_package() { ...@@ -648,25 +648,25 @@ function gen_capi_package() {
fi fi
} }
function gen_fluid_inference_lib() { function gen_fluid_lib() {
mkdir -p ${PADDLE_ROOT}/build mkdir -p ${PADDLE_ROOT}/build
cd ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build
if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
cat <<EOF cat <<EOF
======================================== ========================================
Generating fluid inference library ... Generating fluid library for train and inference ...
======================================== ========================================
EOF EOF
cmake .. -DWITH_DISTRIBUTE=OFF cmake .. -DWITH_DISTRIBUTE=OFF
make -j `nproc` inference_lib_dist make -j `nproc` fluid_lib_dist
fi fi
} }
function tar_fluid_inference_lib() { function tar_fluid_lib() {
if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
cat <<EOF cat <<EOF
======================================== ========================================
Taring fluid inference library ... Taring fluid library for train and inference ...
======================================== ========================================
EOF EOF
cd ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build
...@@ -675,11 +675,11 @@ EOF ...@@ -675,11 +675,11 @@ EOF
fi fi
} }
function test_fluid_inference_lib() { function test_fluid_lib() {
if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
cat <<EOF cat <<EOF
======================================== ========================================
Testing fluid inference library ... Testing fluid library for inference ...
======================================== ========================================
EOF EOF
cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
...@@ -731,9 +731,9 @@ function main() { ...@@ -731,9 +731,9 @@ function main() {
;; ;;
fluid_inference_lib) fluid_inference_lib)
cmake_gen ${PYTHON_ABI:-""} cmake_gen ${PYTHON_ABI:-""}
gen_fluid_inference_lib gen_fluid_lib
tar_fluid_inference_lib tar_fluid_lib
test_fluid_inference_lib test_fluid_lib
;; ;;
check_style) check_style)
check_style check_style
...@@ -744,8 +744,8 @@ function main() { ...@@ -744,8 +744,8 @@ function main() {
assert_api_not_changed ${PYTHON_ABI:-""} assert_api_not_changed ${PYTHON_ABI:-""}
run_test run_test
gen_capi_package gen_capi_package
gen_fluid_inference_lib gen_fluid_lib
test_fluid_inference_lib test_fluid_lib
assert_api_spec_approvals assert_api_spec_approvals
;; ;;
maccheck) maccheck)
......
...@@ -113,7 +113,8 @@ def __bootstrap__(): ...@@ -113,7 +113,8 @@ def __bootstrap__():
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb', 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
"dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb' 'dist_threadpool_size', 'cpu_deterministic', 'eager_delete_tensor_gb',
'reader_queue_speed_test_mode'
] ]
if core.is_compiled_with_dist(): if core.is_compiled_with_dist():
read_env_flags.append('rpc_deadline') read_env_flags.append('rpc_deadline')
......
...@@ -107,6 +107,7 @@ __all__ = [ ...@@ -107,6 +107,7 @@ __all__ = [
'log', 'log',
'crop', 'crop',
'rank_loss', 'rank_loss',
'margin_rank_loss',
'elu', 'elu',
'relu6', 'relu6',
'pow', 'pow',
...@@ -5827,6 +5828,54 @@ def rank_loss(label, left, right, name=None): ...@@ -5827,6 +5828,54 @@ def rank_loss(label, left, right, name=None):
return out return out
def margin_rank_loss(label, left, right, margin=0.1, name=None):
"""
Margin Ranking Loss Layer for ranking problem,
which compares left score and right score passed in.
The ranking loss can be defined as following equation:
.. math::
rank\_loss &= max(0, -label * (left - right) + margin)
Args:
label (Variable): Indicates whether the left is ranked higher than the right or not.
left (Variable): Ranking score for left.
right (Variable): Ranking score for right.
margin (float): Indicates the given margin.
name (str|None): A name for this layer (optional). If set None, the layer
will be named automatically.
Returns:
Variable: The ranking loss.
Raises:
ValueError: Any of label, left, and right is not a Variable.
Examples:
.. code-block:: python
label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32")
left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32")
right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
out = fluid.layers.margin_rank_loss(label, left, right)
"""
helper = LayerHelper('margin_rank_loss', **locals())
if not isinstance(label, Variable):
raise ValueError("The label should be a Variable.")
if not isinstance(left, Variable):
raise ValueError("The left should be a Variable.")
if not isinstance(right, Variable):
raise ValueError("The right should be a Variable.")
out = helper.create_tmp_variable(left.dtype)
act = helper.create_tmp_variable(left.dtype)
helper.append_op(
type='margin_rank_loss',
inputs={"Label": label,
"X1": left,
"X2": right},
outputs={'Out': out,
'Activated': act},
attrs={'margin': margin})
return out
def pad2d(input, def pad2d(input,
paddings=[0, 0, 0, 0], paddings=[0, 0, 0, 0],
mode='constant', mode='constant',
...@@ -6290,6 +6339,7 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None): ...@@ -6290,6 +6339,7 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
outputs={'Out': out}, outputs={'Out': out},
attrs={'win_size': win_size, attrs={'win_size': win_size,
'pad_value': pad_value}) 'pad_value': pad_value})
return out
def sequence_mask(x, maxlen=None, dtype='int64', name=None): def sequence_mask(x, maxlen=None, dtype='int64', name=None):
......
...@@ -659,6 +659,9 @@ class AdamaxOptimizer(Optimizer): ...@@ -659,6 +659,9 @@ class AdamaxOptimizer(Optimizer):
optimizer = fluid.optimizer.Adamax(learning_rate=0.2) optimizer = fluid.optimizer.Adamax(learning_rate=0.2)
optimizer.minimize(cost) optimizer.minimize(cost)
Notes:
Currently, AdamaxOptimizer doesn't support sparse parameter optimization.
""" """
_moment_acc_str = "moment" _moment_acc_str = "moment"
_inf_norm_acc_str = "inf_norm" _inf_norm_acc_str = "inf_norm"
...@@ -778,6 +781,9 @@ class DecayedAdagradOptimizer(Optimizer): ...@@ -778,6 +781,9 @@ class DecayedAdagradOptimizer(Optimizer):
optimizer = fluid.optimizer.DecayedAdagrad(learning_rate=0.2) optimizer = fluid.optimizer.DecayedAdagrad(learning_rate=0.2)
optimizer.minimize(cost) optimizer.minimize(cost)
Notes:
Currently, DecayedAdagradOptimizer doesn't support sparse parameter optimization.
""" """
_moment_acc_str = "moment" _moment_acc_str = "moment"
...@@ -858,6 +864,9 @@ class AdadeltaOptimizer(Optimizer): ...@@ -858,6 +864,9 @@ class AdadeltaOptimizer(Optimizer):
optimizer = fluid.optimizer.Adadelta( optimizer = fluid.optimizer.Adadelta(
learning_rate=0.0003, epsilon=1.0e-6, rho=0.95) learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
_, params_grads = optimizer.minimize(cost) _, params_grads = optimizer.minimize(cost)
Notes:
Currently, AdadeltaOptimizer doesn't support sparse parameter optimization.
""" """
_avg_squared_grad_acc_str = "_avg_squared_grad" _avg_squared_grad_acc_str = "_avg_squared_grad"
...@@ -1126,6 +1135,9 @@ class FtrlOptimizer(Optimizer): ...@@ -1126,6 +1135,9 @@ class FtrlOptimizer(Optimizer):
optimizer = fluid.optimizer.Ftrl(0.0001) optimizer = fluid.optimizer.Ftrl(0.0001)
_, params_grads = optimizer.minimize(cost) _, params_grads = optimizer.minimize(cost)
Notes:
Currently, FtrlOptimizer doesn't support sparse parameter optimization.
""" """
_squared_acc_str = "squared" _squared_acc_str = "squared"
......
...@@ -19,33 +19,76 @@ import unittest ...@@ -19,33 +19,76 @@ import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.op import Operator from paddle.fluid.op import Operator
import paddle.fluid as fluid
def create_selected_rows_and_tensor(scope, place, height, row_num,
embedding_size):
sr = scope.var("@selected_rows@").get_selected_rows()
tensor = scope.var("grad").get_tensor()
rows = np.random.random_integers(
low=0, high=height - 1, size=[row_num, ]).astype('int64')
sr_val = np.random.random(size=[row_num, embedding_size]).astype('float32')
sr.set_height(height)
sr.set_rows(rows)
sr.get_tensor().set(sr_val, place)
tensor_val = np.zeros(shape=[height, embedding_size], dtype='float32')
for i in range(row_num):
row = rows[i]
tensor_val[row, :] = tensor_val[row, :] + sr_val[i, :]
tensor.set(tensor_val, place)
return tensor_val, sr_val
class TestBase(unittest.TestCase): class TestBase(unittest.TestCase):
def setup(self, centered, epsilon=1e-6): def setup(self,
place,
is_sparse,
centered,
size,
row_num=None,
epsilon=1e-6):
np.random.seed(5) # fix seed np.random.seed(5) # fix seed
self.scope = fluid.global_scope()
self.place = place
self.param_name = "param" self.param_name = "param"
self.param = np.random.random((123, 321)).astype("float32") self.param = np.random.random(size).astype("float32")
self.mean_square_name = "mean_square" self.mean_square_name = "mean_square"
self.mean_square = np.random.random((123, 321)).astype("float32") self.mean_square = np.random.uniform(
low=1, high=2, size=size).astype("float32")
self.mean_grad_name = "mean_grad" self.mean_grad_name = "mean_grad"
self.mean_grad = np.random.random((123, 321)).astype("float32") self.mean_grad = np.random.random(size).astype("float32")
self.lr_name = "lr" self.lr_name = "lr"
self.learning_rate = np.array([0.01]).astype("float32") self.learning_rate = np.array([0.01]).astype("float32")
self.grad_name = "grad" self.grad_name = "grad"
self.grad = np.random.random((123, 321)).astype("float32")
self.is_sparse = is_sparse
if self.is_sparse:
self.grad_sr_name = "@selected_rows@"
self.grad, self.grad_sr = create_selected_rows_and_tensor(
self.scope, place, size[0], row_num, size[1])
else:
self.grad = np.random.random(size).astype("float32")
grad_tensor = self.scope.var(self.grad_name).get_tensor()
grad_tensor.set(self.grad, place)
self.moment_name = "moment" self.moment_name = "moment"
self.moment = np.zeros((123, 321)).astype("float32") self.moment = np.random.uniform(
low=0, high=1, size=size).astype("float32")
self.epsilon = epsilon self.epsilon = epsilon
self.decay = 0.9 self.decay = 0.9
self.momentum = 0.0 self.momentum = 0.1
self.centered = centered self.centered = centered
self.ms_out = self.decay * self.mean_square + (1 - self.decay self.ms_out = self.decay * self.mean_square + (1 - self.decay
...@@ -61,118 +104,122 @@ class TestBase(unittest.TestCase): ...@@ -61,118 +104,122 @@ class TestBase(unittest.TestCase):
self.param_out = self.param - self.moment_out self.param_out = self.param - self.moment_out
def check(self,
actual_t,
expect_t,
place,
out_name,
atol=1e-5,
equal_nan=False):
self.assertTrue(
np.allclose(
actual_t, expect_t, atol=atol, equal_nan=equal_nan),
"Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
+ str(expect_t) + "\n" + "But Got" + str(actual_t))
class TestRmspropOp(TestBase):
def check_with_place(self, place, centered, epsilon):
self.setup(centered, epsilon)
scope = core.Scope()
# create and initialize Param Variable # create and initialize Param Variable
param = scope.var(self.param_name).get_tensor() self.param_tensor = self.scope.var(self.param_name).get_tensor()
param.set(self.param, place) self.param_tensor.set(self.param, place)
mean_square = scope.var(self.mean_square_name).get_tensor() self.mean_square_tensor = self.scope.var(
mean_square.set(self.mean_square, place) self.mean_square_name).get_tensor()
self.mean_square_tensor.set(self.mean_square, place)
lr = scope.var(self.lr_name).get_tensor() lr = self.scope.var(self.lr_name).get_tensor()
lr.set(self.learning_rate, place) lr.set(self.learning_rate, place)
grad = scope.var(self.grad_name).get_tensor() self.moment_tensor = self.scope.var(self.moment_name).get_tensor()
grad.set(self.grad, place) self.moment_tensor.set(self.moment, place)
moment = scope.var(self.moment_name).get_tensor() if self.centered:
moment.set(self.moment, place) self.mean_grad_tensor = self.scope.var(
self.mean_grad_name).get_tensor()
self.mean_grad_tensor.set(self.mean_grad, place)
def check(self, actual_t, expect_t, place, out_name, atol=1e-5):
self.assertTrue(
np.allclose(
actual_t, expect_t, atol=atol),
"Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
+ str(expect_t) + "\n" + "But Got" + str(actual_t))
# create and run sgd operator
if self.centered: class TestRmspropOp(TestBase):
mean_grad = scope.var(self.mean_grad_name).get_tensor() def check_with_place(self,
mean_grad.set(self.mean_grad, place) place,
is_sparse,
rmsprop_op = Operator( centered,
"rmsprop", size,
Param=self.param_name, row_num=None,
Grad=self.grad_name, epsilon=1e-6):
MeanSquare=self.mean_square_name, self.setup(place, is_sparse, centered, size, row_num, epsilon)
MeanGrad=self.mean_grad_name, self.run_and_check()
Moment=self.moment_name,
LearningRate=self.lr_name, def run_and_check(self):
ParamOut=self.param_name, grad_name = self.grad_sr_name if self.is_sparse else self.grad_name
MeanSquareOut=self.mean_square_name,
MomentOut=self.moment_name, kwargs = {
MeanGradOut=self.mean_grad_name, 'Param': self.param_name,
epsilon=self.epsilon, 'Grad': grad_name,
decay=self.decay, 'MeanSquare': self.mean_square_name,
momentum=self.momentum, 'Moment': self.moment_name,
centered=True) 'LearningRate': self.lr_name,
else: 'ParamOut': self.param_name,
rmsprop_op = Operator( 'MeanSquareOut': self.mean_square_name,
"rmsprop", 'MomentOut': self.moment_name,
Param=self.param_name, 'epsilon': self.epsilon,
Grad=self.grad_name, 'decay': self.decay,
MeanSquare=self.mean_square_name, 'momentum': self.momentum,
Moment=self.moment_name, 'centered': self.centered
LearningRate=self.lr_name, }
ParamOut=self.param_name,
MeanSquareOut=self.mean_square_name,
MomentOut=self.moment_name,
epsilon=self.epsilon,
decay=self.decay,
momentum=self.momentum,
centered=False)
rmsprop_op.run(scope, place)
atol = 1e-5
equal_nan = False
if self.centered: if self.centered:
atol = 1e-3 kwargs['MeanGrad'] = self.mean_grad_name
equal_nan = True kwargs['MeanGradOut'] = self.mean_grad_name
rmsprop_op = Operator('rmsprop', **kwargs)
atol = 1e-6
rmsprop_op.run(self.scope, self.place)
self.check( self.check(
np.array(mean_square), self.ms_out, place, self.mean_square_name) np.array(self.mean_square_tensor),
self.ms_out,
self.place,
self.mean_square_name,
atol=atol)
self.check( self.check(
np.array(moment), np.array(self.moment_tensor),
self.moment_out, self.moment_out,
place, self.place,
self.moment_name, self.moment_name,
atol=atol, atol=atol)
equal_nan=equal_nan)
self.check( self.check(
np.array(param), np.array(self.param_tensor),
self.param_out, self.param_out,
place, self.place,
self.param_name, self.param_name,
atol=atol, atol=atol)
equal_nan=equal_nan)
if self.centered: if self.centered:
self.check( self.check(
np.array(mean_grad), self.mg_out, place, self.mean_grad_name) np.array(self.mean_grad_tensor), self.mg_out, self.place,
self.mean_grad_name)
def test_rmsprop(self): def test_rmsprop(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
size = (128, 320)
for place in places: for place in places:
self.check_with_place(place, False, 1e-6) for centered in [False, True]:
self.check_with_place(place, False, 1e-10) with fluid.scope_guard(core.Scope()):
self.check_with_place(place, True, 1e-6) self.check_with_place(
self.check_with_place(place, True, 1e-10) place, is_sparse=False, centered=centered, size=size)
with fluid.scope_guard(core.Scope()):
self.check_with_place(
place,
is_sparse=True,
centered=centered,
row_num=512,
size=size)
with fluid.scope_guard(core.Scope()):
self.check_with_place(
place,
is_sparse=True,
centered=centered,
row_num=60,
size=size)
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册