diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 212724a0c7fc06edf0599ddcaad33aba1fead879..6a37b5ca433a3baa1388dd4f720d782ca53e4e99 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -75,7 +75,8 @@ paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'outp paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)) paddle.fluid.layers.sequence_expand_as ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.sequence_unpad ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)) paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) @@ -84,6 +85,7 @@ paddle.fluid.layers.reduce_min ArgSpec(args=['input', 'dim', 'keep_dim', 'name'] paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.sequence_slice ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None)) paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)) paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)) diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 9fbefabc841e3f6940860f60d959fee97495e4c9..d09b94a3fd32952985a37cf4246c7640d2db4f56 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -64,7 +64,8 @@ class OpHandleBase { virtual bool IsMultiDeviceTransfer() { return false; } const platform::DeviceContext *DeviceContext(platform::Place place) { - return dev_ctxes_[place]; + auto it = dev_ctxes_.find(place); + return it != dev_ctxes_.end() ? it->second : nullptr; } void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) { diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 70ec6e90a4d0106b7f838e51b8357798daa4b10d..b212666637a5289c9c6cd3585655deaeed8afd4b 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -46,6 +46,41 @@ ExecutorPrepareContext::~ExecutorPrepareContext() { VLOG(5) << "destroy ExecutorPrepareContext"; } +template +static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, + GarbageCollector* gc, + RefCntMap* ref_cnts) { + std::unordered_set erase_tensors; + + auto handler = [&](const VariableNameMap& name_map) { + for (auto& name_pair : name_map) { + for (auto& name : name_pair.second) { + auto it = ref_cnts->find(name); + if (it == ref_cnts->end()) continue; + if ((it->second)-- == 1) { + auto* var = scope.FindVar(name); + if (var != nullptr) { + VLOG(10) << "Erase tensor \'" << name << "\'"; + if (var->IsType()) { + erase_tensors.insert(var->GetMutable()); + } else if (var->IsType()) { + erase_tensors.insert( + var->GetMutable()->mutable_value()); + } + } + } + } + } + }; + + handler(op->Inputs()); + handler(op->Outputs()); + + if (!erase_tensors.empty()) { + gc->Add(erase_tensors); + } +} + Executor::Executor(const platform::Place& place) : place_(place) {} void Executor::Close() { @@ -66,7 +101,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) { } else if (var_type == proto::VarType::FETCH_LIST) { var->GetMutable(); } else if (var_type == proto::VarType::STEP_SCOPES) { - var->GetMutable>(); + var->GetMutable>(); } else if (var_type == proto::VarType::LOD_RANK_TABLE) { var->GetMutable(); } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { @@ -331,9 +366,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } int64_t max_memory_size = GetEagerDeletionThreshold(); - std::unique_ptr> gc; - if (max_memory_size >= 0) { + // WhileOp would set keep_kids to false + // WhileGradOp would need the scopes created in WhileOp + // Perhaps, we should not perform eager deletion in WhileOp + // The scopes and variables created by WhileOp would be deleted + // in WhileGradOp. + if (max_memory_size >= 0 && !keep_kids) { ctx->ResetReferenceCount(); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { @@ -352,45 +391,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, op->Run(*local_scope, place_); if (gc != nullptr) { - std::vector erase_vars; - for (auto& input : op->Inputs()) { - for (auto& input_name : input.second) { - auto it = ctx->cur_ref_cnts_.find(input_name); - if (it == ctx->cur_ref_cnts_.end()) continue; - if (it->second == 1) { // should delete it - erase_vars.emplace_back(input_name); - ctx->cur_ref_cnts_.erase(input_name); - } else { - --(it->second); - } - } - } - - for (auto& output : op->Outputs()) { - for (auto& output_name : output.second) { - auto it = ctx->cur_ref_cnts_.find(output_name); - if (it == ctx->cur_ref_cnts_.end()) continue; - if (it->second == 1) { - erase_vars.emplace_back(output_name); - ctx->cur_ref_cnts_.erase(output_name); - } else { - --(it->second); - } - } - } - - if (!erase_vars.empty()) { - std::vector erase_tensors; - for (auto& name : erase_vars) { - auto* var = local_scope->FindVar(name); - if (var == nullptr) continue; - if (var->IsType()) { - auto* tensor = var->GetMutable(); - erase_tensors.push_back(tensor); - } - } - if (!erase_tensors.empty()) gc->Add(erase_tensors); - } + DeleteUnusedTensors(*local_scope, op.get(), gc.get(), + &(ctx->cur_ref_cnts_)); } if (FLAGS_benchmark) { diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index f0cc1338a8af50030a70a9797cbcd1b0567272b5..36b36d49c2728dbef93042158dffa26d8f56d529 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -32,38 +32,32 @@ template std::unordered_map GetNonPersistableReferenceCount( const ProgramDesc& prog, size_t block_id) { auto& block = prog.Block(block_id); - std::unordered_set ignored_vars; std::unordered_map ref_cnts; - for (auto var_desc : block.AllVars()) { - auto type = var_desc->Proto()->type().type(); - if (type != proto::VarType::LOD_TENSOR || var_desc->Persistable()) { - ignored_vars.insert(var_desc->Name()); // ignore persistable vars - } - } - - for (auto op_desc : block.AllOps()) { - for (auto& input : op_desc->Inputs()) { - for (auto& input_name : input.second) { - if (!ignored_vars.count(input_name)) { - if (ref_cnts.count(input_name)) - ++ref_cnts[input_name]; - else - ref_cnts[input_name] = 1; + auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) { + for (auto& name_pair : name_map) { + for (auto& name : name_pair.second) { + auto* var_desc = block.FindVar(name); + if (var_desc == nullptr || var_desc->Persistable()) continue; + auto type = var_desc->Proto()->type().type(); + if (type != proto::VarType::LOD_TENSOR && + type != proto::VarType::SELECTED_ROWS) { + continue; } - } - } - for (auto& output : op_desc->Outputs()) { - for (auto output_name : output.second) { - if (!ignored_vars.count(output_name)) { - if (ref_cnts.count(output_name)) - ++ref_cnts[output_name]; - else - ref_cnts[output_name] = 1; + auto it = ref_cnts.find(name); + if (it != ref_cnts.end()) { + ++it->second; + } else { + ref_cnts[name] = 1; } } } + }; + + for (auto op_desc : block.AllOps()) { + update_ref_cnts(op_desc, op_desc->Inputs()); + update_ref_cnts(op_desc, op_desc->Outputs()); } return ref_cnts; } diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 8e1f93c5ebd448903d70f9668539e077875836e4..3e9353f5cf67d8de62c5551f12ea786e49190549 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -27,8 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, // be created. VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index; Variable* g_feed_value = scope->Var(var_name); - auto& feed_inputs = - *(g_feed_value->GetMutable>()); + auto& feed_inputs = *(g_feed_value->GetMutable()); if (index >= feed_inputs.size()) { feed_inputs.resize(index + 1); } diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index ba10687d65cfbbac89cfc76879c8b202ebd03229..2840d503f1454271afb309efdd435225ab077dc0 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -37,7 +37,7 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) { } else if (var_type == proto::VarType::FETCH_LIST) { var->GetMutable(); } else if (var_type == proto::VarType::STEP_SCOPES) { - var->GetMutable>(); + var->GetMutable>(); } else if (var_type == proto::VarType::LOD_RANK_TABLE) { var->GetMutable(); } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 9f930065324f13f5aa79c214e820fb6fc2f3a166..14fcde2fe3b1c3acfc0994e9cd37a784c57826d7 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -149,9 +149,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { platform::SetDeviceId(dev_id); #endif } - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - platform::RecordEvent record_event(Type(), pool.Get(place)); - RunImpl(scope, place); + + // The profile has a process-wide mutex, results in serious performance issue + // in concurrency scenerio. Here use an `if` to fix this issue. + // Please not remove the `if`, ask @Superjomn if there are any concern. + if (platform::IsProfileEnabled()) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::RecordEvent record_event(Type(), pool.Get(place)); + RunImpl(scope, place); + } else { + RunImpl(scope, place); + } VLOG(3) << place << " " << DebugStringEx(&scope); } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index f06bad6c78c05804e583f859906b88fb7b500372..e8adabd26540754d5b9206294eeeed79757220bf 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -307,6 +307,10 @@ ParallelExecutor::~ParallelExecutor() { } } } + + // member_ must be destructed before gcs_ since the destructor of + // ReferenceCountOpHandle use raw pointers of gcs_ inside. + member_.reset(); } } // namespace framework diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index fd386a5987f11ff64964e95eb7e9b83572dc790c..ef09b98b2aa91a9d729b94d15dbb676dde4092b6 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -75,7 +75,7 @@ class ParallelExecutor { private: void BCastParamsToDevices(const std::unordered_set &vars) const; - ParallelExecutorPrivate *member_; + std::unique_ptr member_; #ifdef PADDLE_WITH_CUDA // ref_cnts_ is only initialized when ParallelExecutor constructs, and then diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 1a727a2c8c759d010606d5b605823b7252b35c69..a4abd1b1283f08fb8431fbeea0cea17c8439fdd7 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -49,18 +49,18 @@ int64_t GetEagerDeletionThreshold() { Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); kids_.push_back(new Scope(this)); return *kids_.back(); } Variable* Scope::Var(const std::string& name) { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); return VarInternal(name); } Variable* Scope::Var(std::string* name) { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); auto new_name = string::Sprintf("%p.%d", this, vars_.size()); if (name != nullptr) { *name = new_name; @@ -69,29 +69,34 @@ Variable* Scope::Var(std::string* name) { } Variable* Scope::FindVar(const std::string& name) const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); return FindVarInternal(name); } +Variable* Scope::FindLocalVar(const std::string& name) const { + std::lock_guard lock(mutex_); + return FindVarLocally(name); +} + const Scope* Scope::FindScope(const Variable* var) const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); return FindScopeInternal(var); } void Scope::DropKids() { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); for (Scope* s : kids_) delete s; kids_.clear(); } bool Scope::HasKid(const Scope* scope) const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); return it != this->kids_.end(); } std::vector Scope::LocalVarNames() const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); std::vector known_vars; known_vars.reserve(this->vars_.size()); for (auto& p : vars_) { @@ -101,7 +106,7 @@ std::vector Scope::LocalVarNames() const { } void Scope::DeleteScope(Scope* scope) const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); this->kids_.erase(it); @@ -114,7 +119,7 @@ void Scope::DeleteScope(Scope* scope) const { } void Scope::EraseVars(const std::vector& var_names) { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); std::set var_set(var_names.begin(), var_names.end()); for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { @@ -127,12 +132,12 @@ void Scope::EraseVars(const std::vector& var_names) { void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); RenameInternal(origin_name, new_name); } std::string Scope::Rename(const std::string& origin_name) const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); auto new_name = string::Sprintf("%p.%d", this, vars_.size()); RenameInternal(origin_name, new_name); return new_name; diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index e42fff1d79d92fb7ed61768a614d8cd98f6775a0..14f9f36812d690fc4a7440f2e7e6a85e9993a535 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -63,6 +63,11 @@ class Scope { /// Caller doesn't own the returned Variable. Variable* FindVar(const std::string& name) const; + /// Find a variable in the current scope. + /// Return nullptr if cannot find. + /// Caller doesn't own the returned Variable. + Variable* FindLocalVar(const std::string& name) const; + const Scope* parent() const { return parent_; } /// Find the scope or an ancestor scope that contains the given variable. diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 1d7a2eb5b38255531880fe3d2e5321024caf0c6b..69bcbc0e5891f95af4de8dfd49a25648ca920ab1 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -36,6 +36,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, auto size = src.numel() * SizeOfType(src.type()); if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data async from " << src_place << " to " + << dst_place; + return; + } memory::Copy(boost::get(dst_place), dst_ptr, boost::get(src_place), src_ptr, size); } @@ -71,6 +76,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, auto stream = reinterpret_cast(ctx).stream(); if (platform::is_same_place(src_place, dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data async from " << src_place << " to " + << dst_place; + return; + } memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } else { @@ -114,6 +124,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, auto dst_ptr = dst->mutable_data(dst_place, src.type()); auto size = src.numel() * SizeOfType(src.type()); if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data from " << src_place << " to " + << dst_place; + return; + } memory::Copy(boost::get(dst_place), dst_ptr, boost::get(src_place), src_ptr, size); } @@ -130,6 +145,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr); } else if (platform::is_gpu_place(src_place) && platform::is_gpu_place(dst_place)) { + if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) { + VLOG(3) << "Skip copy the same data from " << src_place << " to " + << dst_place; + return; + } auto src_gpu_place = boost::get(src_place); auto dst_gpu_place = boost::get(dst_place); memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc index a1e5b967a86d10f3439db662af54bb82888027b9..793ccfc79fe56707f226477b9d50b1d972ab6a59 100644 --- a/paddle/fluid/framework/tensor_util_test.cc +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -41,6 +41,11 @@ TEST(TensorCopy, Tensor) { EXPECT_EQ(src_ptr[i], dst_ptr[i]); } + TensorCopy(dst_tensor, *cpu_place, &dst_tensor); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); Tensor slice_tensor = src_tensor.Slice(1, 2); @@ -82,6 +87,15 @@ TEST(TensorCopy, Tensor) { EXPECT_EQ(src_ptr[i], dst_ptr[i]); } + // Copy the same tensor + TensorCopy(gpu_tensor, *gpu_place, gpu_ctx, &gpu_tensor); + gpu_ctx.Wait(); + const int* dst_ptr_tmp = dst_tensor.data(); + EXPECT_NE(src_ptr, dst_ptr_tmp); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]); + } + Tensor slice_tensor = src_tensor.Slice(1, 2); // CPU Slice Tensor to GPU Tensor diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h index e33849ef502fb10b913e7e28cbd0abdb8b8ff9bb..9d3fb811191c207c75845ef8f8486e8beac7525a 100644 --- a/paddle/fluid/framework/var_desc.h +++ b/paddle/fluid/framework/var_desc.h @@ -59,6 +59,7 @@ class VarDesc { public: explicit VarDesc(const std::string &name) { desc_.set_name(name); + // TODO(paddle-dev): Why default to lodtensor. desc_.mutable_type()->set_type(proto::VarType::LOD_TENSOR); } diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h index 067e0c2b8389f88639fd9b95bd680702517efee1..873e1b20a584df3ba90cf5c1a62a3879bf98ce5c 100644 --- a/paddle/fluid/framework/variable.h +++ b/paddle/fluid/framework/variable.h @@ -38,8 +38,12 @@ class Variable { template T* GetMutable() { - if (!IsType()) { + if (!holder_) { holder_.reset(new PlaceholderImpl(new T())); + } else { + PADDLE_ENFORCE(IsType(), + "Variable must be type %s, the holding type is %s", + typeid(T).name(), holder_->Type().name()); } return static_cast(holder_->Ptr()); } diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc index c5c1d215f4a6affae0a3bdafacec40a2aee2ca19..003dcfd3dfe5ecfd563a686bb72b061aff602f73 100644 --- a/paddle/fluid/framework/variable_test.cc +++ b/paddle/fluid/framework/variable_test.cc @@ -33,9 +33,10 @@ TEST(Variable, GetMutable) { const Tensor& tt = v->Get(); EXPECT_EQ(1234, tt.content_); - std::string* s = v->GetMutable(); - *s = "hello"; - - const std::string& ss = v->Get(); - EXPECT_EQ("hello", ss); + try { + v->GetMutable(); + } catch (std::exception& e) { + return; + } + EXPECT_TRUE(false); } diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 3bc6af5241c41bd805699121d614d431d46d863f..3095dee0f0106b2408663cd32bb4fb310111eda4 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -25,9 +25,11 @@ #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/utils/singleton.h" +#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/profiler.h" DECLARE_bool(profile); +DECLARE_int32(paddle_num_threads); namespace paddle { @@ -47,6 +49,9 @@ bool AnalysisPredictor::Init( } #endif + // no matter with or without MKLDNN + paddle::platform::SetNumThreads(FLAGS_paddle_num_threads); + if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim " @@ -335,6 +340,19 @@ bool AnalysisPredictor::LoadProgramDesc() { } return true; } + +AnalysisPredictor::~AnalysisPredictor() { +#if !defined(_WIN32) + if (FLAGS_profile) { + platform::DisableProfiler(platform::EventSortingKey::kTotal, + "./profile.log"); + } +#endif + if (sub_scope_) { + scope_->DeleteScope(sub_scope_); + } +} + std::unique_ptr AnalysisPredictor::Clone() { auto *x = new AnalysisPredictor(config_); x->Init(scope_, inference_program_); diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 0d01d7ac2b29ea6364b07af9bb3bdeb5ced6bd00..5a9f4d36959d4ee7ca16dec769d9d1283b8787cb 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -72,6 +72,7 @@ class AnalysisPredictor : public PaddlePredictor { template void GetFetchOne(const framework::LoDTensor &fetchs, PaddleTensor *output_data); + ~AnalysisPredictor(); private: contrib::AnalysisConfig config_; diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 6682e0a81b20c82aa668a249d37986386d769c83..7cda9c5d8a8366bd097491f37f5352a10e4fb16c 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -23,9 +23,11 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/profiler.h" DEFINE_bool(profile, false, "Turn on profiler for fluid"); +DECLARE_int32(paddle_num_threads); namespace paddle { namespace { @@ -72,6 +74,9 @@ bool NativePaddlePredictor::Init( } #endif + // no matter with or without MKLDNN + paddle::platform::SetNumThreads(FLAGS_paddle_num_threads); + if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); } else { diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 031109398d8e21ad95f19f65fdd814e1782889e6..df3e3fcd9c75f03f4d9b0a7c12788f06bfdefd7f 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -300,7 +300,7 @@ op_library(flatten_op DEPS reshape_op) op_library(sequence_pad_op DEPS sequence_padding) op_library(unstack_op DEPS stack_op) op_library(fake_quantize_op DEPS memory) -op_library(fusion_lstm_op DEPS cpu_lstm_compute) +op_library(fusion_lstm_op DEPS jit_kernel) if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(layer_norm_op DEPS cub) diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h index 3e724f52e4fcb2818f961181a2830dfd653d2733..3455d1ee54e8e6e498d0b0e6932ec099af9c0b30 100644 --- a/paddle/fluid/operators/adam_op.h +++ b/paddle/fluid/operators/adam_op.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/operators/math/algorithm.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/platform/for_range.h" @@ -199,23 +200,9 @@ struct SparseAdamFunctor { row_numel_(row_numel), row_count_(row_count) {} - inline HOSTDEVICE int64_t BinarySearchInRows(int64_t row) const { - int64_t beg = 0, end = row_count_ - 1; - while (beg <= end) { - auto mid = ((beg + end) >> 1); - if (rows_[mid] == row) - return mid; - else if (rows_[mid] < row) - beg = mid + 1; - else - end = mid - 1; - } - return -1; - } - inline HOSTDEVICE void operator()(size_t i) const { - int64_t row = i / row_numel_; - auto row_idx = BinarySearchInRows(row); + auto row_idx = + math::BinarySearch(rows_, row_count_, i / row_numel_); T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; // The following code is the same as dense diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index 2826b82117db113d4d8c10095e89f610ca895775..e04a68717b351ddb0be5a7e70aa9297e5eb0125f 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -70,6 +70,12 @@ class FillConstantOp : public framework::OperatorBase { } }; +class FillConstantOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override {} +}; + class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -102,4 +108,5 @@ Fill up a variable with specified constant value. namespace ops = paddle::operators; REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantInferShape, ops::FillConstantOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + ops::FillConstantOpVarTypeInference); diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc index ae1f6d8e489039667d861a69acabf2c632ef2061..067e6a3e7cccc1f15ebdd984f3a2441339a989ab 100644 --- a/paddle/fluid/operators/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fusion_lstm_op.cc @@ -15,11 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/fusion_lstm_op.h" #include #include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/cpu_lstm_compute.h" -#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/operators/math/jit_kernel.h" #include "paddle/fluid/operators/math/sequence2batch.h" -#include "paddle/fluid/platform/cpu_info.h" namespace paddle { namespace operators { @@ -219,121 +217,55 @@ This operator fuse the X into LSTM, more details can refer to LSTM op. template class FuisonLSTMKernel : public framework::OpKernel { public: -#define INIT_VEC_FUNC \ - std::function act_gate, act_cell, act_cand; \ - auto& act_gate_str = ctx.Attr("gate_activation"); \ - auto& act_cell_str = ctx.Attr("cell_activation"); \ - auto& act_cand_str = ctx.Attr("candidate_activation"); \ - if (platform::jit::MayIUse(platform::jit::avx)) { \ - math::VecActivations act_functor; \ - act_gate = act_functor(act_gate_str); \ - act_cell = act_functor(act_cell_str); \ - act_cand = act_functor(act_cand_str); \ - } else { \ - math::VecActivations act_functor; \ - act_gate = act_functor(act_gate_str); \ - act_cell = act_functor(act_cell_str); \ - act_cand = act_functor(act_cand_str); \ - } - -#define INIT_BASE_INPUT_OUTPUT \ - auto* x = ctx.Input("X"); \ - auto* h0 = ctx.Input("H0"); \ - auto* c0 = ctx.Input("C0"); \ - auto* wx = ctx.Input("WeightX"); \ - auto* wh = ctx.Input("WeightH"); \ - auto* bias = ctx.Input("Bias"); \ - auto* xx = ctx.Output("XX"); \ - auto* hidden_out = ctx.Output("Hidden"); \ - auto* cell_out = ctx.Output("Cell"); \ - bool is_reverse = ctx.Attr("is_reverse"); \ - bool use_peepholes = ctx.Attr("use_peepholes"); - -#define INIT_BASE_SIZES \ - auto x_dims = x->dims(); /* T x M*/ \ - auto wh_dims = wh->dims(); /* D x 4D*/ \ - const int M = x_dims[1]; \ - const int D = wh_dims[0]; \ - const int D2 = D * 2; \ - const int D3 = D * 3; \ - const int D4 = wh_dims[1]; - -#define INIT_BASE_INPUT_DATAS \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - /* diagonal weight*/ \ - const T* wc_data = bias->data() + D4; \ - /* for peephole only*/ \ - T* checked_cell_data = nullptr; \ - auto place = ctx.GetPlace(); \ - if (use_peepholes) { \ - /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ - auto* checked_cell = ctx.Output("CheckedCell"); \ - checked_cell_data = checked_cell->mutable_data(place); \ - } - -/// Compute LSTM +#define INIT_BASE_DEFINES \ + using DeviceContext = paddle::platform::CPUDeviceContext; \ + auto* x = ctx.Input("X"); \ + auto* h0 = ctx.Input("H0"); \ + auto* c0 = ctx.Input("C0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* wh = ctx.Input("WeightH"); \ + auto* bias = ctx.Input("Bias"); \ + auto* xx = ctx.Output("XX"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + auto* cell_out = ctx.Output("Cell"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + bool use_peepholes = ctx.Attr("use_peepholes"); \ + auto x_dims = x->dims(); /* T x M*/ \ + auto wh_dims = wh->dims(); /* D x 4D*/ \ + const int M = x_dims[1]; \ + const int D = wh_dims[0]; \ + const int D4 = wh_dims[1] + +#define INIT_OTHER_DEFINES \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wp_data = bias->data() + D4; \ + /* for peephole only*/ \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + auto* checked_cell = ctx.Output("CheckedCell"); \ + checked_cell_data = checked_cell->mutable_data(place); \ + } \ + const auto& ker = \ + math::jitkernel::KernelPool::Instance() \ + .template Get, const std::string&, \ + const std::string&, const std::string&>( \ + ctx.Attr("gate_activation"), \ + ctx.Attr("candidate_activation"), \ + ctx.Attr("cell_activation"), D, use_peepholes) + +// Wh GEMM #define GEMM_WH_ADDON(bs, prev, out) \ blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast(1), prev, D, \ wh_data, D4, static_cast(1), out, D4) -#define GET_Ct(ct_1, gates, ct) \ - /* C_t = C_t-1 * fgated + cand_gated * igated*/ \ - act_cand(D, gates, gates); \ - blas.VMUL(D, gates, gates + D, gates + D); \ - blas.VMUL(D, ct_1, gates + D2, gates + D2); \ - blas.VADD(D, gates + D, gates + D2, ct) - -#define GET_Ht(ct, gates, ht) \ - /* H_t = act_cell(C_t) * ogated */ \ - act_cell(D, ct, gates + D2); \ - blas.VMUL(D, gates + D2, gates + D3, ht) - -#define GET_Ct_NOH0C0(gates, ct) \ - /* C_t = igated * cgated*/ \ - act_gate(D, gates + D, gates + D); \ - act_cand(D, gates, gates); \ - blas.VMUL(D, gates, gates + D, ct) - -#define COMPUTE_CtHt_NOH0C0(gates, ct, ht) \ - GET_Ct_NOH0C0(gates, ct); \ - act_gate(D, gates + D3, gates + D3); \ - GET_Ht(ct, gates, ht) - -#define COMPUTE_CtHt_PEEPHOLE_NOH0C0(gates, ct, ht) \ - GET_Ct_NOH0C0(gates, ct); \ - /* get outgated, put W_oc * C_t on igated */ \ - blas.VMUL(D, wc_data + D2, ct, gates + D); \ - blas.VADD(D, gates + D, gates + D3, gates + D3); \ - act_gate(D, gates + D3, gates + D3); \ - GET_Ht(ct, gates, ht) - -#define COMPUTE_CtHt(gates, ct_1, ct, ht) \ - act_gate(D3, gates + D, gates + D); \ - GET_Ct(ct_1, gates, ct); \ - GET_Ht(ct, gates, ht) - -#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht) \ - /* get fgated and igated*/ \ - blas.VMUL(D, wc_data, ct_1, checked_cell_data); \ - blas.VMUL(D, wc_data + D, ct_1, checked_cell_data + D); \ - blas.VADD(D2, checked_cell_data, gates + D, gates + D); \ - act_gate(D2, gates + D, gates + D); \ - GET_Ct(ct_1, gates, ct); \ - /* get ogated*/ \ - blas.VMUL(D, wc_data + D2, ct, gates + D); \ - blas.VADD(D, gates + D, gates + D3, gates + D3); \ - act_gate(D, gates + D3, gates + D3); \ - GET_Ht(ct, gates, ht) - void SeqCompute(const framework::ExecutionContext& ctx) const { - using DeviceContext = paddle::platform::CPUDeviceContext; - INIT_BASE_INPUT_OUTPUT - INIT_BASE_SIZES - INIT_VEC_FUNC - INIT_BASE_INPUT_DATAS - + INIT_BASE_DEFINES; + INIT_OTHER_DEFINES; auto x_lod = x->lod(); const int total_T = x_dims[0]; const int N = x_lod[0].size() - 1; @@ -357,89 +289,47 @@ class FuisonLSTMKernel : public framework::OpKernel { gate_offset = -D; } -#define MOVE_ONE_STEP \ - prev_h_data = h_out_data; \ - prev_c_data = c_out_data; \ - xx_data = xx_data + xx_offset; \ - h_out_data = h_out_data + gate_offset; \ - c_out_data = c_out_data + gate_offset - -#define PROCESS_H0C0_DEFINES \ - int bid = is_reverse ? N - 1 - i : i; \ - int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; \ - const T* prev_c_data = nullptr; \ - const T* prev_h_data = nullptr; \ - int tstart = 0 - -#define PROCESS_H0C0_PEEPHOLE \ - PROCESS_H0C0_DEFINES; \ - if (h0_data) { \ - prev_h_data = h0_data + bid * D; \ - prev_c_data = c0_data + bid * D; \ - } else { \ - COMPUTE_CtHt_PEEPHOLE_NOH0C0(xx_data, c_out_data, h_out_data); \ - MOVE_ONE_STEP; \ - tstart = 1; \ - } - -#define PROCESS_H0C0 \ - PROCESS_H0C0_DEFINES; \ - if (h0_data) { \ - prev_h_data = h0_data + bid * D; \ - prev_c_data = c0_data + bid * D; \ - } else { \ - COMPUTE_CtHt_NOH0C0(xx_data, c_out_data, h_out_data); \ - MOVE_ONE_STEP; \ - tstart = 1; \ - } - - if (use_peepholes) { - for (int i = 0; i < N; ++i) { - PROCESS_H0C0_PEEPHOLE - for (int step = tstart; step < seq_len; ++step) { - GEMM_WH_ADDON(1, prev_h_data, xx_data); - COMPUTE_CtHt_PEEPHOLE(xx_data, prev_c_data, c_out_data, h_out_data); - MOVE_ONE_STEP; - } - } - } else { - // TODO(TJ): unly workaround, clean me - std::function compute_ctht; - if (platform::jit::MayIUse(platform::jit::avx) && - act_gate_str == "sigmoid" && act_cand_str == "tanh" && - act_cell_str == "tanh" && D == 8) { - compute_ctht = math::lstm_compute_ctht; + for (int i = 0; i < N; ++i) { + int bid = is_reverse ? N - 1 - i : i; + int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; + const T* prev_c_data = nullptr; + const T* prev_h_data = nullptr; + int tstart = 0; + if (h0_data) { + prev_h_data = h0_data + bid * D; + prev_c_data = c0_data + bid * D; } else { - compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) { - COMPUTE_CtHt(gates, ct_1, ct, ht); - }; + ker->ComputeC1H1(xx_data, c_out_data, h_out_data, wp_data); + tstart = 1; + // move one step + prev_h_data = h_out_data; + prev_c_data = c_out_data; + xx_data = xx_data + xx_offset; + h_out_data = h_out_data + gate_offset; + c_out_data = c_out_data + gate_offset; } - for (int i = 0; i < N; ++i) { - PROCESS_H0C0 - for (int step = tstart; step < seq_len; ++step) { - GEMM_WH_ADDON(1, prev_h_data, xx_data); - compute_ctht(xx_data, prev_c_data, c_out_data, h_out_data); - MOVE_ONE_STEP; - } + for (int step = tstart; step < seq_len; ++step) { + GEMM_WH_ADDON(1, prev_h_data, xx_data); + ker->ComputeCtHt(xx_data, prev_c_data, c_out_data, h_out_data, wp_data, + checked_cell_data); + // move one step + prev_h_data = h_out_data; + prev_c_data = c_out_data; + xx_data = xx_data + xx_offset; + h_out_data = h_out_data + gate_offset; + c_out_data = c_out_data + gate_offset; } } -#undef PROCESS_H0C0_DEFINES -#undef PROCESS_H0C0_PEEPHOLE -#undef PROCESS_H0C0 -#undef MOVE_ONE_STEP } void BatchCompute(const framework::ExecutionContext& ctx) const { - using DeviceContext = platform::CPUDeviceContext; - INIT_BASE_INPUT_OUTPUT - INIT_BASE_SIZES + INIT_BASE_DEFINES; if (x->lod()[0].size() == 2) { xx->Resize({x_dims[0], D4}); SeqCompute(ctx); return; } - INIT_VEC_FUNC - INIT_BASE_INPUT_DATAS + INIT_OTHER_DEFINES; auto* reordered_h0 = ctx.Output("ReorderedH0"); auto* reordered_c0 = ctx.Output("ReorderedC0"); @@ -487,8 +377,8 @@ class FuisonLSTMKernel : public framework::OpKernel { prev_c_data = reordered_c0_data; size_t sz = sizeof(T) * D; for (int i = 0; i < max_bs; ++i) { - std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz); - std::memcpy(reordered_c0_data, c0_data + seq_order[i] * D, sz); + blas.VCOPY(sz, h0_data + seq_order[i] * D, reordered_h0_data); + blas.VCOPY(sz, c0_data + seq_order[i] * D, reordered_c0_data); reordered_h0_data += D; reordered_c0_data += D; } @@ -498,13 +388,7 @@ class FuisonLSTMKernel : public framework::OpKernel { T* cur_h_out_data = batched_h_out_data; T* cur_c_out_data = batched_c_out_data; for (int i = 0; i < max_bs; ++i) { - GET_Ct_NOH0C0(cur_in_data, cur_c_out_data); - if (use_peepholes) { - blas.VMUL(D, wc_data + D2, cur_c_out_data, cur_in_data + D); - blas.VADD(D, cur_in_data + D, cur_in_data + D3, cur_in_data + D3); - } - act_gate(D, cur_in_data + D3, cur_in_data + D3); - GET_Ht(cur_c_out_data, cur_in_data, cur_h_out_data); + ker->ComputeC1H1(cur_in_data, cur_c_out_data, cur_h_out_data, wp_data); cur_in_data += D4; cur_c_out_data += D; cur_h_out_data += D; @@ -513,71 +397,37 @@ class FuisonLSTMKernel : public framework::OpKernel { prev_h_data = batched_h_out_data; prev_c_data = batched_c_out_data; } + + // compute kernel part const auto& batch_starts = batched_lod[0]; const int max_seq_len = batch_starts.size() - 1; const int offset = tstart * max_bs * D; batched_input_data = batched_input_data + offset * 4; batched_h_out_data = batched_h_out_data + offset; batched_c_out_data = batched_c_out_data + offset; - -#define DEFINE_CUR \ - T* cur_in_data = batched_input_data; \ - T* cur_prev_c_data = prev_c_data; \ - T* cur_c_out_data = batched_c_out_data; \ - T* cur_h_out_data = batched_h_out_data - -#define MOVE_ONE_BATCH \ - cur_in_data += D4; \ - cur_prev_c_data += D; \ - cur_c_out_data += D; \ - cur_h_out_data += D - -#define MOVE_ONE_STEP \ - prev_c_data = batched_c_out_data; \ - prev_h_data = batched_h_out_data; \ - batched_c_out_data = cur_c_out_data; \ - batched_h_out_data = cur_h_out_data; \ - batched_input_data = cur_in_data - - if (use_peepholes) { - for (int step = tstart; step < max_seq_len; ++step) { - const int cur_bs = batch_starts[step + 1] - batch_starts[step]; - GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data); - DEFINE_CUR; - for (int i = 0; i < cur_bs; ++i) { - COMPUTE_CtHt_PEEPHOLE(cur_in_data, cur_prev_c_data, cur_c_out_data, - cur_h_out_data); - MOVE_ONE_BATCH; - } - MOVE_ONE_STEP; - } - } else { - // TODO(TJ): unly workaround, clean me - std::function compute_ctht; - if (platform::jit::MayIUse(platform::jit::avx) && - act_gate_str == "sigmoid" && act_cand_str == "tanh" && - act_cell_str == "tanh" && D == 8) { - compute_ctht = math::lstm_compute_ctht; - } else { - compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) { - COMPUTE_CtHt(gates, ct_1, ct, ht); - }; - } - for (int step = tstart; step < max_seq_len; ++step) { - const int cur_bs = batch_starts[step + 1] - batch_starts[step]; - GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data); - DEFINE_CUR; - for (int i = 0; i < cur_bs; ++i) { - compute_ctht(cur_in_data, cur_prev_c_data, cur_c_out_data, - cur_h_out_data); - MOVE_ONE_BATCH; - } - MOVE_ONE_STEP; + for (int step = tstart; step < max_seq_len; ++step) { + const int cur_bs = batch_starts[step + 1] - batch_starts[step]; + GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data); + T* cur_in_data = batched_input_data; + T* cur_prev_c_data = prev_c_data; + T* cur_c_out_data = batched_c_out_data; + T* cur_h_out_data = batched_h_out_data; + for (int i = 0; i < cur_bs; ++i) { + ker->ComputeCtHt(cur_in_data, cur_prev_c_data, cur_c_out_data, + cur_h_out_data, wp_data, checked_cell_data); + // move one batch + cur_in_data += D4; + cur_prev_c_data += D; + cur_c_out_data += D; + cur_h_out_data += D; } + // move one step + prev_c_data = batched_c_out_data; + prev_h_data = batched_h_out_data; + batched_c_out_data = cur_c_out_data; + batched_h_out_data = cur_h_out_data; + batched_input_data = cur_in_data; } -#undef MOVE_ONE_STEP -#undef MOVE_ONE_BATCH -#undef DEFINE_CUR math::Batch2LoDTensorFunctor to_seq; batched_h_out->set_lod(batched_lod); @@ -594,18 +444,9 @@ class FuisonLSTMKernel : public framework::OpKernel { } } -#undef COMPUTE_CtHt_PEEPHOLE -#undef COMPUTE_CtHt -#undef GET_Ct_NOH0C0 -#undef COMPUTE_CtHt_NOH0C0 -#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0 -#undef GET_Ht -#undef GET_Ct #undef GEMM_WH_ADDON -#undef INIT_BASE_INPUT_DATAS -#undef INIT_BASE_SIZES -#undef INIT_BASE_INPUT_OUTPUT -#undef INIT_VEC_FUNC +#undef INIT_OTHER_DEFINES +#undef INIT_BASE_DEFINES }; } // namespace operators diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc index 248c7793560db99c0af06421bf74808422016061..7b42efd623b31a703bf51d2d157130b3120b42a4 100644 --- a/paddle/fluid/operators/isfinite_op.cc +++ b/paddle/fluid/operators/isfinite_op.cc @@ -60,7 +60,7 @@ class OverflowOpMaker : public framework::OpProtoAndCheckerMaker { "(Tensor) 1-dim tensor, contains a bool scalar. The output " "tensor of overflow operator."); AddComment(string::Sprintf(R"DOC( -Overflow operator. +Overflow %s operator. $$Out = any(X)$$ @@ -69,6 +69,8 @@ Out = Inf if any X contains Inf, Out = Nan if any X contains Nan, Out = 0 if no Inf/Nan detected. If X contains both Inf/Nan, it will return the first indicator it meeted. + +%s )DOC", GetName(), GetComments())); } diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index b0276f4080b3989047f9d181e9d37a18dcfad5fa..7365bfeeb8edf09a8ad5e1cb2c61300e86bdf518 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -45,8 +45,6 @@ math_library(im2col) if (NOT WIN32) # windows do not support avx functions yet. math_library(gru_compute DEPS activation_functions math_function) math_library(lstm_compute DEPS activation_functions) -# TODO(TJ): ugly workaround, clean me -cc_library(cpu_lstm_compute SRCS cpu_lstm_compute.cc DEPS activation_functions cblas cpu_info) endif (NOT WIN32) cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context) @@ -76,3 +74,7 @@ if(WITH_GPU) endif() cc_test(concat_test SRCS concat_test.cc DEPS concat) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) +cc_library(jit_kernel + SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_lstm.cc + DEPS cpu_info cblas activation_functions) +cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) diff --git a/paddle/fluid/operators/math/algorithm.h b/paddle/fluid/operators/math/algorithm.h new file mode 100644 index 0000000000000000000000000000000000000000..262469beea7449eb5820b86de1ac4f790a833e79 --- /dev/null +++ b/paddle/fluid/operators/math/algorithm.h @@ -0,0 +1,44 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include // for int64_t +#include + +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { +namespace math { + +template +HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) { + int64_t beg = 0, end = num - 1; + while (beg <= end) { + auto mid = ((beg + end) >> 1); + if (x[mid] == val) + return mid; + else if (x[mid] < val) + beg = mid + 1; + else + end = mid - 1; + } + return -1; +} + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/cpu_lstm_compute.cc b/paddle/fluid/operators/math/cpu_lstm_compute.cc deleted file mode 100644 index e96d1879331974e0873e13f171414bcfa8c45953..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/math/cpu_lstm_compute.cc +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/cpu_lstm_compute.h" - -namespace paddle { -namespace operators { -namespace math { -#ifdef __AVX__ -template <> -void lstm_compute_ctht(float* gates, const float* ct_1, float* ct, - float* ht) { - namespace act = detail::forward::avx; - // gates: W_ch, W_ih, W_fh, W_oh - __m256 c, i, f, o; - c = _mm256_loadu_ps(gates); - i = _mm256_loadu_ps(gates + 8); - f = _mm256_loadu_ps(gates + 16); - o = _mm256_loadu_ps(gates + 24); - - /* C_t = C_t-1 * fgated + cand_gated * igated*/ - c = _mm256_mul_ps(act::Tanh(c), act::Sigmoid(i)); - i = _mm256_loadu_ps(ct_1); - f = _mm256_mul_ps(i, act::Sigmoid(f)); - f = _mm256_add_ps(c, f); - _mm256_storeu_ps(ct, f); - - /* H_t = act_cell(C_t) * ogated */ - o = _mm256_mul_ps(act::Tanh(f), act::Sigmoid(o)); - _mm256_storeu_ps(ht, o); -} -#endif -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/cpu_lstm_compute.h b/paddle/fluid/operators/math/cpu_lstm_compute.h deleted file mode 100644 index 169a9e4b47f54851ad436428416eca879b78e186..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/math/cpu_lstm_compute.h +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/operators/math/cpu_vec.h" -#include "paddle/fluid/platform/cpu_info.h" -#ifdef __AVX__ -#include -#endif - -namespace paddle { -namespace operators { -namespace math { - -// TODO(TJ): ugly workaround, clean me -template -void lstm_compute_ctht(T* gates, const T* ct_1, T* ct, T* ht) { - // gates: W_ch, W_ih, W_fh, W_oh - vec_sigmoid(24, gates + 8, gates + 8); - vec_tanh(8, gates, gates); - const T *i = gates + 8, *f = gates + 16, *o = gates + 24; - const T min = SIGMOID_THRESHOLD_MIN; - const T max = SIGMOID_THRESHOLD_MAX; - for (int d = 0; d < 8; ++d) { - // C_t = C_t-1 * fgated + cand_gated * igated - ct[d] = ct_1[d] * f[d] + gates[d] * i[d]; - // H_t = act_cell(C_t) * ogated - T tmp = ct[d] * 2; - tmp = static_cast(0) - ((tmp < min) ? min : ((tmp > max) ? max : tmp)); - vec_exp(1, &tmp, &tmp); - tmp = static_cast(2) / (static_cast(1) + tmp) - static_cast(1); - ht[d] = tmp * o[d]; - } -} - -#ifdef __AVX__ -namespace detail { -namespace forward { -namespace avx { -__m256 Sigmoid(const __m256 a); -__m256 Tanh(const __m256 a); - -} // namespace avx -} // namespace forward -} // namespace detail - -template <> -void lstm_compute_ctht(float* gates, const float* ct_1, float* ct, - float* ht); - -#endif - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index 6a059968b79189458349e466079cc7a663a8e5ff..0aed253c80fc28560716cbcfa70f74ef9c84f9b6 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -125,10 +125,8 @@ inline void vec_scal(const int n, const float a, } template <> -inline void vec_scal(const int n, - const float a, - const float* x, - float* y) { +inline void vec_scal(const int n, const float a, + const float* x, float* y) { // TODO(TJ): enable me vec_scal(n, a, x, y); } @@ -181,10 +179,10 @@ inline void vec_bias_sub(const int n, const float a, } template <> -inline void vec_bias_sub(const int n, - const float a, - const float* x, - float* y) { +inline void vec_bias_sub(const int n, + const float a, + const float* x, + float* y) { // TODO(TJ): enable me vec_bias_sub(n, a, x, y); } @@ -242,7 +240,7 @@ inline void vec_cross(const int n, const float* x, } template <> -inline void vec_cross( +inline void vec_cross( const int n, const float* x, const float* y, const float* z, float* out) { // TODO(TJ): enable me vec_cross(n, x, y, z, out); @@ -296,10 +294,10 @@ inline void vec_add_bias(const int n, const float a, } template <> -inline void vec_add_bias(const int n, - const float a, - const float* x, - float* y) { +inline void vec_add_bias(const int n, + const float a, + const float* x, + float* y) { // TODO(TJ): enable me vec_add_bias(n, a, x, y); } @@ -390,9 +388,9 @@ inline void vec_sigmoid(const int n, const float* x, } template <> -inline void vec_sigmoid(const int n, - const float* x, - float* y) { +inline void vec_sigmoid(const int n, + const float* x, + float* y) { // TODO(TJ): enable me vec_sigmoid(n, x, y); } @@ -454,9 +452,8 @@ inline void vec_relu(const int n, const float* x, } template <> -inline void vec_relu(const int n, - const float* x, - float* y) { +inline void vec_relu(const int n, const float* x, + float* y) { // TODO(TJ): enable me vec_relu(n, x, y); } diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc index 3ce66f49ed8354c49e8af26ca6eb48fef654a40b..cd40f1b2f984126663a5711efac24fdf6d680b32 100644 --- a/paddle/fluid/operators/math/cpu_vec_test.cc +++ b/paddle/fluid/operators/math/cpu_vec_test.cc @@ -110,7 +110,7 @@ TEST(CpuVecTest, sigmoid) { TestAndBench(sz, vec_sigmoid, ref_sigmoid); TestAndBench(sz, vec_sigmoid, ref_sigmoid); TestAndBench(sz, vec_sigmoid, ref_sigmoid); - TestAndBench(sz, vec_sigmoid, + TestAndBench(sz, vec_sigmoid, ref_sigmoid); } TestAndBench(30, vec_sigmoid, ref_sigmoid); @@ -123,8 +123,7 @@ TEST(CpuVecTest, tanh) { TestAndBench(sz, vec_tanh, ref_tanh); TestAndBench(sz, vec_tanh, ref_tanh); TestAndBench(sz, vec_tanh, ref_tanh); - TestAndBench(sz, vec_tanh, - ref_tanh); + TestAndBench(sz, vec_tanh, ref_tanh); } TestAndBench(30, vec_tanh, ref_tanh); } @@ -136,8 +135,7 @@ TEST(CpuVecTest, relu) { TestAndBench(sz, vec_relu, ref_relu); TestAndBench(sz, vec_relu, ref_relu); TestAndBench(sz, vec_relu, ref_relu); - TestAndBench(sz, vec_relu, - ref_relu); + TestAndBench(sz, vec_relu, ref_relu); } TestAndBench(30, vec_relu, ref_relu); } @@ -170,7 +168,7 @@ TEST(CpuVecTest, inplace_sigmoid) { TestInplace(sz, vec_sigmoid, ref_sigmoid); TestInplace(sz, vec_sigmoid, ref_sigmoid); TestInplace(sz, vec_sigmoid, ref_sigmoid); - TestInplace(sz, vec_sigmoid, + TestInplace(sz, vec_sigmoid, ref_sigmoid); } TestInplace(30, vec_sigmoid, ref_sigmoid); @@ -183,8 +181,7 @@ TEST(CpuVecTest, inplace_tanh) { TestInplace(sz, vec_tanh, ref_tanh); TestInplace(sz, vec_tanh, ref_tanh); TestInplace(sz, vec_tanh, ref_tanh); - TestInplace(sz, vec_tanh, - ref_tanh); + TestInplace(sz, vec_tanh, ref_tanh); } TestInplace(30, vec_tanh, ref_tanh); } @@ -196,8 +193,7 @@ TEST(CpuVecTest, inplace_relu) { TestInplace(sz, vec_relu, ref_relu); TestInplace(sz, vec_relu, ref_relu); TestInplace(sz, vec_relu, ref_relu); - TestInplace(sz, vec_relu, - ref_relu); + TestInplace(sz, vec_relu, ref_relu); } TestInplace(30, vec_relu, ref_relu); } diff --git a/paddle/fluid/operators/math/jit_kernel.cc b/paddle/fluid/operators/math/jit_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..68b708b345334bc63b5e2e88c308d20ca6378e6b --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel.cc @@ -0,0 +1,41 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/jit_kernel.h" +#include +#include + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { + +namespace jit = platform::jit; + +KernelPool& KernelPool::Instance() { + static thread_local KernelPool g_jit_kernels; + return g_jit_kernels; +} + +std::shared_ptr KernelPool::Get(const std::string& key) const { + if (kers_.find(key) == kers_.end()) { + return nullptr; + } + return kers_.at(key); +} + +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..b4dfda6db76fd4231be0acd1f90c98a2d62134b8 --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel.h @@ -0,0 +1,142 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include // for shared_ptr +#include +#include +#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/macros.h" + +// Note: Only support on CPU yet. +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { + +#define SIGMOID_THRESHOLD_MIN -40.0 +#define SIGMOID_THRESHOLD_MAX 13.0 +#define EXP_MAX_INPUT 40.0 +#define AVX_FLOAT_BLOCK 8 +#define AVX2_FLOAT_BLOCK 8 +#define AVX512_FLOAT_BLOCK 16 + +typedef enum { kLT8, kEQ8, kGT8LT16, kEQ16, kGT16 } jit_block; + +class Kernel { + public: + Kernel() = default; + virtual ~Kernel() = default; + int num_{0}; + int end_{0}; + int rest_{0}; + DISABLE_COPY_AND_ASSIGN(Kernel); +}; + +class KernelPool { + public: + static KernelPool &Instance(); + + template + std::shared_ptr Get(ARGS... args); + + std::shared_ptr Get(const std::string &key) const; + + private: + KernelPool() = default; + std::unordered_map> kers_; + + DISABLE_COPY_AND_ASSIGN(KernelPool); +}; + +template +class VMulKernel : public Kernel { + public: + virtual void Compute(const T *x, const T *y, T *z) const = 0; +}; + +template +class VAddKernel : public Kernel { + public: + virtual void Compute(const T *x, const T *y, T *z) const = 0; +}; + +template +class VScalKernel : public Kernel { + public: + virtual void Compute(const T a, const T *x, T *y) const = 0; + virtual void Compute(const T a, T *x) const = 0; +}; + +template +class VAddBiasKernel : public Kernel { + public: + virtual void Compute(const T a, const T *x, T *y) const = 0; +}; + +template +class VActKernel : public Kernel { + public: + virtual void Compute(const T *x, T *y) const = 0; +}; + +template +class VReluKernel : public VActKernel { + public: + virtual void Compute(const T *x, T *y) const = 0; +}; + +template +class VIdentityKernel : public VActKernel { + public: + virtual void Compute(const T *x, T *y) const = 0; +}; + +template +class VExpKernel : public VActKernel { + public: + virtual void Compute(const T *x, T *y) const = 0; +}; + +template +class VSigmoidKernel : public VActKernel { + public: + virtual void Compute(const T *x, T *y) const = 0; +}; + +template +class VTanhKernel : public VActKernel { + public: + virtual void Compute(const T *x, T *y) const = 0; +}; + +template +class LSTMKernel : public Kernel { + public: + virtual void ComputeCtHt(T *gates, const T *ct_1, T *ct, T *ht, + /* below only used in peephole*/ + const T *wp_data = nullptr, + T *checked = nullptr) const = 0; + + // compute c1 and h1 without c0 or h0 + virtual void ComputeC1H1(T *gates, T *ct, T *ht, + /* below only used in peephole*/ + const T *wp_data = nullptr) const = 0; +}; + +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc new file mode 100644 index 0000000000000000000000000000000000000000..0f9ea533fccdd34a5ccf061d89ffe92687d65933 --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -0,0 +1,391 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/jit_kernel.h" +#include +#include "paddle/fluid/operators/math/jit_kernel_macro.h" +#ifdef PADDLE_WITH_MKLML +#include "paddle/fluid/platform/dynload/mklml.h" +#endif + +#ifdef __AVX__ +#include +#endif + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { + +namespace jit = platform::jit; + +/* VMUL JitKernel */ +template +class VMulKernelImpl : public VMulKernel { + public: + explicit VMulKernelImpl(int d) : VMulKernel() { this->num_ = d; } + void Compute(const T* x, const T* y, T* z) const override { + for (int i = 0; i < this->num_; ++i) { + z[i] = x[i] * y[i]; + } + } +}; + +#ifdef PADDLE_WITH_MKLML +#define MKL_FLOAT(isa, block) \ + template <> \ + void VMulKernelImpl::Compute( \ + const float* x, const float* y, float* z) const { \ + platform::dynload::vsMul(this->num_, x, y, z); \ + } + +#define MKL_DOUBLE(isa, block) \ + template <> \ + void VMulKernelImpl::Compute( \ + const double* x, const double* y, double* z) const { \ + platform::dynload::vdMul(this->num_, x, y, z); \ + } + +FOR_EACH_ISA(MKL_FLOAT, kGT16); +FOR_EACH_ISA_BLOCK(MKL_DOUBLE); +#endif + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VMulKernelImpl::Compute( \ + const float* x, const float* y, float* z) const { \ + __m256 tmpx, tmpy; \ + tmpx = _mm256_loadu_ps(x); \ + tmpy = _mm256_loadu_ps(y); \ + tmpx = _mm256_mul_ps(tmpx, tmpy); \ + _mm256_storeu_ps(z, tmpx); \ + } + +// avx > for > mkl +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +#endif +// TODO(TJ): eq16 test and complete avx512 +#undef INTRI8_FLOAT +#undef MKL_FLOAT +#undef MKL_DOUBLE + +/* VADD JitKernel */ +template +class VAddKernelImpl : public VAddKernel { + public: + explicit VAddKernelImpl(int d) : VAddKernel() { this->num_ = d; } + void Compute(const T* x, const T* y, T* z) const override { + for (int i = 0; i < this->num_; ++i) { + z[i] = x[i] + y[i]; + } + } +}; + +#ifdef PADDLE_WITH_MKLML +#define MKL_FLOAT(isa, block) \ + template <> \ + void VAddKernelImpl::Compute( \ + const float* x, const float* y, float* z) const { \ + platform::dynload::vsAdd(this->num_, x, y, z); \ + } + +#define MKL_DOUBLE(isa, block) \ + template <> \ + void VAddKernelImpl::Compute( \ + const double* x, const double* y, double* z) const { \ + platform::dynload::vdAdd(this->num_, x, y, z); \ + } + +FOR_EACH_ISA(MKL_FLOAT, kGT16); +FOR_EACH_ISA_BLOCK(MKL_DOUBLE); +#endif + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VAddKernelImpl::Compute( \ + const float* x, const float* y, float* z) const { \ + __m256 tmpx, tmpy; \ + tmpx = _mm256_loadu_ps(x); \ + tmpy = _mm256_loadu_ps(y); \ + tmpx = _mm256_add_ps(tmpx, tmpy); \ + _mm256_storeu_ps(z, tmpx); \ + } +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +#endif +// TODO(TJ): eq16 test and complete avx512 + +#undef INTRI8_FLOAT +#undef MKL_FLOAT +#undef MKL_DOUBLE + +/* VSCAL JitKernel */ +template +class VScalKernelImpl : public VScalKernel { + public: + explicit VScalKernelImpl(int d) : VScalKernel() { this->num_ = d; } + void Compute(const T a, const T* x, T* y) const override { + for (int i = 0; i < this->num_; ++i) { + y[i] = a * x[i]; + } + } + void Compute(const T a, T* x) const override { + for (int i = 0; i < this->num_; ++i) { + x[i] = a * x[i]; + } + } +}; + +#ifdef PADDLE_WITH_MKLML +#define MKL_FLOAT(isa, block) \ + template <> \ + void VScalKernelImpl::Compute(const float a, float* x) \ + const { \ + platform::dynload::cblas_sscal(this->num_, a, x, 1); \ + } + +#define MKL_DOUBLE(isa, block) \ + template <> \ + void VScalKernelImpl::Compute(const double a, double* x) \ + const { \ + platform::dynload::cblas_dscal(this->num_, a, x, 1); \ + } + +FOR_EACH_ISA(MKL_FLOAT, kGT16); +FOR_EACH_ISA_BLOCK(MKL_DOUBLE); +#endif + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VScalKernelImpl::Compute( \ + const float a, const float* x, float* y) const { \ + __m256 tmp; \ + __m256 scalar = _mm256_set1_ps(a); \ + tmp = _mm256_loadu_ps(x); \ + tmp = _mm256_mul_ps(tmp, scalar); \ + _mm256_storeu_ps(y, tmp); \ + } +#define INTRI8_INPLACE_FLOAT(isa) \ + template <> \ + void VScalKernelImpl::Compute(const float a, float* x) \ + const { \ + __m256 tmp; \ + __m256 scalar = _mm256_set1_ps(a); \ + tmp = _mm256_loadu_ps(x); \ + tmp = _mm256_mul_ps(tmp, scalar); \ + _mm256_storeu_ps(x, tmp); \ + } + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +INTRI8_INPLACE_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +INTRI8_INPLACE_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +INTRI8_INPLACE_FLOAT(jit::avx512f); +#endif +// TODO(TJ): eq16 test and complete avx512 + +#undef INTRI8_FLOAT +#undef INTRI8_INPLACE_FLOAT +#undef MKL_FLOAT +#undef MKL_DOUBLE + +/* VAddBias JitKernel */ +template +class VAddBiasKernelImpl : public VAddBiasKernel { + public: + explicit VAddBiasKernelImpl(int d) : VAddBiasKernel() { this->num_ = d; } + void Compute(const T a, const T* x, T* y) const override { + for (int i = 0; i < this->num_; ++i) { + y[i] = x[i] + a; + } + } +}; + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VAddBiasKernelImpl::Compute( \ + const float a, const float* x, float* y) const { \ + __m256 tmp = _mm256_loadu_ps(x); \ + tmp = _mm256_add_ps(tmp, _mm256_set1_ps(a)); \ + _mm256_storeu_ps(y, tmp); \ + } + +#define INTRI16_FLOAT(isa) \ + template <> \ + void VAddBiasKernelImpl::Compute( \ + const float a, const float* x, float* y) const { \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + 8); \ + tmp0 = _mm256_add_ps(tmp0, _mm256_set1_ps(a)); \ + tmp1 = _mm256_add_ps(tmp1, _mm256_set1_ps(a)); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + 8, tmp1); \ + } + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +INTRI16_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +INTRI16_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +INTRI16_FLOAT(jit::avx512f); +#endif +// TODO(TJ): eq16 test and complete avx512 + +#undef INTRI8_FLOAT +#undef INTRI16_FLOAT + +/* VRelu JitKernel */ +template +class VReluKernelImpl : public VReluKernel { + public: + explicit VReluKernelImpl(int d) : VReluKernel() { this->num_ = d; } + void Compute(const T* x, T* y) const override { + for (int i = 0; i < this->num_; ++i) { + y[i] = x[i] > 0 ? x[i] : 0; + } + } +}; + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VReluKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 tmp = _mm256_loadu_ps(x); \ + tmp = _mm256_max_ps(tmp, _mm256_setzero_ps()); \ + _mm256_storeu_ps(y, tmp); \ + } + +#define INTRI16_FLOAT(isa) \ + template <> \ + void VReluKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 zeros = _mm256_setzero_ps(); \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + 8); \ + tmp0 = _mm256_max_ps(tmp0, zeros); \ + tmp1 = _mm256_max_ps(tmp1, zeros); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + 8, tmp1); \ + } + +#define INTRI_GT8LT16_FLOAT(isa) \ + template <> \ + VReluKernelImpl::VReluKernelImpl(int d) \ + : VReluKernel() { \ + this->num_ = d; \ + this->end_ = AVX_FLOAT_BLOCK; \ + this->rest_ = d - AVX_FLOAT_BLOCK; \ + } \ + template <> \ + void VReluKernelImpl::Compute(const float* x, \ + float* y) const { \ + __m256 zeros = _mm256_setzero_ps(); \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + this->rest_); \ + tmp0 = _mm256_max_ps(tmp0, zeros); \ + tmp1 = _mm256_max_ps(tmp1, zeros); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + this->rest_, tmp1); \ + } + +#define INTRI_GT16_FLOAT(isa) \ + template <> \ + VReluKernelImpl::VReluKernelImpl(int d) \ + : VReluKernel() { \ + this->num_ = d; \ + this->end_ = d - d % AVX_FLOAT_BLOCK; \ + this->rest_ = d - AVX_FLOAT_BLOCK; \ + } \ + template <> \ + void VReluKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 zeros = _mm256_setzero_ps(); \ + for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \ + __m256 tmp = _mm256_loadu_ps(x + i); \ + tmp = _mm256_max_ps(tmp, zeros); \ + _mm256_storeu_ps(y + i, tmp); \ + } \ + __m256 tmp = _mm256_loadu_ps(x + this->rest_); \ + tmp = _mm256_max_ps(tmp, zeros); \ + _mm256_storeu_ps(y + this->rest_, tmp); \ + } + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +INTRI16_FLOAT(jit::avx); +INTRI_GT8LT16_FLOAT(jit::avx); +INTRI_GT16_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +INTRI16_FLOAT(jit::avx2); +INTRI_GT8LT16_FLOAT(jit::avx2); +INTRI_GT16_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +// TODO(TJ): refine avx512 +INTRI8_FLOAT(jit::avx512f); +INTRI16_FLOAT(jit::avx512f); +INTRI_GT8LT16_FLOAT(jit::avx512f); +INTRI_GT16_FLOAT(jit::avx512f); +#endif + +#undef INTRI8_FLOAT +#undef INTRI16_FLOAT +#undef INTRI_GT8LT16_FLOAT +#undef INTRI_GT16_FLOAT + +/* An empty JitKernel */ +template +class VIdentityKernelImpl : public VIdentityKernel { + public: + explicit VIdentityKernelImpl(int d) : VIdentityKernel() { this->num_ = d; } + void Compute(const T* x, T* y) const override {} +}; + +REGISTER_JITKERNEL(vmul, VMulKernel); +REGISTER_JITKERNEL(vadd, VAddKernel); +REGISTER_JITKERNEL(vscal, VScalKernel); +REGISTER_JITKERNEL(vaddb, VAddBiasKernel); +REGISTER_JITKERNEL(vrelu, VReluKernel); +REGISTER_JITKERNEL(videntity, VIdentityKernel); + +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc new file mode 100644 index 0000000000000000000000000000000000000000..b62e130c43743f542e2074868fc01598047d6b19 --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_exp.cc @@ -0,0 +1,400 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/jit_kernel.h" +#include // for exp +#include +#include "paddle/fluid/operators/math/jit_kernel_macro.h" +#ifdef PADDLE_WITH_MKLML +#include "paddle/fluid/platform/dynload/mklml.h" +#endif + +#ifdef __AVX__ +#include +#endif + +namespace paddle { +namespace operators { +namespace math { + +#ifdef __AVX__ +namespace detail { +__m256 Exp(__m256 a); +} // namespace detail +#endif + +namespace jitkernel { +namespace jit = platform::jit; + +/* VExp JitKernel */ +template +class VExpKernelImpl : public VExpKernel { + public: + explicit VExpKernelImpl(int d) : VExpKernel() { this->num_ = d; } + void Compute(const T* x, T* y) const override { + for (int i = 0; i < this->num_; ++i) { + y[i] = std::exp(x[i]); + } + } +}; + +#ifdef PADDLE_WITH_MKLML +#define MKL_FLOAT(isa, block) \ + template <> \ + void VExpKernelImpl::Compute(const float* x, float* y) \ + const { \ + platform::dynload::vsExp(this->num_, x, y); \ + } + +#define MKL_DOUBLE(isa, block) \ + template <> \ + void VExpKernelImpl::Compute(const double* x, double* y) \ + const { \ + platform::dynload::vdExp(this->num_, x, y); \ + } +FOR_EACH_ISA(MKL_FLOAT, kLT8); +FOR_EACH_ISA(MKL_FLOAT, kGT8LT16); +FOR_EACH_ISA(MKL_FLOAT, kGT16); +FOR_EACH_ISA_BLOCK(MKL_DOUBLE); +#endif + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VExpKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 tmp = _mm256_loadu_ps(x); \ + _mm256_storeu_ps(y, detail::Exp(tmp)); \ + } + +#define INTRI16_FLOAT(isa) \ + template <> \ + void VExpKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + 8); \ + tmp0 = detail::Exp(tmp0); \ + tmp1 = detail::Exp(tmp1); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + 8, tmp1); \ + } + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +INTRI16_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +INTRI16_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +INTRI16_FLOAT(jit::avx512f); +#endif +// TODO(TJ): eq16 test and complete avx512 + +#undef INTRI8_FLOAT +#undef INTRI16_FLOAT +#undef MKL_FLOAT +#undef MKL_DOUBLE + +REGISTER_JITKERNEL(vexp, VExpKernel); + +/* VSigmoid JitKernel */ +template +class VSigmoidKernelImpl : public VSigmoidKernel { + public: + explicit VSigmoidKernelImpl(int d) : VSigmoidKernel() { + this->num_ = d; + vexp_ = KernelPool::Instance().template Get>(d); + } + void Compute(const T* x, T* y) const override { + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + for (int i = 0; i < this->num_; ++i) { + y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); + y[i] = static_cast(0) - y[i]; + } + vexp_->Compute(y, y); + for (int i = 0; i < this->num_; ++i) { + y[i] = static_cast(1) / (static_cast(1) + y[i]); + } + } + + private: + std::shared_ptr> vexp_; +}; + +#define INTRI_SIGMOID(tmp, min, max) \ + tmp = _mm256_max_ps(tmp, min); \ + tmp = _mm256_min_ps(tmp, max); \ + tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); \ + tmp = detail::Exp(tmp); \ + tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \ + tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp) + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VSigmoidKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ + __m256 tmp = _mm256_loadu_ps(x); \ + INTRI_SIGMOID(tmp, min, max); \ + _mm256_storeu_ps(y, tmp); \ + } + +#define INTRI16_FLOAT(isa) \ + template <> \ + void VSigmoidKernelImpl::Compute(const float* x, \ + float* y) const { \ + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + 8); \ + INTRI_SIGMOID(tmp0, min, max); \ + INTRI_SIGMOID(tmp1, min, max); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + 8, tmp1); \ + } + +#define INTRI_GT8LT16_FLOAT(isa) \ + template <> \ + VSigmoidKernelImpl::VSigmoidKernelImpl(int d) \ + : VSigmoidKernel() { \ + this->num_ = d; \ + this->end_ = AVX_FLOAT_BLOCK; \ + this->rest_ = d - this->end_; \ + vexp_ = \ + KernelPool::Instance().template Get>(this->rest_); \ + } \ + template <> \ + void VSigmoidKernelImpl::Compute(const float* x, \ + float* y) const { \ + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ + __m256 tmp = _mm256_loadu_ps(x); \ + INTRI_SIGMOID(tmp, min, max); \ + _mm256_storeu_ps(y, tmp); \ + const float min_ = SIGMOID_THRESHOLD_MIN; \ + const float max_ = SIGMOID_THRESHOLD_MAX; \ + for (int i = this->end_; i < this->num_; ++i) { \ + y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \ + y[i] = 0.f - y[i]; \ + } \ + vexp_->Compute(y + this->end_, y + this->end_); \ + for (int i = this->end_; i < this->num_; ++i) { \ + y[i] = 1.f / (1.f + y[i]); \ + } \ + } + +#define INTRI_GT16_FLOAT(isa) \ + template <> \ + VSigmoidKernelImpl::VSigmoidKernelImpl(int d) \ + : VSigmoidKernel() { \ + this->num_ = d; \ + this->rest_ = d % AVX_FLOAT_BLOCK; \ + this->end_ = d - this->rest_; \ + vexp_ = \ + KernelPool::Instance().template Get>(this->rest_); \ + } \ + template <> \ + void VSigmoidKernelImpl::Compute(const float* x, \ + float* y) const { \ + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ + for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \ + __m256 tmp = _mm256_loadu_ps(x + i); \ + INTRI_SIGMOID(tmp, min, max); \ + _mm256_storeu_ps(y + i, tmp); \ + } \ + const float min_ = SIGMOID_THRESHOLD_MIN; \ + const float max_ = SIGMOID_THRESHOLD_MAX; \ + for (int i = this->end_; i < this->num_; ++i) { \ + y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \ + y[i] = 0.f - y[i]; \ + } \ + vexp_->Compute(y + this->end_, y + this->end_); \ + for (int i = this->end_; i < this->num_; ++i) { \ + y[i] = 1.f / (1.f + y[i]); \ + } \ + } + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +INTRI16_FLOAT(jit::avx); +INTRI_GT8LT16_FLOAT(jit::avx); +INTRI_GT16_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +INTRI16_FLOAT(jit::avx2); +// INTRI_GT8LT16_FLOAT(jit::avx2); +// INTRI_GT16_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +INTRI16_FLOAT(jit::avx512f); +// INTRI_GT8LT16_FLOAT(jit::avx512f); +// INTRI_GT16_FLOAT(jit::avx512f); +#endif + +#undef INTRI8_FLOAT +#undef INTRI16_FLOAT +#undef INTRI_GT8LT16_FLOAT +#undef INTRI_GT16_FLOAT +#undef INTRI_VSIGMOID + +REGISTER_JITKERNEL(vsigmoid, VSigmoidKernel); + +/* VTanh JitKernel */ +template +class VTanhKernelImpl : public VTanhKernel { + public: + explicit VTanhKernelImpl(int d) : VTanhKernel() { + this->num_ = d; + vscal_ = KernelPool::Instance().template Get>(d); + vsigmoid_ = KernelPool::Instance().template Get>(d); + vaddbias_ = KernelPool::Instance().template Get>(d); + } + void Compute(const T* x, T* y) const override { + vscal_->Compute(static_cast(2), x, y); + vsigmoid_->Compute(y, y); + vscal_->Compute(static_cast(2), y); + vaddbias_->Compute(static_cast(-1), y, y); + } + + private: + std::shared_ptr> vscal_; + std::shared_ptr> vsigmoid_; + std::shared_ptr> vaddbias_; +}; + +#define INTRI_VTANH(tmp) \ + tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), tmp); \ + tmp = _mm256_min_ps(tmp, _mm256_set1_ps(EXP_MAX_INPUT)); \ + tmp = detail::Exp(tmp); \ + tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \ + tmp = _mm256_div_ps(_mm256_set1_ps(2.0f), tmp); \ + tmp = _mm256_sub_ps(tmp, _mm256_set1_ps(1.0f)) + +#define INTRI8_FLOAT(isa) \ + template <> \ + void VTanhKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 tmp = _mm256_loadu_ps(x); \ + INTRI_VTANH(tmp); \ + _mm256_storeu_ps(y, tmp); \ + } + +#define INTRI16_FLOAT(isa) \ + template <> \ + void VTanhKernelImpl::Compute(const float* x, float* y) \ + const { \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + 8); \ + INTRI_VTANH(tmp0); \ + INTRI_VTANH(tmp1); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + 8, tmp1); \ + } + +#define INTRI_GT8LT16_FLOAT(isa) \ + template <> \ + VTanhKernelImpl::VTanhKernelImpl(int d) \ + : VTanhKernel() { \ + this->num_ = d; \ + this->end_ = AVX_FLOAT_BLOCK; \ + this->rest_ = d - this->end_; \ + vscal_ = \ + KernelPool::Instance().template Get>(this->rest_); \ + vsigmoid_ = KernelPool::Instance().template Get>( \ + this->rest_); \ + vaddbias_ = KernelPool::Instance().template Get>( \ + this->rest_); \ + } \ + template <> \ + void VTanhKernelImpl::Compute(const float* x, \ + float* y) const { \ + __m256 tmp = _mm256_loadu_ps(x); \ + INTRI_VTANH(tmp); \ + _mm256_storeu_ps(y, tmp); \ + x += AVX_FLOAT_BLOCK; \ + y += AVX_FLOAT_BLOCK; \ + vscal_->Compute(2.f, x, y); \ + vsigmoid_->Compute(y, y); \ + vscal_->Compute(2.f, y); \ + vaddbias_->Compute(-1.f, y, y); \ + } + +#define INTRI_GT16_FLOAT(isa) \ + template <> \ + VTanhKernelImpl::VTanhKernelImpl(int d) \ + : VTanhKernel() { \ + this->num_ = d; \ + this->rest_ = d % AVX_FLOAT_BLOCK; \ + this->end_ = d - this->rest_; \ + vscal_ = \ + KernelPool::Instance().template Get>(this->rest_); \ + vsigmoid_ = KernelPool::Instance().template Get>( \ + this->rest_); \ + vaddbias_ = KernelPool::Instance().template Get>( \ + this->rest_); \ + } \ + template <> \ + void VTanhKernelImpl::Compute(const float* x, float* y) \ + const { \ + for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \ + __m256 tmp = _mm256_loadu_ps(x + i); \ + INTRI_VTANH(tmp); \ + _mm256_storeu_ps(y + i, tmp); \ + } \ + x += this->end_; \ + y += this->end_; \ + vscal_->Compute(2.f, x, y); \ + vsigmoid_->Compute(y, y); \ + vscal_->Compute(2.f, y); \ + vaddbias_->Compute(-1.f, y, y); \ + } + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +INTRI16_FLOAT(jit::avx); +INTRI_GT8LT16_FLOAT(jit::avx); +INTRI_GT16_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +INTRI16_FLOAT(jit::avx2); +// maybe use avx at gt8lt16 and gt16 +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +INTRI16_FLOAT(jit::avx512f); +// maybe use avx at gt8lt16 and gt16 +#endif + +#undef INTRI8_FLOAT +#undef INTRI16_FLOAT +#undef INTRI_GT8LT16_FLOAT +#undef INTRI_GT16_FLOAT +#undef INTRI_VTANH + +REGISTER_JITKERNEL(vtanh, VTanhKernel); + +#undef JITKERNEL_NEW_ACT_IMPL + +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel_lstm.cc b/paddle/fluid/operators/math/jit_kernel_lstm.cc new file mode 100644 index 0000000000000000000000000000000000000000..42a2b96fd945c516f8c26ca51ecb452345a9a86f --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_lstm.cc @@ -0,0 +1,308 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/jit_kernel.h" +#include +#include "paddle/fluid/operators/math/jit_kernel_macro.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/macros.h" + +#ifdef __AVX__ +#include +#endif + +namespace paddle { +namespace operators { +namespace math { +#ifdef __AVX__ +namespace detail { +__m256 Exp(__m256 a); +} // namespace detail +#endif + +namespace jitkernel { +namespace jit = platform::jit; + +#ifdef __AVX__ +typedef enum { kSigmoid, kRelu, kTanh, kIdentity } act_type; + +class AVXAct { + public: + virtual ~AVXAct() = default; + virtual __m256 Compute(__m256 x) const = 0; +}; + +template +class AVXActImpl : public AVXAct { + public: + __m256 Compute(__m256 x) const override { PADDLE_THROW("Unkown type!"); } +}; + +template <> +__m256 AVXActImpl::Compute(__m256 x) const { + __m256 ones = _mm256_set1_ps(1.0f); + x = _mm256_max_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MIN)); + x = _mm256_min_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MAX)); + x = _mm256_sub_ps(_mm256_set1_ps(0.0f), x); + x = detail::Exp(x); + x = _mm256_add_ps(ones, x); + return _mm256_div_ps(ones, x); +} + +template <> +__m256 AVXActImpl::Compute(__m256 x) const { + __m256 ones = _mm256_set1_ps(1.0f); + x = _mm256_mul_ps(_mm256_set1_ps(-2.0f), x); + x = _mm256_min_ps(x, _mm256_set1_ps(EXP_MAX_INPUT)); + x = detail::Exp(x); + x = _mm256_add_ps(ones, x); + x = _mm256_div_ps(_mm256_set1_ps(2.0f), x); + return _mm256_sub_ps(x, ones); +} + +template <> +__m256 AVXActImpl::Compute(__m256 x) const { + return _mm256_max_ps(x, _mm256_setzero_ps()); +} + +template <> +__m256 AVXActImpl::Compute(__m256 x) const { + return x; +} +#endif + +template +static std::shared_ptr> GetActKernel( + const std::string& type, int n) { + if (type == "sigmoid") { + return std::dynamic_pointer_cast>( + KernelPool::Instance().template Get>(n)); + } else if (type == "relu") { + return std::dynamic_pointer_cast>( + KernelPool::Instance().template Get>(n)); + } else if (type == "tanh") { + return std::dynamic_pointer_cast>( + KernelPool::Instance().template Get>(n)); + } else if (type == "identity" || type == "") { + return std::dynamic_pointer_cast>( + KernelPool::Instance().template Get>(n)); + } + PADDLE_THROW("Not support type: %s", type); + return nullptr; +} + +/* LSTM JitKernel */ +template +class LSTMKernelImpl : public LSTMKernel { + public: + explicit LSTMKernelImpl(const std::string& act_gate, + const std::string& act_cand, + const std::string& act_cell, int d) + : LSTMKernel() { + d_ = d; + d2_ = d * 2; + d3_ = d * 3; + act_gate_d3_ = GetActKernel(act_gate, d3_); + act_gate_d_ = GetActKernel(act_gate, d); + act_cand_d_ = GetActKernel(act_cand, d); + act_cell_d_ = GetActKernel(act_cell, d); + vmul_d_ = KernelPool::Instance().template Get>(d); + vadd_d_ = KernelPool::Instance().template Get>(d); +#ifdef __AVX__ + auto GetAVXAct = [&](const std::string& type) -> std::unique_ptr { + if (type == "sigmoid") { + return std::unique_ptr(new AVXActImpl()); + } else if (type == "relu") { + return std::unique_ptr(new AVXActImpl()); + } else if (type == "tanh") { + return std::unique_ptr(new AVXActImpl()); + } else if (type == "identity" || type == "") { + return std::unique_ptr(new AVXActImpl()); + } + PADDLE_THROW("Not support type: %s", type); + }; + avx_act_gate_ = GetAVXAct(act_gate); + avx_act_cand_ = GetAVXAct(act_cand); + avx_act_cell_ = GetAVXAct(act_cell); +#endif + } + + void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data, + T* checked) const override { + // gates: W_ch, W_ih, W_fh, W_oh + act_gate_d3_->Compute(gates + d_, gates + d_); + + /* C_t = C_t-1 * fgated + cand_gated * igated */ + act_cand_d_->Compute(gates, gates); + vmul_d_->Compute(gates, gates + d_, gates + d_); + vmul_d_->Compute(ct_1, gates + d2_, gates + d2_); + vadd_d_->Compute(gates + d_, gates + d2_, ct); + + /* H_t = act_cell(C_t) * ogated */ + act_cell_d_->Compute(ct, gates + d2_); + vmul_d_->Compute(gates + d2_, gates + d3_, ht); + } + void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override { + /* C_t = igated * cgated*/ + act_gate_d_->Compute(gates + d_, gates + d_); + act_cand_d_->Compute(gates, gates); + vmul_d_->Compute(gates, gates + d_, ct); + /* H_t = act_cell(C_t) * ogated */ + act_gate_d_->Compute(gates + d3_, gates + d3_); + act_cell_d_->Compute(ct, gates + d2_); + vmul_d_->Compute(gates + d2_, gates + d3_, ht); + } + + private: + int d_, d2_, d3_; + std::shared_ptr> act_gate_d3_, act_gate_d_, act_cand_d_, + act_cell_d_; + std::shared_ptr> vmul_d_; + std::shared_ptr> vadd_d_; +#ifdef __AVX__ + std::unique_ptr avx_act_gate_, avx_act_cand_, avx_act_cell_; +#endif +}; + +#define INTRI8_FLOAT(isa) \ + template <> \ + void LSTMKernelImpl::ComputeCtHt( \ + float* gates, const float* ct_1, float* ct, float* ht, \ + const float* wp_data, float* checked) const { \ + /* gates: W_ch, W_ih, W_fh, W_oh */ \ + __m256 c, i, f, o; \ + c = _mm256_loadu_ps(gates); \ + i = _mm256_loadu_ps(gates + 8); \ + f = _mm256_loadu_ps(gates + 16); \ + o = _mm256_loadu_ps(gates + 24); \ + /* C_t = C_t-1 * fgated + cand_gated * igated*/ \ + c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \ + i = _mm256_loadu_ps(ct_1); \ + f = _mm256_mul_ps(i, avx_act_gate_->Compute(f)); \ + f = _mm256_add_ps(c, f); \ + _mm256_storeu_ps(ct, f); \ + /* H_t = act_cell(C_t) * ogated */ \ + o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \ + _mm256_storeu_ps(ht, o); \ + } + +// TODO(TJ): optimize keq16 + +#ifdef __AVX__ +INTRI8_FLOAT(jit::avx); +#endif +#ifdef __AVX2__ +INTRI8_FLOAT(jit::avx2); +#endif +#ifdef __AVX512F__ +INTRI8_FLOAT(jit::avx512f); +#endif + +/* Peephole JitKernel */ +template +class PeepholeKernelImpl : public LSTMKernel { + public: + explicit PeepholeKernelImpl(const std::string& act_gate, + const std::string& act_cand, + const std::string& act_cell, int d) + : LSTMKernel() { + d_ = d; + d2_ = d * 2; + d3_ = d * 3; + act_gate_d_ = GetActKernel(act_gate, d); + act_cand_d_ = GetActKernel(act_cand, d); + act_cell_d_ = GetActKernel(act_cell, d); + vmul_d_ = KernelPool::Instance().template Get>(d); + vadd_d_ = KernelPool::Instance().template Get>(d); + vadd_d2_ = KernelPool::Instance().template Get>(d2_); + act_gate_d2_ = GetActKernel(act_gate, d2_); + } + + void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data, + T* checked) const override { + /* get fgated and igated*/ + vmul_d_->Compute(wp_data, ct_1, checked); + vmul_d_->Compute(wp_data + d_, ct_1, checked + d_); + vadd_d2_->Compute(checked, gates + d_, gates + d_); + act_gate_d2_->Compute(gates + d_, gates + d_); + /* C_t = C_t-1 * fgated + cand_gated * igated*/ + act_cand_d_->Compute(gates, gates); + vmul_d_->Compute(gates, gates + d_, gates + d_); + vmul_d_->Compute(ct_1, gates + d2_, gates + d2_); + vadd_d_->Compute(gates + d_, gates + d2_, ct); + /* get ogated*/ + vmul_d_->Compute(wp_data + d2_, ct, gates + d_); + vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_); + act_gate_d_->Compute(gates + d3_, gates + d3_); + /* H_t = act_cell(C_t) * ogated */ + act_cell_d_->Compute(ct, gates + d2_); + vmul_d_->Compute(gates + d2_, gates + d3_, ht); + } + + void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override { + /* C_t = igated * cgated*/ + act_gate_d_->Compute(gates + d_, gates + d_); + act_cand_d_->Compute(gates, gates); + vmul_d_->Compute(gates, gates + d_, ct); + /* get outgated, put W_oc * C_t on igated */ + vmul_d_->Compute(wp_data + d2_, ct, gates + d_); + vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_); + /* H_t = act_cell(C_t) * ogated */ + act_gate_d_->Compute(gates + d3_, gates + d3_); + act_cell_d_->Compute(ct, gates + d2_); + vmul_d_->Compute(gates + d2_, gates + d3_, ht); + } + + private: + int d_, d2_, d3_; + std::shared_ptr> act_gate_d2_, act_gate_d_, act_cand_d_, + act_cell_d_; + std::shared_ptr> vmul_d_; + std::shared_ptr> vadd_d_, vadd_d2_; +}; + +#define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype) \ + template <> \ + std::shared_ptr> \ + KernelPool::Get, const std::string&, \ + const std::string&, const std::string&, int, bool>( \ + const std::string& act_gate, const std::string& act_cand, \ + const std::string& act_cell, int d, bool use_peephole) + +#define JITKERNEL_KEY_LSTM(ker_key, dtype_key) \ + #ker_key #dtype_key + std::to_string(d) + act_gate + act_cand + act_cell + \ + (use_peephole ? "p" : "n") + +#define JITKERNEL_NEW_LSTM_IMPL(ker, dtype, isa, k) \ + if (use_peephole) { \ + p = std::dynamic_pointer_cast>( \ + std::make_shared>( \ + act_gate, act_cand, act_cell, d)); \ + } else { \ + p = std::dynamic_pointer_cast>( \ + std::make_shared>(act_gate, act_cand, \ + act_cell, d)); \ + } + +REGISTER_JITKERNEL_ARGS(lstm, LSTMKernel, JITKERNEL_DECLARE_LSTM, + JITKERNEL_KEY_LSTM, JITKERNEL_NEW_LSTM_IMPL); + +#undef INTRI8_FLOAT +#undef JITKERNEL_DECLARE_LSTM +#undef JITKERNEL_KEY_LSTM +#undef JITKERNEL_NEW_LSTM_IMPL +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel_macro.h b/paddle/fluid/operators/math/jit_kernel_macro.h new file mode 100644 index 0000000000000000000000000000000000000000..d8e55f2673560ff6afa34376b73275b57a8ceea1 --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_macro.h @@ -0,0 +1,111 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { + +namespace jit = platform::jit; + +#define SEARCH_BLOCK(macro_, ker, dtype, isa) \ + if (d < AVX_FLOAT_BLOCK) { \ + macro_(ker, dtype, isa, kLT8); \ + } else if (d == AVX_FLOAT_BLOCK) { \ + macro_(ker, dtype, isa, kEQ8); \ + } else if (d > AVX_FLOAT_BLOCK && d < AVX512_FLOAT_BLOCK) { \ + macro_(ker, dtype, isa, kGT8LT16); \ + } else if (d == AVX512_FLOAT_BLOCK) { \ + macro_(ker, dtype, isa, kEQ16); \ + } else { \ + macro_(ker, dtype, isa, kGT16); \ + } + +#define SEARCH_ISA_BLOCK(macro_, ker, dtype) \ + if (jit::MayIUse(jit::avx512f)) { \ + SEARCH_BLOCK(macro_, ker, dtype, jit::avx512f); \ + } else if (jit::MayIUse(jit::avx2)) { \ + SEARCH_BLOCK(macro_, ker, dtype, jit::avx2); \ + } else if (jit::MayIUse(jit::avx)) { \ + SEARCH_BLOCK(macro_, ker, dtype, jit::avx); \ + } else { \ + SEARCH_BLOCK(macro_, ker, dtype, jit::isa_any); \ + } + +#define JITKERNEL_DECLARE(ker_class, ker_dtype) \ + template <> \ + std::shared_ptr> \ + KernelPool::Get, int>(int d) + +#define JITKERNEL_KEY(ker_key, dtype_key) \ + #ker_key #dtype_key + std::to_string(d) + +#define JITKERNEL_NEW_IMPL(ker, dtype, isa, k) \ + p = std::dynamic_pointer_cast>( \ + std::make_shared>(d)) + +#define JITKERNEL_WITH_DTYPE(ker_key, ker_class, ker_dtype, dtype_key, \ + marco_declare, macro_key, macro_impl) \ + marco_declare(ker_class, ker_dtype) { \ + std::string key = macro_key(ker_key, dtype_key); \ + if (kers_.find(key) == kers_.end()) { \ + std::shared_ptr> p; \ + SEARCH_ISA_BLOCK(macro_impl, ker_class, ker_dtype); \ + kers_.insert({key, std::dynamic_pointer_cast(p)}); \ + return p; \ + } \ + return std::dynamic_pointer_cast>( \ + kers_.at(key)); \ + } + +#define REGISTER_JITKERNEL(ker_key, ker_class) \ + JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, JITKERNEL_DECLARE, \ + JITKERNEL_KEY, JITKERNEL_NEW_IMPL); \ + JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, JITKERNEL_DECLARE, \ + JITKERNEL_KEY, JITKERNEL_NEW_IMPL) + +#define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_declare, macro_key, \ + macro_impl) \ + JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, marco_declare, macro_key, \ + macro_impl); \ + JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, marco_declare, \ + macro_key, macro_impl) + +#define FOR_EACH_ISA(macro_, block) \ + macro_(jit::avx512f, block); \ + macro_(jit::avx2, block); \ + macro_(jit::avx, block); \ + macro_(jit::isa_any, block) + +#define FOR_EACH_BLOCK(macro_, isa) \ + macro_(isa, kLT8); \ + macro_(isa, kEQ8); \ + macro_(isa, kGT8LT16); \ + macro_(isa, kEQ16); \ + macro_(isa, kGT16) + +#define FOR_EACH_ISA_BLOCK(macro_) \ + FOR_EACH_BLOCK(macro_, jit::avx512f); \ + FOR_EACH_BLOCK(macro_, jit::avx2); \ + FOR_EACH_BLOCK(macro_, jit::avx); \ + FOR_EACH_BLOCK(macro_, jit::isa_any) + +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..26590171bbeaa385ac09b04e5faf483924176598 --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -0,0 +1,749 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/jit_kernel.h" +#include +#include // for exp +#include // for memcpy +#include +#include +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#ifdef PADDLE_WITH_MKLML +#include "paddle/fluid/platform/dynload/mklml.h" +#endif + +#ifdef __AVX__ +#include +#endif + +constexpr int repeat = 20000; + +inline double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; +} + +template +void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), + const T upper = static_cast(20.f)) { + static unsigned int seed = 100; + std::mt19937 rng(seed++); + std::uniform_real_distribution uniform_dist(0, 1); + for (int i = 0; i < n; ++i) { + a[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); + } +} + +void vrelu_ref(const int n, const float* x, float* y) { + for (int i = 0; i < n; ++i) { + y[i] = x[i] > 0.f ? x[i] : 0.f; + } +} + +#if defined __AVX__ || defined __AVX2__ +void vrelu_intri8(const int n, const float* x, float* y) { + __m256 tmp = _mm256_loadu_ps(x); + tmp = _mm256_max_ps(tmp, _mm256_setzero_ps()); + _mm256_storeu_ps(y, tmp); +} +#endif + +TEST(JitKernel, vrelu) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 256, 512}) { + std::vector x(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data(), -10.f, 1.f); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vrelu_ref(d, x_data, zref_data); + } + auto trefe = GetCurrentUS(); +#if defined __AVX__ || defined __AVX2__ + if (d == 8) { + auto si0 = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vrelu_intri8(d, x_data, zref_data); + } + auto si1 = GetCurrentUS(); + VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat; + } +#endif + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(x_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + << " us, tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +void vaddbias_ref(const int n, const float a, const float* x, float* y) { + for (int i = 0; i < n; ++i) { + y[i] = x[i] + a; + } +} + +TEST(JitKernel, vaddbias) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 64, 100, 128, 256}) { + std::vector x(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data(), -2.f, 2.f); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const float a = 2.f; + const float* x_data = x.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vaddbias_ref(d, a, x_data, zref_data); + } + auto trefe = GetCurrentUS(); + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(a, x_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + << " us, tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +void vexp_ref(const int n, const float* x, float* y) { + for (int i = 0; i < n; ++i) { + y[i] = std::exp(x[i]); + } +} + +#ifdef PADDLE_WITH_MKLML +void vexp_mkl(const int n, const float* x, float* y) { + paddle::platform::dynload::vsExp(n, x, y); +} +#endif + +TEST(JitKernel, vexp) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 128, 256}) { + std::vector x(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data(), -2.f, 2.f); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vexp_ref(d, x_data, zref_data); + } + auto trefe = GetCurrentUS(); + +#ifdef PADDLE_WITH_MKLML + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vexp_mkl(d, x_data, zref_data); + } + auto tmkle = GetCurrentUS(); +#endif + + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(x_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat +#ifdef PADDLE_WITH_MKLML + << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, " +#else + << " us, " +#endif + << "tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +inline float _sigmoid(float x) { + const float min = SIGMOID_THRESHOLD_MIN; + const float max = SIGMOID_THRESHOLD_MAX; + float tmp = (x < min) ? min : ((x > max) ? max : x); + return 1.f / (1.f + std::exp(-tmp)); +} + +void vsigmoid_ref(const int n, const float* x, float* y) { + for (int i = 0; i < n; ++i) { + y[i] = _sigmoid(x[i]); + } +} + +void vsigmoid_better( + const std::shared_ptr< + const paddle::operators::math::jitkernel::VExpKernel>& vexp, + const int n, const float* x, float* y) { + const float min = SIGMOID_THRESHOLD_MIN; + const float max = SIGMOID_THRESHOLD_MAX; + for (int i = 0; i < n; ++i) { + y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); + y[i] = 0.f - y[i]; + } + vexp->Compute(y, y); + for (int i = 0; i < n; ++i) { + y[i] = 1.f / (1.f + y[i]); + } +} + +TEST(JitKernel, vsigmoid) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) { + std::vector x(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data(), -2.f, 2.f); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const auto& vexp = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vsigmoid_better(vexp, d, x_data, zref_data); + } + auto tmkle = GetCurrentUS(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vsigmoid_ref(d, x_data, zref_data); + } + auto trefe = GetCurrentUS(); + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(x_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat + << " us, tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +inline float _tanh(float x) { return 2.f * _sigmoid(2.f * x) - 1.f; } + +void vtanh_ref(const int n, const float* x, float* y) { + for (int i = 0; i < n; ++i) { + y[i] = _tanh(x[i]); + } +} + +void vtanh_better( + const std::shared_ptr< + const paddle::operators::math::jitkernel::VScalKernel>& vscal, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VSigmoidKernel>& + vsigmoid, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VAddBiasKernel>& + vaddbias, + const int n, const float* x, float* y) { + vscal->Compute(2.f, x, y); + vsigmoid->Compute(y, y); + vscal->Compute(2.f, y); + vaddbias->Compute(-1.f, y, y); +} + +TEST(JitKernel, vtanh) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) { + std::vector x(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data(), -2.f, 2.f); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const auto& vscal = + jit::KernelPool::Instance().template Get>(d); + const auto& vsigmoid = + jit::KernelPool::Instance().template Get>(d); + const auto& vaddbias = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vtanh_better(vscal, vsigmoid, vaddbias, d, x_data, zref_data); + } + auto tmkle = GetCurrentUS(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vtanh_ref(d, x_data, zref_data); + } + auto trefe = GetCurrentUS(); + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(x_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat + << " us, tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +void lstm_ctht_ref( + const std::shared_ptr< + const paddle::operators::math::jitkernel::VSigmoidKernel>& + vsigmoid_3d, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VTanhKernel>& vtanh_d, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VExpKernel>& vexp_1, + const int d, float* gates, const float* ct_1, float* ct, float* ht) { + vsigmoid_3d->Compute(gates + d, gates + d); + vtanh_d->Compute(gates, gates); + const float *i = gates + d, *f = gates + d * 2, *o = gates + d * 3; + const float min = SIGMOID_THRESHOLD_MIN; + const float max = SIGMOID_THRESHOLD_MAX; + for (int k = 0; k < d; ++k) { + // C_t = C_t-1 * fgated + cand_gated * igated + ct[k] = ct_1[k] * f[k] + gates[k] * i[k]; + // H_t = act_cell(C_t) * ogated + float tmp = ct[k] * 2; + tmp = 0.f - ((tmp < min) ? min : ((tmp > max) ? max : tmp)); + vexp_1->Compute(&tmp, &tmp); + tmp = 2.f / (1.f + tmp) - 1.f; + ht[k] = tmp * o[k]; + } +} + +void lstm_ctht_better( + const std::shared_ptr< + const paddle::operators::math::jitkernel::VSigmoidKernel>& + vsigmoid_3d, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VTanhKernel>& vtanh_d, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VMulKernel>& vmul_d, + const std::shared_ptr< + const paddle::operators::math::jitkernel::VAddKernel>& vadd_d, + const int d, float* gates, const float* ct_1, float* ct, float* ht) { + int d2 = d * 2; + vsigmoid_3d->Compute(gates + d, gates + d); + vtanh_d->Compute(gates, gates); + vmul_d->Compute(gates, gates + d, gates + d); + vmul_d->Compute(ct_1, gates + d2, gates + d2); + vadd_d->Compute(gates + d, gates + d2, ct); + /* H_t = act_cell(C_t) * ogated */ + vtanh_d->Compute(ct, gates + d2); + vmul_d->Compute(gates + d2, gates + d * 3, ht); +} + +TEST(JitKernel, lstm) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 32, 64, 100}) { + int d4 = d * 4; + int d3 = d * 3; + std::vector x(d4), xref(d4); + std::vector ct_1(d), ct_tgt(d), ht_tgt(d); + std::vector ct_ref(d), ht_ref(d); + RandomVec(d4, x.data(), -2.f, 2.f); + RandomVec(d, ct_1.data(), -2.f, 2.f); + memcpy(xref.data(), x.data(), sizeof(float) * d4); + std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh"; + const auto& ker = + jit::KernelPool::Instance() + .template Get, const std::string&, + const std::string&, const std::string&>( + act_gate, act_cand, act_cell, d, false); + // below kernels are used to compute refer + const auto& vsigmoid_3d = + jit::KernelPool::Instance().template Get>( + d3); + const auto& vtanh_d = + jit::KernelPool::Instance().template Get>(d); + const auto& vexp_1 = + jit::KernelPool::Instance().template Get>(1); + const auto& vmul_d = + jit::KernelPool::Instance().template Get>(d); + const auto& vadd_d = + jit::KernelPool::Instance().template Get>(d); + + float* x_data = x.data(); + float* xref_data = xref.data(); + const float* ct_1_data = ct_1.data(); + float* ct_tgt_data = ct_tgt.data(); + float* ht_tgt_data = ht_tgt.data(); + float* ct_ref_data = ct_ref.data(); + float* ht_ref_data = ht_ref.data(); + // compute once to check correctness + lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data, + ct_ref_data, ht_ref_data); + ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data); + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ct_tgt_data[i], ct_ref_data[i], 1e-3); + EXPECT_NEAR(ht_tgt_data[i], ht_ref_data[i], 1e-3); + } + + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + lstm_ctht_better(vsigmoid_3d, vtanh_d, vmul_d, vadd_d, d, xref_data, + ct_1_data, ct_ref_data, ht_ref_data); + } + auto tmkle = GetCurrentUS(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data, + ct_ref_data, ht_ref_data); + } + auto trefe = GetCurrentUS(); + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data); + } + auto ttgte = GetCurrentUS(); + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + << " us, better(jit) takes: " << (tmkle - tmkls) / repeat + << " us, tgt takes: " << (ttgte - ttgts) / repeat; + } +} + +void vscal_ref(const int n, const float a, const float* x, float* y) { + for (int i = 0; i < n; ++i) { + y[i] = a * x[i]; + } +} +void vscal_inp_ref(const int n, const float a, float* x) { + for (int i = 0; i < n; ++i) { + x[i] = a * x[i]; + } +} +#if defined __AVX__ || defined __AVX2__ +void vscal_intri8(const int n, const float a, const float* x, float* y) { + __m256 tmp; + __m256 scalar = _mm256_set1_ps(a); + tmp = _mm256_loadu_ps(x); + tmp = _mm256_mul_ps(tmp, scalar); + _mm256_storeu_ps(y, tmp); +} +void vscal_inp_intri8(const int n, const float a, float* x) { + __m256 tmp; + __m256 scalar = _mm256_set1_ps(a); + tmp = _mm256_loadu_ps(x); + tmp = _mm256_mul_ps(tmp, scalar); + _mm256_storeu_ps(x, tmp); +} +#endif + +#ifdef PADDLE_WITH_MKLML +void vscal_inp_mkl(const int n, const float a, float* x) { + paddle::platform::dynload::cblas_sscal(n, a, x, 1); +} +#endif + +TEST(JitKernel, vscal) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 256, 512}) { + std::vector x(d), y(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data()); + std::memcpy(y.data(), x.data(), sizeof(float) * d); + float a = 2.f; + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + float* y_data = y.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vscal_ref(d, a, x_data, zref_data); + } + auto trefe = GetCurrentUS(); + auto trefs1 = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vscal_inp_ref(d, a, y_data); + } + auto trefe1 = GetCurrentUS(); + +#ifdef PADDLE_WITH_MKLML + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vscal_inp_mkl(d, a, y_data); + } + auto tmkle = GetCurrentUS(); +#endif + +#if defined __AVX__ || defined __AVX2__ + if (d == 8) { + auto si0 = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vscal_intri8(d, a, x_data, zref_data); + } + auto si1 = GetCurrentUS(); + auto si2 = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vscal_inp_intri8(d, a, y_data); + } + auto si3 = GetCurrentUS(); + VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat + << " us, inplace: " << (si3 - si2) / repeat; + } +#endif + + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(a, x_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + auto ttgts1 = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(a, y_data); + } + auto ttgte1 = GetCurrentUS(); + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat + << " us, inplace takes: " << (trefe1 - trefs1) / repeat +#ifdef PADDLE_WITH_MKLML + << " us, mkl inplace takes: " << (tmkle - tmkls) / repeat << " us, " +#else + << " us, " +#endif + << "tgt takes: " << (ttgte - ttgts) / repeat + << "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +void vmul_ref(const int n, const float* x, const float* y, float* z) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] * y[i]; + } +} + +#if defined __AVX__ || defined __AVX2__ +void vmul_intri8(const int n, const float* x, const float* y, float* z) { + __m256 tmpx, tmpy; + tmpx = _mm256_loadu_ps(x); + tmpy = _mm256_loadu_ps(y); + tmpx = _mm256_mul_ps(tmpx, tmpy); + _mm256_storeu_ps(z, tmpx); +} +#endif + +#ifdef PADDLE_WITH_MKLML +void vmul_mkl(const int n, const float* x, const float* y, float* z) { + paddle::platform::dynload::vsMul(n, x, y, z); +} +#endif + +TEST(JitKernel, vmul) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 256, 512}) { + std::vector x(d), y(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data()); + RandomVec(d, y.data()); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + const float* y_data = y.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vmul_ref(d, x_data, y_data, zref_data); + } + auto trefe = GetCurrentUS(); + +#ifdef PADDLE_WITH_MKLML + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vmul_mkl(d, x_data, y_data, zref_data); + } + auto tmkle = GetCurrentUS(); +#endif + +#if defined __AVX__ || defined __AVX2__ + if (d == 8) { + auto si0 = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vmul_intri8(d, x_data, y_data, zref_data); + } + auto si1 = GetCurrentUS(); + VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat; + } +#endif + + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(x_data, y_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat +#ifdef PADDLE_WITH_MKLML + << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, " +#else + << " us, " +#endif + << "tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +void vadd_ref(const int n, const float* x, const float* y, float* z) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] + y[i]; + } +} + +#if defined __AVX__ || defined __AVX2__ +void vadd_intri8(const int n, const float* x, const float* y, float* z) { + __m256 tmpx, tmpy; + tmpx = _mm256_loadu_ps(x); + tmpy = _mm256_loadu_ps(y); + tmpx = _mm256_add_ps(tmpx, tmpy); + _mm256_storeu_ps(z, tmpx); +} +#endif + +#ifdef PADDLE_WITH_MKLML +void vadd_mkl(const int n, const float* x, const float* y, float* z) { + paddle::platform::dynload::vsAdd(n, x, y, z); +} +#endif + +TEST(JitKernel, vadd) { + namespace jit = paddle::operators::math::jitkernel; + for (int d : {7, 8, 15, 16, 30, 256, 512}) { + std::vector x(d), y(d); + std::vector zref(d), ztgt(d); + RandomVec(d, x.data()); + RandomVec(d, y.data()); + const auto& ker = + jit::KernelPool::Instance().template Get>(d); + const float* x_data = x.data(); + const float* y_data = y.data(); + float* ztgt_data = ztgt.data(); + float* zref_data = zref.data(); + auto trefs = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vadd_ref(d, x_data, y_data, zref_data); + } + auto trefe = GetCurrentUS(); + +#ifdef PADDLE_WITH_MKLML + auto tmkls = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vadd_mkl(d, x_data, y_data, zref_data); + } + auto tmkle = GetCurrentUS(); +#endif + +#if defined __AVX__ || defined __AVX2__ + if (d == 8) { + auto si0 = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + vadd_intri8(d, x_data, y_data, zref_data); + } + auto si1 = GetCurrentUS(); + VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat; + } +#endif + + auto ttgts = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ker->Compute(x_data, y_data, ztgt_data); + } + auto ttgte = GetCurrentUS(); + + VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat +#ifdef PADDLE_WITH_MKLML + << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, " +#else + << " us, " +#endif + << "tgt takes: " << (ttgte - ttgts) / repeat; + for (int i = 0; i < d; ++i) { + EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3); + } + } +} + +TEST(JitKernel, pool) { + namespace jit = paddle::operators::math::jitkernel; + const int frame_size = 4; + std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh"; + const auto& plstm1 = + jit::KernelPool::Instance() + .template Get, const std::string&, + const std::string&, const std::string&>( + act_gate, act_cand, act_cell, frame_size, false); + const auto& plstm2 = + jit::KernelPool::Instance() + .template Get, const std::string&, + const std::string&, const std::string&>( + act_gate, act_cand, act_cell, frame_size, false); + const auto& peephole = + jit::KernelPool::Instance() + .template Get, const std::string&, + const std::string&, const std::string&>( + act_gate, act_cand, act_cell, frame_size, true); + EXPECT_TRUE(plstm1 != peephole); + + const auto& pvmul_f = + jit::KernelPool::Instance().template Get>(4); + EXPECT_TRUE(std::dynamic_pointer_cast(plstm2) != + std::dynamic_pointer_cast(pvmul_f)); + + const auto& pvmul_d = + jit::KernelPool::Instance().template Get>(4); + EXPECT_TRUE(std::dynamic_pointer_cast(pvmul_f) != + std::dynamic_pointer_cast(pvmul_d)); + + const auto& pvmul_from_key = jit::KernelPool::Instance().Get("vmulf4"); + EXPECT_EQ(pvmul_f, pvmul_from_key); + const auto& pvmul_from_key2 = jit::KernelPool::Instance().Get("vmulf5"); + EXPECT_TRUE(pvmul_from_key2 == nullptr); +} diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 6810c24227ed074d4ba747399348aa12f55467a0..08f57dd45ad76946cbcafb98a3414003ed9d67a9 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include +#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" namespace paddle { @@ -245,40 +247,42 @@ struct MergeAdd { const framework::SelectedRows& input, framework::SelectedRows* output) { framework::SelectedRows& out = *output; - auto input_rows = input.rows(); - std::vector merge_rows; - merge_rows.reserve(input_rows.size()); - std::unordered_map rows_pos_map; - rows_pos_map.reserve(input_rows.size()); - size_t idx = 0u; - for (std::vector::iterator iter = input_rows.begin(); - iter != input_rows.end(); ++iter) { - if (rows_pos_map.find(*iter) == rows_pos_map.end()) { - rows_pos_map[*iter] = idx++; - merge_rows.emplace_back(*iter); - } + std::vector input_rows(input.rows()); + + std::map> merge_row_map; + for (size_t i = 0; i < input_rows.size(); ++i) { + merge_row_map[input_rows[i]].push_back(i); } - auto input_width = input.value().dims()[1]; - out.set_rows(merge_rows); + std::vector merge_rows(merge_row_map.size()); + size_t idx = 0; + int64_t input_width = input.value().dims()[1]; out.set_height(input.height()); - out.mutable_value()->mutable_data( + + T* out_data = out.mutable_value()->mutable_data( framework::make_ddim( {static_cast(merge_rows.size()), input_width}), context.GetPlace()); - - math::SetConstant constant_functor; - constant_functor(context, out.mutable_value(), 0.0); - - auto* out_data = out.mutable_value()->data(); - auto* input_data = input.value().data(); - - for (size_t i = 0; i < input_rows.size(); i++) { - size_t out_i = rows_pos_map[input_rows[i]]; - for (int64_t j = 0; j < input_width; j++) { - out_data[out_i * input_width + j] += input_data[i * input_width + j]; + const T* in_data = input.value().data(); + + for (auto& row_pair : merge_row_map) { + auto* out_ptr = out_data + idx * input_width; + auto& rows = row_pair.second; + merge_rows[idx] = row_pair.first; + ++idx; + // rows.size() is always larger than 0 + std::memcpy(out_ptr, in_data + rows[0] * input_width, + sizeof(T) * input_width); + + for (size_t i = 1; i < rows.size(); ++i) { + auto* in_ptr = in_data + rows[i] * input_width; + for (int64_t j = 0; j < input_width; ++j) { + out_ptr[j] += in_ptr[j]; + } } } + + out.set_rows(merge_rows); } }; diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h index 3d99c9b3f2303c941a150f77516703b8103c6c25..900be86f91c6658a5265189a6745316c6471209e 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.h +++ b/paddle/fluid/operators/math/selected_rows_functor.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include #include "paddle/fluid/framework/eigen.h" @@ -97,41 +98,39 @@ struct MergeAdd { const framework::SelectedRows& input, framework::SelectedRows* output) { framework::SelectedRows& out = *output; - auto input_rows = input.rows(); - std::vector merge_rows; - merge_rows.reserve(input_rows.size()); - std::unordered_map rows_pos_map; - rows_pos_map.reserve(input_rows.size()); - size_t idx = 0u; - for (std::vector::iterator iter = input_rows.begin(); - iter != input_rows.end(); ++iter) { - if (rows_pos_map.find(*iter) == rows_pos_map.end()) { - rows_pos_map[*iter] = idx++; - merge_rows.emplace_back(*iter); - } + std::vector input_rows(input.rows()); + + std::map> merge_row_map; + for (size_t i = 0; i < input_rows.size(); ++i) { + merge_row_map[input_rows[i]].push_back(i); } - auto input_width = input.value().dims()[1]; - out.set_rows(merge_rows); + std::vector merge_rows(merge_row_map.size()); + size_t idx = 0; + int64_t input_width = input.value().dims()[1]; out.set_height(input.height()); - out.mutable_value()->mutable_data( + + auto* out_data = out.mutable_value()->mutable_data( framework::make_ddim( {static_cast(merge_rows.size()), input_width}), context.GetPlace()); - - math::SetConstant constant_functor; - constant_functor(context, out.mutable_value(), 0.0); - - auto* out_data = out.mutable_value()->data(); - auto* input_data = input.value().data(); + auto* in_data = input.value().data(); auto blas = GetBlas(context); - for (size_t i = 0; i < input_rows.size(); i++) { - size_t out_i = rows_pos_map[input_rows[i]]; - float* y = out_data + out_i * input_width; - const float* x = input_data + i * input_width; - blas.AXPY(input_width, 1., x, y); + for (auto& row_pair : merge_row_map) { + auto* out_ptr = out_data + idx * input_width; + auto& rows = row_pair.second; + merge_rows[idx] = row_pair.first; + ++idx; + // rows.size() is always larger than 0 + blas.VCOPY(input_width, in_data + rows[0] * input_width, out_ptr); + + for (size_t i = 1; i < rows.size(); ++i) { + blas.AXPY(input_width, 1., in_data + rows[i] * input_width, out_ptr); + } } + + out.set_rows(merge_rows); } }; @@ -148,41 +147,39 @@ struct MergeAdd { const framework::SelectedRows& input, framework::SelectedRows* output) { framework::SelectedRows& out = *output; - auto input_rows = input.rows(); - std::vector merge_rows; - merge_rows.reserve(input_rows.size()); - std::unordered_map rows_pos_map; - rows_pos_map.reserve(input_rows.size()); - size_t idx = 0u; - for (std::vector::iterator iter = input_rows.begin(); - iter != input_rows.end(); ++iter) { - if (rows_pos_map.find(*iter) == rows_pos_map.end()) { - rows_pos_map[*iter] = idx++; - merge_rows.emplace_back(*iter); - } + std::vector input_rows(input.rows()); + + std::map> merge_row_map; + for (size_t i = 0; i < input_rows.size(); ++i) { + merge_row_map[input_rows[i]].push_back(i); } - auto input_width = input.value().dims()[1]; - out.set_rows(merge_rows); + std::vector merge_rows(merge_row_map.size()); + size_t idx = 0; + int64_t input_width = input.value().dims()[1]; out.set_height(input.height()); - out.mutable_value()->mutable_data( + + auto* out_data = out.mutable_value()->mutable_data( framework::make_ddim( {static_cast(merge_rows.size()), input_width}), context.GetPlace()); - - math::SetConstant constant_functor; - constant_functor(context, out.mutable_value(), 0.0); - - auto* out_data = out.mutable_value()->data(); - auto* input_data = input.value().data(); + auto* in_data = input.value().data(); auto blas = GetBlas(context); - for (size_t i = 0; i < input_rows.size(); i++) { - size_t out_i = rows_pos_map[input_rows[i]]; - double* y = out_data + out_i * input_width; - const double* x = input_data + i * input_width; - blas.AXPY(input_width, 1., x, y); + for (auto& row_pair : merge_row_map) { + auto* out_ptr = out_data + idx * input_width; + auto& rows = row_pair.second; + merge_rows[idx] = row_pair.first; + ++idx; + // rows.size() is always larger than 0 + blas.VCOPY(input_width, in_data + rows[0] * input_width, out_ptr); + + for (size_t i = 1; i < rows.size(); ++i) { + blas.AXPY(input_width, 1., in_data + rows[i] * input_width, out_ptr); + } } + + out.set_rows(merge_rows); } }; diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc index 97c36a83fc5eff421725d05f66fca05f5169d1bb..ab25628d45699dbcfc1fc5792958bae9e42e72a3 100644 --- a/paddle/fluid/operators/parallel_do_op.cc +++ b/paddle/fluid/operators/parallel_do_op.cc @@ -397,6 +397,24 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase { } }; +class ParallelDoGradOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + framework::BlockDesc *sub_block = + boost::get(op_desc.GetAttr(kParallelBlock)); + for (auto &out_vars : op_desc.Outputs()) { + for (auto &out_var : out_vars.second) { + auto &var = block->FindRecursiveOrCreateVar(out_var); + auto sub_var = sub_block->FindRecursiveOrCreateVar(out_var); + if (sub_var.GetType() != var.GetType()) { + var.SetType(sub_var.GetType()); + } + } + } + } +}; + } // namespace operators } // namespace paddle @@ -404,4 +422,5 @@ REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp, paddle::operators::ParallelDoOpProtoMaker, paddle::operators::ParallelDoGradOpDescMaker); REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp, - paddle::operators::ParallelDoGradOpShapeInference); + paddle::operators::ParallelDoGradOpShapeInference, + paddle::operators::ParallelDoGradOpVarTypeInference); diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index 28cc91a5ed5d74994e5b960a0a4dd3c6a5e6cdcc..51b980acb5a08d431d96a3a92479dec09119c27e 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -31,8 +31,8 @@ class BlockingQueue { // is a workaround and a simplified version of framework::Channel as it // doesn't support GPU and it implements on buffered blocking queue. public: - explicit BlockingQueue(size_t capacity) - : capacity_(capacity), closed_(false) { + explicit BlockingQueue(size_t capacity, bool speed_test_mode = false) + : capacity_(capacity), speed_test_mode_(speed_test_mode), closed_(false) { PADDLE_ENFORCE_GT( capacity_, 0, "The capacity of a reader::BlockingQueue must be greater than 0."); @@ -72,7 +72,9 @@ class BlockingQueue { if (!queue_.empty()) { PADDLE_ENFORCE_NOT_NULL(elem); *elem = queue_.front(); - queue_.pop_front(); + if (LIKELY(!speed_test_mode_)) { + queue_.pop_front(); + } send_cv_.notify_one(); return true; } else { @@ -114,6 +116,7 @@ class BlockingQueue { private: size_t capacity_; + bool speed_test_mode_; bool closed_; std::deque queue_; diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h index 4f7cfc24ec035349f3c85e84d876ad9b5b5493a6..3f041ff7e4e32b407729a22aab25d3aab199fee0 100644 --- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h +++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h @@ -33,8 +33,9 @@ class LoDTensorBlockingQueue { private: LoDTensorBlockingQueue(size_t capacity, - const std::vector& dims) - : queue_(capacity), dims_(dims) {} + const std::vector& dims, + bool speed_test_mode = false) + : queue_(capacity, speed_test_mode), dims_(dims) {} public: bool Push(const std::vector& lod_tensor_vec) { @@ -69,11 +70,12 @@ class LoDTensorBlockingQueue { class LoDTensorBlockingQueueHolder { public: - void InitOnce(size_t capacity, const std::vector& dims) { + void InitOnce(size_t capacity, const std::vector& dims, + bool speed_test_mode = false) { PADDLE_ENFORCE( queue_ == nullptr, "LoDTensorBlockingQueueHolder::InitOnce() can only be called once"); - queue_.reset(new LoDTensorBlockingQueue(capacity, dims)); + queue_.reset(new LoDTensorBlockingQueue(capacity, dims, speed_test_mode)); } inline const std::shared_ptr& GetQueue() const { diff --git a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc index 7d1b381d56c8cdc1e79e594b18c1a1ed59ab5284..bd7ac64b2fce2452744e4756b149ee7f291d38aa 100644 --- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc +++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc @@ -217,3 +217,27 @@ TEST(BlockingQueue, MyClassTest) { q.Receive(&b); EXPECT_EQ(a.val_, b.val_); } + +TEST(BlockingQueue, speed_test_mode) { + size_t queue_size = 10; + BlockingQueue q1(queue_size, false); + for (size_t i = 0; i < queue_size; ++i) { + q1.Send(i); + } + size_t b; + for (size_t i = 0; i < queue_size; ++i) { + q1.Receive(&b); + EXPECT_EQ(b, i); + } + EXPECT_EQ(q1.Size(), 0); + + BlockingQueue q2(queue_size, true); + for (size_t i = 0; i < queue_size; ++i) { + q2.Send(i); + } + for (size_t i = 0; i < queue_size; ++i) { + q2.Receive(&b); + EXPECT_EQ(b, 0); + } + EXPECT_EQ(q2.Size(), queue_size); +} diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index d72f85f2c44db2fa887732cfc05e1376a6a79e4a..500d86fec33830fc2cfb0412f1f2c7780d08eb02 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -164,7 +164,7 @@ dimension value will be copied from Input(X) at runtime. Note that the index of [2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input. 3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while -Attr(shape) still should be set correctly to gurantee shape inference in +Attr(shape) still should be set correctly to gurantee shape inference in compile-time. )DOC"); @@ -259,7 +259,6 @@ class Reshape2Op : public ReshapeOp { : ReshapeOp(type, inputs, outputs, attrs) {} void InferShape(framework::InferShapeContext *ctx) const override { - ReshapeOp::InferShape(ctx); PADDLE_ENFORCE(ctx->HasOutput("XShape"), "Output(XShape) of ReshapeOp should not be null."); const auto &x_dims = ctx->GetInputDim("X"); @@ -270,6 +269,8 @@ class Reshape2Op : public ReshapeOp { } ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims)); ctx->ShareLoD("X", /*->*/ "XShape"); + + ReshapeOp::InferShape(ctx); } }; diff --git a/paddle/fluid/operators/rmsprop_op.h b/paddle/fluid/operators/rmsprop_op.h index a04d1bd2ca5128714d10155a9679d921519cfe07..797cd45fdcdbd5c3567d1676f37e148304ee6e2d 100644 --- a/paddle/fluid/operators/rmsprop_op.h +++ b/paddle/fluid/operators/rmsprop_op.h @@ -13,72 +13,254 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/algorithm.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/for_range.h" namespace paddle { namespace operators { -using Tensor = framework::Tensor; template using EigenVector = framework::EigenVector; +template +struct DenseRmspropGradFunctor { + inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {} + + HOSTDEVICE inline T operator()(int64_t idx) const { return grad_[idx]; } + + const T *grad_; +}; + +template +struct SparseRmspropGradFunctor { + inline SparseRmspropGradFunctor(const T *grad, const int64_t *rows, + int64_t row_numel, int64_t row_count) + : grad_(grad), + rows_(rows), + row_numel_(row_numel), + row_count_(row_count) {} + + HOSTDEVICE inline T operator()(int64_t idx) const { + auto row_idx = math::BinarySearch(rows_, row_count_, idx / row_numel_); + return row_idx >= 0 ? grad_[row_idx * row_numel_ + idx % row_numel_] : 0; + } + + const T *grad_; + const int64_t *rows_; + int64_t row_numel_; + int64_t row_count_; +}; + +template +struct UncenteredRmspropFunctor { + UncenteredRmspropFunctor(T *param, T *ms, T *mom, const T *lr, T rho, + T epsilon, T momentum, + const GradFunctor &grad_functor) + : param_(param), + ms_(ms), + mom_(mom), + lr_(lr), + rho_(rho), + epsilon_(epsilon), + momentum_(momentum), + grad_functor_(grad_functor) {} + + HOSTDEVICE inline void operator()(int64_t idx) const { + T g = grad_functor_(idx); + T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g; + T mom_out = momentum_ * mom_[idx] + lr_[0] * g / sqrt(ms_out + epsilon_); + param_[idx] -= mom_out; + ms_[idx] = ms_out; + mom_[idx] = mom_out; + } + + T *param_; + T *ms_; + T *mom_; + const T *lr_; + T rho_; + T epsilon_; + T momentum_; + GradFunctor grad_functor_; +}; + +template +struct CenteredRmspropFunctor { + CenteredRmspropFunctor(T *param, T *ms, T *mom, T *mean_grad, const T *lr, + T rho, T epsilon, T momentum, + const GradFunctor &grad_functor) + : param_(param), + ms_(ms), + mom_(mom), + mean_grad_(mean_grad), + lr_(lr), + rho_(rho), + epsilon_(epsilon), + momentum_(momentum), + grad_functor_(grad_functor) {} + + HOSTDEVICE inline void operator()(int64_t idx) const { + T g = grad_functor_(idx); + T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g; + T mg_out = rho_ * mean_grad_[idx] + (1 - rho_) * g; + T mom_out = momentum_ * mom_[idx] + + lr_[0] * g / sqrt(ms_out - mg_out * mg_out + epsilon_); + param_[idx] -= mom_out; + ms_[idx] = ms_out; + mom_[idx] = mom_out; + mean_grad_[idx] = mg_out; + } + + T *param_; + T *ms_; + T *mom_; + T *mean_grad_; + const T *lr_; + T rho_; + T epsilon_; + T momentum_; + GradFunctor grad_functor_; +}; + template class RmspropOpKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE(param_var->IsType(), - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); - - auto* param_out = ctx.Output("ParamOut"); - auto* moment_out = ctx.Output("MomentOut"); - auto* mean_square_out = ctx.Output("MeanSquareOut"); - - auto grad = ctx.Input("Grad"); - - param_out->mutable_data(ctx.GetPlace()); - moment_out->mutable_data(ctx.GetPlace()); - mean_square_out->mutable_data(ctx.GetPlace()); - - float epsilon = ctx.Attr("epsilon"); - float rho = ctx.Attr("decay"); - float momentum = ctx.Attr("momentum"); + void Compute(const framework::ExecutionContext &ctx) const override { + using LoDTensor = framework::LoDTensor; + auto *grad_var = ctx.InputVar("Grad"); + auto *param_out = ctx.Output("ParamOut"); + auto *moment_out = ctx.Output("MomentOut"); + auto *mean_square_out = ctx.Output("MeanSquareOut"); + + auto epsilon = static_cast(ctx.Attr("epsilon")); + auto rho = static_cast(ctx.Attr("decay")); + auto momentum = static_cast(ctx.Attr("momentum")); bool centered = ctx.Attr("centered"); - auto p = EigenVector::Flatten(*ctx.Input("Param")); - auto ms = EigenVector::Flatten(*ctx.Input("MeanSquare")); - auto lr = EigenVector::Flatten(*ctx.Input("LearningRate")); - auto g = EigenVector::Flatten(*grad); - auto mom = EigenVector::Flatten(*ctx.Input("Moment")); - - auto p_out = EigenVector::Flatten(*param_out); - auto mom_out = EigenVector::Flatten(*moment_out); - auto ms_out = EigenVector::Flatten(*mean_square_out); - auto& place = *ctx.template device_context().eigen_device(); - - Eigen::DSizes grad_dsize(static_cast(grad->numel())); - - ms_out.device(place) = rho * ms + (1 - rho) * g * g; - if (centered) { - auto mg = EigenVector::Flatten(*ctx.Input("MeanGrad")); - auto* mean_grad_out = ctx.Output("MeanGradOut"); - mean_grad_out->mutable_data(ctx.GetPlace()); - auto mg_out = EigenVector::Flatten(*mean_grad_out); - - mg_out.device(place) = rho * mg + (1 - rho) * g; - mom_out.device(place) = momentum * mom + - lr.broadcast(grad_dsize) * g / - (ms_out - mg_out.square() + epsilon).sqrt(); + auto &p_tensor = *ctx.Input("Param"); + auto &ms_tensor = *ctx.Input("MeanSquare"); + auto &lr_tensor = *ctx.Input("LearningRate"); + auto &mom_tensor = *ctx.Input("Moment"); + + PADDLE_ENFORCE_EQ(&p_tensor, param_out, + "Param and ParamOut must be the same Tensor"); + PADDLE_ENFORCE_EQ(&mom_tensor, moment_out, + "Moment and MomentOut must be the same Tensor"); + PADDLE_ENFORCE_EQ(&ms_tensor, mean_square_out, + "MeanSquare and MeanSquareOut must be the same Tensor"); + + auto &dev_ctx = ctx.template device_context(); + size_t limit = static_cast(ms_tensor.numel()); + + if (grad_var->IsType()) { + auto &grad_tensor = grad_var->Get(); + + if (std::is_same::value) { + auto &place = + *ctx.template device_context().eigen_device(); + auto lr_value = lr_tensor.data()[0]; + + auto p = EigenVector::Flatten(p_tensor); + auto ms = EigenVector::Flatten(ms_tensor); + auto g = EigenVector::Flatten(grad_tensor); + auto mom = EigenVector::Flatten(mom_tensor); + + auto p_out = EigenVector::Flatten(*param_out); + auto mom_out = EigenVector::Flatten(*moment_out); + auto ms_out = EigenVector::Flatten(*mean_square_out); + + ms_out.device(place) = rho * ms + (1 - rho) * g * g; + if (centered) { + auto &mg_tensor = *ctx.Input("MeanGrad"); + auto mg = EigenVector::Flatten(mg_tensor); + auto *mean_grad_out = ctx.Output("MeanGradOut"); + PADDLE_ENFORCE(&mg_tensor, mean_grad_out, + "MeanGrad and MeanGradOut must be the same Tensor"); + auto mg_out = EigenVector::Flatten(*mean_grad_out); + + mg_out.device(place) = rho * mg + (1 - rho) * g; + mom_out.device(place) = + momentum * mom + + lr_value * g / (ms_out - mg_out.square() + epsilon).sqrt(); + } else { + mom_out.device(place) = + momentum * mom + lr_value * g / (ms_out + epsilon).sqrt(); + } + p_out.device(place) = p - mom_out; + } else { + DenseRmspropGradFunctor grad_func(grad_tensor.data()); + platform::ForRange for_range(dev_ctx, limit); + if (centered) { + auto &mg_tensor = *ctx.Input("MeanGrad"); + auto *mean_grad_out = ctx.Output("MeanGradOut"); + PADDLE_ENFORCE(&mg_tensor, mean_grad_out, + "MeanGrad and MeanGradOut must be the same Tensor"); + for_range(CenteredRmspropFunctor>( + param_out->mutable_data(ctx.GetPlace()), + mean_square_out->mutable_data(ctx.GetPlace()), + moment_out->mutable_data(ctx.GetPlace()), + mean_grad_out->mutable_data(ctx.GetPlace()), + lr_tensor.data(), rho, epsilon, momentum, grad_func)); + } else { + for_range(UncenteredRmspropFunctor>( + param_out->mutable_data(ctx.GetPlace()), + mean_square_out->mutable_data(ctx.GetPlace()), + moment_out->mutable_data(ctx.GetPlace()), lr_tensor.data(), + rho, epsilon, momentum, grad_func)); + } + } + } else if (grad_var->IsType()) { + auto &grad = grad_var->Get(); + auto *merged_grad = const_cast(ctx.scope()) + .Var() + ->GetMutable(); + + math::scatter::MergeAdd merge_func; + merge_func(dev_ctx, grad, merged_grad); + + platform::ForRange for_range(dev_ctx, limit); + const int64_t *rows; +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + rows = merged_grad->rows().CUDAData(ctx.GetPlace()); + } else { +#endif + rows = merged_grad->rows().data(); +#ifdef PADDLE_WITH_CUDA + } +#endif + auto &merged_tensor = merged_grad->value(); + int64_t row_count = merged_grad->rows().size(); + int64_t row_numel = merged_tensor.numel() / row_count; + SparseRmspropGradFunctor grad_func(merged_tensor.data(), rows, + row_numel, row_count); + + if (centered) { + auto &mg_tensor = *ctx.Input("MeanGrad"); + auto *mean_grad_out = ctx.Output("MeanGradOut"); + PADDLE_ENFORCE(&mg_tensor, mean_grad_out, + "MeanGrad and MeanGradOut must be the same Tensor"); + for_range(CenteredRmspropFunctor>( + param_out->mutable_data(ctx.GetPlace()), + mean_square_out->mutable_data(ctx.GetPlace()), + moment_out->mutable_data(ctx.GetPlace()), + mean_grad_out->mutable_data(ctx.GetPlace()), lr_tensor.data(), + rho, epsilon, momentum, grad_func)); + } else { + for_range(UncenteredRmspropFunctor>( + param_out->mutable_data(ctx.GetPlace()), + mean_square_out->mutable_data(ctx.GetPlace()), + moment_out->mutable_data(ctx.GetPlace()), lr_tensor.data(), + rho, epsilon, momentum, grad_func)); + } } else { - mom_out.device(place) = - momentum * mom + - lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt(); + PADDLE_THROW("RMSProp only supports LoDTensor or SelectedRows gradient"); } - p_out.device(place) = p - mom_out; } }; diff --git a/paddle/fluid/operators/sequence_concat_op.cc b/paddle/fluid/operators/sequence_concat_op.cc index 397a3182953e3f1afaeadeff6d53a4f22fb95d26..3234b60861da3d0c6a8434eb11fd0488a95e171f 100644 --- a/paddle/fluid/operators/sequence_concat_op.cc +++ b/paddle/fluid/operators/sequence_concat_op.cc @@ -90,11 +90,13 @@ REGISTER_OPERATOR(sequence_concat, paddle::framework::OperatorWithKernel, paddle::framework::DefaultGradOpDescMaker); template using Kernel = op::SeqConcatKernel; -REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel, Kernel); +REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel, Kernel, + Kernel); + REGISTER_OPERATOR(sequence_concat_grad, paddle::framework::OperatorWithKernel, op::SeqConcatGradShapeInferer); template using GradKernel = op::SeqConcatGradKernel; REGISTER_OP_CPU_KERNEL(sequence_concat_grad, GradKernel, - GradKernel); + GradKernel, GradKernel); diff --git a/paddle/fluid/operators/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_unpad_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f3a0762b9a4d3e080d5d6d10b249e0bd81980b95 --- /dev/null +++ b/paddle/fluid/operators/sequence_unpad_op.cc @@ -0,0 +1,153 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_unpad_op.h" + +namespace paddle { +namespace operators { + +class SequenceUnpadOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceUnpadOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Length"), + "Input(Length) of SequenceUnpadOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequenceUnpadOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_GE(x_dims.size(), 2, + "The rank of Input(X) can't be less than 2."); + + auto len_dims = ctx->GetInputDim("Length"); + PADDLE_ENFORCE(len_dims.size() == 2 && len_dims[1] == 1, + "The shape of Input(Length) should be [batch_size, 1]."); + PADDLE_ENFORCE( + len_dims[0] == x_dims[0], + "Input(X) and Input(Length) should have the same first dimension."); + + int64_t out_dim_0 = -1; + if (ctx->IsRuntime()) { + out_dim_0 = x_dims[0] * x_dims[1]; + } + + std::vector out_dims_vec{out_dim_0}; + if (x_dims.size() == 2) { + out_dims_vec.push_back(1); + } else { + for (size_t i = 2; i < x_dims.size(); ++i) { + out_dims_vec.push_back(x_dims[i]); + } + } + ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class SequenceUnpadOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(LoDTensor, default LoDTensor) Input tensor which " + "contains the padded sequences with equal length."); + AddInput("Length", + "(LoDTensor) The input tensor which specifies the actual ength of " + "sequences after unpadding."); + AddOutput( + "Out", + "(LoDTensor) The output tensor which contains unpadded sequences."); + AddComment(R"DOC( + Sequence Unpad Operator + + This operator removes the padding data in the input sequences and convert + them into sequences with actual length as output, identitied by lod + information. + + Example: + + Given input tensor Input(X): + X.data = [[ 1.0, 2.0, 3.0, 4.0, 5.0], + [ 6.0, 7.0, 8.0, 9.0, 10.0], + [11.0, 12.0, 13.0, 14.0, 15.0]], +` + in which there are 3 sequences padded to length 5, and the acutal length + specified by Input(Length): + + Length.data = [[2], [3], [4]], + + after unpadding, Output(Out) will be: + + Out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]] + Out.lod = [[0, 2, 5, 9]] + + )DOC"); + } +}; + +class SequenceUnpadGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceUnpadGradOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) of SequenceUnpadGradOp should not be null."); + + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(sequence_unpad, ops::SequenceUnpadOp, + ops::SequenceUnpadOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(sequence_unpad_grad, ops::SequenceUnpadGradOp); +REGISTER_OP_CPU_KERNEL( + sequence_unpad, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel); +REGISTER_OP_CPU_KERNEL( + sequence_unpad_grad, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel); diff --git a/paddle/fluid/operators/sequence_unpad_op.cu b/paddle/fluid/operators/sequence_unpad_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..75248372237ec2cb23122f6b16e64f6ce750ebf9 --- /dev/null +++ b/paddle/fluid/operators/sequence_unpad_op.cu @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_unpad_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sequence_unpad, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel); +REGISTER_OP_CUDA_KERNEL( + sequence_unpad_grad, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel); diff --git a/paddle/fluid/operators/sequence_unpad_op.h b/paddle/fluid/operators/sequence_unpad_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ebe3118b985bdfd41ca55e8c572047aa87502ff4 --- /dev/null +++ b/paddle/fluid/operators/sequence_unpad_op.h @@ -0,0 +1,104 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sequence_padding.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +template +class SequenceUnpadOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x_t = ctx.Input("X"); + auto* len_t = ctx.Input("Length"); + auto* out_t = ctx.Output("Out"); + out_t->mutable_data(ctx.GetPlace()); + + const int64_t* seq_len_ptr = nullptr; + if (platform::is_gpu_place(ctx.GetPlace())) { + LoDTensor seq_len_cpu; + seq_len_cpu.Resize(len_t->dims()); + seq_len_ptr = seq_len_cpu.mutable_data(platform::CPUPlace()); + framework::TensorCopy(*len_t, platform::CPUPlace(), + ctx.template device_context(), + &seq_len_cpu); + } else { + seq_len_ptr = len_t->data(); + } + + size_t batch_size = x_t->dims()[0]; + std::vector out_lod0(batch_size + 1, 0); + for (size_t i = 0; i < batch_size; ++i) { + out_lod0[i + 1] = out_lod0[i] + seq_len_ptr[i]; + } + + framework::LoD out_lod; + out_lod.push_back(out_lod0); + out_t->set_lod(out_lod); + + std::vector out_dims_vec{static_cast(out_lod0.back())}; + if (x_t->dims().size() == 2) { + out_dims_vec.push_back(1); + } else { + for (size_t i = 2; i < x_t->dims().size(); ++i) { + out_dims_vec.push_back(x_t->dims()[i]); + } + } + out_t->Resize(framework::make_ddim(out_dims_vec)); + + int64_t padded_length = x_t->dims()[1]; + math::UnpaddingLoDTensorFunctor()( + ctx.template device_context(), *x_t, out_t, + padded_length, 0, false, math::kBatchLengthWidth); + } +}; + +template +class SequenceUnpadGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* d_x = ctx.Output(framework::GradVarName("X")); + if (d_x) { + const auto* d_out = ctx.Input(framework::GradVarName("Out")); + const auto* x_t = ctx.Input("X"); + d_x->mutable_data(ctx.GetPlace()); + + int padded_length = x_t->dims()[1]; + + LoDTensor zero_pads; + zero_pads.Resize({1, 1}); + zero_pads.mutable_data(ctx.GetPlace()); + math::SetConstant set_zero; + auto& dev_ctx = ctx.template device_context(); + set_zero(dev_ctx, &zero_pads, static_cast(0)); + + math::PaddingLoDTensorFunctor()( + ctx.template device_context(), *d_out, d_x, zero_pads, + padded_length, 0, false, math::kBatchLengthWidth); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index 2880c09263f10e9c624e11b77188171f48d9db28..b5f472d20f40fa182a4aa55ff384b0954e4ba9e3 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -128,7 +128,7 @@ bool MayIUse(const cpu_isa_t cpu_isa) { return cpu.has(Cpu::tAVX); case avx2: return cpu.has(Cpu::tAVX2); - case avx512_common: + case avx512f: return cpu.has(Cpu::tAVX512F); case avx512_core: return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) && diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 30c8fbcfce92a8b06a175ddf198cde572f72b2a4..6810a1651a14cdb2080af846b21cad242b70bf35 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -43,7 +43,7 @@ typedef enum { sse42, avx, avx2, - avx512_common, + avx512f, avx512_core, avx512_core_vnni, avx512_mic, diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index f04395a8ac00f33501008aa12f22773ddda9b138..a251bfcd9914422cb6300adbbcdef3dfa79f441c 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -130,6 +130,13 @@ struct EOFException : public std::exception { #define UNLIKELY(condition) (condition == 0) #endif +#if !defined(_WIN32) +#define LIKELY(condition) __builtin_expect(static_cast(condition), 1) +#else +// there is no equivalent intrinsics in msvc. +#define LIKELY(condition) (condition != 0) +#endif + template inline typename std::enable_if::type throw_on_error( bool stat, const Args&... args) { diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 4c99f4be321160caf0ee2f89a655bdfb933408e3..ab91ca5345047f3053eb8771e6a265d2a3011f85 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -116,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector devices) { platform::SetNumThreads(FLAGS_paddle_num_threads); #endif - if (platform::jit::MayIUse(platform::jit::avx512_common)) { + if (platform::jit::MayIUse(platform::jit::avx512f)) { #ifndef __AVX512F__ LOG(WARNING) << "AVX512F is available, Please re-compile on local machine"; #endif diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 612f3bc0e7940663f84a55b2c4395a7b5119d5bb..a35147da90e87af85308431fd7dbe965bb1fd1d7 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -370,8 +370,8 @@ void ParseEvents(const std::vector>& events, std::vector> merged_events_list; if (merge_thread) { std::vector merged_events; - for (int i = 0; i < events.size(); ++i) { - for (int j = 0; j < events[i].size(); ++j) { + for (size_t i = 0; i < events.size(); ++i) { + for (size_t j = 0; j < events[i].size(); ++j) { merged_events.push_back(events[i][j]); } } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 1709546fae288e84ce2b6fef4bfd1bd68da84b1d..339a7c98c6a2bba2cd46790cecc169ef447c63ce 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -57,6 +57,10 @@ limitations under the License. */ #include "pybind11/stl.h" +DEFINE_bool(reader_queue_speed_test_mode, false, + "If set true, the queue.pop will only get data from queue but not " + "remove the data from queue for speed testing"); + // disable auto conversion to list in Python PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); @@ -380,7 +384,8 @@ All parameter, weight, gradient are variables in Paddle. return make_ddim(shape); }); auto *holder = var.GetMutable(); - holder->InitOnce(capacity, dims); + holder->InitOnce(capacity, dims, + FLAGS_reader_queue_speed_test_mode); return holder->GetQueue(); }, py::return_value_policy::copy); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 7bbdf7de89cc932e0023952e3c8e102f92b06855..41678918b8bb54078091f892ce7a519dfc8a0014 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -113,7 +113,8 @@ def __bootstrap__(): 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', - "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb' + 'dist_threadpool_size', 'cpu_deterministic', 'eager_delete_tensor_gb', + 'reader_queue_speed_test_mode' ] if core.is_compiled_with_dist(): read_env_flags.append('rpc_deadline') diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 43aa4a9e7c4ffe96a8f1346cd1bc3e4f309ba309..224781e6596e2b2bc6d3084f4780a33ed2903118 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -56,6 +56,7 @@ __all__ = [ 'sequence_expand', 'sequence_expand_as', 'sequence_pad', + 'sequence_unpad', 'lstm_unit', 'reduce_sum', 'reduce_mean', @@ -64,6 +65,7 @@ __all__ = [ 'reduce_prod', 'sequence_first_step', 'sequence_last_step', + 'sequence_slice', 'dropout', 'split', 'ctc_greedy_decoder', @@ -1902,6 +1904,76 @@ def sequence_last_step(input): return sequence_pool(input=input, pool_type="last") +def sequence_slice(input, offset, length, name=None): + """ + **Sequence Slice Layer** + + The layer crops a subsequence from given sequence with given start + offset and subsequence length. + + It only supports sequence data (LoDTensor with lod_level equal to 1). + + .. code-block:: text + + - Case: + + Given the input Variable **input**: + + input.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]], + input.lod = [[3, 2]], + input.dims = (5, 2), + + with offset.data = [[0], [1]] and length.data = [[2], [1]], + + the output Variable will be + + out.data = [[a1, a2], [b1, b2], [e1, e2]], + out.lod = [[2, 1]], + out.dims = (3, 2). + + NOTE: The first dimension size of **input**, **offset** and **length** + should be equal. The **offset** should start from 0. + + Args: + input(Variable): The input Variable which consists of the complete + sequences. + offset(Variable): The offset to slice each sequence. + length(Variable): The length of each subsequence. + name(str|None): A name for this layer(optional). If set None, the + layer will be named automatically. + + Returns: + Variable: The output subsequences. + + Examples: + + .. code-block:: python + + import numpy as np + seqs = fluid.layers.data(name='x', shape=[10, 5], + dtype='float32', lod_level=1) + offset = fluid.layers.assign(input=np.array([[0, 1]]).astype("int32")) + length = fluid.layers.assign(input=np.array([[2, 1]]).astype("int32")) + subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset, + length=length) + """ + helper = LayerHelper("sequence_slice", **locals()) + dtype = helper.input_dtype() + out = helper.create_tmp_variable(dtype) + + offset.stop_gradient = True + length.stop_gradient = True + + helper.append_op( + type="sequence_slice", + inputs={"X": input, + "Offset": offset, + "Length": length}, + outputs={"Out": out}) + + return out + + @templatedoc() def pool2d(input, pool_size=-1, @@ -2793,7 +2865,7 @@ def sequence_expand_as(x, y, name=None): @templatedoc() -def sequence_pad(x, pad_value, maxlen=None): +def sequence_pad(x, pad_value, maxlen=None, name=None): """ ${comment} @@ -2807,7 +2879,9 @@ def sequence_pad(x, pad_value, maxlen=None): None or any positive int. When it is None, all sequences will be padded up to the length of the longest one among them; when it a certain positive value, it must be greater than the length of the - longest original sequence." + longest original sequence. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Returns: Variable: The padded sequence batch and the original lengths before @@ -2844,6 +2918,66 @@ def sequence_pad(x, pad_value, maxlen=None): return out, length +def sequence_unpad(x, length, name=None): + """ + **Sequence Unpad Layer** + + This layer removes the padding data in the input sequences and convert + them into sequences with actual length as output, identitied by lod + information. + + .. code-block:: text + + Example: + + Given input Variable **x**: + x.data = [[ 1.0, 2.0, 3.0, 4.0, 5.0], + [ 6.0, 7.0, 8.0, 9.0, 10.0], + [11.0, 12.0, 13.0, 14.0, 15.0]], + + in which there are 3 sequences padded to length 5, and the acutal length + specified by input Variable **length**: + + length.data = [[2], [3], [4]], + + after unpadding, the output Variable will be: + + out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]] + out.lod = [[2, 3, 4]] + + Args: + x(Variable): Input Variable which contains the padded sequences with + equal length. + length(Variable): The Variable that specifies the actual ength of + sequences after unpadding. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The Variable contains the unpadded sequences. + + Examples: + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[10, 5], dtype='float32') + len = fluid.layers.data(name='length', shape=[1], dtype='int64') + out = fluid.layers.sequence_unpad(x=x, length=len) + """ + + helper = LayerHelper('sequence_unpad', input=x, **locals()) + dtype = helper.input_dtype() + out = helper.create_tmp_variable(dtype) + + length.stop_gradient = True + + helper.append_op( + type='sequence_unpad', + inputs={'X': x, + 'Length': length}, + outputs={'Out': out}) + return out + + def beam_search(pre_ids, pre_scores, ids, diff --git a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py index 6456d1b53a129db04ace7ff4413a3d76e922ccde..fac5e037a46715d146e354825f09ee8ccc4f3d70 100644 --- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py +++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py @@ -81,7 +81,10 @@ def get_optimizer(): return optimizer -def train_network(batch_size, is_distributed=False, is_sparse=False): +def train_network(batch_size, + is_distributed=False, + is_sparse=False, + is_self_contained_lr=False): # query q = fluid.layers.data( name="query_ids", shape=[1], dtype="int64", lod_level=1) @@ -93,7 +96,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False): param_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0.01), name="__emb__", - learning_rate=emb_lr), + learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__"), is_sparse=is_sparse) ## vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') @@ -119,7 +124,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False): param_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0.01), name="__emb__", - learning_rate=emb_lr), + learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__"), is_sparse=is_sparse) ## vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') @@ -144,7 +151,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False): param_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0.01), name="__emb__", - learning_rate=emb_lr), + learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__"), is_sparse=is_sparse) ## vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') @@ -220,7 +229,10 @@ class TestDistSimnetBow2x2(TestDistRunnerBase): def get_model(self, batch_size=2): # Train program avg_cost, acc, predict = \ - train_network(batch_size, bool(int(os.environ["IS_DISTRIBUTED"])), bool(int(os.environ["IS_SPARSE"]))) + train_network(batch_size, + bool(int(os.environ["IS_DISTRIBUTED"])), + bool(int(os.environ["IS_SPARSE"])), + bool(int(os.environ["IS_SELF_CONTAINED_LR"]))) inference_program = fluid.default_main_program().clone() diff --git a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py index e971f29db42a7c1a2394505a8ece3d2fd6b347e9..11095f23591edc41a82962149a52096fa17cfb93 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py +++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py @@ -25,7 +25,11 @@ class TestDistSimnetBowDense2x2(TestDistBase): self._enforce_place = "CPU" def test_simnet_bow(self): - need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'} + need_envs = { + "IS_DISTRIBUTED": '0', + "IS_SPARSE": '0', + 'IS_SELF_CONTAINED_LR': '1' + } self.check_with_place( "dist_simnet_bow.py", delta=1e-5, @@ -39,7 +43,11 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase): self._enforce_place = "CPU" def test_simnet_bow(self): - need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'} + need_envs = { + "IS_DISTRIBUTED": '0', + "IS_SPARSE": '0', + 'IS_SELF_CONTAINED_LR': '1' + } self.check_with_place( "dist_simnet_bow.py", delta=100, @@ -53,7 +61,11 @@ class TestDistSimnetBowSparse2x2(TestDistBase): self._enforce_place = "CPU" def test_simnet_bow(self): - need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'} + need_envs = { + "IS_DISTRIBUTED": '0', + "IS_SPARSE": '1', + 'IS_SELF_CONTAINED_LR': '1' + } self.check_with_place( "dist_simnet_bow.py", delta=1e-5, @@ -67,7 +79,11 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase): self._enforce_place = "CPU" def test_simnet_bow(self): - need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'} + need_envs = { + "IS_DISTRIBUTED": '0', + "IS_SPARSE": '1', + 'IS_SELF_CONTAINED_LR': '1' + } self.check_with_place( "dist_simnet_bow.py", delta=100, @@ -75,5 +91,59 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase): need_envs=need_envs) +class TestDistSimnetBow2x2LookupTableSync(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._enforce_place = "CPU" + + def test_simnet_bow(self): + need_envs = { + "IS_DISTRIBUTED": '1', + "IS_SPARSE": '1', + 'IS_SELF_CONTAINED_LR': '1' + } + self.check_with_place( + "dist_simnet_bow.py", + delta=1e-5, + check_error_log=False, + need_envs=need_envs) + + +class TestDistSimnetBow2x2LookupTableAsync(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._enforce_place = "CPU" + + def test_simnet_bow(self): + need_envs = { + "IS_DISTRIBUTED": '1', + "IS_SPARSE": '1', + 'IS_SELF_CONTAINED_LR': '1' + } + self.check_with_place( + "dist_simnet_bow.py", + delta=100, + check_error_log=False, + need_envs=need_envs) + + +class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._enforce_place = "CPU" + + def test_simnet_bow(self): + need_envs = { + "IS_DISTRIBUTED": '1', + "IS_SPARSE": '1', + 'IS_SELF_CONTAINED_LR': '0' + } + self.check_with_place( + "dist_simnet_bow.py", + delta=1e-5, + check_error_log=False, + need_envs=need_envs) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 1d8d0b55f0c5d7cffa01a100847bdf48b6d7023d..dc70477ebe1cfbffd207ebb4bbf9d9f39893d79e 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -194,6 +194,14 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(layers.sequence_expand(x=x, y=y, ref_level=1)) print(str(program)) + def test_sequence_unpad(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[10, 5], dtype='float32') + length = layers.data(name='length', shape=[1], dtype='int64') + self.assertIsNotNone(layers.sequence_unpad(x=x, length=length)) + print(str(program)) + def test_lstm_unit(self): program = Program() with program_guard(program): @@ -406,6 +414,19 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(out) print(str(program)) + def test_sequence_slice(self): + program = Program() + with program_guard(program): + import numpy as np + seqs = layers.data( + name='x', shape=[10, 5], dtype='float32', lod_level=1) + offset = layers.assign(input=np.array([[0, 1]]).astype('int32')) + length = layers.assign(input=np.array([[2, 1]]).astype('int32')) + out = layers.sequence_slice( + input=seqs, offset=offset, length=length) + self.assertIsNotNone(out) + print(str(program)) + def test_lod_reset(self): program = Program() with program_guard(program): diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py index 70848e4e2239e2be160bb0c1a28a5aecd01a87dc..eb12bc741767340a3e7e3580a8b95065d4267693 100644 --- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py +++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py @@ -19,33 +19,76 @@ import unittest import numpy as np import paddle.fluid.core as core from paddle.fluid.op import Operator +import paddle.fluid as fluid + + +def create_selected_rows_and_tensor(scope, place, height, row_num, + embedding_size): + sr = scope.var("@selected_rows@").get_selected_rows() + tensor = scope.var("grad").get_tensor() + + rows = np.random.random_integers( + low=0, high=height - 1, size=[row_num, ]).astype('int64') + sr_val = np.random.random(size=[row_num, embedding_size]).astype('float32') + + sr.set_height(height) + sr.set_rows(rows) + sr.get_tensor().set(sr_val, place) + + tensor_val = np.zeros(shape=[height, embedding_size], dtype='float32') + for i in range(row_num): + row = rows[i] + tensor_val[row, :] = tensor_val[row, :] + sr_val[i, :] + + tensor.set(tensor_val, place) + return tensor_val, sr_val class TestBase(unittest.TestCase): - def setup(self, centered, epsilon=1e-6): + def setup(self, + place, + is_sparse, + centered, + size, + row_num=None, + epsilon=1e-6): np.random.seed(5) # fix seed + self.scope = fluid.global_scope() + self.place = place + self.param_name = "param" - self.param = np.random.random((123, 321)).astype("float32") + self.param = np.random.random(size).astype("float32") self.mean_square_name = "mean_square" - self.mean_square = np.random.random((123, 321)).astype("float32") + self.mean_square = np.random.uniform( + low=1, high=2, size=size).astype("float32") self.mean_grad_name = "mean_grad" - self.mean_grad = np.random.random((123, 321)).astype("float32") + self.mean_grad = np.random.random(size).astype("float32") self.lr_name = "lr" self.learning_rate = np.array([0.01]).astype("float32") self.grad_name = "grad" - self.grad = np.random.random((123, 321)).astype("float32") + + self.is_sparse = is_sparse + if self.is_sparse: + self.grad_sr_name = "@selected_rows@" + self.grad, self.grad_sr = create_selected_rows_and_tensor( + self.scope, place, size[0], row_num, size[1]) + else: + self.grad = np.random.random(size).astype("float32") + grad_tensor = self.scope.var(self.grad_name).get_tensor() + grad_tensor.set(self.grad, place) self.moment_name = "moment" - self.moment = np.zeros((123, 321)).astype("float32") + self.moment = np.random.uniform( + low=0, high=1, size=size).astype("float32") self.epsilon = epsilon self.decay = 0.9 - self.momentum = 0.0 + self.momentum = 0.1 self.centered = centered self.ms_out = self.decay * self.mean_square + (1 - self.decay @@ -61,118 +104,122 @@ class TestBase(unittest.TestCase): self.param_out = self.param - self.moment_out - def check(self, - actual_t, - expect_t, - place, - out_name, - atol=1e-5, - equal_nan=False): - self.assertTrue( - np.allclose( - actual_t, expect_t, atol=atol, equal_nan=equal_nan), - "Output (" + out_name + ") has diff at " + str(place) + "\nExpect " - + str(expect_t) + "\n" + "But Got" + str(actual_t)) - - -class TestRmspropOp(TestBase): - def check_with_place(self, place, centered, epsilon): - self.setup(centered, epsilon) - scope = core.Scope() - # create and initialize Param Variable - param = scope.var(self.param_name).get_tensor() - param.set(self.param, place) + self.param_tensor = self.scope.var(self.param_name).get_tensor() + self.param_tensor.set(self.param, place) - mean_square = scope.var(self.mean_square_name).get_tensor() - mean_square.set(self.mean_square, place) + self.mean_square_tensor = self.scope.var( + self.mean_square_name).get_tensor() + self.mean_square_tensor.set(self.mean_square, place) - lr = scope.var(self.lr_name).get_tensor() + lr = self.scope.var(self.lr_name).get_tensor() lr.set(self.learning_rate, place) - grad = scope.var(self.grad_name).get_tensor() - grad.set(self.grad, place) + self.moment_tensor = self.scope.var(self.moment_name).get_tensor() + self.moment_tensor.set(self.moment, place) - moment = scope.var(self.moment_name).get_tensor() - moment.set(self.moment, place) + if self.centered: + self.mean_grad_tensor = self.scope.var( + self.mean_grad_name).get_tensor() + self.mean_grad_tensor.set(self.mean_grad, place) - # create and run sgd operator + def check(self, actual_t, expect_t, place, out_name, atol=1e-5): + self.assertTrue( + np.allclose( + actual_t, expect_t, atol=atol), + "Output (" + out_name + ") has diff at " + str(place) + "\nExpect " + + str(expect_t) + "\n" + "But Got" + str(actual_t)) - if self.centered: - mean_grad = scope.var(self.mean_grad_name).get_tensor() - mean_grad.set(self.mean_grad, place) - - rmsprop_op = Operator( - "rmsprop", - Param=self.param_name, - Grad=self.grad_name, - MeanSquare=self.mean_square_name, - MeanGrad=self.mean_grad_name, - Moment=self.moment_name, - LearningRate=self.lr_name, - ParamOut=self.param_name, - MeanSquareOut=self.mean_square_name, - MomentOut=self.moment_name, - MeanGradOut=self.mean_grad_name, - epsilon=self.epsilon, - decay=self.decay, - momentum=self.momentum, - centered=True) - else: - rmsprop_op = Operator( - "rmsprop", - Param=self.param_name, - Grad=self.grad_name, - MeanSquare=self.mean_square_name, - Moment=self.moment_name, - LearningRate=self.lr_name, - ParamOut=self.param_name, - MeanSquareOut=self.mean_square_name, - MomentOut=self.moment_name, - epsilon=self.epsilon, - decay=self.decay, - momentum=self.momentum, - centered=False) - - rmsprop_op.run(scope, place) - - atol = 1e-5 - equal_nan = False + +class TestRmspropOp(TestBase): + def check_with_place(self, + place, + is_sparse, + centered, + size, + row_num=None, + epsilon=1e-6): + self.setup(place, is_sparse, centered, size, row_num, epsilon) + self.run_and_check() + + def run_and_check(self): + grad_name = self.grad_sr_name if self.is_sparse else self.grad_name + + kwargs = { + 'Param': self.param_name, + 'Grad': grad_name, + 'MeanSquare': self.mean_square_name, + 'Moment': self.moment_name, + 'LearningRate': self.lr_name, + 'ParamOut': self.param_name, + 'MeanSquareOut': self.mean_square_name, + 'MomentOut': self.moment_name, + 'epsilon': self.epsilon, + 'decay': self.decay, + 'momentum': self.momentum, + 'centered': self.centered + } if self.centered: - atol = 1e-3 - equal_nan = True + kwargs['MeanGrad'] = self.mean_grad_name + kwargs['MeanGradOut'] = self.mean_grad_name + + rmsprop_op = Operator('rmsprop', **kwargs) + atol = 1e-6 + + rmsprop_op.run(self.scope, self.place) self.check( - np.array(mean_square), self.ms_out, place, self.mean_square_name) + np.array(self.mean_square_tensor), + self.ms_out, + self.place, + self.mean_square_name, + atol=atol) self.check( - np.array(moment), + np.array(self.moment_tensor), self.moment_out, - place, + self.place, self.moment_name, - atol=atol, - equal_nan=equal_nan) + atol=atol) self.check( - np.array(param), + np.array(self.param_tensor), self.param_out, - place, + self.place, self.param_name, - atol=atol, - equal_nan=equal_nan) + atol=atol) if self.centered: self.check( - np.array(mean_grad), self.mg_out, place, self.mean_grad_name) + np.array(self.mean_grad_tensor), self.mg_out, self.place, + self.mean_grad_name) def test_rmsprop(self): places = [core.CPUPlace()] if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) + + size = (128, 320) for place in places: - self.check_with_place(place, False, 1e-6) - self.check_with_place(place, False, 1e-10) - self.check_with_place(place, True, 1e-6) - self.check_with_place(place, True, 1e-10) + for centered in [False, True]: + with fluid.scope_guard(core.Scope()): + self.check_with_place( + place, is_sparse=False, centered=centered, size=size) + + with fluid.scope_guard(core.Scope()): + self.check_with_place( + place, + is_sparse=True, + centered=centered, + row_num=512, + size=size) + + with fluid.scope_guard(core.Scope()): + self.check_with_place( + place, + is_sparse=True, + centered=centered, + row_num=60, + size=size) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py b/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py new file mode 100644 index 0000000000000000000000000000000000000000..673b0ea180464b8b8f6f5c6e76d5c5c80f347d25 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py @@ -0,0 +1,75 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import six +import numpy as np +from op_test import OpTest + + +class TestSequenceUnpadOp(OpTest): + def init(self): + self.length = [2, 3, 4] + self.x_shape = (3, 5) + self.dtype = "float32" + + def compute(self): + assert len(self.length) == self.x_shape[0] + x = np.random.random(self.x_shape).astype(self.dtype) + out_lod = [self.length] + + out = x[0, 0:self.length[0]] + for i in six.moves.xrange(1, x.shape[0]): + out = np.append(out, x[i, 0:self.length[i]], axis=0) + + out_shape = (sum(self.length), ) + if len(self.x_shape) == 2: + out_shape = out_shape + (1, ) + else: + out_shape = out_shape + self.x_shape[2:] + + self.inputs = { + 'X': x, + 'Length': np.array(self.length).astype('int64').reshape(-1, 1) + } + self.outputs = {'Out': (out.reshape(out_shape), out_lod)} + + def setUp(self): + self.op_type = 'sequence_unpad' + self.init() + self.compute() + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestSequenceUnpadOp2(TestSequenceUnpadOp): + def init(self): + self.length = [2, 3, 4] + self.x_shape = (3, 5, 4, 3) + self.dtype = "float32" + + +class TestSequenceUnpadOp3(TestSequenceUnpadOp): + def init(self): + self.length = [5, 2, 3, 4] + self.x_shape = (4, 5, 3, 3, 6) + self.dtype = "float64" + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 91db85b8ec6a32fee3b7aa8ab76429a4a197fcc3..2192139f8d5950286691a77333dd8ec35505b033 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -1119,6 +1119,7 @@ to transpile() call.") def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints): # 2. add split_ids_op and send_op to send gradient to pservers + # there should only be one table_name all_ops = program.global_block().ops table_grad_name = grad_var_name(self.table_name) @@ -1143,7 +1144,7 @@ to transpile() call.") if self.sync_mode else [] }, attrs={ - "sync_mode": self.sync_mode, + "sync_mode": not self.sync_mode, "epmap": pserver_endpoints, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, OP_ROLE_VAR_ATTR_NAME: [ @@ -1189,7 +1190,15 @@ to transpile() call.") def _create_table_optimize_block(self, pserver_index, pserver_program, pre_block_idx, grad_to_block_id): # STEP: create table optimize block + table_opt_block = pserver_program._create_block(pre_block_idx) # create table param and grad var in pserver program + # create table optimize block in pserver program + table_opt_op = [ + op for op in self.optimize_ops + if 'Param' in op.input_names and op.input("Param")[0] == + self.table_name + ][0] + origin_param_var = self.origin_program.global_block().vars[ self.table_name] @@ -1205,19 +1214,16 @@ to transpile() call.") dtype=origin_param_var.dtype, type=core.VarDesc.VarType.SELECTED_ROWS, persistable=True) + # parameter must be selected rows param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS) grad_var = pserver_program.global_block()._clone_variable( self.origin_program.global_block().vars[grad_var_name( self.table_name)]) - # create table optimize block in pserver program - table_opt_op = [ - op for op in self.optimize_ops - if 'Param' in op.input_names and op.input("Param")[0] == - self.table_name - ][0] - table_opt_block = pserver_program._create_block(pre_block_idx) + lr_var = pserver_program.global_block()._clone_variable( + self.origin_program.global_block().vars[table_opt_op.input( + "LearningRate")[0]]) if self.sync_mode: # create grad vars in pserver program @@ -1249,8 +1255,6 @@ to transpile() call.") grad_var = pserver_program.global_block()._rename_var( origin_grad_name, splited_grad_name) - lr_var = pserver_program.global_block().vars[table_opt_op.input( - "LearningRate")[0]] inputs = { "Param": [param_var], "Grad": [grad_var],