提交 ac0e0f51 编写于 作者: S sneaxiy

merge develop

test=develop
......@@ -179,7 +179,6 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
else:
build_strategy.reduce_strategy = fluid.BuildStrategy(
).ReduceStrategy.AllReduce
build_strategy.fuse_broadcast_op = args.fuse_broadcast_op
avg_loss = train_args[0]
......
......@@ -302,6 +302,7 @@ paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=
paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3'))
paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b'))
paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9d586a0b5bd05f67ee78048f9d503b6'))
paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3a46e0b5f9ce82348406478e610f14c9'))
paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7'))
paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13'))
paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '072a8541e0f632366bba10f67cb0db27'))
......@@ -309,6 +310,8 @@ paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None
paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad'))
paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973'))
paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '485f2686bcc2fe37a4bd893769c8a3e2'))
paddle.fluid.layers.acos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '920a47734482276c069ba24c61c26b25'))
paddle.fluid.layers.asin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cf4ee2c9b9d7293556f8c5173dfb5d2c'))
paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4'))
paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b47f5da13913d3e56bdb1e612a73f3f2'))
paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cc6ac2f14f03c52aaa83a59bf83b8d26'))
......
......@@ -38,10 +38,10 @@ if(WITH_GPU)
nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
add_dependencies(tensor tensor_util)
else()
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context )
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler)
endif(WIN32)
else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context )
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler)
endif()
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
......@@ -174,7 +174,7 @@ else()
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif()
target_link_libraries(executor garbage_collector)
target_link_libraries(executor garbage_collector while_op_helper)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor
......
......@@ -61,7 +61,8 @@ cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_
cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
cc_library(while_op_eager_deletion_pass SRCS while_op_eager_deletion_pass.cc DEPS while_op_helper graph_helper pass computation_op_handle)
cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass)
cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper)
cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
......
......@@ -14,6 +14,7 @@
#pragma once
#include <memory>
#include <string>
#include <vector>
......@@ -31,6 +32,8 @@ class ComputationOpHandle : public OpHandleBase {
ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
size_t scope_idx);
OperatorBase *GetOp() { return op_.get(); }
std::string Name() const override;
const Scope *GetScope() const { return scope_; }
......
......@@ -12,6 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <unordered_set>
#include <utility>
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/scope.h"
......@@ -45,6 +49,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
}
}
#endif
PADDLE_ENFORCE(!var_names_.empty(), "Var names cannot be empty");
}
EagerDeletionOpHandle::~EagerDeletionOpHandle() {
......@@ -60,15 +65,20 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; }
void EagerDeletionOpHandle::RunImpl() {
auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
Scope *exec_scope = nullptr;
std::deque<std::shared_ptr<memory::Allocation>> garbages;
for (auto &name : var_names_) {
auto it = ref_cnts_->find(name);
// Var not found, not reference count has not decreased to 0
// Reference count has not decreased to 0
if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) {
continue;
}
if (!exec_scope) {
exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
}
// Var not found
auto *var = exec_scope->FindVar(name);
if (var == nullptr) {
continue;
......
......@@ -12,20 +12,173 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <functional>
#include <queue>
#include <string>
#include <tuple>
#include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include "paddle/fluid/framework/details/eager_deletion_pass.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
"Fraction of eager deletion. If less than 1.0, all variables in "
"the program would be sorted according to its memory size, and "
"only the FLAGS_memory_fraction_of_eager_deletion of the largest "
"variables would be deleted.");
namespace paddle {
namespace framework {
namespace details {
// op -> variables which can be deleted after op runs
using OpToVarNameSetMap =
std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>;
// Check whether the variable is LoDTensor based on static VarDesc info
static bool IsLoDTensor(VarDesc *var) {
return var->Proto()->type().type() == proto::VarType::LOD_TENSOR;
}
// Get memory size of LoDTensor
static int64_t GetMemorySize(
const std::unordered_map<std::string, std::vector<VarHandle *>> &vars,
const std::string &var_name) {
auto *var_desc = TryGetLatestVarDesc(vars.at(var_name));
PADDLE_ENFORCE_NOT_NULL(var_desc);
PADDLE_ENFORCE(IsLoDTensor(var_desc));
auto dims = var_desc->GetShape();
return SizeOfType(var_desc->GetDataType()) *
std::accumulate(dims.begin(), dims.end(), static_cast<int64_t>(1),
std::multiplies<int64_t>());
}
// Split all variables in the graph into LoDTensor and Non-LoDTensor (e.g.
// SelectedRows, LoDTensorArray)
// Since partial GC is based on static analysis of memory size of each variable
// So we should skip SelectedRows and LoDTensorArray here
static void SplitIntoLoDTensorAndNonLoDTensorVars(
const OpToVarNameSetMap &m, const GraphVars &vars,
OpToVarNameSetMap *lod_tensors, OpToVarNameSetMap *other_vars) {
lod_tensors->clear();
other_vars->clear();
for (auto &op_vars_pair : m) {
for (auto &var_name : op_vars_pair.second) {
auto *var_desc = TryGetLatestVarDesc(
vars[op_vars_pair.first->GetScopeIdx()].at(var_name));
if (IsLoDTensor(var_desc)) {
(*lod_tensors)[op_vars_pair.first].insert(var_name);
} else {
(*other_vars)[op_vars_pair.first].insert(var_name);
}
}
}
}
struct GCVarInfo {
GCVarInfo(const std::string &name, int64_t memory_size,
ComputationOpHandle *op, size_t scope_idx)
: name_(name),
memory_size_(memory_size),
op_(op),
scope_idx_(scope_idx) {}
std::string name_; // variable name
int64_t memory_size_; // memory size
ComputationOpHandle *op_; // op after which the variable could be deleted
size_t scope_idx_; // scope index where the variable locates
int64_t AbsMemorySize() const { return std::abs(memory_size_); }
};
// Delete delete_lod_tensor_only is not used currently
static OpToVarNameSetMap ShrinkGCVars(
const OpToVarNameSetMap &m, const GraphVars &vars,
const std::vector<platform::Place> &places, double fraction_of_memory_size,
bool delete_lod_tensor_only = false) {
// Do not perform gc when fraction_of_memory_size = 0
if (fraction_of_memory_size <= 0.0) return {};
/**
* Step 1: Split all variables into LoDTensor and Non-LoDTensor.
* We can only calculate memory size of LoDTensors
*/
OpToVarNameSetMap lod_tensors, other_vars;
SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars);
// Perform complete gc when fraction_of_memory_size >= 1
if (fraction_of_memory_size >= 1.0) {
return delete_lod_tensor_only ? lod_tensors : m;
}
/**
* Step 2: build GCVarInfos, and calculate total memory sizes of each device
*/
// place -> variable info (name, memory size, place, scope_idx)
std::map<platform::Place, std::vector<GCVarInfo>> place_to_vars;
// place -> total memory sizes
std::map<platform::Place, int64_t> place_to_size;
for (auto &op_vars_pair : lod_tensors) {
auto *op = op_vars_pair.first;
auto &var_names = op_vars_pair.second;
auto scope_idx = op->GetScopeIdx();
auto &place = places[scope_idx];
for (auto &var_name : var_names) {
auto var_size = GetMemorySize(vars[scope_idx], var_name);
GCVarInfo var_info(var_name, var_size, op, scope_idx);
place_to_size[place] += var_info.AbsMemorySize();
place_to_vars[place].emplace_back(std::move(var_info));
}
}
/**
* Step 3: sort GCVarInfos, and only delete the largest variables.
*/
OpToVarNameSetMap partial_vars;
for (auto &place_to_var_pair : place_to_vars) {
auto &place = place_to_var_pair.first;
auto &gc_vars = place_to_var_pair.second;
std::sort(gc_vars.begin(), gc_vars.end(),
[](const GCVarInfo &var1, const GCVarInfo &var2) {
return var1.AbsMemorySize() > var2.AbsMemorySize();
});
int64_t accumulated_size = 0;
int64_t size_threshold =
static_cast<int64_t>(fraction_of_memory_size * place_to_size[place]);
for (size_t i = 0; i < gc_vars.size() && accumulated_size < size_threshold;
++i) {
partial_vars[gc_vars[i].op_].insert(gc_vars[i].name_);
accumulated_size += gc_vars[i].AbsMemorySize();
}
}
/**
* Step 4: Combine other vars (SelectedRows, LoDTensorArray)
*/
if (!delete_lod_tensor_only) {
for (auto &op_vars_pair : other_vars) {
partial_vars[op_vars_pair.first].insert(op_vars_pair.second.begin(),
op_vars_pair.second.end());
}
}
return partial_vars;
}
class EagerDeletionPass : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
auto &ref_cnts =
......@@ -43,9 +196,7 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
// a reverse map of last_live_ops
// i.e., last op --> variable names which can be deleted.
std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>
op_vars_map;
OpToVarNameSetMap op_vars_map;
for (auto &var_ops_map : last_live_ops) {
for (auto &var_ops_pair : var_ops_map) {
const std::string &var_name = var_ops_pair.first;
......@@ -55,6 +206,9 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
}
}
op_vars_map = ShrinkGCVars(op_vars_map, vars, places,
FLAGS_memory_fraction_of_eager_deletion);
for (auto &pair : op_vars_map) {
auto *op = pair.first;
auto &var_names = pair.second;
......@@ -85,8 +239,13 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
eager_deletion_op->AddOutput(dummy_leaf);
}
VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = "
<< FLAGS_memory_fraction_of_eager_deletion;
VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)";
return graph;
auto while_op_eager_deletion_pass =
ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass");
return while_op_eager_deletion_pass->Apply(std::move(graph));
}
} // namespace details
......@@ -99,3 +258,5 @@ REGISTER_PASS(eager_deletion_pass,
.RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars)
.RequirePassAttr(paddle::framework::details::kAllPlaces)
.RequirePassAttr(paddle::framework::details::kGarbageCollector);
USE_PASS(while_op_eager_deletion_pass);
......@@ -16,6 +16,7 @@
#include <algorithm>
#include <deque>
#include <iterator>
#include <memory>
#include <stack>
#include <string>
#include <unordered_map>
......@@ -263,6 +264,10 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
ir::Graph* graph) const {
VLOG(4) << "Try to inplace op " << op->Name();
// FIXME(liuwei1031): Graph is not aware of the existence of BlockDescs and
// ProgramDescs.
// The operations related to BlockDesc or ProgramDesc should perform on Graph
// or Node directly!
PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
"op_desc is nullptr");
// some pre-requirments need to meet if the op want to inplaced.
......
......@@ -24,6 +24,7 @@
#include <sstream>
#include <string>
#include <type_traits>
#include <unordered_set>
#include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/framework/data_type.h"
......@@ -191,6 +192,10 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
// immediately to make the subblock variable reuse strategy take
// effect. Because it is a single op in graph. No need to
// update the ir nodes.
// FIXME(liuwei1031): Graph is not aware of the existence of
// BlockDescs and ProgramDescs.
// The operations related to BlockDesc or ProgramDesc should perform
// on Graph or Node directly!
sub_op_desc->Rename(var->Name(), cache->Name());
if (sub_op_desc->Block() != nullptr &&
sub_op_desc->Block()->HasVar(var->Name())) {
......
......@@ -12,9 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <queue>
#include <string>
#include <type_traits>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h"
......@@ -189,15 +193,6 @@ ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx,
return shrink_func(computation_op);
}
static VarDesc *TryGetLatestVarDesc(const std::vector<VarHandle *> &vars) {
VarDesc *var_desc = nullptr;
std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool {
var_desc = var_handle->Node()->Var();
return var_desc != nullptr;
});
return var_desc;
}
std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount);
......
......@@ -13,9 +13,22 @@
// limitations under the License.
#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/var_desc.h"
namespace paddle {
namespace framework {
namespace details {} // namespace details
namespace details {
VarDesc *TryGetLatestVarDesc(const std::vector<VarHandle *> &vars) {
VarDesc *var_desc = nullptr;
std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool {
var_desc = var_handle->Node()->Var();
return var_desc != nullptr;
});
return var_desc;
}
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -16,6 +16,7 @@
#include <atomic>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
......@@ -25,6 +26,10 @@
namespace paddle {
namespace framework {
class VarDesc;
class VarHandle;
namespace details {
class ComputationOpHandle;
......@@ -43,9 +48,11 @@ const char kGarbageCollector[] = "garbage_collector";
const char kAllPlaces[] = "all_places";
using LastLiveOpsOfVars =
std::unordered_map<std::string, std::unordered_set<ComputationOpHandle*>>;
std::unordered_map<std::string, std::unordered_set<ComputationOpHandle *>>;
const char kLastLiveOpsOfVars[] = "last_live_ops_of_var";
VarDesc *TryGetLatestVarDesc(const std::vector<VarHandle *> &vars);
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/operators/controlflow/while_op_helper.h"
namespace paddle {
namespace framework {
namespace details {
class WhileOpEagerDeletionPass : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override {
auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
// Find all while_op and while_grad_op
std::unordered_map<size_t, std::pair<std::vector<OperatorBase *>,
std::vector<OperatorBase *>>>
target_ops;
for (auto *op : all_ops) {
auto compute_op = dynamic_cast<ComputationOpHandle *>(op);
if (compute_op == nullptr) continue;
if (compute_op->Name() == "while") {
target_ops[compute_op->GetScopeIdx()].first.emplace_back(
compute_op->GetOp());
} else if (compute_op->Name() == "while_grad") {
target_ops[compute_op->GetScopeIdx()].second.emplace_back(
compute_op->GetOp());
}
}
for (auto &ops_pair : target_ops) {
auto &while_ops = ops_pair.second.first;
auto &while_grad_ops = ops_pair.second.second;
operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
while_ops, while_grad_ops);
}
return graph;
}
};
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_PASS(while_op_eager_deletion_pass,
paddle::framework::details::WhileOpEagerDeletionPass);
......@@ -14,6 +14,10 @@ limitations under the License. */
#include "paddle/fluid/framework/executor.h"
#include <deque>
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/lod_rank_table.h"
......@@ -23,17 +27,18 @@ limitations under the License. */
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/controlflow/while_op_helper.h"
#include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_NGRAPH
#include "paddle/fluid/operators/ngraph/ngraph_engine.h"
DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
#endif
DECLARE_bool(benchmark);
DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
namespace paddle {
namespace framework {
......@@ -75,11 +80,11 @@ static std::unordered_map<std::string, size_t> GetNonPersistableReferenceCounts(
ExecutorPrepareContext::ExecutorPrepareContext(
const framework::ProgramDesc& prog, size_t block_id,
const std::vector<std::string>& skip_ref_cnt_vars)
: prog_(prog), block_id_(block_id) {
if (GetEagerDeletionThreshold() >= 0) {
global_ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id),
skip_ref_cnt_vars);
const std::vector<std::string>& keep_vars, bool force_disable_gc)
: prog_(prog), block_id_(block_id), force_disable_gc_(force_disable_gc) {
if (GetEagerDeletionThreshold() >= 0 && !force_disable_gc_) {
global_ref_cnts_ =
GetNonPersistableReferenceCounts(prog.Block(block_id), keep_vars);
}
}
......@@ -184,13 +189,12 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
}
void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
bool create_local_scope, bool create_vars) {
bool create_local_scope, bool create_vars,
const std::vector<std::string>& skip_ref_cnt_vars,
bool force_disable_gc) {
platform::RecordBlock b(block_id);
if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
#ifdef PADDLE_WITH_NGRAPH
if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc);
#endif
auto ctx = Prepare(pdesc, block_id);
auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc);
RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
}
......@@ -357,20 +361,27 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
const ProgramDesc& program, int block_id,
const std::vector<std::string>& skip_ref_cnt_vars) {
std::unique_ptr<ExecutorPrepareContext> ctx(
new ExecutorPrepareContext(program, block_id, skip_ref_cnt_vars));
const std::vector<std::string>& skip_ref_cnt_vars, bool force_disable_gc) {
std::unique_ptr<ExecutorPrepareContext> ctx(new ExecutorPrepareContext(
program, block_id, skip_ref_cnt_vars, force_disable_gc));
PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
auto& block = program.Block(block_id);
for (auto& op_desc : block.AllOps()) {
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
}
#ifdef PADDLE_WITH_NGRAPH
if (FLAGS_use_ngraph) {
paddle::operators::NgraphEngine::FuseNgraphOps(
ctx->prog_.Block(ctx->block_id_), &ctx->ops_);
}
#endif
return ctx;
}
std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
const ProgramDesc& program, const std::vector<int>& block_ids,
const std::vector<std::vector<std::string>>& skip_ref_cnt_vars) {
const std::vector<std::vector<std::string>>& skip_ref_cnt_vars,
bool force_disable_gc) {
PADDLE_ENFORCE(
skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(),
"skip_ref_cnt_vars should be either empty or equals to block number %d",
......@@ -380,9 +391,11 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
for (auto& bid : block_ids) {
ExecutorPrepareContext* ctx;
if (skip_ref_cnt_vars.empty()) {
ctx = new ExecutorPrepareContext(program, bid);
ctx = new ExecutorPrepareContext(program, bid, std::vector<std::string>(),
force_disable_gc);
} else {
ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx]);
ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx],
force_disable_gc);
}
PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
auto& block = program.Block(bid);
......@@ -409,8 +422,9 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
int64_t max_memory_size = GetEagerDeletionThreshold();
std::unique_ptr<GarbageCollector> gc;
// skip while_op and while_grad_op temporarily
if (max_memory_size >= 0 && !keep_kids) {
// FIXME(zjl): recurrent_op is rather complex, we would
// disable gc forcely in recurrent_op
if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
ctx->ResetReferenceCount();
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) {
......@@ -428,6 +442,11 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
#ifdef PADDLE_WITH_CUDA
}
#endif
// If gc is enabled and block size > 1
if (gc && ctx->prog_.Size() > 1) {
operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(ctx->block_id_,
ctx->ops_);
}
}
for (auto& op : ctx->ops_) {
......
......@@ -15,7 +15,9 @@ limitations under the License. */
#pragma once
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/op_info.h"
......@@ -30,7 +32,8 @@ namespace framework {
struct ExecutorPrepareContext {
ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id,
const std::vector<std::string>& skip_ref_cnt_vars =
std::vector<std::string>());
std::vector<std::string>(),
bool force_disable_gc = false);
~ExecutorPrepareContext();
......@@ -38,6 +41,7 @@ struct ExecutorPrepareContext {
const framework::ProgramDesc& prog_;
size_t block_id_;
bool force_disable_gc_;
std::vector<std::unique_ptr<OperatorBase>> ops_;
std::unordered_map<std::string, size_t> global_ref_cnts_;
......@@ -66,7 +70,10 @@ class Executor {
* Scope
*/
void Run(const ProgramDesc& prog, Scope* scope, int block_id,
bool create_local_scope = true, bool create_vars = true);
bool create_local_scope = true, bool create_vars = true,
const std::vector<std::string>& skip_ref_cnt_vars =
std::vector<std::string>(),
bool force_disable_gc = false);
// This API is very slow.
void Run(const ProgramDesc& program, Scope* scope,
......@@ -79,12 +86,14 @@ class Executor {
static std::unique_ptr<ExecutorPrepareContext> Prepare(
const ProgramDesc& program, int block_id,
const std::vector<std::string>& skip_ref_cnt_vars =
std::vector<std::string>());
std::vector<std::string>(),
bool force_disable_gc = false);
static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
const ProgramDesc& program, const std::vector<int>& block_ids,
const std::vector<std::vector<std::string>>& skip_ref_cnt_vars =
std::vector<std::vector<std::string>>());
std::vector<std::vector<std::string>>(),
bool force_disable_gc = false);
void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id);
......
......@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <unordered_set>
#include <unordered_map>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/op_proto_maker.h"
......@@ -152,6 +152,39 @@ void Graph::ResolveHazard(
}
}
std::shared_ptr<Graph> Graph::Clone() {
auto cloned_graph = std::make_shared<Graph>(this->program_);
cloned_graph->ReleaseNodes();
cloned_graph->num_node_created_ = 0;
std::unordered_map<ir::Node *, ir::Node *> origin_to_cloned;
for (auto *n : this->node_set_) {
ir::Node *cloned_node = nullptr;
if (n->IsCtrlVar()) {
cloned_node = cloned_graph->CreateControlDepVar();
} else if (!n->var_desc_ && !n->op_desc_) { // empty node
cloned_node = cloned_graph->CreateEmptyNode(n->Name(), n->NodeType());
} else if (n->IsVar()) {
cloned_node = cloned_graph->CreateVarNode(n->Var());
} else if (n->IsOp()) {
cloned_node = cloned_graph->CreateOpNode(n->Op());
}
if (cloned_node) {
origin_to_cloned[n] = cloned_node;
} else {
PADDLE_THROW("The cloned node's type is not supported!");
}
}
for (auto *n : this->node_set_) {
for (auto it = n->inputs.begin(); it != n->inputs.end(); it++) {
origin_to_cloned[n]->inputs.push_back(origin_to_cloned[*it]);
}
for (auto it = n->outputs.begin(); it != n->outputs.end(); it++) {
origin_to_cloned[n]->outputs.push_back(origin_to_cloned[*it]);
}
}
return cloned_graph;
}
bool IsControlDepVar(const ir::Node &var) {
return var.Name().find(ir::Node::kControlDepVarName) != std::string::npos;
}
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <map>
#include <memory>
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/node.h"
......@@ -199,7 +200,12 @@ class Graph {
// WARN: After a series of passes, the current graph can be quite
// different from OriginProgram. Caller shouldn't assume much from
// the returned OriginProgram.
const ProgramDesc &OriginProgram() const { return program_; }
const ProgramDesc &OriginProgram() const {
LOG(WARNING) << "WARN: After a series of passes, the current graph can be "
"quite different from OriginProgram. So, please avoid "
"using the `OriginProgram()` method!";
return program_;
}
// This method takes ownership of `node`.
ir::Node *AddNode(ir::Node *node) {
......@@ -212,6 +218,10 @@ class Graph {
void ResolveHazard(
const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
// Create a new and duplicated graph.
// WARN: The method only clones the graph structure, not its attributes.
std::shared_ptr<Graph> Clone();
private:
std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
const ProgramDesc &program);
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <typeindex>
#include <typeinfo>
......
......@@ -186,14 +186,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
VLOG(3) << place << " " << DebugStringEx(&scope);
} catch (platform::EnforceNotMet exception) {
if (Attrs().count("sub_block") != 0) {
throw;
throw std::move(exception);
}
auto& callstack = Attr<std::vector<std::string>>(
OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
if (callstack.empty()) {
throw;
throw std::move(exception);
}
std::ostringstream sout;
sout << "Invoke operator " << Type() << " error.\n";
......@@ -204,7 +204,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
sout << "C++ Callstacks: \n";
sout << exception.err_str_;
exception.err_str_ = sout.str();
throw;
throw std::move(exception);
} catch (...) {
std::rethrow_exception(std::current_exception());
}
......@@ -926,8 +926,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
dev_ctx = pool.Get(expected_kernel_key.place_);
}
RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx);
this->InferShape(&infer_shape_ctx);
if (!HasAttr(kAllKernelsMustComputeRuntimeShape)) {
RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx);
this->InferShape(&infer_shape_ctx);
}
// TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
// not Scope. Imperative mode only pass inputs and get outputs.
kernel_iter->second(
......
......@@ -62,6 +62,15 @@ constexpr char kZeroVarSuffix[] = "@ZERO";
/// Variables with this suffix are the new Gradient.
constexpr char kNewGradSuffix[] = "@NEWGRAD@";
/// If an Op has this attribute, all its kernels should calculate output
/// variable's shape in the corresponding Compute() function. And
/// OperatorWithKernel::RunImpl() would skip call this Op's InferShape()
/// function in its runtime for speedup.
/// TODO(luotao): Note that this temporal attribute would be deleted after all
/// ops contain it.
constexpr char kAllKernelsMustComputeRuntimeShape[] =
"@ALL_KERNELS_MUST_COMPUTE_RUNTIME_SHAPE@";
// define some kernel priority
/* Define multiple kernel type fallback order*/
extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
......
......@@ -181,13 +181,14 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
return member_->local_scopes_;
}
ParallelExecutor::ParallelExecutor(
const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &bcast_vars,
const std::string &loss_var_name, Scope *scope,
const std::vector<Scope *> &local_scopes,
const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
ir::Graph *graph)
ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
const std::vector<std::string> &bcast_vars,
const std::string &loss_var_name,
Scope *scope,
const std::vector<Scope *> &local_scopes,
const ExecutionStrategy &exec_strategy,
const BuildStrategy &build_strategy,
ir::Graph *graph)
: member_(new ParallelExecutorPrivate(places)) {
member_->global_scope_ = scope;
member_->use_cuda_ = exec_strategy.use_cuda_;
......@@ -254,9 +255,23 @@ ParallelExecutor::ParallelExecutor(
PADDLE_THROW("Not compiled with CUDA");
#endif
}
if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
BCastParamsToDevices(bcast_vars);
// broadcast parameters from the 0th device to others:
auto need_broadcast = [&]() -> bool {
if (build_strategy.num_trainers_ > 1) {
// 1. num_tariners would be grater than 1 for nccl distributed training.
return true;
} else if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
// 2. Only one trainer process, but ParallelExecutor hold multiple
// devices.
return true;
}
return false;
};
if (need_broadcast()) {
BCastParamsToDevices(bcast_vars, build_strategy.trainer_id_);
}
// Startup Program has been run. All local scopes has correct parameters.
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
......@@ -338,7 +353,7 @@ ParallelExecutor::ParallelExecutor(
}
void ParallelExecutor::BCastParamsToDevices(
const std::unordered_set<std::string> &vars) const {
const std::vector<std::string> &vars, int trainer_id) const {
// the initializing bcast, all vars would be bcast from device(0).
for (auto &var : vars) {
framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
......@@ -362,7 +377,7 @@ void ParallelExecutor::BCastParamsToDevices(
auto place = member_->places_[i];
void *buffer;
if (i == 0) {
if (i == 0 && trainer_id == 0) {
buffer = const_cast<void *>(main_tensor.data<void>());
} else {
auto local_scope = member_->local_scopes_[i];
......
......@@ -14,9 +14,11 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/build_strategy.h"
......@@ -45,7 +47,7 @@ class ParallelExecutor {
public:
explicit ParallelExecutor(const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &bcast_vars,
const std::vector<std::string> &bcast_vars,
const std::string &loss_var_name, Scope *scope,
const std::vector<Scope *> &local_scopes,
const ExecutionStrategy &exec_strategy,
......@@ -70,7 +72,10 @@ class ParallelExecutor {
const std::string &fetched_var_name);
private:
void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
// broadcast the parameters from the 0th device.
// trainer_id the trainer index in nccl distributed training.
void BCastParamsToDevices(const std::vector<std::string> &vars,
int trainer_id = 0) const;
bool EnableParallelGraphExecution(const ir::Graph &graph,
const ExecutionStrategy &exec_strategy,
const BuildStrategy &build_strategy) const;
......
......@@ -18,6 +18,7 @@
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
......@@ -137,16 +138,19 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
#ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
platform::RecordEvent record_event("TensorCopy:GPU->CPU");
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
} else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
platform::RecordEvent record_event("TensorCopy:CPU->GPU");
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
} else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
platform::RecordEvent record_event("TensorCopy:GPU->GPU");
if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
VLOG(3) << "Skip copy the same data from " << src_place << " to "
<< dst_place;
......@@ -157,6 +161,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
} else if (platform::is_cuda_pinned_place(src_place) &&
platform::is_gpu_place(dst_place)) {
platform::RecordEvent record_event("TensorCopy:CUDAPinned->GPU");
auto src_pinned_place = boost::get<platform::CUDAPinnedPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,
......
......@@ -159,10 +159,9 @@ class Autograd {
for (auto it : candidate->pre_ops_) {
for (OpBase* pre_op : it.second) {
if (!pre_op) continue;
VLOG(5) << "op dep " << candidate->op_desc_->Type() << " trace id "
VLOG(5) << "op dep " << candidate->Type() << " trace id "
<< candidate->trace_id_ << " <---- " << it.first << " <---- "
<< pre_op->op_desc_->Type() << " trace id "
<< pre_op->trace_id_;
<< pre_op->Type() << " trace id " << pre_op->trace_id_;
if (visited.find(pre_op) == visited.end()) {
visited.insert(pre_op);
queue.push_back(pre_op);
......@@ -180,10 +179,12 @@ std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
PADDLE_ENFORCE(var_->IsInitialized(),
"Variable must be initialized when getting numpy tensor");
std::unique_ptr<VarBase> new_var(new VarBase());
// TODO(minqiyang): change this after move unique_name generator to CXX
const framework::LoDTensor& self_tensor = var_->Get<framework::LoDTensor>();
std::unique_ptr<VarBase> new_var(new VarBase(
"Itmp", self_tensor.type(), self_tensor.dims(), dst_place, true, false));
framework::LoDTensor* tensor =
new_var->var_->GetMutable<framework::LoDTensor>();
tensor->Resize(var_->Get<framework::LoDTensor>().dims());
tensor->set_lod(var_->Get<framework::LoDTensor>().lod());
if (blocking) {
......@@ -199,52 +200,62 @@ std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
}
if (platform::is_gpu_place(dst_place)) {
VLOG(3) << "copy tensor " << var_desc_->Name() << " from gpu";
VLOG(3) << "copy tensor " << Name() << " from gpu";
}
return new_var;
}
framework::LoDTensor& VarBase::GradValue() {
VLOG(3) << "get var grad " << var_desc_->Name();
VLOG(3) << "get var grad " << Name();
PADDLE_ENFORCE_NOT_NULL(grads_,
"Could not get grad value from no grad variable");
return *(grads_->var_->GetMutable<framework::LoDTensor>());
}
std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
if (grad_op_descs_.empty() && backward_id_ <= 0) {
VLOG(3) << "op with no grad: " << op_desc_->Type();
VLOG(3) << "op with no grad: " << Type();
return {};
}
VLOG(3) << "apply op grad: " << op_desc_->Type();
std::vector<framework::VariableValueMap> grad_outputs;
VLOG(3) << "apply op grad: " << Type();
std::vector<framework::VariableValueMap> tmp_grad_outputs;
if (backward_id_ > 0) {
VLOG(3) << "py_layer_grad";
grad_outputs.resize(1);
grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] =
tmp_grad_outputs.resize(1);
tmp_grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] =
PyLayer::ApplyGrad(
backward_id_,
grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]);
} else {
grad_outputs.resize(grad_op_descs_.size());
for (size_t k = 0; k < grad_op_descs_.size(); ++k) {
const size_t grad_op_count = grad_op_descs_.size();
tmp_grad_outputs.resize(grad_op_count);
for (size_t k = 0; k < grad_op_count; ++k) {
framework::OpDesc* grad_op_desc = grad_op_descs_[k];
VLOG(3) << "op grad " << grad_op_desc->Type();
for (auto it : grad_output_vars_[k]) {
auto& outputs = grad_outputs[k][it.first];
auto& grad_output_variable_map = grad_output_vars_[k];
VLOG(3) << "apply grad op " << grad_op_desc->Type();
// Allocate tmp grad output variable
for (auto it : grad_output_variable_map) {
auto& outputs = tmp_grad_outputs[k][it.first];
outputs.reserve(it.second.size());
for (size_t i = 0; i < it.second.size(); ++i) {
// Allocate a new variable
Variable* tmp_var = new framework::Variable();
tmp_var->GetMutable<framework::LoDTensor>();
outputs.push_back(tmp_var);
outputs.emplace_back(tmp_var);
}
}
framework::RuntimeContext ctx(grad_input_vars_[k], grad_outputs[k]);
// Run grad op
framework::RuntimeContext ctx(grad_input_vars_[k], tmp_grad_outputs[k]);
// No need to do compile time infer shape here.
// grad_op_desc_->InferShape(*block_);
grad_op_desc->InferVarType(block_);
// grad_op_desc->InferVarType(block_);
std::unique_ptr<framework::OperatorBase> opbase =
framework::OpRegistry::CreateOp(*grad_op_desc);
......@@ -260,9 +271,10 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
}
}
// Add tmp grad outputs to original grad vars
for (size_t k = 0; k < grad_output_vars_.size(); ++k) {
for (auto it : grad_output_vars_[k]) {
auto& outputs = grad_outputs[k][it.first];
auto& outputs = tmp_grad_outputs[k][it.first];
auto& origin_outputs = it.second;
PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
......@@ -316,19 +328,14 @@ void PyLayer::RegisterFunc(int func_id, const py::object& py_func) {
int PyLayer::NumFuncs() { return py_funcs_.size(); }
std::vector<VarBase*> PyLayer::Apply(int func_id,
const std::vector<VarBase*>& inputs) {
std::vector<Variable*> PyLayer::Apply(int func_id,
const std::vector<VarBase*>& inputs) {
std::vector<framework::Variable*> invars;
for (const VarBase* in : inputs) {
invars.push_back(in->var_);
}
PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
std::vector<Variable*> outvars = CallPythonFunc(py_funcs_[func_id], invars);
std::vector<VarBase*> ret;
for (Variable* v : outvars) {
ret.push_back(new VarBase(v, new VarBase(true)));
}
return ret;
return CallPythonFunc(py_funcs_[func_id], invars);
}
std::vector<Variable*> PyLayer::ApplyGrad(
......
......@@ -112,31 +112,53 @@ class OpBase;
*/
class VarBase {
public:
VarBase() : VarBase(new framework::Variable(), new VarBase(true)) {}
explicit VarBase(bool stop_gradient)
: VarBase(new framework::Variable(),
stop_gradient ? nullptr : new VarBase(true), stop_gradient) {}
VarBase(framework::Variable* var, VarBase* grad)
: VarBase(var, grad, false) {}
// Internal interface, create VarBase from exist variable
VarBase(const std::string& name, framework::Variable* var, VarBase* grad,
bool stop_gradient)
: VarBase(name, var->Get<framework::LoDTensor>().type(),
var->Get<framework::LoDTensor>().dims(),
var->Get<framework::LoDTensor>().place(), var, grad,
stop_gradient, false) {}
// Python interface
VarBase(const std::string& name, const framework::proto::VarType::Type dtype,
const std::vector<int64_t>& shape, const platform::Place& place,
bool stop_gradient, bool persistable)
: VarBase(name, dtype, framework::make_ddim(shape), place, stop_gradient,
persistable) {}
// Internal interface, create VarBase from with ddim
VarBase(const std::string& name, const framework::proto::VarType::Type dtype,
const framework::DDim& shape, const platform::Place& place,
bool stop_gradient, bool persistable)
: VarBase(name, dtype, shape, place, nullptr, nullptr, stop_gradient,
persistable) {}
private:
VarBase(framework::Variable* var, VarBase* grad, bool stop_gradient)
: name_(),
var_desc_(nullptr),
VarBase(const std::string& name, framework::proto::VarType::Type dtype,
const framework::DDim& shape, const platform::Place& place,
framework::Variable* var, VarBase* grad, bool stop_gradient,
bool persistable)
: name_(name),
dtype_(dtype),
place_(place),
var_(var),
grads_(grad),
block_(nullptr),
persistable_(false),
stop_gradient_(stop_gradient),
persistable_(persistable),
pre_op_(nullptr),
pre_op_out_name_(),
pre_op_out_idx_(-1) {}
pre_op_out_idx_(-1) {
if (!var_) {
var_ = new framework::Variable();
auto tensor = var_->GetMutable<framework::LoDTensor>();
tensor->Resize(shape);
tensor->mutable_data(place_, dtype_);
}
}
public:
virtual ~VarBase() {
// TODO(minqiyang): remove var desc from block desc
if (var_) {
delete var_;
var_ = nullptr;
......@@ -151,14 +173,30 @@ class VarBase {
pre_op_out_idx_ = -1;
}
inline OpBase* PreOp() const { return pre_op_; }
inline int PreOpOutIdx() const { return pre_op_out_idx_; }
inline void SetName(const std::string& name) { name_ = name; }
inline std::string Name() const { return name_; }
inline std::vector<int64_t> Shape() const {
if (var_->IsInitialized()) {
return framework::vectorize(var_->Get<framework::LoDTensor>().dims());
} else {
return {};
}
}
inline framework::proto::VarType::Type DType() const { return dtype_; }
inline void SetStopGradient(bool stop_gradient) {
stop_gradient_ = stop_gradient;
}
inline bool IsStopGradient() const { return stop_gradient_; }
inline void SetPersistable(bool persistable) { persistable_ = persistable; }
inline bool IsPersistable() const { return persistable_; }
inline OpBase* PreOp() const { return pre_op_; }
inline int PreOpOutIdx() const { return pre_op_out_idx_; }
void RunBackward();
inline void ResetPreOp(OpBase* op) {
......@@ -180,7 +218,7 @@ class VarBase {
}
void ClearGradient() {
VLOG(1) << "clear gradient of " << var_desc_->Name();
VLOG(1) << "clear gradient of " << Name();
if (grads_ && grads_->var_ && grads_->var_->IsInitialized()) {
auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>();
operators::math::set_constant(
......@@ -196,23 +234,20 @@ class VarBase {
const bool blocking) const;
inline std::string GradName() const {
PADDLE_ENFORCE(
var_desc_,
"Couldn't get gradient variable's name, please call backward() first");
return string::Sprintf("%s@IGrad", var_desc_->Name());
return string::Sprintf("%s@IGrad", Name());
}
std::string name_;
framework::VarDesc* var_desc_;
framework::proto::VarType::Type dtype_;
platform::Place place_;
framework::Variable* var_;
VarBase* grads_;
framework::BlockDesc* block_;
bool persistable_;
private:
bool stop_gradient_;
bool persistable_;
OpBase* pre_op_;
std::string pre_op_out_name_;
int pre_op_out_idx_;
......@@ -223,11 +258,11 @@ class VarBase {
*/
class PYBIND11_HIDDEN OpBase {
public:
OpBase()
: op_desc_(nullptr),
OpBase(const std::string& type)
: type_(type),
trace_id_(-1),
forward_id_(-1),
backward_id_(-1),
trace_id_(-1),
place_(platform::CPUPlace()),
backward_hooks_() {}
......@@ -249,13 +284,34 @@ class PYBIND11_HIDDEN OpBase {
std::map<std::string, std::vector<VarBase*>> ApplyGrad();
inline std::string Type() const { return type_; }
inline std::string GradOpType(size_t index) const {
PADDLE_ENFORCE_NOT_NULL(grad_op_descs_[index]);
return grad_op_descs_[index]->Type();
}
void RegisterBackwardHooks(const py::object& callable);
void InvokeBackwardHooks();
// One of `op_desc_` or `forward_id_` is set, not both.
// For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_.
framework::OpDesc* op_desc_;
void TrackPreOp(const VarBase* inp_var, const std::string& inp_name) {
if (inp_var->PreOp() && !inp_var->IsStopGradient()) {
VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot "
<< inp_name;
pre_ops_[inp_name].push_back(inp_var->PreOp());
pre_ops_out_idx_[inp_name].push_back(inp_var->PreOpOutIdx());
} else {
VLOG(3) << "no pre op in slot " << inp_name
<< " input var stop_gradient: " << inp_var->IsStopGradient();
pre_ops_[inp_name].push_back(nullptr);
// pre_ops_out_idx_[inp_name].push_back(-1);
}
}
std::string type_;
// One of `trace_id_` or `forward_id_` is set, not both.
// For pure python PyLayer, use `forward_id_`, otherwise, use trace_id_.
int trace_id_;
int forward_id_;
// When has backward, one of `grad_op_descs_` or `backward_id_` is set,
......@@ -263,7 +319,6 @@ class PYBIND11_HIDDEN OpBase {
// Note: each fwd op corresponds to a vector of bwd ops.
std::vector<framework::OpDesc*> grad_op_descs_;
int backward_id_;
int trace_id_;
platform::Place place_;
......@@ -277,8 +332,6 @@ class PYBIND11_HIDDEN OpBase {
// Outputs to a vector of bwd ops.
std::vector<framework::VariableValueMap> grad_output_vars_;
framework::BlockDesc* block_;
std::vector<py::object> backward_hooks_;
};
......@@ -303,8 +356,8 @@ class PyLayer {
static int NumFuncs();
static std::vector<VarBase*> Apply(int func_id,
const std::vector<VarBase*>& inputs);
static std::vector<framework::Variable*> Apply(
int func_id, const std::vector<VarBase*>& inputs);
static std::vector<framework::Variable*> ApplyGrad(
int func_id, const std::vector<framework::Variable*>& inputs);
......
......@@ -56,15 +56,19 @@ void CreateGradOp(const framework::OpDesc& op_desc,
}
}
void InitVar(framework::Variable* var, framework::Variable* grad_var,
platform::DeviceContext* dev_ctx) {
void InitGrad(VarBase* var, platform::DeviceContext* dev_ctx) {
PADDLE_ENFORCE_NOT_NULL(var, "Could not get valid var base");
PADDLE_ENFORCE_NOT_NULL(dev_ctx,
"Could not get valid device from forward op");
auto& var_t = var->Get<framework::LoDTensor>();
grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>(
var_t.dims(), dev_ctx->GetPlace());
operators::math::set_constant(
*dev_ctx, grad_var->GetMutable<framework::LoDTensor>(), 0.0);
if (var->grads_ == nullptr) {
auto& var_t = var->var_->Get<framework::LoDTensor>();
var->grads_ = new VarBase(var->GradName(), framework::proto::VarType::FP32,
framework::vectorize(var_t.dims()),
dev_ctx->GetPlace(), true, false);
auto grad_t = var->grads_->var_->GetMutable<framework::LoDTensor>();
operators::math::set_constant(*dev_ctx, grad_t, 0.0);
}
}
platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) {
......@@ -85,6 +89,62 @@ platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) {
return result;
}
framework::VariableNameMap CreateInputVarNameMap(
const OpBase* op, const VarBasePtrMap& varbase_map) {
framework::VariableNameMap result;
auto& info_map = framework::OpInfoMap::Instance();
auto* op_info = info_map.GetNullable(op->Type());
if (op_info == nullptr || op_info->proto_ == nullptr) {
return result;
}
for (auto& in : op_info->Proto().inputs()) {
auto it = varbase_map.find(in.name());
if (it == varbase_map.end()) {
PADDLE_ENFORCE(in.dispensable());
result[in.name()] = {};
} else {
auto var_vector = it->second;
std::vector<std::string> args;
args.reserve(var_vector.size());
for (VarBase* var_base : var_vector) {
args.emplace_back(var_base->Name());
}
result[in.name()] = args;
}
}
return result;
}
framework::VariableNameMap CreateOutputVarNameMap(
const OpBase* op, const VarBasePtrMap& varbase_map) {
framework::VariableNameMap result;
auto& info_map = framework::OpInfoMap::Instance();
auto* op_info = info_map.GetNullable(op->Type());
if (op_info == nullptr || op_info->proto_ == nullptr) {
return result;
}
for (auto& out : op_info->Proto().outputs()) {
auto it = varbase_map.find(out.name());
if (it == varbase_map.end()) {
PADDLE_ENFORCE(out.dispensable());
result[out.name()] = {};
} else {
auto var_vector = it->second;
std::vector<std::string> args;
args.reserve(var_vector.size());
for (VarBase* var_base : var_vector) {
args.emplace_back(var_base->Name());
}
result[out.name()] = args;
}
}
return result;
}
Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
if (!FLAGS_tracer_profile_fname.empty()) {
std::call_once(gTracerProfileOnce, [] {
......@@ -101,7 +161,7 @@ Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
const VarBasePtrMap& outputs,
framework::BlockDesc* block,
framework::AttributeMap attrs_map,
const platform::Place expected_place,
const bool stop_gradient) {
#ifdef WITH_GPERFTOOLS
......@@ -110,40 +170,27 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
}
#endif
std::map<std::string, VarBase*> vars;
framework::OpDesc* op_desc = op->op_desc_;
VLOG(3) << "tracer tracing " << op_desc->Type() << " trace id "
<< op->trace_id_;
op_desc->InferShape(*block);
op_desc->InferVarType(block);
std::unique_ptr<framework::OperatorBase> op_base =
framework::OpRegistry::CreateOp(*op_desc);
framework::VariableValueMap invars_map;
framework::VariableValueMap outvars_map;
// Construct input_vars_map and output_vars_map
std::map<std::string, VarBase*> current_vars_map;
op->input_vars_ = inputs;
for (auto it : op->input_vars_) {
auto& invars = invars_map[it.first];
invars.reserve(it.second.size());
for (VarBase* inp : it.second) {
PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr",
op->op_desc_->Type(), inp->var_desc_->Name());
PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", op->Type(),
inp->Name());
invars.emplace_back(inp->var_);
vars[inp->var_desc_->Name()] = inp;
if (inp->PreOp() && !inp->IsStopGradient()) {
op->pre_ops_[it.first].push_back(inp->PreOp());
op->pre_ops_out_idx_[it.first].push_back(inp->PreOpOutIdx());
VLOG(3) << "add pre op " << inp->PreOp()->op_desc_->Type();
} else {
op->pre_ops_[it.first].push_back(nullptr);
op->TrackPreOp(inp, it.first);
if (!stop_gradient) {
current_vars_map[inp->Name()] = inp;
}
VLOG(3) << "input vname " << inp->var_desc_->Name() << " "
<< inp->var_->IsInitialized() << " stop_gradient "
<< inp->IsStopGradient();
VLOG(3) << "input var name: " << inp->Name()
<< " inited: " << inp->var_->IsInitialized()
<< " stop_grad: " << inp->IsStopGradient();
}
}
......@@ -152,25 +199,38 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
auto& outvars = outvars_map[it.first];
const std::vector<VarBase*>& outputs = it.second;
outvars.reserve(outputs.size());
for (size_t i = 0; i < outputs.size(); ++i) {
for (size_t i = 0U; i < outputs.size(); ++i) {
VarBase* out = outputs[i];
outvars.emplace_back(out->var_);
vars[out->var_desc_->Name()] = out;
framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name());
if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
out->var_->GetMutable<framework::LoDTensor>();
} else {
LOG(ERROR) << "tracer doesn't support yet";
}
out->TrackPreOp(op, it.first, i, stop_gradient);
if (!stop_gradient) {
current_vars_map[out->Name()] = out;
}
VLOG(3) << "output vname " << out->var_desc_->Name() << " "
<< out->var_->IsInitialized();
VLOG(3) << "input var name: " << out->Name()
<< " inited: " << out->var_->IsInitialized()
<< " stop_grad: " << out->IsStopGradient();
}
}
VLOG(3) << "tracer running " << op_desc->Type();
// Check attrs and create op
framework::VariableNameMap invars_name_map =
CreateInputVarNameMap(op, inputs);
framework::VariableNameMap outvars_name_map =
CreateOutputVarNameMap(op, outputs);
auto& info = framework::OpInfoMap::Instance().Get(op->Type());
if (info.Checker() != nullptr) {
info.Checker()->Check(&attrs_map);
}
std::unique_ptr<framework::OperatorBase> op_base =
framework::OpRegistry::CreateOp(op->Type(), invars_name_map,
outvars_name_map, attrs_map);
// TODO(minqiyang): Support infer var type in imperative mode
// Run forward op
VLOG(3) << "tracer running " << op->Type();
framework::RuntimeContext ctx(invars_map, outvars_map);
// TODO(panyx0718): Cache p.
......@@ -186,36 +246,44 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
framework::ExecutionContext(prepared_op.op, scope, *prepared_op.dev_ctx,
prepared_op.ctx, prepared_op.kernel_configs));
// construct backward op
std::set<std::string> vars_saved_for_backward;
if (!stop_gradient) {
VLOG(5) << "start construct backward op";
// construct grad op descs
std::unique_ptr<framework::OpDesc> fwd_op_desc(new framework::OpDesc(
op->Type(), invars_name_map, outvars_name_map, attrs_map));
std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
new std::unordered_map<std::string, std::string>());
CreateGradOp(*op_desc, {}, {block}, &op->grad_op_descs_, grad_to_var.get());
// NOTE(minqiyang): We don't support control flow op in imperative now
// Add grad_block_ when we want to support it
CreateGradOp(*fwd_op_desc, {}, {}, &op->grad_op_descs_, grad_to_var.get());
op->grad_input_vars_.resize(op->grad_op_descs_.size());
op->grad_output_vars_.resize(op->grad_op_descs_.size());
VLOG(5) << "create grad op desc: " << op->grad_op_descs_[0]->Type();
for (size_t i = 0; i < op->grad_op_descs_.size(); ++i) {
const size_t grad_op_count = op->grad_op_descs_.size();
op->grad_input_vars_.resize(grad_op_count);
op->grad_output_vars_.resize(grad_op_count);
for (size_t i = 0; i < grad_op_count; ++i) {
framework::OpDesc* grad_op_desc = op->grad_op_descs_[i];
for (auto it : grad_op_desc->Inputs()) {
auto& grad_in_vars = op->grad_input_vars_[i][it.first];
grad_in_vars.reserve(it.second.size());
for (const std::string& grad_invar : it.second) {
block->FindRecursiveOrCreateVar(grad_invar);
auto var_it = grad_to_var->find(grad_invar);
if (var_it == grad_to_var->end()) {
auto fwd_var_it = vars.find(grad_invar);
PADDLE_ENFORCE(fwd_var_it != vars.end());
auto fwd_var_it = current_vars_map.find(grad_invar);
PADDLE_ENFORCE(fwd_var_it != current_vars_map.end());
// Forward inputs or outputs.
grad_in_vars.push_back(fwd_var_it->second->var_);
grad_in_vars.emplace_back(fwd_var_it->second->var_);
} else {
VarBase* var = vars[var_it->second];
if (!var->grads_->var_->IsInitialized()) {
InitVar(var->var_, var->grads_->var_,
prepared_op.GetDeviceContext());
}
VarBase* var = current_vars_map[var_it->second];
InitGrad(var, prepared_op.GetDeviceContext());
// Douts.
grad_in_vars.push_back(var->grads_->var_);
grad_in_vars.emplace_back(var->grads_->var_);
}
vars_saved_for_backward.insert(it.first);
......@@ -225,48 +293,48 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
for (auto it : grad_op_desc->Outputs()) {
auto& grad_out_vars = op->grad_output_vars_[i][it.first];
for (const std::string& grad_outvar : it.second) {
block->FindRecursiveOrCreateVar(grad_outvar);
auto var_it = grad_to_var->find(grad_outvar);
PADDLE_ENFORCE(var_it != grad_to_var->end(),
"Could not found the grad op output var, should this "
"operator %s's stop gradient be True",
op_desc->Type());
VarBase* var = vars[var_it->second];
if (!var->grads_->var_->IsInitialized()) {
InitVar(var->var_, var->grads_->var_,
prepared_op.GetDeviceContext());
}
op->Type());
VarBase* var = current_vars_map[var_it->second];
InitGrad(var, prepared_op.GetDeviceContext());
grad_out_vars.push_back(var->grads_->var_);
}
}
}
}
op->block_ = block;
return vars_saved_for_backward;
}
std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
const std::vector<VarBase*>& inputs,
bool stop_gradient) {
VLOG(3) << "py_trace";
VLOG(3) << "py_trace " << op->Type();
op->input_vars_[PyLayer::kFwdInp] = inputs;
op->output_vars_[PyLayer::kFwdOut] = PyLayer::Apply(op->forward_id_, inputs);
std::vector<framework::Variable*> ret_vars =
PyLayer::Apply(op->forward_id_, inputs);
for (VarBase* inp : inputs) {
if (inp->PreOp() && !inp->IsStopGradient()) {
op->pre_ops_[PyLayer::kFwdInp].push_back(inp->PreOp());
op->pre_ops_out_idx_[PyLayer::kFwdInp].push_back(inp->PreOpOutIdx());
} else {
op->pre_ops_[PyLayer::kFwdInp].push_back(nullptr);
}
op->TrackPreOp(inp, PyLayer::kFwdInp);
}
auto& outputs = op->output_vars_[PyLayer::kFwdOut];
for (size_t i = 0; i < outputs.size(); ++i) {
VarBase* out = outputs[i];
std::vector<VarBase*>& outputs = op->output_vars_[PyLayer::kFwdOut];
outputs.reserve(ret_vars.size());
for (size_t i = 0U; i != ret_vars.size(); ++i) {
framework::Variable* v = ret_vars[i];
VarBase* out = new VarBase(string::Sprintf("%s_out_%d", op->Type(), i), v,
nullptr, stop_gradient);
outputs.emplace_back(out);
out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient);
}
if (!stop_gradient) {
VLOG(5) << "start construct backward op";
op->grad_input_vars_.resize(1);
op->grad_output_vars_.resize(1);
auto& grad_input_vars =
......@@ -281,23 +349,16 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
grad_input_vars.push_back(out->var_);
}
// TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
platform::CPUPlace place;
for (VarBase* out : outputs) {
InitGrad(out, platform::DeviceContextPool::Instance().Get(place));
grad_input_vars.push_back(out->grads_->var_);
if (!grad_input_vars.back()->IsInitialized()) {
// TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
InitVar(out->var_, grad_input_vars.back(),
platform::DeviceContextPool::Instance().Get(place));
}
}
for (const VarBase* inp : inputs) {
for (VarBase* inp : inputs) {
InitGrad(inp, platform::DeviceContextPool::Instance().Get(place));
grad_output_vars.push_back(inp->grads_->var_);
if (!grad_output_vars.back()->IsInitialized()) {
// TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
InitVar(inp->var_, grad_output_vars.back(),
platform::DeviceContextPool::Instance().Get(place));
}
}
}
return outputs;
......
......@@ -17,6 +17,8 @@
#include <map>
#include <set>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/op_desc.h"
......@@ -34,7 +36,8 @@ void CreateGradOp(const framework::OpDesc& op_desc,
framework::OpDesc** grad_op_desc,
std::unordered_map<std::string, std::string>* grad_to_var);
void InitVar(framework::Variable* var, framework::Variable* grad_var);
void InitVar(const VarBase* var, framework::Variable* grad_var,
platform::DeviceContext* dev_ctx);
platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs);
......@@ -46,7 +49,7 @@ class Tracer {
std::set<std::string> Trace(OpBase* op, const VarBasePtrMap& inputs,
const VarBasePtrMap& outputs,
framework::BlockDesc* block,
framework::AttributeMap attrs_map,
const platform::Place expected_place,
const bool stop_gradient = false);
......
......@@ -126,15 +126,20 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
}
template void ZeroCopyTensor::copy_from_cpu<float>(const float *data);
template void ZeroCopyTensor::copy_from_cpu<int64_t>(const int64_t *data);
template void ZeroCopyTensor::copy_from_cpu<int32_t>(const int32_t *data);
template void ZeroCopyTensor::copy_to_cpu<float>(float *data);
template void ZeroCopyTensor::copy_to_cpu<int64_t>(int64_t *data);
template void ZeroCopyTensor::copy_to_cpu<int32_t>(int32_t *data);
template float *ZeroCopyTensor::data<float>(PaddlePlace *place,
int *size) const;
template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place,
int *size) const;
template int32_t *ZeroCopyTensor::data<int32_t>(PaddlePlace *place,
int *size) const;
template float *ZeroCopyTensor::mutable_data<float>(PaddlePlace place);
template int64_t *ZeroCopyTensor::mutable_data<int64_t>(PaddlePlace place);
template int32_t *ZeroCopyTensor::mutable_data<int32_t>(PaddlePlace place);
void *ZeroCopyTensor::FindTensor() const {
PADDLE_ENFORCE(!name_.empty(),
......
......@@ -139,9 +139,8 @@ static void TensorAssignData(PaddleTensor *tensor,
}
template <typename T>
static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
const std::vector<std::vector<T>> &data) {
int size{0};
static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
const std::vector<std::vector<T>> &data) {
auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);
int c = 0;
for (const auto &f : data) {
......@@ -149,7 +148,15 @@ static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
ptr[c++] = v;
}
}
return size;
}
template <typename T>
static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
const PaddleBuf &data) {
auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);
for (size_t i = 0; i < data.length() / sizeof(T); i++) {
ptr[i] = *(reinterpret_cast<T *>(data.data()) + i);
}
}
static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) {
......
......@@ -107,6 +107,9 @@ void SetConfig(AnalysisConfig *cfg) {
cfg->DisableGpu();
cfg->SwitchSpecifyInputNames();
cfg->SwitchIrOptim();
if (FLAGS_zero_copy) {
cfg->SwitchUseFeedFetchOps(false);
}
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......@@ -131,7 +134,7 @@ TEST(Analyzer_Pyramid_DNN, profile) {
TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data && !FLAGS_zero_copy) {
PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
size_t size = GetSize(outputs[0]);
PADDLE_ENFORCE_GT(size, 0);
......@@ -166,6 +169,19 @@ TEST(Analyzer_Pyramid_DNN, compare) {
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
}
// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
TEST(Analyzer_Pyramid_DNN, compare_zero_copy) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
std::vector<std::string> outputs_name;
outputs_name.emplace_back("cos_sim_2.tmp_0");
CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
input_slots_all, outputs_name);
}
// Compare Deterministic result
TEST(Analyzer_Pyramid_DNN, compare_determine) {
AnalysisConfig cfg;
......
......@@ -207,6 +207,9 @@ void SetConfig(AnalysisConfig *cfg) {
cfg->DisableGpu();
cfg->SwitchSpecifyInputNames();
cfg->SwitchIrOptim();
if (FLAGS_zero_copy) {
cfg->SwitchUseFeedFetchOps(false);
}
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......@@ -285,133 +288,17 @@ TEST(Analyzer_rnn1, multi_thread) {
input_slots_all, &outputs, 2 /* multi_thread */);
}
// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
// on the complex RNN1 model.
TEST(Analyzer_rnn1, ZeroCopy) {
AnalysisConfig config;
SetConfig(&config);
config.SwitchUseFeedFetchOps(false);
PaddlePlace place;
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
config.SwitchUseFeedFetchOps(true);
auto native_predictor =
CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
config.SwitchUseFeedFetchOps(
true); // the analysis predictor needs feed/fetch.
auto analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
#define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__);
NEW_TENSOR(data_lod_attention);
NEW_TENSOR(cell_init);
NEW_TENSOR(data);
NEW_TENSOR(week);
NEW_TENSOR(minute);
NEW_TENSOR(hidden_init);
// Prepare data for AnalysisPredictor
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
PrepareZeroCopyInputs(data_lod_attention_tensor.get(), cell_init_tensor.get(),
data_tensor.get(), hidden_init_tensor.get(),
week_tensor.get(), minute_tensor.get(), &data,
FLAGS_batch_size);
// Prepare data for NativePredictor
std::vector<std::vector<PaddleTensor>> native_inputs;
SetInput(&native_inputs);
std::vector<PaddleTensor> native_outputs;
std::vector<PaddleTensor> analysis_outputs;
auto output_tensor = predictor->GetOutputTensor("final_output.tmp_1");
// Run analysis predictor
int num_ops;
auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_EQ(fuse_statis.at("fc_fuse"), 1);
ASSERT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM
ASSERT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
ASSERT_EQ(num_ops,
13); // After graph optimization, only 13 operators exists.
Timer timer;
double total_time{0};
for (int i = 0; i < FLAGS_repeat; i++) {
timer.tic();
predictor->ZeroCopyRun();
total_time += timer.toc();
}
LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor);
ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs));
LOG(INFO) << "native output " << DescribeTensor(native_outputs.front());
int output_size{0}; // this is the number of elements not memory size
auto *zero_copy_data = output_tensor->data<float>(&place, &output_size);
auto *native_data = static_cast<float *>(native_outputs.front().data.data());
for (int i = 0; i < output_size; i++) {
EXPECT_NEAR(zero_copy_data[i], native_data[i], 1e-3);
}
}
TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
AnalysisConfig config;
SetConfig(&config);
config.SwitchUseFeedFetchOps(false);
#define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__);
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
for (int tid = 1; tid < FLAGS_num_threads; tid++) {
predictors.emplace_back(predictors.front()->Clone());
}
double total_time_of_threads{0};
std::vector<std::thread> threads;
for (int tid = 0; tid < FLAGS_num_threads; tid++) {
threads.emplace_back([&, tid] {
auto &predictor = predictors[tid];
NEW_TENSOR(data_lod_attention);
NEW_TENSOR(cell_init);
NEW_TENSOR(data);
NEW_TENSOR(week);
NEW_TENSOR(minute);
NEW_TENSOR(hidden_init);
// Prepare data for AnalysisPredictor
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
Timer timer;
double total_time{0};
for (int i = 0; i < FLAGS_repeat; i++) {
PrepareZeroCopyInputs(data_lod_attention_tensor.get(),
cell_init_tensor.get(), data_tensor.get(),
hidden_init_tensor.get(), week_tensor.get(),
minute_tensor.get(), &data, FLAGS_batch_size);
timer.tic();
predictor->ZeroCopyRun();
total_time += timer.toc();
}
total_time_of_threads += total_time;
LOG(INFO) << "thread time: " << total_time / FLAGS_repeat;
});
}
for (auto &t : threads) {
t.join();
}
// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
TEST(Analyzer_rnn1, compare_zero_copy) {
AnalysisConfig cfg;
SetConfig(&cfg);
LOG(INFO) << "average time: "
<< total_time_of_threads / FLAGS_num_threads / FLAGS_repeat;
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
std::vector<std::string> outputs_name;
outputs_name.emplace_back("final_output.tmp_1");
CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
input_slots_all, outputs_name);
}
} // namespace inference
......
......@@ -144,6 +144,9 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
cfg->SwitchSpecifyInputNames();
cfg->SwitchIrDebug();
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
if (FLAGS_zero_copy) {
cfg->SwitchUseFeedFetchOps(false);
}
if (use_mkldnn) {
cfg->EnableMKLDNN();
}
......@@ -184,10 +187,10 @@ TEST(Analyzer_seq_pool1, compare_determine) {
input_slots_all);
}
void analysis_fuse_statis(bool use_zerocopy) {
// Check the fuse status
TEST(Analyzer_seq_pool1, fuse_statis) {
AnalysisConfig cfg;
SetConfig(&cfg);
cfg.SwitchUseFeedFetchOps(!use_zerocopy);
int num_ops;
auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
......@@ -203,137 +206,17 @@ void analysis_fuse_statis(bool use_zerocopy) {
EXPECT_EQ(num_ops, 171);
}
// Check the fuse status
TEST(Analyzer_seq_pool1, fuse_statis) { analysis_fuse_statis(false); }
void PrepareZeroCopyInputs(
const std::unique_ptr<PaddlePredictor> &predictor,
std::vector<std::unique_ptr<ZeroCopyTensor>> *inputs) {
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
// only feed one batch
const auto &one_batch = data.NextBatch();
inputs->clear();
for (size_t i = 0; i < one_batch.size(); ++i) {
auto &slot = one_batch[i];
auto tensor = predictor->GetInputTensor(slot.name + "_embed");
tensor->Reshape(slot.shape);
tensor->SetLoD({slot.lod});
ZeroCopyTensorAssignData<float>(tensor.get(), slot.data);
inputs->emplace_back(std::move(tensor));
}
}
// return the output values
std::vector<float> zerocopy_profile(int repeat_times) {
AnalysisConfig config;
SetConfig(&config);
config.SwitchUseFeedFetchOps(false);
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
std::vector<std::unique_ptr<ZeroCopyTensor>> inputs;
PrepareZeroCopyInputs(predictor, &inputs);
auto output_tensor = predictor->GetOutputTensor(out_var_name);
Timer timer;
LOG(INFO) << "Warm up run...";
timer.tic();
predictor->ZeroCopyRun();
PrintTime(FLAGS_batch_size, 1, 1, 0, timer.toc(), 1);
if (FLAGS_profile) {
paddle::platform::ResetProfiler();
}
LOG(INFO) << "Run " << repeat_times << " times...";
timer.tic();
for (int i = 0; i < repeat_times; i++) {
predictor->ZeroCopyRun();
}
PrintTime(FLAGS_batch_size, repeat_times, 1, 0, timer.toc() / repeat_times,
1);
LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor);
PaddlePlace place;
int output_size{0};
auto *pdata = output_tensor->data<float>(&place, &output_size);
std::vector<float> res(output_size);
for (int i = 0; i < output_size; ++i) {
res[i] = pdata[i];
}
return res;
}
TEST(Analyzer_seq_pool1, zerocopy_profile) { zerocopy_profile(FLAGS_repeat); }
TEST(Analyzer_seq_pool1, zerocopy_profile_threads) {
AnalysisConfig config;
SetConfig(&config);
config.SwitchUseFeedFetchOps(false);
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
for (int tid = 1; tid < FLAGS_num_threads; tid++) {
predictors.emplace_back(predictors.front()->Clone());
}
double total_time_of_threads{0};
std::vector<std::thread> threads;
for (int tid = 0; tid < FLAGS_num_threads; tid++) {
threads.emplace_back([&, tid] {
auto &predictor = predictors[tid];
std::vector<std::unique_ptr<ZeroCopyTensor>> inputs;
PrepareZeroCopyInputs(predictor, &inputs);
auto output_tensor = predictor->GetOutputTensor(out_var_name);
Timer timer;
double total_time{0};
LOG(INFO) << "Warm up run...";
timer.tic();
predictor->ZeroCopyRun();
PrintTime(FLAGS_batch_size, 1, FLAGS_num_threads, tid, timer.toc(), 1);
if (FLAGS_profile) {
paddle::platform::ResetProfiler();
}
int repeat_times = FLAGS_repeat;
LOG(INFO) << "Run " << repeat_times << " times...";
timer.tic();
for (int i = 0; i < repeat_times; i++) {
predictor->ZeroCopyRun();
}
total_time += timer.toc();
total_time_of_threads += total_time;
LOG(INFO) << "thread time: " << total_time / repeat_times;
});
}
for (auto &t : threads) {
t.join();
}
LOG(INFO) << "average time: "
<< total_time_of_threads / FLAGS_num_threads / FLAGS_repeat;
}
TEST(Analyzer_seq_pool1, zerocopy_fuse_statis) { analysis_fuse_statis(true); }
// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
TEST(Analyzer_seq_pool1, compare_zero_copy) {
AnalysisConfig cfg;
SetConfig(&cfg);
TEST(Analyzer_seq_pool1, zerocopy_compare_native) {
AnalysisConfig config;
SetConfig(&config);
config.SwitchUseFeedFetchOps(true);
auto predictor = CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
std::vector<PaddleTensor> native_outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
ASSERT_TRUE(predictor->Run(input_slots_all[0], &native_outputs));
EXPECT_EQ(native_outputs.size(), 1UL);
auto zerocopy_output = zerocopy_profile(1);
EXPECT_EQ(zerocopy_output.size() * sizeof(float),
native_outputs.front().data.length());
auto *native_data = static_cast<float *>(native_outputs.front().data.data());
for (size_t i = 0; i < zerocopy_output.size(); ++i) {
EXPECT_LT(
std::fabs((zerocopy_output[i] - native_data[i]) / zerocopy_output[i]),
1e-3);
}
std::vector<std::string> outputs_name;
outputs_name.emplace_back(out_var_name);
CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
input_slots_all, outputs_name);
}
} // namespace analysis
......
......@@ -50,6 +50,7 @@ DEFINE_bool(use_analysis, true,
DEFINE_bool(record_benchmark, false,
"Record benchmark after profiling the model");
DEFINE_double(accuracy, 1e-3, "Result Accuracy.");
DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch.");
DECLARE_bool(profile);
DECLARE_int32(paddle_num_threads);
......@@ -67,6 +68,7 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
LOG(INFO) << analysis_config->ToNativeConfig();
}
// Compare result between two PaddleTensor
void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &ref_outputs) {
EXPECT_GT(outputs.size(), 0UL);
......@@ -108,6 +110,50 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
}
}
// Compare result between a PaddleTensor and a ZeroCopyTensor
void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<ZeroCopyTensor> &ref_outputs) {
EXPECT_GT(outputs.size(), 0UL);
EXPECT_EQ(outputs.size(), ref_outputs.size());
for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i];
auto &ref_out = ref_outputs[i];
size_t size = VecReduceToInt(out.shape);
EXPECT_GT(size, 0UL);
int ref_size = 0; // this is the number of elements not memory size
PaddlePlace place;
switch (out.dtype) {
case PaddleDType::INT64: {
int64_t *pdata = static_cast<int64_t *>(out.data.data());
int64_t *pdata_ref = ref_out.data<int64_t>(&place, &ref_size);
EXPECT_EQ(size, ref_size);
for (size_t j = 0; j < size; ++j) {
EXPECT_EQ(pdata_ref[j], pdata[j]);
}
break;
}
case PaddleDType::FLOAT32: {
float *pdata = static_cast<float *>(out.data.data());
float *pdata_ref = ref_out.data<float>(&place, &ref_size);
EXPECT_EQ(size, ref_size);
for (size_t j = 0; j < size; ++j) {
CHECK_LE(std::abs(pdata_ref[j] - pdata[j]), FLAGS_accuracy);
}
break;
}
case PaddleDType::INT32: {
int32_t *pdata = static_cast<int32_t *>(out.data.data());
int32_t *pdata_ref = ref_out.data<int32_t>(&place, &ref_size);
EXPECT_EQ(size, ref_size);
for (size_t j = 0; j < size; ++j) {
EXPECT_EQ(pdata_ref[j], pdata[j]);
}
break;
}
}
}
}
std::unique_ptr<PaddlePredictor> CreateTestPredictor(
const PaddlePredictor::Config *config, bool use_analysis = true) {
const auto *analysis_config =
......@@ -205,61 +251,106 @@ void GetInputPerBatch(const std::vector<std::vector<int64_t>> &in,
}
}
void TestOneThreadPrediction(
const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
auto predictor = CreateTestPredictor(config, use_analysis);
void ConvertPaddleTensorToZeroCopyTensor(
PaddlePredictor *predictor, const std::vector<PaddleTensor> &inputs) {
for (size_t i = 0; i < inputs.size(); i++) {
auto input = inputs[i];
auto tensor = predictor->GetInputTensor(input.name);
tensor->Reshape(input.shape);
tensor->SetLoD({input.lod});
if (input.dtype == PaddleDType::INT64) {
ZeroCopyTensorAssignData<int64_t>(tensor.get(), input.data);
} else if (input.dtype == PaddleDType::FLOAT32) {
ZeroCopyTensorAssignData<float>(tensor.get(), input.data);
} else if (input.dtype == PaddleDType::INT32) {
ZeroCopyTensorAssignData<int32_t>(tensor.get(), input.data);
} else {
LOG(ERROR) << "unsupported feed type " << input.dtype;
}
}
}
// warmup run
LOG(INFO) << "Warm up run...";
{
Timer warmup_timer;
warmup_timer.tic();
void PredictionWarmUp(PaddlePredictor *predictor,
const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, int num_threads,
int tid) {
int batch_size = FLAGS_batch_size;
LOG(INFO) << "Running thread " << tid << ", warm up run...";
if (FLAGS_zero_copy) {
ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[0]);
}
Timer warmup_timer;
warmup_timer.tic();
if (!FLAGS_zero_copy) {
predictor->Run(inputs[0], outputs, batch_size);
PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1);
if (FLAGS_profile) {
paddle::platform::ResetProfiler();
}
} else {
predictor->ZeroCopyRun();
}
PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1);
if (FLAGS_profile) {
paddle::platform::ResetProfiler();
}
}
LOG(INFO) << "Run " << num_times << " times...";
{
Timer run_timer;
run_timer.tic();
void PredictionRun(PaddlePredictor *predictor,
const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, int num_threads,
int tid) {
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
LOG(INFO) << "Thread " << tid << " run " << num_times << " times...";
Timer run_timer;
double elapsed_time = 0;
#ifdef WITH_GPERFTOOLS
ProfilerStart("paddle_inference.prof");
ProfilerStart("paddle_inference.prof");
#endif
for (int i = 0; i < num_times; i++) {
for (size_t j = 0; j < inputs.size(); j++) {
predictor->Run(inputs[j], outputs, batch_size);
if (!FLAGS_zero_copy) {
run_timer.tic();
for (size_t i = 0; i < inputs.size(); i++) {
for (int j = 0; j < num_times; j++) {
predictor->Run(inputs[i], outputs, batch_size);
}
}
elapsed_time = run_timer.toc();
} else {
for (size_t i = 0; i < inputs.size(); i++) {
ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]);
run_timer.tic();
for (int j = 0; j < num_times; j++) {
predictor->ZeroCopyRun();
}
elapsed_time += run_timer.toc();
}
}
#ifdef WITH_GPERFTOOLS
ProfilerStop();
ProfilerStop();
#endif
double latency = run_timer.toc() / (num_times > 1 ? num_times : 1);
PrintTime(batch_size, num_times, 1, 0, latency, inputs.size());
if (FLAGS_record_benchmark) {
Benchmark benchmark;
benchmark.SetName(FLAGS_model_name);
benchmark.SetBatchSize(batch_size);
benchmark.SetLatency(latency);
benchmark.PersistToFile("benchmark_record.txt");
}
PrintTime(batch_size, num_times, num_threads, tid, elapsed_time / num_times,
inputs.size());
if (FLAGS_record_benchmark) {
Benchmark benchmark;
benchmark.SetName(FLAGS_model_name);
benchmark.SetBatchSize(batch_size);
benchmark.SetLatency(elapsed_time / num_times);
benchmark.PersistToFile("benchmark_record.txt");
}
}
void TestOneThreadPrediction(
const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
auto predictor = CreateTestPredictor(config, use_analysis);
PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0);
PredictionRun(predictor.get(), inputs, outputs, 1, 0);
}
void TestMultiThreadPrediction(
const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, int num_threads,
bool use_analysis = true) {
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
std::vector<std::thread> threads;
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
predictors.emplace_back(CreateTestPredictor(config, use_analysis));
......@@ -267,7 +358,6 @@ void TestMultiThreadPrediction(
predictors.emplace_back(predictors.front()->Clone());
}
size_t total_time{0};
for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() {
// Each thread should have local inputs and outputs.
......@@ -280,34 +370,8 @@ void TestMultiThreadPrediction(
->SetMkldnnThreadID(static_cast<int>(tid) + 1);
}
#endif
// warmup run
LOG(INFO) << "Running thread " << tid << ", warm up run...";
{
Timer warmup_timer;
warmup_timer.tic();
predictor->Run(inputs[0], outputs, batch_size);
PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1);
if (FLAGS_profile) {
paddle::platform::ResetProfiler();
}
}
LOG(INFO) << "Thread " << tid << " run " << num_times << " times...";
{
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
for (const auto &input : inputs) {
ASSERT_TRUE(predictor->Run(input, &outputs_tid));
}
}
auto time = timer.toc();
total_time += time;
PrintTime(batch_size, num_times, num_threads, tid, time / num_times,
inputs.size());
}
PredictionWarmUp(predictor.get(), inputs, outputs, num_threads, tid);
PredictionRun(predictor.get(), inputs, outputs, num_threads, tid);
});
}
for (int i = 0; i < num_threads; ++i) {
......@@ -367,6 +431,31 @@ void CompareNativeAndAnalysis(
CompareResult(analysis_outputs, native_outputs);
}
void CompareAnalysisAndZeroCopy(
PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs,
const std::vector<std::string> &outputs_name) {
int batch_size = FLAGS_batch_size;
// analysis
std::vector<PaddleTensor> analysis_outputs;
auto predictor = CreateTestPredictor(config, true);
predictor->Run(inputs[0], &analysis_outputs, batch_size);
// analysis + zero_copy
std::vector<ZeroCopyTensor> zerocopy_outputs;
reinterpret_cast<AnalysisConfig *>(config)->SwitchUseFeedFetchOps(false);
predictor = CreateTestPredictor(config, true);
ConvertPaddleTensorToZeroCopyTensor(predictor.get(), inputs[0]);
predictor->ZeroCopyRun();
for (size_t i = 0; i < outputs_name.size(); i++) {
ZeroCopyTensor zerocopy_output =
*predictor->GetOutputTensor(outputs_name[i]).get();
zerocopy_outputs.emplace_back(zerocopy_output);
LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(zerocopy_output);
}
// compare
CompareResult(analysis_outputs, zerocopy_outputs);
}
template <typename T>
std::string LoDTensorSummary(const framework::LoDTensor &tensor) {
std::stringstream ss;
......
......@@ -30,19 +30,20 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
${EXTERNAL_PROJECT_NAME}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${INSTALL_DIR}
URL ${URL}/${FILENAME}
DOWNLOAD_COMMAND wget -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} &&
${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
DOWNLOAD_DIR ${INSTALL_DIR}
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
UPDATE_COMMAND ""
INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory ${UNPACK_DIR} ${INSTALL_DIR}
INSTALL_COMMAND ""
)
endfunction()
set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
if (NOT EXISTS ${WORD2VEC_INSTALL_DIR})
inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
if(NOT EXISTS ${WORD2VEC_INSTALL_DIR} AND NOT WIN32)
inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
endif()
set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
......
add_subdirectory(detail)
add_subdirectory(allocation)
cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade)
cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade profiler)
cc_library(memcpy SRCS memcpy.cc DEPS place)
cc_library(memory
......
......@@ -3,7 +3,7 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator)
cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
if (WITH_GPU)
......
......@@ -12,8 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include <memory>
#include <string>
#include <utility>
#include <vector>
......@@ -23,9 +22,11 @@
#endif
#include "glog/logging.h"
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/split.h"
......@@ -328,18 +329,22 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
} // namespace legacy
namespace allocation {
LegacyMemMonitor GPUMemMonitor;
Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
return new Allocation(ptr, size, place_);
auto *tmp_alloc = new Allocation(ptr, size, place_);
platform::MemEvenRecorder::Instance().PushMemRecord(
static_cast<void *>(tmp_alloc), place_, size);
return tmp_alloc;
}
void LegacyAllocator::Free(Allocation *allocation) {
boost::apply_visitor(
legacy::FreeVisitor(allocation->ptr(), allocation->size()),
allocation->place());
platform::MemEvenRecorder::Instance().PopMemRecord(
static_cast<void *>(allocation), place_);
delete allocation;
}
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h"
#include <cstring> // for memcpy
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace memory {
......@@ -29,14 +30,23 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
#ifdef PADDLE_WITH_CUDA
static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K
// NOTE(zcd): Do not use GpuMemcpySync as much as possible.
// because GpuMemcpySync issues the copying command to the default stream,
// which will make two commands from different streams cannot run concurrently.
// Reference:
// https://devblogs.nvidia.com/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
template <>
void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, cudaStream_t stream) {
platform::SetDeviceId(src_place.device);
if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
} else {
platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
// FIXME(zjl): do we really need it?
if (num <= kMaxGpuAsyncCopyBytes) {
......@@ -51,8 +61,10 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
const void* src, size_t num, cudaStream_t stream) {
platform::SetDeviceId(dst_place.device);
if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
} else {
platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
// FIXME(zjl): do we really need it?
if (num <= kMaxGpuAsyncCopyBytes) {
......@@ -68,15 +80,19 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
if (dst_place == src_place) {
platform::SetDeviceId(src_place.device);
if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
} else {
platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
}
} else {
if (stream) {
platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU");
platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
num, stream);
} else {
platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU");
platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
num);
}
......@@ -111,8 +127,10 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
cudaStream_t stream) {
platform::SetDeviceId(src_place.device);
if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
} else {
platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
}
}
......@@ -124,8 +142,10 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
cudaStream_t stream) {
platform::SetDeviceId(dst_place.device);
if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
} else {
platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
}
}
......
......@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/activation_op.h"
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
#include "paddle/fluid/platform/port.h"
#ifdef PADDLE_WITH_CUDA
......@@ -269,6 +271,48 @@ $$out = \\frac{x}{1 + \|x\|}$$
)DOC";
class AcosOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input of acos operator");
AddOutput("Out", "Output of acos operator");
AddComment(R"DOC(
Arccosine Activation Operator.
$$out = \cos^{-1}(x)$$
)DOC");
}
};
class AsinOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input of asin operator");
AddOutput("Out", "Output of asin operator");
AddComment(R"DOC(
Arcsine Activation Operator.
$$out = \sin^{-1}(x)$$
)DOC");
}
};
class AtanOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input of atan operator");
AddOutput("Out", "Output of atan operator");
AddComment(R"DOC(
Arctanh Activation Operator.
$$out = \tanh^{-1}(x)$$
)DOC");
}
};
class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
......@@ -543,7 +587,10 @@ namespace ops = paddle::operators;
__macro(SoftShrink, softshrink); \
__macro(Abs, abs); \
__macro(Cos, cos); \
__macro(Acos, acos); \
__macro(Sin, sin); \
__macro(Asin, asin); \
__macro(Atan, atan); \
__macro(Round, round); \
__macro(Log, log); \
__macro(Square, square); \
......
......@@ -39,9 +39,8 @@ namespace operators {
Please refer to the layer_helper.py and get the details.
*/
static std::unordered_set<std::string> InplaceOpSet = {
"sigmoid", "exp", "relu", "tanh", "sqrt", "ceil",
"floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid",
};
"sigmoid", "exp", "relu", "tanh", "sqrt", "ceil",
"floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid"};
static bool IsInplace(const std::string& op) {
bool inplace = InplaceOpSet.count(op);
......@@ -553,6 +552,101 @@ struct SinFunctor : public BaseActivationFunctor<T> {
}
};
template <typename T>
struct Acos {
HOSTDEVICE T operator()(const T& val) const { return acos(val); }
};
template <>
struct Acos<platform::float16> {
HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
return platform::float16(acos(static_cast<float>(val)));
}
};
// Acos(x) = acos(x)
template <typename T>
struct AcosFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.unaryExpr(Acos<T>());
}
};
// acos'(x) = -1/sqrt(1-x^2)
template <typename T>
struct AcosGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) =
-dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
}
};
template <typename T>
struct Asin {
HOSTDEVICE T operator()(const T& val) const { return asin(val); }
};
template <>
struct Asin<platform::float16> {
HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
return platform::float16(asin(static_cast<float>(val)));
}
};
// Asin(x) = asin(x)
template <typename T>
struct AsinFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.unaryExpr(Asin<T>());
}
};
// asin'(x) = 1/sqrt(1-x^2)
template <typename T>
struct AsinGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) =
dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
}
};
template <typename T>
struct Atan {
HOSTDEVICE T operator()(const T& val) const { return atan(val); }
};
template <>
struct Atan<platform::float16> {
HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
return platform::float16(atan(static_cast<float>(val)));
}
};
// Atan(x) = atan(x)
template <typename T>
struct AtanFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.unaryExpr(Atan<T>());
}
};
// atan'(x) = 1 / (1 + x^2)
template <typename T>
struct AtanGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) + x.square());
}
};
// round(x) = [x]
template <typename T>
struct RoundFunctor : public BaseActivationFunctor<T> {
......@@ -1001,13 +1095,16 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
__macro(relu, ReluFunctor, ReluGradFunctor); \
__macro(gelu, GeluFunctor, GeluGradFunctor); \
__macro(tanh, TanhFunctor, TanhGradFunctor); \
__macro(atan, AtanFunctor, AtanGradFunctor); \
__macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \
__macro(sqrt, SqrtFunctor, SqrtGradFunctor); \
__macro(abs, AbsFunctor, AbsGradFunctor); \
__macro(ceil, CeilFunctor, ZeroGradFunctor); \
__macro(floor, FloorFunctor, ZeroGradFunctor); \
__macro(cos, CosFunctor, CosGradFunctor); \
__macro(acos, AcosFunctor, AcosGradFunctor); \
__macro(sin, SinFunctor, SinGradFunctor); \
__macro(asin, AsinFunctor, AsinGradFunctor); \
__macro(round, RoundFunctor, ZeroGradFunctor); \
__macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \
__macro(log, LogFunctor, LogGradFunctor); \
......
include(operators)
register_operators(DEPS naive_executor)
cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator)
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
......@@ -18,6 +18,7 @@
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/controlflow/while_op_helper.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
namespace paddle {
......@@ -26,14 +27,6 @@ namespace operators {
using StepScopeVar = std::vector<framework::Scope *>;
using LoDTensor = framework::LoDTensor;
static constexpr char kStepBlock[] = "sub_block";
static constexpr char kCondition[] = "Condition";
static constexpr char kStepScopes[] = "StepScopes";
static constexpr char kX[] = "X";
static constexpr char kXGRAD[] = "X@GRAD";
static constexpr char kOutputs[] = "Out";
static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
namespace { // NOLINT
static std::string GetSkipEagerDeletionVarsDebugString(
const std::vector<std::string> &vars) {
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/controlflow/while_op_helper.h"
#include <string>
#include <unordered_set>
#include <utility>
#include "paddle/fluid/framework/program_desc.h"
namespace paddle {
namespace operators {
// OpVariant is a wrapper class of OpDesc and OperatorBase
// So that API would be the same.
class OpVariant {
struct InputsVisitor
: public boost::static_visitor<const framework::VariableNameMap *> {
template <typename OpType>
const framework::VariableNameMap *operator()(const OpType *op) const {
return &(op->Inputs());
}
};
struct OutputsVisitor
: public boost::static_visitor<const framework::VariableNameMap *> {
template <typename OpType>
const framework::VariableNameMap *operator()(const OpType *op) const {
return &(op->Outputs());
}
};
struct AttributeMapVisitor
: public boost::static_visitor<const framework::AttributeMap *> {
const framework::AttributeMap *operator()(
const framework::OpDesc *op) const {
return &(op->GetAttrMap());
}
const framework::AttributeMap *operator()(
const framework::OperatorBase *op) const {
return &(op->Attrs());
}
};
struct RawPointerVisitor : public boost::static_visitor<const void *> {
template <typename OpType>
const void *operator()(const OpType *op) const {
return op;
}
};
public:
OpVariant(const framework::OperatorBase *op) : op_(op) {} // NOLINT
OpVariant(const framework::OpDesc *op) : op_(op) {} // NOLINT
const framework::VariableNameMap &Inputs() const {
return *boost::apply_visitor(InputsVisitor(), op_);
}
const framework::VariableNameMap &Outputs() const {
return *boost::apply_visitor(OutputsVisitor(), op_);
}
const framework::AttributeMap &Attrs() const {
return *boost::apply_visitor(AttributeMapVisitor(), op_);
}
template <typename AttrType>
const AttrType &Attr(const std::string &name) const {
auto &attrs = Attrs();
auto it = attrs.find(name);
PADDLE_ENFORCE(it != attrs.end(), "Cannot find attribute %s", name);
return boost::get<AttrType>(it->second);
}
bool operator==(const OpVariant &other) const {
return RawPointer() == other.RawPointer();
}
const void *RawPointer() const {
return boost::apply_visitor(RawPointerVisitor(), op_);
}
int which() const { return static_cast<int>(op_.which()); }
struct Hasher {
size_t operator()(const OpVariant &op) const {
return reinterpret_cast<size_t>(op.RawPointer());
}
};
private:
const boost::variant<const framework::OperatorBase *,
const framework::OpDesc *>
op_;
};
static std::string GetDebugString(const std::vector<std::string> &names) {
if (names.empty()) return "";
std::string ret = names[0];
for (size_t i = 1; i < names.size(); ++i) {
ret += (" " + names[i]);
}
return ret;
}
// Set skip variables of while_op and while_grad_op
// These variables should be skipped when eager deletion enables.
// It is because:
// 1. while_grad_op needs some variables defined in while_op.
// 2. while_grad_op needs variables from the previous time step.
static void SetSkipVars(const OpVariant &op, std::vector<std::string> attr) {
auto &attrs = const_cast<framework::AttributeMap &>(op.Attrs());
VLOG(2) << "Prepare to skip " << attr.size()
<< " var(s): " << GetDebugString(attr);
attrs[kSkipEagerDeletionVars] = std::move(attr);
}
// Check whether the forward while_op and while_grad_op match
// The program may have many while_ops.
static bool IsMatchedWhileOpAndWhileGradOp(const OpVariant &fwd_op,
const OpVariant &grad_op) {
return fwd_op.Inputs().at(kX) == grad_op.Inputs().at(kX) &&
fwd_op.Outputs().at(kOutputs) == grad_op.Inputs().at(kOutputs);
}
// Test whether the variable is skippable in forward while_op
// The variable is skippable in while_op when the variable used in while_grad
// is not from grad_block.
static bool IsSkippableVar(const std::string &name,
framework::BlockDesc *grad_block) {
return name != framework::kEmptyVarName && !grad_block->HasVar(name);
}
static void ModifyWhileOpAndWhileGradOpAttr(const OpVariant &fwd_op,
const OpVariant &bwd_op) {
auto *grad_block = bwd_op.Attr<framework::BlockDesc *>(kStepBlock);
// Find all skippable variables in forward while_op
std::unordered_set<std::string> forward_skip_vars;
for (auto *op_desc : grad_block->AllOps()) {
for (auto &in_arg_name : op_desc->InputArgumentNames()) {
if (IsSkippableVar(in_arg_name, grad_block)) {
forward_skip_vars.insert(in_arg_name);
}
}
for (auto &out_arg_name : op_desc->OutputArgumentNames()) {
if (IsSkippableVar(out_arg_name, grad_block)) {
forward_skip_vars.insert(out_arg_name);
}
}
}
SetSkipVars(fwd_op, std::vector<std::string>(forward_skip_vars.begin(),
forward_skip_vars.end()));
// Find all skippable variables in while_grad_op
// The skipped variables are those which would be used across time steps.
auto &fwd_input = fwd_op.Inputs().at(kX);
auto &in_grads = bwd_op.Outputs().at(framework::GradVarName(kX));
PADDLE_ENFORCE_EQ(
fwd_input.size(), in_grads.size(),
"Backward input gradient number does not match forward input number.");
std::unordered_set<std::string> backward_skip_vars;
for (size_t i = 0; i < in_grads.size(); ++i) {
if (in_grads[i] == framework::kEmptyVarName) {
continue;
}
backward_skip_vars.insert(in_grads[i]);
backward_skip_vars.insert(framework::GradVarName(fwd_input[i]));
}
SetSkipVars(bwd_op, std::vector<std::string>(backward_skip_vars.begin(),
backward_skip_vars.end()));
}
// Find all while_ops and while_grad_ops in the graph or program
// The while_grad_op and while_op may located in different blocks
// So we should traverse all blocks in the program and find them out.
static void FindAllWhileAndWhileGradOp(std::vector<OpVariant> *while_ops,
std::vector<OpVariant> *while_grad_ops) {
PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size());
if (while_ops->empty()) return;
const auto *program =
while_ops->front().Attr<framework::BlockDesc *>(kStepBlock)->Program();
for (size_t i = 1; i < program->Size(); ++i) {
auto &block = program->Block(i);
for (size_t j = 0; j < block.OpSize(); ++j) {
auto *op = block.Op(j);
if (op->Type() == "while") {
while_ops->emplace_back(op);
} else if (op->Type() == "while_grad") {
while_grad_ops->emplace_back(op);
}
}
}
PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size(),
"There are extra while_grad ops in the graph or program");
}
static void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(
std::vector<OpVariant> *while_ops, std::vector<OpVariant> *while_grad_ops) {
FindAllWhileAndWhileGradOp(while_ops, while_grad_ops);
VLOG(2) << "Found while op num: " << while_ops->size()
<< ", while grad op num: " << while_grad_ops->size();
if (while_grad_ops->empty()) {
return;
}
std::unordered_set<OpVariant, OpVariant::Hasher> while_op_set(
while_ops->begin(), while_ops->end());
for (auto &bwd_op : *while_grad_ops) {
const OpVariant *matched_fwd_op = nullptr;
for (auto &fwd_op : while_op_set) {
if (IsMatchedWhileOpAndWhileGradOp(fwd_op, bwd_op)) {
PADDLE_ENFORCE(matched_fwd_op == nullptr,
"Found multiple matched while ops");
matched_fwd_op = &fwd_op;
}
}
PADDLE_ENFORCE_NOT_NULL(matched_fwd_op,
"Cannot find matched forward while op.");
ModifyWhileOpAndWhileGradOpAttr(*matched_fwd_op, bwd_op);
while_op_set.erase(*matched_fwd_op);
}
}
void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
int block_id,
const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops) {
// If block_id is not 0, returns
// This is because all while_ops and while_grad_ops in the whole program
// would be processed when block_id is 0 (i.e. when Executor::Run() or
// ParallelExecutor constructs).
// What's more, all while_ops and while_grad_ops must be processed when
// block_id is zero. If not, while_op may run first and erase variables
// used in while_grad_op, and in this moment, while_grad_ops may be not
// constructed yet.
if (block_id != 0) return;
std::vector<OpVariant> fwd_ops, bwd_ops;
for (auto &op : all_ops) {
if (op->Type() == "while") {
fwd_ops.emplace_back(op.get());
} else if (op->Type() == "while_grad") {
bwd_ops.emplace_back(op.get());
}
}
PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops);
}
void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
const std::vector<framework::OperatorBase *> &while_ops,
const std::vector<framework::OperatorBase *> &while_grad_ops) {
std::vector<OpVariant> fwd_ops, bwd_ops;
fwd_ops.reserve(while_ops.size());
for (auto *op : while_ops) {
fwd_ops.emplace_back(op);
}
bwd_ops.reserve(while_grad_ops.size());
for (auto *op : while_grad_ops) {
bwd_ops.emplace_back(op);
}
PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops);
}
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -14,19 +14,30 @@
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/variant.h"
namespace paddle {
namespace framework {
namespace details {
namespace operators {
class EagerDeletionPass : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
static constexpr char kStepBlock[] = "sub_block";
static constexpr char kCondition[] = "Condition";
static constexpr char kStepScopes[] = "StepScopes";
static constexpr char kX[] = "X";
static constexpr char kXGRAD[] = "X@GRAD";
static constexpr char kOutputs[] = "Out";
static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
} // namespace details
} // namespace framework
void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
int block_id,
const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops);
void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
const std::vector<framework::OperatorBase *> &while_ops,
const std::vector<framework::OperatorBase *> &while_grad_ops);
} // namespace operators
} // namespace paddle
......@@ -82,8 +82,9 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
Tensor track;
int* track_value =
track.mutable_data<int>(emission_dims, platform::CPUPlace());
auto ker = jit::Get<jit::kCRFDecoding, jit::CRFDecodingTuples<T>,
platform::CPUPlace>(tag_num);
auto ker =
jit::KernelFuncs<jit::CRFDecodingTuple<T>, platform::CPUPlace>::Cache()
.At(tag_num);
ker(static_cast<int>(seq_len), x, w, alpha_value, track_value, tag_num);
T max_score = -std::numeric_limits<T>::max();
int max_i = 0;
......
......@@ -20,7 +20,7 @@ namespace operators {
enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };
inline BoxCodeType GetBoxCodeType(const std::string& type) {
inline BoxCodeType GetBoxCodeType(const std::string &type) {
if (type == "encode_center_size") {
return BoxCodeType::kEncodeCenterSize;
} else if (type == "decode_center_size") {
......@@ -32,24 +32,23 @@ inline BoxCodeType GetBoxCodeType(const std::string& type) {
template <typename DeviceContext, typename T>
class BoxCoderKernel : public framework::OpKernel<T> {
public:
void EncodeCenterSize(const framework::Tensor* target_box,
const framework::Tensor* prior_box,
const framework::Tensor* prior_box_var,
void EncodeCenterSize(const framework::Tensor *target_box,
const framework::Tensor *prior_box,
const framework::Tensor *prior_box_var,
const bool normalized,
const std::vector<float> variance, T* output) const {
const std::vector<float> variance, T *output) const {
int64_t row = target_box->dims()[0];
int64_t col = prior_box->dims()[0];
int64_t len = prior_box->dims()[1];
auto* target_box_data = target_box->data<T>();
auto* prior_box_data = prior_box->data<T>();
const T* prior_box_var_data = nullptr;
if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2)
#endif
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
auto *target_box_data = target_box->data<T>();
auto *prior_box_data = prior_box->data<T>();
size_t offset = i * col * len + j * len;
T prior_box_width = prior_box_data[j * len + 2] -
prior_box_data[j * len] + (normalized == false);
T prior_box_height = prior_box_data[j * len + 3] -
......@@ -69,7 +68,6 @@ class BoxCoderKernel : public framework::OpKernel<T> {
target_box_data[i * len + 1] +
(normalized == false);
size_t offset = i * col * len + j * len;
output[offset] =
(target_box_center_x - prior_box_center_x) / prior_box_width;
output[offset + 1] =
......@@ -78,44 +76,61 @@ class BoxCoderKernel : public framework::OpKernel<T> {
std::log(std::fabs(target_box_width / prior_box_width));
output[offset + 3] =
std::log(std::fabs(target_box_height / prior_box_height));
if (prior_box_var) {
int prior_var_offset = j * len;
output[offset] /= prior_box_var_data[prior_var_offset];
output[offset + 1] /= prior_box_var_data[prior_var_offset + 1];
output[offset + 2] /= prior_box_var_data[prior_var_offset + 2];
output[offset + 3] /= prior_box_var_data[prior_var_offset + 3];
} else if (!(variance.empty())) {
}
}
if (prior_box_var) {
const T *prior_box_var_data = prior_box_var->data<T>();
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(3)
#endif
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
for (int k = 0; k < 4; ++k) {
size_t offset = i * col * len + j * len;
int prior_var_offset = j * len;
output[offset + k] /= prior_box_var_data[prior_var_offset + k];
}
}
}
} else if (!(variance.empty())) {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(3)
#endif
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
for (int k = 0; k < 4; ++k) {
size_t offset = i * col * len + j * len;
output[offset + k] /= static_cast<T>(variance[k]);
}
}
}
}
}
template <int axis, int var_size>
void DecodeCenterSize(const framework::Tensor* target_box,
const framework::Tensor* prior_box,
const framework::Tensor* prior_box_var,
void DecodeCenterSize(const framework::Tensor *target_box,
const framework::Tensor *prior_box,
const framework::Tensor *prior_box_var,
const bool normalized, std::vector<float> variance,
T* output) const {
T *output) const {
int64_t row = target_box->dims()[0];
int64_t col = target_box->dims()[1];
int64_t len = target_box->dims()[2];
auto* target_box_data = target_box->data<T>();
auto* prior_box_data = prior_box->data<T>();
const T* prior_box_var_data = nullptr;
if (var_size == 2) prior_box_var_data = prior_box_var->data<T>();
int prior_box_offset = 0;
T var_data[4] = {1., 1., 1., 1.};
T* var_ptr = var_data;
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2)
#endif
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
auto *target_box_data = target_box->data<T>();
auto *prior_box_data = prior_box->data<T>();
T var_data[4] = {1., 1., 1., 1.};
T *var_ptr = var_data;
size_t offset = i * col * len + j * len;
prior_box_offset = axis == 0 ? j * len : i * len;
int prior_box_offset = axis == 0 ? j * len : i * len;
T prior_box_width = prior_box_data[prior_box_offset + 2] -
prior_box_data[prior_box_offset] +
(normalized == false);
......@@ -131,10 +146,10 @@ class BoxCoderKernel : public framework::OpKernel<T> {
T target_box_width = 0, target_box_height = 0;
int prior_var_offset = axis == 0 ? j * len : i * len;
if (var_size == 2) {
std::memcpy(var_ptr, prior_box_var_data + prior_var_offset,
std::memcpy(var_ptr, prior_box_var->data<T>() + prior_var_offset,
4 * sizeof(T));
} else if (var_size == 1) {
var_ptr = reinterpret_cast<T*>(variance.data());
var_ptr = reinterpret_cast<T *>(variance.data());
}
T box_var_x = *var_ptr;
T box_var_y = *(var_ptr + 1);
......@@ -162,11 +177,11 @@ class BoxCoderKernel : public framework::OpKernel<T> {
}
}
void Compute(const framework::ExecutionContext& context) const override {
auto* prior_box = context.Input<framework::Tensor>("PriorBox");
auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
auto* output_box = context.Output<framework::Tensor>("OutputBox");
void Compute(const framework::ExecutionContext &context) const override {
auto *prior_box = context.Input<framework::Tensor>("PriorBox");
auto *prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
auto *target_box = context.Input<framework::LoDTensor>("TargetBox");
auto *output_box = context.Output<framework::Tensor>("OutputBox");
std::vector<float> variance = context.Attr<std::vector<float>>("variance");
const int axis = context.Attr<int>("axis");
if (target_box->lod().size()) {
......@@ -194,7 +209,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {
output_box->mutable_data<T>({row, col, len}, context.GetPlace());
T* output = output_box->data<T>();
T *output = output_box->data<T>();
if (code_type == BoxCodeType::kEncodeCenterSize) {
EncodeCenterSize(target_box, prior_box, prior_box_var, normalized,
variance, output);
......
......@@ -110,8 +110,9 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
constexpr int simd_width = 16;
int C = c / simd_width;
auto multiply = jit::Get<jit::kNCHW16CMulNC, jit::NCHW16CMulNCTuples<T>,
platform::CPUPlace>(0);
auto multiply = jit::KernelFuncs<jit::NCHW16CMulNCTuple<T>,
platform::CPUPlace>::Cache()
.At(0);
#pragma omp parallel for collapse(2)
for (int ni = 0; ni < n; ni++) {
for (int ci = 0; ci < C; ci++) {
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/fake_dequantize_op.h"
#include <string>
#include <vector>
namespace paddle {
namespace operators {
......@@ -76,6 +77,63 @@ $$Out = \frac{scale*X}{ max_range }$$
}
};
class FakeChannelWiseDequantizeMaxAbsOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(
ctx->HasInput("X"),
"Input(X) of FakeChannelWiseDequantizeMaxAbsOp should not be null.");
PADDLE_ENFORCE(ctx->HasInputs("Scales"),
"Input(Scales) of FakeChannelWiseDequantizeMaxAbsOp "
"should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Out"),
"Output(Out) of FakeChannelWiseDequantizeMaxAbsOp should not be null.");
ctx->ShareDim("X", /*->*/ "Out");
ctx->ShareLoD("X", /*->*/ "Out");
}
};
class FakeChannelWiseDequantizeMaxAbsOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor) The input with float-32/64 type is the "
"low precision tensor.");
AddInput("Scales",
"(Tensors) The scales in quantization stage. "
"Now, `Scales` is a vector with at most two tensors. "
"If Scales has two elements, the second tensor should only have "
"one value.")
.AsDuplicable();
AddOutput("Out",
"(Tensor) The output is the dequantized high "
"precision tensor.");
AddAttr<std::vector<int>>(
"quant_bits",
"Quantization bit numbers in quantization stage. "
"The size of `quant_bits` should be equal to the size of `Scales`.")
.SetDefault({8});
AddComment(R"DOC(
FakeChannelWiseDequantizeMaxAbsOp operator.
This calculation is an opposite operation of FakeChannelWiseQuantizeMaxAbsOp:
$$Out_c = \frac{X_c\prod_{i=1}^{n}Scales_{ic}}{\prod_{i=1}^{n}(2^{quant\_bits_i-1}-1)}$$
In the above formula, the range value of $c$ can be represented as $0 \leq c \lt \ the\ channel\ number\ of\ X$.
Besides, the size of $quant\_bits$ should be equal to the size of $Scales$, and it is called $n$ in the formula.
Notes: In general, the per-channel quantization is only applied to weights and the activations use per-layer quantization.
)DOC");
}
};
} // namespace operators
} // namespace paddle
......@@ -88,3 +146,11 @@ REGISTER_OPERATOR(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsOp,
REGISTER_OP_CPU_KERNEL(fake_dequantize_max_abs,
ops::FakeDequantizeMaxAbsKernel<CPU, float>,
ops::FakeDequantizeMaxAbsKernel<CPU, double>);
REGISTER_OPERATOR(fake_channel_wise_dequantize_max_abs,
ops::FakeChannelWiseDequantizeMaxAbsOp,
ops::FakeChannelWiseDequantizeMaxAbsOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(fake_channel_wise_dequantize_max_abs,
ops::FakeChannelWiseDequantizeMaxAbsKernel<CPU, float>,
ops::FakeChannelWiseDequantizeMaxAbsKernel<CPU, double>);
......@@ -55,3 +55,7 @@ using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs,
ops::FakeDequantizeMaxAbsKernel<CUDA, float>,
ops::FakeDequantizeMaxAbsKernel<CUDA, double>);
REGISTER_OP_CUDA_KERNEL(
fake_channel_wise_dequantize_max_abs,
ops::FakeChannelWiseDequantizeMaxAbsKernel<CUDA, float>,
ops::FakeChannelWiseDequantizeMaxAbsKernel<CUDA, double>);
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
......@@ -45,5 +46,42 @@ class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
}
};
template <typename DeviceContext, typename T>
class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
public:
virtual void Compute(const framework::ExecutionContext& ctx) const {
auto* in = ctx.Input<framework::Tensor>("X");
auto scales = ctx.MultiInput<framework::Tensor>("Scales");
auto* out = ctx.Output<framework::Tensor>("Out");
PADDLE_ENFORCE_EQ(scales[0]->numel(), in->dims()[0],
"The number of first scale values must be the same with "
"first dimension value of Input(X).");
auto quant_bits = ctx.Attr<std::vector<int>>("quant_bits");
int max_range = std::pow(2, quant_bits[0] - 1) - 1;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
out->mutable_data<T>(dev_ctx.GetPlace());
auto dequant = DequantizeFunctor<DeviceContext, T>();
for (int64_t i = 0; i < in->dims()[0]; i++) {
framework::Tensor one_channel_in = in->Slice(i, i + 1);
framework::Tensor one_channel_out = out->Slice(i, i + 1);
framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1);
dequant(dev_ctx, &one_channel_in, &one_channel_scale,
static_cast<T>(max_range), &one_channel_out);
}
if (scales.size() == 2) {
PADDLE_ENFORCE_EQ(
scales[1]->numel(), 1,
"The second scale tensor should only have one value at now.");
max_range = std::pow(2, quant_bits[1] - 1) - 1;
dequant(dev_ctx, out, scales[1], static_cast<T>(max_range), out);
}
}
};
} // namespace operators
} // namespace paddle
......@@ -134,6 +134,60 @@ $$Out = round(X/scale * range)$$
}
};
class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of FakeChannelWiseQuantizeOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Out"),
"Output(Out) of FakeChannelWiseQuantizeOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("OutScales"),
"Output(Scales) of FakeChannelWiseQuantizeOp should not be null.");
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->SetOutputDim("OutScales", {ctx->GetInputDim("X")[0]});
ctx->ShareLoD("X", /*->*/ "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
ctx.GetPlace());
}
};
class FakeChannelWiseQuantizeAbsMaxOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor) Input is float data type.");
AddOutput("Out",
"(Tensor) Output of quantized low level tensor, "
"but also saved as float data type.");
AddOutput("OutScales", "(Tensor) Current channel wise scale");
AddAttr<int>("bit_length", "(int, default 8)")
.SetDefault(8)
.AddCustomChecker([](const int& bit_length) {
PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
"'bit_length' should be between 1 and 16.");
});
AddComment(R"DOC(
The scale of FakeChannelWiseQuantize operator is a vector.
In detail, each channel of the input X has a scale value.
$$scale_c = max(abs(X_c))$$
$$range = 2^{bit\_length - 1} - 1$$
$$Out_c = round(\frac{X_c * range} {scale_c})$$
In above three formulas, the range value of c is as follow:
$$0 \leq c \lt \ the\ channel\ number\ of\ X$$
)DOC");
}
};
class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
public:
FakeQuantizeRangeAbsMaxOp(const std::string& type,
......@@ -218,3 +272,10 @@ REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max,
ops::FakeQuantizeRangeAbsMaxKernel<CPU, float>);
REGISTER_OPERATOR(fake_channel_wise_quantize_abs_max,
ops::FakeChannelWiseQuantizeAbsMaxOp,
ops::FakeChannelWiseQuantizeAbsMaxOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(fake_channel_wise_quantize_abs_max,
ops::FakeChannelWiseQuantizeAbsMaxKernel<CPU, float>);
......@@ -174,5 +174,7 @@ namespace ops = paddle::operators;
using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max,
ops::FakeQuantizeAbsMaxKernel<CUDA, float>);
REGISTER_OP_CUDA_KERNEL(fake_channel_wise_quantize_abs_max,
ops::FakeChannelWiseQuantizeAbsMaxKernel<CUDA, float>);
REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max,
ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>);
......@@ -63,6 +63,39 @@ class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> {
}
};
template <typename DeviceContext, typename T>
class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<framework::Tensor>("X");
auto* out = context.Output<framework::Tensor>("Out");
auto* out_scales = context.Output<framework::Tensor>("OutScales");
T* out_scales_data = out_scales->mutable_data<T>(context.GetPlace());
out->mutable_data<T>(context.GetPlace());
int bit_length = context.Attr<int>("bit_length");
int bin_cnt = std::pow(2, bit_length - 1) - 1;
auto& dev_ctx = context.template device_context<DeviceContext>();
auto find_abs_max = FindAbsMaxFunctor<DeviceContext, T>();
for (int64_t i = 0; i < in->dims()[0]; i++) {
framework::Tensor one_channel = in->Slice(i, i + 1);
const T* one_channel_data = one_channel.data<T>();
find_abs_max(dev_ctx, one_channel_data, one_channel.numel(),
&out_scales_data[i]);
}
auto clip_quant = ClipAndFakeQuantFunctor<DeviceContext, T>();
for (int64_t i = 0; i < in->dims()[0]; i++) {
framework::Tensor one_channel_in = in->Slice(i, i + 1);
framework::Tensor one_channel_out = out->Slice(i, i + 1);
framework::Tensor one_channel_scale = out_scales->Slice(i, i + 1);
clip_quant(dev_ctx, one_channel_in, one_channel_scale, bin_cnt,
&one_channel_out);
}
}
};
template <typename DeviceContext, typename T>
class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
public:
......
......@@ -23,9 +23,6 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
if (ctx->IsRuntime()) {
return;
}
PADDLE_ENFORCE(ctx->HasInput("W"),
"Input W of FusedEmbeddingSeqPoolOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Ids"),
......@@ -91,6 +88,8 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker {
"(boolean, default false) "
"Sparse update.")
.SetDefault(false);
AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
.SetDefault(true);
AddComment(R"DOC(
FusedEmbeddingSeqPool Operator.
......
......@@ -52,8 +52,9 @@ struct EmbeddingVSumFunctor {
out_width, jit::SeqPoolType::kSum);
for (size_t i = 0; i != ids_lod.size() - 1; ++i) {
attr.index_height = ids_lod[i + 1] - ids_lod[i];
auto emb_seqpool = jit::Get<jit::kEmbSeqPool, jit::EmbSeqPoolTuples<T>,
platform::CPUPlace>(attr);
auto emb_seqpool =
jit::KernelFuncs<jit::EmbSeqPoolTuple<T>, platform::CPUPlace>::Cache()
.At(attr);
emb_seqpool(table, ids + ids_lod[i] * idx_width, output + i * out_width,
&attr);
}
......@@ -120,6 +121,8 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
auto *ids = context.Input<LoDTensor>("Ids");
auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
// runtime shape
d_table->set_height(table_dim[0]);
auto *ids_data = ids->data<int64_t>();
int64_t ids_num = ids->numel();
......@@ -135,8 +138,9 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
T *d_table_data = d_table_value->mutable_data<T>(context.GetPlace());
const T *d_output_data = d_output->data<T>();
auto vbroadcast = jit::Get<jit::kVBroadcast, jit::VBroadcastTuples<T>,
platform::CPUPlace>(out_width);
auto vbroadcast =
jit::KernelFuncs<jit::VBroadcastTuple<T>, platform::CPUPlace>::Cache()
.At(out_width);
for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
const T *src = d_output_data + i * out_width;
......
......@@ -182,29 +182,32 @@ class FusionGRUKernel : public framework::OpKernel<T> {
const int total_T = x_dims[0]; \
const int D3 = wh_dims[1]
#define INIT_OTHER_DEFINES \
auto* h0 = ctx.Input<Tensor>("H0"); \
auto* wx = ctx.Input<Tensor>("WeightX"); \
auto* bias = ctx.Input<Tensor>("Bias"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
bool is_reverse = ctx.Attr<bool>("is_reverse"); \
const int M = x_dims[1]; \
const int D = wh_dims[0]; \
const int D2 = D * 2; \
const jit::gru_attr_t attr( \
D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")), \
jit::to_kerneltype(ctx.Attr<std::string>("activation"))); \
jit::gru_t one_step; \
auto ComputeH1 = \
jit::Get<jit::kGRUH1, jit::GRUTuples<T>, platform::CPUPlace>(attr); \
auto ComputeHtPart1 = \
jit::Get<jit::kGRUHtPart1, jit::GRUTuples<T>, platform::CPUPlace>(attr); \
auto ComputeHtPart2 = \
jit::Get<jit::kGRUHtPart2, jit::GRUTuples<T>, platform::CPUPlace>(attr); \
const T* x_data = x->data<T>(); \
const T* wx_data = wx->data<T>(); \
const T* wh_data = wh->data<T>(); \
auto place = ctx.GetPlace(); \
#define INIT_OTHER_DEFINES \
auto* h0 = ctx.Input<Tensor>("H0"); \
auto* wx = ctx.Input<Tensor>("WeightX"); \
auto* bias = ctx.Input<Tensor>("Bias"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
bool is_reverse = ctx.Attr<bool>("is_reverse"); \
const int M = x_dims[1]; \
const int D = wh_dims[0]; \
const int D2 = D * 2; \
const jit::gru_attr_t attr( \
D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")), \
jit::to_kerneltype(ctx.Attr<std::string>("activation"))); \
jit::gru_t one_step; \
auto ComputeH1 = \
jit::KernelFuncs<jit::GRUH1Tuple<T>, platform::CPUPlace>::Cache().At( \
attr); \
auto ComputeHtPart1 = \
jit::KernelFuncs<jit::GRUHtPart1Tuple<T>, platform::CPUPlace>::Cache() \
.At(attr); \
auto ComputeHtPart2 = \
jit::KernelFuncs<jit::GRUHtPart2Tuple<T>, platform::CPUPlace>::Cache() \
.At(attr); \
const T* x_data = x->data<T>(); \
const T* wx_data = wx->data<T>(); \
const T* wh_data = wh->data<T>(); \
auto place = ctx.GetPlace(); \
T* xx_data = xx->mutable_data<T>(place)
void SeqCompute(const framework::ExecutionContext& ctx) const {
......
......@@ -235,32 +235,34 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
const int D = wh_dims[0]; \
const int D4 = wh_dims[1]
#define INIT_OTHER_DEFINES \
const T* x_data = x->data<T>(); \
const T* wx_data = wx->data<T>(); \
const T* wh_data = wh->data<T>(); \
/* diagonal weight*/ \
const T* wp_data = bias->data<T>() + D4; \
/* for peephole only*/ \
T* checked_cell_data = nullptr; \
auto place = ctx.GetPlace(); \
if (use_peepholes) { \
/* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \
auto* checked_cell = ctx.Output<Tensor>("CheckedCell"); \
checked_cell_data = checked_cell->mutable_data<T>(place); \
} \
const jit::lstm_attr_t attr( \
D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")), \
jit::to_kerneltype(ctx.Attr<std::string>("candidate_activation")), \
jit::to_kerneltype(ctx.Attr<std::string>("cell_activation")), \
use_peepholes); \
jit::lstm_t one_step; \
one_step.wp = wp_data; \
one_step.checked = checked_cell_data; \
auto ComputeC1H1 = \
jit::Get<jit::kLSTMC1H1, jit::LSTMTuples<T>, platform::CPUPlace>(attr); \
auto ComputeCtHt = \
jit::Get<jit::kLSTMCtHt, jit::LSTMTuples<T>, platform::CPUPlace>(attr)
#define INIT_OTHER_DEFINES \
const T* x_data = x->data<T>(); \
const T* wx_data = wx->data<T>(); \
const T* wh_data = wh->data<T>(); \
/* diagonal weight*/ \
const T* wp_data = bias->data<T>() + D4; \
/* for peephole only*/ \
T* checked_cell_data = nullptr; \
auto place = ctx.GetPlace(); \
if (use_peepholes) { \
/* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \
auto* checked_cell = ctx.Output<Tensor>("CheckedCell"); \
checked_cell_data = checked_cell->mutable_data<T>(place); \
} \
const jit::lstm_attr_t attr( \
D, jit::to_kerneltype(ctx.Attr<std::string>("gate_activation")), \
jit::to_kerneltype(ctx.Attr<std::string>("candidate_activation")), \
jit::to_kerneltype(ctx.Attr<std::string>("cell_activation")), \
use_peepholes); \
jit::lstm_t one_step; \
one_step.wp = wp_data; \
one_step.checked = checked_cell_data; \
auto ComputeC1H1 = \
jit::KernelFuncs<jit::LSTMC1H1Tuple<T>, platform::CPUPlace>::Cache().At( \
attr); \
auto ComputeCtHt = \
jit::KernelFuncs<jit::LSTMCtHtTuple<T>, platform::CPUPlace>::Cache().At( \
attr)
// Wh GEMM
#define GEMM_WH_ADDON(bs, prev, out) \
......
......@@ -82,9 +82,11 @@ template <typename T>
static void fc_relu(const T* x, const T* w, const T* b, T* y,
const jit::matmul_attr_t& attr) {
auto matmul =
jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr);
jit::KernelFuncs<jit::MatMulTuple<T>, platform::CPUPlace>::Cache().At(
attr);
auto addbias_relu =
jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(attr.n);
jit::KernelFuncs<jit::VAddReluTuple<T>, platform::CPUPlace>::Cache().At(
attr.n);
matmul(x, w, y, &attr);
T* dst = y;
for (int i = 0; i < attr.m; ++i) {
......
......@@ -98,7 +98,7 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel<T> {
attr.type = jit::SeqPoolType::kSqrt;
}
auto seqpool =
jit::Get<jit::kSeqPool, jit::SeqPoolTuples<T>, platform::CPUPlace>(
jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache().At(
attr);
size_t n = ins.size();
size_t dst_step_size = n * w;
......
......@@ -94,19 +94,23 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
int o_numel = attr.m * attr.n;
auto vsquare_x =
jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.m *
attr.k);
jit::KernelFuncs<jit::VSquareTuple<T>, platform::CPUPlace>::Cache().At(
attr.m * attr.k);
auto vsquare_y =
jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.k *
attr.n);
jit::KernelFuncs<jit::VSquareTuple<T>, platform::CPUPlace>::Cache().At(
attr.k * attr.n);
auto vsquare_xy =
jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(o_numel);
jit::KernelFuncs<jit::VSquareTuple<T>, platform::CPUPlace>::Cache().At(
o_numel);
auto vsub =
jit::Get<jit::kVSub, jit::XYZNTuples<T>, platform::CPUPlace>(o_numel);
jit::KernelFuncs<jit::VSubTuple<T>, platform::CPUPlace>::Cache().At(
o_numel);
auto vscal =
jit::Get<jit::kVScal, jit::AXYNTuples<T>, platform::CPUPlace>(o_numel);
jit::KernelFuncs<jit::VScalTuple<T>, platform::CPUPlace>::Cache().At(
o_numel);
auto matmul =
jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr);
jit::KernelFuncs<jit::MatMulTuple<T>, platform::CPUPlace>::Cache().At(
attr);
const T* x_data = x->data<T>();
const T* y_data = y->data<T>();
......
......@@ -26,9 +26,6 @@ class HashOp : public framework::OperatorWithKernel {
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
if (ctx->IsRuntime()) {
return;
}
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of HashOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
......@@ -57,6 +54,8 @@ $$Out = scale * X$$
)DOC");
AddAttr<int>("num_hash", "").SetDefault(1);
AddAttr<int>("mod_by", "").SetDefault(100000);
AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
.SetDefault(true);
}
};
......
......@@ -5,7 +5,7 @@ file(APPEND ${jit_file} "\#pragma once\n")
file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n")
file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n")
set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place)
set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place xxhash)
file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
list(REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc)
......
......@@ -59,8 +59,6 @@ BenchJITKernel* InsertBenchmark(BenchJITKernel* b) {
InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_()); \
void BenchJITKernel_##name##_##dtype##_##place##_::Run()
#define BENCH_FP32_CPU(name) BENCH_JITKERNEL(name, FP32, CPU)
void RUN_ALL_BENCHMARK() {
for (auto p : g_all_benchmarks) {
if (!FLAGS_filter.empty() && FLAGS_filter != p->Name()) {
......@@ -90,11 +88,11 @@ std::vector<int> TestSizes() {
return s;
}
template <typename KernelTuples, typename... Args>
template <typename KernelTuple, typename... Args>
struct BenchFunc {
// return this function avg time
// TODO(TJ): clear cache every time
double operator()(const typename KernelTuples::func_type tgt, Args... args) {
double operator()(const typename KernelTuple::func_type tgt, Args... args) {
for (int i = 0; i < FLAGS_burning; ++i) {
tgt(args...);
}
......@@ -109,40 +107,17 @@ struct BenchFunc {
namespace jit = paddle::operators::jit;
template <jit::KernelType KT, typename KernelTuples, typename PlaceType,
typename... Args>
void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
BenchFunc<KernelTuples, Args...> benchmark;
template <typename KernelTuple, typename PlaceType, typename... Args>
void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) {
BenchFunc<KernelTuple, Args...> benchmark;
std::vector<std::pair<std::string, double>> infos;
// test refer
auto refer = jit::GetRefer<KT, KernelTuples>();
if (!refer) {
LOG(FATAL) << "Refer can not be empty!";
auto funcs = jit::GetAllCandidateFuncsWithTypes<KernelTuple, PlaceType>(attr);
for (auto f : funcs) {
infos.push_back(std::make_pair(f.first, benchmark(f.second, args...)));
}
infos.push_back(std::make_pair("Refer", benchmark(refer, args...)));
// test jitcode
auto jitcode = jit::GetJitCode<KT, KernelTuples, PlaceType>(attr);
if (jitcode) {
infos.push_back(std::make_pair("JitCode", benchmark(jitcode, args...)));
}
// test all impls in more
jit::KernelKey kkey(KT, PlaceType());
auto& pool = jit::KernelPool().Instance().AllKernels();
auto iter = pool.find(kkey);
if (iter != pool.end()) {
auto& impls = iter->second;
for (auto& impl : impls) {
auto i = dynamic_cast<const jit::KernelMore<KernelTuples>*>(impl.get());
if (i && i->UseMe(attr)) {
auto more = i->GetFunc();
infos.push_back(
std::make_pair(i->ImplType(), benchmark(more, args...)));
}
}
}
// Test result from Get function
auto tgt = jit::Get<KT, KernelTuples, PlaceType>(attr);
auto tgt = jit::KernelFuncs<KernelTuple, PlaceType>::Cache().At(attr);
if (!tgt) {
LOG(FATAL) << "Target can not be empty!";
}
......@@ -150,7 +125,8 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
// print
std::ostringstream loginfos;
loginfos << "Kernel Type " << jit::to_string(KT) << ": " << attr << ": ";
loginfos << "Kernel Type " << jit::to_string(KernelTuple::kernel_type) << ": "
<< attr << ": ";
for (auto pair : infos) {
loginfos << pair.first << " takes " << pair.second << " us; ";
}
......@@ -159,8 +135,9 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
using Tensor = paddle::framework::Tensor;
template <jit::KernelType KT, typename T, typename PlaceType>
void BenchXYZNKernel() {
template <typename KernelTuple, typename PlaceType>
void BenchKernelXYZN() {
using T = typename KernelTuple::data_type;
for (int d : TestSizes()) {
Tensor x, y, z;
x.Resize({d});
......@@ -171,16 +148,16 @@ void BenchXYZNKernel() {
T* z_data = z.mutable_data<T>(PlaceType());
RandomVec<T>(d, x_data);
RandomVec<T>(d, y_data);
BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data<T>(),
y.data<T>(), z_data, d);
BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), y.data<T>(), z_data,
d);
// test inplace
BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data<T>(), z_data,
z_data, d);
BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), z_data, z_data, d);
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void BenchAXYNKernel() {
template <typename KernelTuple, typename PlaceType>
void BenchKernelAXYN() {
using T = typename KernelTuple::data_type;
for (int d : TestSizes()) {
const T a = static_cast<T>(3);
Tensor x, y;
......@@ -189,26 +166,26 @@ void BenchAXYNKernel() {
T* x_data = x.mutable_data<T>(PlaceType());
T* y_data = y.mutable_data<T>(PlaceType());
RandomVec<T>(d, x_data);
BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data<T>(), y_data,
d);
BenchAllImpls<KernelTuple, PlaceType>(d, &a, x.data<T>(), y_data, d);
// test inplace
BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data<T>(), x_data,
d);
BenchAllImpls<KernelTuple, PlaceType>(d, &a, x.data<T>(), x_data, d);
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void BenchXRNKernel() {
template <typename KernelTuple, typename PlaceType>
void BenchKernelXRN() {
using T = typename KernelTuple::data_type;
for (int d : TestSizes()) {
Tensor x;
RandomVec<T>(d, x.mutable_data<T>({d}, PlaceType()));
T res;
BenchAllImpls<KT, jit::XRNTuples<T>, PlaceType>(d, x.data<T>(), &res, d);
BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), &res, d);
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void BenchXYNKernel() {
template <typename KernelTuple, typename PlaceType>
void BenchKernelXYN() {
using T = typename KernelTuple::data_type;
for (int d : TestSizes()) {
Tensor x, y;
x.Resize({d});
......@@ -216,12 +193,13 @@ void BenchXYNKernel() {
T* x_data = x.mutable_data<T>(PlaceType());
T* y_data = y.mutable_data<T>(PlaceType());
RandomVec<T>(d, x_data);
BenchAllImpls<KT, jit::XYNTuples<T>, PlaceType>(d, x.data<T>(), y_data, d);
BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), y_data, d);
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void BenchLSTMKernel() {
template <typename KernelTuple, typename PlaceType>
void BenchKernelLSTM() {
using T = typename KernelTuple::data_type;
for (bool use_peephole : {true, false}) {
for (int d : TestSizes()) {
const jit::lstm_attr_t attr(d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh,
......@@ -252,13 +230,14 @@ void BenchLSTMKernel() {
step.wp = wp_data;
step.checked = checked_data;
}
BenchAllImpls<KT, jit::LSTMTuples<T>, PlaceType>(attr, &step, &attr);
BenchAllImpls<KernelTuple, PlaceType>(attr, &step, &attr);
}
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void BenchGRUKernel() {
template <typename KernelTuple, typename PlaceType>
void BenchKernelGRU() {
using T = typename KernelTuple::data_type;
for (int d : TestSizes()) {
const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh);
auto place = PlaceType();
......@@ -275,12 +254,13 @@ void BenchGRUKernel() {
step.gates = x_data;
step.ht_1 = ht_1_data;
step.ht = ht_data;
BenchAllImpls<KT, jit::GRUTuples<T>, PlaceType>(attr, &step, &attr);
BenchAllImpls<KernelTuple, PlaceType>(attr, &step, &attr);
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void BenchSeqPoolKernel() {
template <typename KernelTuple, typename PlaceType>
void BenchKernelSeqPool() {
using T = typename KernelTuple::data_type;
std::vector<jit::SeqPoolType> pool_types = {
jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
for (auto type : pool_types) {
......@@ -294,15 +274,15 @@ void BenchSeqPoolKernel() {
RandomVec<T>(h * w, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
const T* x_data = x.data<T>();
T* y_data = y.mutable_data<T>(PlaceType());
BenchAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType>(attr, x_data,
y_data, &attr);
BenchAllImpls<KernelTuple, PlaceType>(attr, x_data, y_data, &attr);
}
}
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void BenchEmbSeqPoolKernel() {
template <typename KernelTuple, typename PlaceType>
void BenchKernelEmbSeqPool() {
using T = typename KernelTuple::data_type;
std::vector<jit::SeqPoolType> pool_types = {jit::SeqPoolType::kSum};
int64_t tbl_h = 1e4;
for (int tbl_w : {10, 16, 256}) {
......@@ -324,16 +304,17 @@ void BenchEmbSeqPoolKernel() {
tbl_h - 1);
const int64_t* idx_data = idx.data<int64_t>();
T* o_data = out.mutable_data<T>(PlaceType());
BenchAllImpls<KT, jit::EmbSeqPoolTuples<T>, PlaceType>(
attr, table_data, idx_data, o_data, &attr);
BenchAllImpls<KernelTuple, PlaceType>(attr, table_data, idx_data,
o_data, &attr);
}
}
}
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void BenchSgdKernel() {
template <typename KernelTuple, typename PlaceType>
void BenchKernelSgd() {
using T = typename KernelTuple::data_type;
const T lr = 0.1;
auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
const int64_t upper) -> std::vector<int64_t> {
......@@ -364,15 +345,16 @@ void BenchSgdKernel() {
const T* grad_data = grad.data<T>();
const int64_t* rows_data = rows.data();
jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size);
BenchAllImpls<KT, jit::SgdTuples<T>, PlaceType>(
attr, &lr, param_data, grad_data, rows_data, param_data, &attr);
BenchAllImpls<KernelTuple, PlaceType>(attr, &lr, param_data, grad_data,
rows_data, param_data, &attr);
}
}
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void BenchMatMulKernel() {
template <typename KernelTuple, typename PlaceType>
void BenchKernelMatMul() {
using T = typename KernelTuple::data_type;
for (int m : {1, 2, 3, 4}) {
for (int n : TestSizes()) {
for (int k : TestSizes()) {
......@@ -386,15 +368,16 @@ void BenchMatMulKernel() {
const T* b_data = b.data<T>();
T* c_data = c.mutable_data<T>(PlaceType());
const jit::matmul_attr_t attr{m, n, k};
BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(attr, a_data, b_data,
c_data, &attr);
BenchAllImpls<KernelTuple, PlaceType>(attr, a_data, b_data, c_data,
&attr);
}
}
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void BenchSoftmaxKernel() {
template <typename KernelTuple, typename PlaceType>
void BenchKernelSoftmax() {
using T = typename KernelTuple::data_type;
for (int bs : {1, 2, 10}) {
for (int n : TestSizes()) {
Tensor x, y;
......@@ -403,14 +386,14 @@ void BenchSoftmaxKernel() {
RandomVec<T>(bs * n, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
const T* x_data = x.data<T>();
T* y_data = y.mutable_data<T>(PlaceType());
BenchAllImpls<KT, jit::SoftmaxTuples<T>, PlaceType>(n, x_data, y_data, n,
bs);
BenchAllImpls<KernelTuple, PlaceType>(n, x_data, y_data, n, bs);
}
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void BenchLayerNormKernel() {
template <typename KernelTuple, typename PlaceType>
void BenchKernelLayerNorm() {
using T = typename KernelTuple::data_type;
const T epsilon = 9.99999975e-06;
for (int n : {1, 2, 10}) {
for (int x_dim_0 : {1, 9, 17, 50}) {
......@@ -439,16 +422,17 @@ void BenchLayerNormKernel() {
T* var_data = var.data<T>();
T* out_data = out.mutable_data<T>(PlaceType());
BenchAllImpls<KT, jit::LayerNormTuples<T>, PlaceType>(
right, x_data, out_data, mean_data, var_data, scale_data, bias_data,
left, epsilon, right);
BenchAllImpls<KernelTuple, PlaceType>(right, x_data, out_data,
mean_data, var_data, scale_data,
bias_data, left, epsilon, right);
}
}
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void BenchCRFDecodingKernel() {
template <typename KernelTuple, typename PlaceType>
void BenchKernelCRFDecoding() {
using T = typename KernelTuple::data_type;
constexpr int state_trans_base_idx = 2;
for (int seq_len : {1, 11, 17, 50}) {
for (int tag_num : TestSizes()) {
......@@ -468,14 +452,15 @@ void BenchCRFDecodingKernel() {
T* alpha_data = alpha.mutable_data<T>(PlaceType());
int* track_data = track.mutable_data<int>(PlaceType());
BenchAllImpls<KT, jit::CRFDecodingTuples<T>, PlaceType>(
tag_num, seq_len, x_data, w_data, alpha_data, track_data, tag_num);
BenchAllImpls<KernelTuple, PlaceType>(tag_num, seq_len, x_data, w_data,
alpha_data, track_data, tag_num);
}
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void BenchVBroadcastKernel() {
template <typename KernelTuple, typename PlaceType>
void BenchKernelVBroadcast() {
using T = typename KernelTuple::data_type;
for (int64_t w : {1, 16, 64, 100, 256}) {
Tensor x;
x.Resize({w});
......@@ -485,78 +470,86 @@ void BenchVBroadcastKernel() {
Tensor y;
y.Resize({h * w});
T* y_data = y.mutable_data<T>(PlaceType());
BenchAllImpls<KT, jit::VBroadcastTuples<T>, PlaceType>(
w, x_data, y_data, static_cast<int64_t>(h), w);
BenchAllImpls<KernelTuple, PlaceType>(w, x_data, y_data,
static_cast<int64_t>(h), w);
}
}
}
using T = float;
using CPUPlace = paddle::platform::CPUPlace;
#define BenchKernelVMul BenchKernelXYZN
#define BenchKernelVAdd BenchKernelXYZN
#define BenchKernelVAddRelu BenchKernelXYZN
#define BenchKernelVSub BenchKernelXYZN
// xyzn
BENCH_FP32_CPU(kVMul) { BenchXYZNKernel<jit::kVMul, T, CPUPlace>(); }
BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel<jit::kVAdd, T, CPUPlace>(); }
BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel<jit::kVAddRelu, T, CPUPlace>(); }
BENCH_FP32_CPU(kVSub) { BenchXYZNKernel<jit::kVSub, T, CPUPlace>(); }
#define BenchKernelVScal BenchKernelAXYN
#define BenchKernelVAddBias BenchKernelAXYN
// axyn
BENCH_FP32_CPU(kVScal) { BenchAXYNKernel<jit::kVScal, T, CPUPlace>(); }
BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel<jit::kVAddBias, T, CPUPlace>(); }
#define BenchKernelVRelu BenchKernelXYN
#define BenchKernelVIdentity BenchKernelXYN
#define BenchKernelVSquare BenchKernelXYN
#define BenchKernelVExp BenchKernelXYN
#define BenchKernelVSigmoid BenchKernelXYN
#define BenchKernelVTanh BenchKernelXYN
#define BenchKernelVCopy BenchKernelXYN
// xrn
BENCH_FP32_CPU(kHSum) { BenchXRNKernel<jit::kHSum, T, CPUPlace>(); }
BENCH_FP32_CPU(kHMax) { BenchXRNKernel<jit::kHMax, T, CPUPlace>(); }
#define BenchKernelHMax BenchKernelXRN
#define BenchKernelHSum BenchKernelXRN
// xyn
BENCH_FP32_CPU(kVRelu) { BenchXYNKernel<jit::kVRelu, T, CPUPlace>(); }
BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel<jit::kVIdentity, T, CPUPlace>(); }
BENCH_FP32_CPU(kVSquare) { BenchXYNKernel<jit::kVSquare, T, CPUPlace>(); }
BENCH_FP32_CPU(kVExp) { BenchXYNKernel<jit::kVExp, T, CPUPlace>(); }
BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel<jit::kVSigmoid, T, CPUPlace>(); }
BENCH_FP32_CPU(kVTanh) { BenchXYNKernel<jit::kVTanh, T, CPUPlace>(); }
BENCH_FP32_CPU(kVCopy) { BenchXYNKernel<jit::kVCopy, T, CPUPlace>(); }
// lstm and peephole
BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel<jit::kLSTMCtHt, T, CPUPlace>(); }
BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel<jit::kLSTMC1H1, T, CPUPlace>(); }
// gru functions
BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel<jit::kGRUH1, T, CPUPlace>(); }
BENCH_FP32_CPU(kGRUHtPart1) { BenchGRUKernel<jit::kGRUHtPart1, T, CPUPlace>(); }
BENCH_FP32_CPU(kGRUHtPart2) { BenchGRUKernel<jit::kGRUHtPart2, T, CPUPlace>(); }
// seq pool function
BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel<jit::kSeqPool, T, CPUPlace>(); }
// embedding seq pool function
BENCH_FP32_CPU(kEmbSeqPool) {
BenchEmbSeqPoolKernel<jit::kEmbSeqPool, T, CPUPlace>();
}
#define BenchKernelLSTMCtHt BenchKernelLSTM
#define BenchKernelLSTMC1H1 BenchKernelLSTM
// sgd function
BENCH_FP32_CPU(kSgd) { BenchSgdKernel<jit::kSgd, T, CPUPlace>(); }
#define BenchKernelGRUH1 BenchKernelGRU
#define BenchKernelGRUHtPart1 BenchKernelGRU
#define BenchKernelGRUHtPart2 BenchKernelGRU
// matmul
BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, CPUPlace>(); }
using CPUPlace = paddle::platform::CPUPlace;
// softmax
BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel<jit::kSoftmax, T, CPUPlace>(); }
#define BENCH_FP32_CPU(name) \
BENCH_JITKERNEL(name, FP32, CPU) { \
BenchKernel##name<jit::name##Tuple<float>, CPUPlace>(); \
}
// layernorm
BENCH_FP32_CPU(kLayerNorm) {
BenchLayerNormKernel<jit::kLayerNorm, T, CPUPlace>();
}
// xyzn
BENCH_FP32_CPU(VMul);
BENCH_FP32_CPU(VAdd);
BENCH_FP32_CPU(VAddRelu);
BENCH_FP32_CPU(VSub);
// crfdecoding
BENCH_FP32_CPU(kCRFDecoding) {
BenchCRFDecodingKernel<jit::kCRFDecoding, T, CPUPlace>();
}
// axyn
BENCH_FP32_CPU(VScal);
BENCH_FP32_CPU(VAddBias);
// vbroadcast function
BENCH_FP32_CPU(kVBroadcast) {
BenchVBroadcastKernel<jit::kVBroadcast, T, CPUPlace>();
}
// xyn
BENCH_FP32_CPU(VRelu);
BENCH_FP32_CPU(VIdentity);
BENCH_FP32_CPU(VSquare);
BENCH_FP32_CPU(VExp);
BENCH_FP32_CPU(VSigmoid);
BENCH_FP32_CPU(VTanh);
BENCH_FP32_CPU(VCopy);
// xrn
BENCH_FP32_CPU(HMax);
BENCH_FP32_CPU(HSum);
// LSTM
BENCH_FP32_CPU(LSTMCtHt);
BENCH_FP32_CPU(LSTMC1H1);
// GRU
BENCH_FP32_CPU(GRUH1);
BENCH_FP32_CPU(GRUHtPart1);
BENCH_FP32_CPU(GRUHtPart2);
BENCH_FP32_CPU(LayerNorm);
BENCH_FP32_CPU(CRFDecoding);
BENCH_FP32_CPU(SeqPool);
BENCH_FP32_CPU(EmbSeqPool);
BENCH_FP32_CPU(MatMul);
BENCH_FP32_CPU(Softmax);
BENCH_FP32_CPU(Sgd);
BENCH_FP32_CPU(VBroadcast);
// Benchmark all jit kernels including jitcode, mkl and refer.
// To use this tool, run command: ./benchmark [options...]
......
......@@ -13,6 +13,7 @@
* limitations under the License. */
#include "paddle/fluid/operators/jit/gen/act.h"
#include <memory>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
......@@ -81,7 +82,7 @@ void VActJitCode::genCode() {
#define DECLARE_ACT_CREATOR(name) \
class name##Creator : public JitCodeCreator<int> { \
public: \
bool UseMe(const int& attr) const override; \
bool CanBeUsed(const int& attr) const override; \
size_t CodeSize(const int& d) const override; \
std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
return make_unique<name##JitCode>(attr, CodeSize(attr)); \
......@@ -96,27 +97,27 @@ DECLARE_ACT_CREATOR(VSigmoid);
DECLARE_ACT_CREATOR(VTanh);
// TODO(TJ): tuning use me
bool VReluCreator::UseMe(const int& d) const {
bool VReluCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx);
}
bool VSquareCreator::UseMe(const int& d) const {
bool VSquareCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx);
}
bool VIdentityCreator::UseMe(const int& d) const {
bool VIdentityCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx);
}
bool VExpCreator::UseMe(const int& d) const {
bool VExpCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx) && d < 32;
}
bool VSigmoidCreator::UseMe(const int& d) const {
bool VSigmoidCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx);
}
bool VTanhCreator::UseMe(const int& d) const {
bool VTanhCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx);
}
......
......@@ -13,6 +13,7 @@
* limitations under the License. */
#include "paddle/fluid/operators/jit/gen/blas.h"
#include <memory>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
......@@ -142,7 +143,7 @@ void NCHW16CMulNCJitCode::genCode() {
class NCHW16CMulNCCreator : public JitCodeCreator<int> {
public:
bool UseMe(const int& attr) const override {
bool CanBeUsed(const int& attr) const override {
return platform::MayIUse(platform::avx512f);
}
size_t CodeSize(const int& d) const override { return 256 * 1024; }
......@@ -154,7 +155,7 @@ class NCHW16CMulNCCreator : public JitCodeCreator<int> {
#define DECLARE_BLAS_CREATOR(name) \
class name##Creator : public JitCodeCreator<int> { \
public: \
bool UseMe(const int& attr) const override { \
bool CanBeUsed(const int& attr) const override { \
return platform::MayIUse(platform::avx) && attr <= 1024; \
} \
size_t CodeSize(const int& d) const override { \
......
......@@ -14,6 +14,7 @@
#include "paddle/fluid/operators/jit/gen/embseqpool.h"
#include <stddef.h> // offsetof
#include <memory>
#include <vector>
#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones
#include "paddle/fluid/operators/jit/registry.h"
......@@ -121,7 +122,7 @@ void EmbSeqPoolJitCode::genCode() {
class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
public:
bool UseMe(const emb_seq_pool_attr_t& attr) const override {
bool CanBeUsed(const emb_seq_pool_attr_t& attr) const override {
return platform::MayIUse(platform::avx) &&
attr.table_width % YMM_FLOAT_BLOCK == 0;
}
......
......@@ -14,6 +14,7 @@
#include "paddle/fluid/operators/jit/gen/gru.h"
#include <stddef.h> // offsetof
#include <memory>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
......@@ -86,7 +87,7 @@ void GRUJitCode::genCode() {
class name##Creator : public JitCodeCreator<gru_attr_t> { \
public: \
/* TODO(TJ): enable more */ \
bool UseMe(const gru_attr_t& attr) const override { \
bool CanBeUsed(const gru_attr_t& attr) const override { \
return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
} \
size_t CodeSize(const gru_attr_t& attr) const override { \
......
......@@ -13,6 +13,7 @@
* limitations under the License. */
#include "paddle/fluid/operators/jit/gen/hopv.h"
#include <memory>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
......@@ -76,7 +77,7 @@ void HOPVJitCode::genCode() {
#define DECLARE_HOP_CREATOR(name) \
class name##Creator : public JitCodeCreator<int> { \
public: \
bool UseMe(const int& attr) const override { \
bool CanBeUsed(const int& attr) const override { \
return platform::MayIUse(platform::avx); \
} \
size_t CodeSize(const int& d) const override { \
......
......@@ -73,7 +73,7 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
virtual void genCode() = 0;
size_t getSize() const override { return CodeGenerator::getSize(); }
const unsigned char* getCodeInternal() override {
const unsigned char* getCodeInternal() const override {
const Xbyak::uint8* code = CodeGenerator::getCode();
return code;
}
......
......@@ -14,6 +14,7 @@
#include "paddle/fluid/operators/jit/gen/lstm.h"
#include <stddef.h> // offsetof
#include <memory>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
......@@ -114,7 +115,7 @@ void LSTMJitCode::genCode() {
class name##Creator : public JitCodeCreator<lstm_attr_t> { \
public: \
/* TODO(TJ): enable more */ \
bool UseMe(const lstm_attr_t& attr) const override { \
bool CanBeUsed(const lstm_attr_t& attr) const override { \
return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
} \
size_t CodeSize(const lstm_attr_t& attr) const override { \
......
......@@ -14,8 +14,8 @@
#include "paddle/fluid/operators/jit/gen/matmul.h"
#include <stddef.h> // offsetof
#include <memory>
#include <vector>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
......@@ -98,7 +98,7 @@ void MatMulJitCode::genCode() {
class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
public:
bool UseMe(const matmul_attr_t& attr) const override {
bool CanBeUsed(const matmul_attr_t& attr) const override {
return attr.m == 1 && platform::MayIUse(platform::avx512f) &&
attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512;
}
......
......@@ -13,6 +13,7 @@
* limitations under the License. */
#include "paddle/fluid/operators/jit/gen/seqpool.h"
#include <memory>
#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
......@@ -57,7 +58,7 @@ void SeqPoolJitCode::genCode() {
class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
public:
bool UseMe(const seq_pool_attr_t& attr) const override {
bool CanBeUsed(const seq_pool_attr_t& attr) const override {
return platform::MayIUse(platform::avx);
}
size_t CodeSize(const seq_pool_attr_t& attr) const override {
......
......@@ -14,6 +14,7 @@
#include "paddle/fluid/operators/jit/gen/sgd.h"
#include <stddef.h> // offsetof
#include <memory>
#include <vector>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
......@@ -104,7 +105,7 @@ void SgdJitCode::genCode() {
class SgdCreator : public JitCodeCreator<sgd_attr_t> {
public:
bool UseMe(const sgd_attr_t& attr) const override {
bool CanBeUsed(const sgd_attr_t& attr) const override {
return platform::MayIUse(platform::avx) &&
attr.grad_width % YMM_FLOAT_BLOCK == 0;
}
......
......@@ -69,7 +69,7 @@ void VBroadcastJitCode::genCode() {
class VBroadcastCreator : public JitCodeCreator<int64_t> {
public:
bool UseMe(const int64_t& w) const override {
bool CanBeUsed(const int64_t& w) const override {
return platform::MayIUse(platform::avx) && w % YMM_FLOAT_BLOCK == 0;
}
size_t CodeSize(const int64_t& w) const override {
......
......@@ -31,7 +31,7 @@ namespace paddle {
namespace operators {
namespace jit {
// refer do not need useme, it would be the last one.
// refer do not need CanBeUsed, it would be the last one.
void GenBase::dumpCode(const unsigned char* code) const {
if (code) {
static int counter = 0;
......
......@@ -31,9 +31,10 @@ class GenBase : public Kernel {
virtual ~GenBase() = default;
virtual std::string name() const = 0;
virtual size_t getSize() const = 0;
virtual const unsigned char* getCodeInternal() = 0;
virtual const unsigned char* getCodeInternal() const = 0;
const char* ImplType() const override { return "JitCode"; }
template <typename Func>
Func getCode() {
Func getCode() const {
const unsigned char* code = this->getCodeInternal();
if (FLAGS_dump_jitcode) {
this->dumpCode(code);
......@@ -65,7 +66,7 @@ class JitCodeCreator : public GenCreator {
virtual ~JitCodeCreator() = default;
// condition when this jit code can be used.
virtual bool UseMe(const Attr& attr) const = 0;
virtual bool CanBeUsed(const Attr& attr) const = 0;
// estimate this code size
virtual size_t CodeSize(const Attr& attr) const = 0;
......
......@@ -16,6 +16,8 @@
#include <iostream>
#include <string>
#include <unordered_map>
#include <utility> // for std::move
#include <vector>
#include "paddle/fluid/operators/jit/gen_base.h"
#include "paddle/fluid/operators/jit/kernel_base.h"
......@@ -27,35 +29,34 @@ namespace paddle {
namespace operators {
namespace jit {
template <KernelType KT, typename KernelTuples, typename PlaceType>
template <typename KernelTuple, typename PlaceType>
inline typename std::enable_if<
std::is_same<typename KernelTuples::data_type, float>::value &&
std::is_same<typename KernelTuple::data_type, float>::value &&
std::is_same<PlaceType, platform::CPUPlace>::value,
typename KernelTuples::func_type>::type
GetJitCode(const typename KernelTuples::attr_type& attr) {
using Func = typename KernelTuples::func_type;
using Attr = typename KernelTuples::attr_type;
size_t key = JitCodeKey<Attr>(attr);
auto& codes = JitCodePool<KT>().Instance();
const Kernel*>::type
GetJitCode(const typename KernelTuple::attr_type& attr) {
using Attr = typename KernelTuple::attr_type;
int64_t key = JitCodeKey<Attr>(attr);
auto& codes = JitCodePool<KernelTuple::kernel_type>::Instance();
if (codes.Has(key)) {
return codes.AllKernels().at(key)->template getCode<Func>();
return codes.AllKernels().at(key).get();
}
// creator is not related with attr, so can use KernelKey as key
KernelKey kkey(KT, PlaceType());
KernelKey kkey(KernelTuple::kernel_type, PlaceType());
// pool: (KernelKey(type, place), vector<GenCreatorPtr>)
auto& creator_map = JitCodeCreatorPool().Instance().AllCreators();
auto& creator_map = JitCodeCreatorPool::Instance().AllCreators();
auto iter = creator_map.find(kkey);
if (iter != creator_map.end()) {
auto& creators = iter->second;
for (auto& cur : creators) {
auto i = dynamic_cast<const JitCodeCreator<Attr>*>(cur.get());
if (i && i->UseMe(attr)) {
if (i && i->CanBeUsed(attr)) {
auto p = i->CreateJitCode(attr);
if (p) {
auto f = p->template getCode<Func>();
auto res = p.get();
codes.Insert(key, std::move(p));
return f;
return res;
}
}
}
......@@ -63,87 +64,153 @@ GetJitCode(const typename KernelTuples::attr_type& attr) {
return nullptr;
}
template <KernelType KT, typename KernelTuples, typename PlaceType>
template <typename KernelTuple, typename PlaceType>
inline typename std::enable_if<
!std::is_same<typename KernelTuples::data_type, float>::value ||
!std::is_same<typename KernelTuple::data_type, float>::value ||
!std::is_same<PlaceType, platform::CPUPlace>::value,
typename KernelTuples::func_type>::type
GetJitCode(const typename KernelTuples::attr_type& attr) {
const Kernel*>::type
GetJitCode(const typename KernelTuple::attr_type& attr) {
return nullptr;
}
// Refer code do not related with attr, which is just for cast
// Refer is always on CPUPlace
template <KernelType KT, typename KernelTuples>
inline typename KernelTuples::func_type GetRefer() {
auto& ref_pool = ReferKernelPool().Instance().AllKernels();
KernelKey kkey(KT, platform::CPUPlace());
template <typename KernelTuple>
inline const Kernel* GetReferKernel() {
auto& ref_pool = ReferKernelPool::Instance().AllKernels();
KernelKey kkey(KernelTuple::kernel_type, platform::CPUPlace());
auto ref_iter = ref_pool.find(kkey);
PADDLE_ENFORCE(ref_iter != ref_pool.end(),
"Every Kernel should have reference function.");
auto& ref_impls = ref_iter->second;
for (auto& impl : ref_impls) {
auto i = dynamic_cast<const ReferKernel<KernelTuples>*>(impl.get());
auto i = dynamic_cast<const ReferKernel<KernelTuple>*>(impl.get());
if (i) {
return i->GetFunc();
return i;
}
}
return nullptr;
}
template <KernelType KT, typename KernelTuples,
typename PlaceType = platform::CPUPlace>
typename KernelTuples::func_type Get(
const typename KernelTuples::attr_type& attr) {
auto jitfunc = GetJitCode<KT, KernelTuples, PlaceType>(attr);
if (jitfunc) {
return jitfunc;
template <typename KernelTuple>
inline typename KernelTuple::func_type GetReferFunc() {
auto ker = GetReferKernel<KernelTuple>();
auto p = dynamic_cast<const ReferKernel<KernelTuple>*>(ker);
PADDLE_ENFORCE(p, "The Refer kernel should exsit");
return p->GetFunc();
}
// Return all Kernels that can be used
template <typename KernelTuple, typename PlaceType>
std::vector<const Kernel*> GetAllCandidateKernels(
const typename KernelTuple::attr_type& attr) {
// the search order shoudl be jitcode > more > refer
std::vector<const Kernel*> res;
auto jitker = GetJitCode<KernelTuple, PlaceType>(attr);
if (jitker) {
res.emplace_back(jitker);
}
// pool: (KernelKey(type, place), vector<KernelPtr>)
KernelKey kkey(KT, PlaceType());
auto& pool = KernelPool().Instance().AllKernels();
// more kernelpool: (KernelKey(type, place), vector<KernelPtr>)
KernelKey kkey(KernelTuple::kernel_type, PlaceType());
auto& pool = KernelPool::Instance().AllKernels();
auto iter = pool.find(kkey);
if (iter != pool.end()) {
auto& impls = iter->second;
for (auto& impl : impls) {
auto i = dynamic_cast<const KernelMore<KernelTuples>*>(impl.get());
if (i && i->UseMe(attr)) {
return i->GetFunc();
auto i = dynamic_cast<const KernelMore<KernelTuple>*>(impl.get());
if (i && i->CanBeUsed(attr)) {
res.emplace_back(i);
}
}
}
// The last implementation should be reference function on CPUPlace.
return GetRefer<KT, KernelTuples>();
auto ref = GetReferKernel<KernelTuple>();
PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty.");
res.emplace_back(ref);
return res;
}
template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
std::vector<std::pair<std::string, typename KernelTuple::func_type>>
GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
using Func = typename KernelTuple::func_type;
auto kers = GetAllCandidateKernels<KernelTuple, PlaceType>(attr);
std::vector<std::pair<std::string, Func>> res;
for (auto k : kers) {
std::string name = k->ImplType();
if (name == "JitCode") {
auto i = dynamic_cast<const GenBase*>(k);
PADDLE_ENFORCE(i, "jitcode kernel cast can not fail.");
res.emplace_back(std::make_pair(name, i->template getCode<Func>()));
} else {
auto i = dynamic_cast<const KernelMore<KernelTuple>*>(k);
PADDLE_ENFORCE(i, "kernel cast can not fail.");
res.emplace_back(std::make_pair(name, i->GetFunc()));
}
}
return res;
}
template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
std::vector<typename KernelTuple::func_type> GetAllCandidateFuncs(
const typename KernelTuple::attr_type& attr) {
auto funcs = GetAllCandidateFuncsWithTypes<KernelTuple, PlaceType>(attr);
std::vector<typename KernelTuple::func_type> res;
for (auto& i : funcs) {
res.emplace_back(i.second);
}
return res;
}
template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
typename KernelTuple::func_type GetDefaultBestFunc(
const typename KernelTuple::attr_type& attr) {
auto funcs = GetAllCandidateFuncs<KernelTuple, PlaceType>(attr);
PADDLE_ENFORCE_GE(funcs.size(), 1UL);
// Here could do some runtime benchmark of this attr and return the best one.
// But yet just get the first one as the default best one,
// which is searched in order and tuned by offline.
return funcs[0];
}
template <KernelType KT, typename KernelTuples, typename PlaceType>
template <typename KernelTuple, typename PlaceType>
class KernelFuncs {
public:
KernelFuncs() = default;
static KernelFuncs& Cache() {
static thread_local KernelFuncs<KT, KernelTuples, PlaceType> g_func_cache;
static thread_local KernelFuncs<KernelTuple, PlaceType> g_func_cache;
return g_func_cache;
}
bool Has(int key) const { return funcs_.find(key) != funcs_.end(); }
void Insert(int key, typename KernelTuples::func_type func) {
funcs_.emplace(key, func);
}
typename KernelTuples::func_type At(int key) {
// the exposed interface to use
typename KernelTuple::func_type At(
const typename KernelTuple::attr_type& attr) {
// Maybe here is not good enough, not all kernels should have jitcode
int64_t key = JitCodeKey<typename KernelTuple::attr_type>(attr);
if (Has(key)) {
return funcs_.at(key);
}
auto func = Get<KT, KernelTuples, PlaceType>(key);
// If do not have this attr in cache then get the default best
auto func = GetDefaultBestFunc<KernelTuple, PlaceType>(attr);
Insert(key, func);
return func;
}
typename KernelTuple::func_type operator[](
const typename KernelTuple::attr_type& attr) {
return At(attr);
}
protected:
bool Has(int64_t key) const { return funcs_.find(key) != funcs_.end(); }
void Insert(int64_t key, typename KernelTuple::func_type func) {
funcs_.emplace(key, func);
}
private:
std::unordered_map<int, typename KernelTuples::func_type> funcs_;
std::unordered_map<int64_t, typename KernelTuple::func_type> funcs_;
DISABLE_COPY_AND_ASSIGN(KernelFuncs);
};
......
......@@ -62,26 +62,55 @@ typedef enum {
kSqrt,
} SeqPoolType;
// x, y, z, n
template <typename T>
struct XYZNTuples {
struct XYZNTuple {
typedef T data_type;
typedef int attr_type;
typedef void (*func_type)(const T*, const T*, T*, int);
};
// a, x, y, n
template <typename T>
struct AXYNTuples : public XYZNTuples<T> {};
struct AXYNTuple : public XYZNTuple<T> {};
// x, y, n
template <typename T>
struct XYNTuples {
struct XYNTuple {
typedef T data_type;
typedef int attr_type;
typedef void (*func_type)(const T*, T*, int);
};
// x, return and int
// x, returned value, n
template <typename T>
struct XRNTuples : public XYNTuples<T> {};
struct XRNTuple : public XYNTuple<T> {};
#define DECLARE_KERNELTUPLE(kernel_tuple, type) \
template <typename T> \
struct type##Tuple : public kernel_tuple<T> { \
static constexpr KernelType kernel_type = k##type; \
}
// Tuple should be corresponding to the KernelType
DECLARE_KERNELTUPLE(XYZNTuple, VMul);
DECLARE_KERNELTUPLE(XYZNTuple, VAdd);
DECLARE_KERNELTUPLE(XYZNTuple, VAddRelu);
DECLARE_KERNELTUPLE(XYZNTuple, VSub);
DECLARE_KERNELTUPLE(AXYNTuple, VScal);
DECLARE_KERNELTUPLE(AXYNTuple, VAddBias);
DECLARE_KERNELTUPLE(XYNTuple, VRelu);
DECLARE_KERNELTUPLE(XYNTuple, VIdentity);
DECLARE_KERNELTUPLE(XYNTuple, VSquare);
DECLARE_KERNELTUPLE(XYNTuple, VExp);
DECLARE_KERNELTUPLE(XYNTuple, VSigmoid);
DECLARE_KERNELTUPLE(XYNTuple, VTanh);
DECLARE_KERNELTUPLE(XYNTuple, VCopy);
DECLARE_KERNELTUPLE(XRNTuple, HMax);
DECLARE_KERNELTUPLE(XRNTuple, HSum);
typedef struct {
void* gates; // gates: x_ch, x_ih, x_fh, x_oh
......@@ -122,21 +151,31 @@ typedef struct rnn_attr_s gru_attr_t;
typedef struct lstm_attr_s lstm_attr_t;
template <typename T>
struct LSTMTuples {
struct LSTMTuple {
typedef T data_type;
typedef lstm_attr_t attr_type;
typedef void (*func_type)(lstm_t*, const lstm_attr_t*);
};
template <typename T>
struct GRUTuples {
struct GRUTuple {
typedef T data_type;
typedef gru_attr_t attr_type;
typedef void (*func_type)(gru_t*, const gru_attr_t*);
};
DECLARE_KERNELTUPLE(LSTMTuple, LSTMCtHt);
DECLARE_KERNELTUPLE(LSTMTuple, LSTMC1H1);
DECLARE_KERNELTUPLE(GRUTuple, GRUH1);
DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart1);
DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart2);
#undef DECLARE_KERNELTUPLE
template <typename T>
struct VBroadcastTuples {
struct VBroadcastTuple {
static constexpr KernelType kernel_type = kVBroadcast;
typedef T data_type;
typedef int64_t attr_type;
typedef void (*func_type)(const T*, T*, int64_t, int64_t);
......@@ -151,7 +190,8 @@ typedef struct seq_pool_attr_s {
} seq_pool_attr_t;
template <typename T>
struct SeqPoolTuples {
struct SeqPoolTuple {
static constexpr KernelType kernel_type = kSeqPool;
typedef T data_type;
typedef seq_pool_attr_t attr_type;
typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
......@@ -176,7 +216,8 @@ typedef struct emb_seq_pool_attr_s {
} emb_seq_pool_attr_t;
template <typename T>
struct EmbSeqPoolTuples {
struct EmbSeqPoolTuple {
static constexpr KernelType kernel_type = kEmbSeqPool;
typedef T data_type;
typedef emb_seq_pool_attr_t attr_type;
typedef void (*func_type)(const T*, const int64_t*, T*,
......@@ -198,7 +239,8 @@ typedef struct sgd_attr_s {
} sgd_attr_t;
template <typename T>
struct SgdTuples {
struct SgdTuple {
static constexpr KernelType kernel_type = kSgd;
typedef T data_type;
typedef sgd_attr_t attr_type;
typedef void (*func_type)(const T*, const T*, const T*, const int64_t*, T*,
......@@ -214,21 +256,24 @@ typedef struct matmul_attr_s {
} matmul_attr_t;
template <typename T>
struct MatMulTuples {
struct MatMulTuple {
static constexpr KernelType kernel_type = kMatMul;
typedef T data_type;
typedef matmul_attr_t attr_type;
typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*);
};
template <typename T>
struct CRFDecodingTuples {
struct CRFDecodingTuple {
static constexpr KernelType kernel_type = kCRFDecoding;
typedef T data_type;
typedef int attr_type;
typedef void (*func_type)(const int, const T*, const T*, T*, int*, int);
};
template <typename T>
struct LayerNormTuples {
struct LayerNormTuple {
static constexpr KernelType kernel_type = kLayerNorm;
typedef T data_type;
typedef int attr_type;
typedef void (*func_type)(T*, T*, T*, T*, const T*, const T*, int,
......@@ -236,7 +281,8 @@ struct LayerNormTuples {
};
template <typename T>
struct SoftmaxTuples {
struct SoftmaxTuple {
static constexpr KernelType kernel_type = kSoftmax;
typedef T data_type;
typedef int attr_type;
typedef void (*func_type)(const T*, T*, int, int);
......@@ -244,7 +290,8 @@ struct SoftmaxTuples {
// nChw16c = nChw16c .* NC
template <typename T>
struct NCHW16CMulNCTuples {
struct NCHW16CMulNCTuple {
static constexpr KernelType kernel_type = kNCHW16CMulNC;
typedef T data_type;
typedef int attr_type;
typedef void (*func_type)(const T*, const T*, T*, int, int);
......@@ -255,28 +302,29 @@ class Kernel {
public:
Kernel() = default;
virtual ~Kernel() = default;
virtual const char* ImplType() const = 0;
DISABLE_COPY_AND_ASSIGN(Kernel);
};
template <typename KernelTuples>
template <typename KernelTuple>
class KernelMore : public Kernel {
public:
using T = typename KernelTuples::data_type;
using Func = typename KernelTuples::func_type;
using Attr = typename KernelTuples::attr_type;
using T = typename KernelTuple::data_type;
using Func = typename KernelTuple::func_type;
using Attr = typename KernelTuple::attr_type;
virtual Func GetFunc() const { return func; }
virtual bool UseMe(const Attr& attr) const = 0;
virtual const char* ImplType() const = 0;
// specify this kernel can be used, means it should not fail if use it.
virtual bool CanBeUsed(const Attr& attr) const = 0;
protected:
Func func{nullptr};
};
template <typename KernelTuples>
class ReferKernel : public KernelMore<KernelTuples> {
template <typename KernelTuple>
class ReferKernel : public KernelMore<KernelTuple> {
public:
// Refer code can always be used
bool UseMe(const typename KernelTuples::attr_type& attr) const override {
bool CanBeUsed(const typename KernelTuple::attr_type& attr) const override {
return true;
}
const char* ImplType() const override { return "Refer"; }
......
......@@ -13,6 +13,7 @@
* limitations under the License. */
#include "paddle/fluid/operators/jit/kernel_key.h"
#include <xxhash.h> // XXH64: 13.8 GB/s
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
......@@ -20,71 +21,46 @@ namespace operators {
namespace jit {
template <>
size_t JitCodeKey<int>(const int& d) {
int64_t JitCodeKey<int>(const int& d) {
return d;
}
template <>
size_t JitCodeKey<int64_t>(const int64_t& d) {
int64_t JitCodeKey<int64_t>(const int64_t& d) {
return d;
}
// TODO(TJ): refine and benchmark JitCodeKey generatation
constexpr int act_type_shift = 3; // suppot 2^3 act types
static inline int act_type_convert(KernelType type) {
if (type == kVIdentity) {
return 0;
} else if (type == kVExp) {
return 1;
} else if (type == kVRelu) {
return 2;
} else if (type == kVSigmoid) {
return 3;
} else if (type == kVTanh) {
return 4;
}
PADDLE_THROW("Unsupported act type %d", type);
return 0;
}
template <>
size_t JitCodeKey<lstm_attr_t>(const lstm_attr_t& attr) {
size_t key = attr.d;
int gate_key = act_type_convert(attr.act_gate) << 1;
int cand_key = act_type_convert(attr.act_cand) << (1 + act_type_shift);
int cell_key = act_type_convert(attr.act_cell) << (1 + act_type_shift * 2);
return (key << (1 + act_type_shift * 3)) + gate_key + cand_key + cell_key +
attr.use_peephole;
int64_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) {
return XXH64(&attr, sizeof(gru_attr_t), 0);
}
template <>
size_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) {
size_t key = attr.d;
return (key << (act_type_shift * 2)) + act_type_convert(attr.act_gate) +
(act_type_convert(attr.act_cand) << act_type_shift);
int64_t JitCodeKey<lstm_attr_t>(const lstm_attr_t& attr) {
int keys[5] = {
attr.d, static_cast<int>(attr.act_gate), static_cast<int>(attr.act_cand),
static_cast<int>(attr.act_cell), static_cast<int>(attr.use_peephole)};
return XXH64(keys, sizeof(int) * 5, 0);
}
template <>
size_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) {
size_t key = attr.w;
constexpr int pool_type_shift = 3;
return (key << pool_type_shift) + static_cast<int>(attr.type);
int64_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) {
int keys[2] = {attr.w, static_cast<int>(attr.type)};
return XXH64(keys, sizeof(int) * 2, 0);
}
template <>
size_t JitCodeKey<matmul_attr_t>(const matmul_attr_t& attr) {
size_t key = attr.m;
constexpr int shift = 21;
return (key << shift * 2) + ((static_cast<size_t>(attr.n)) << shift) + attr.k;
int64_t JitCodeKey<matmul_attr_t>(const matmul_attr_t& attr) {
return XXH64(&attr, sizeof(int) * 3, 0); // m, n, k
}
template <>
size_t JitCodeKey<emb_seq_pool_attr_t>(const emb_seq_pool_attr_t& attr) {
int64_t JitCodeKey<emb_seq_pool_attr_t>(const emb_seq_pool_attr_t& attr) {
return attr.table_width;
}
template <>
size_t JitCodeKey<sgd_attr_t>(const sgd_attr_t& attr) {
int64_t JitCodeKey<sgd_attr_t>(const sgd_attr_t& attr) {
return attr.grad_width;
}
......
......@@ -46,7 +46,7 @@ struct KernelKey {
// Every JitCode should have a method to get the key from attribution
template <typename Attr>
size_t JitCodeKey(const Attr& attr);
int64_t JitCodeKey(const Attr& attr);
} // namespace jit
} // namespace operators
......
......@@ -161,7 +161,7 @@ void CRFDecoding(const int seq_len, const float* x, const float* w,
}
}
bool CRFDecodingKernel::UseMe(const int& d) const {
bool CRFDecodingKernel::CanBeUsed(const int& d) const {
#ifdef __AVX512F__
constexpr int block = ZMM_FLOAT_BLOCK;
#else
......
......@@ -153,7 +153,7 @@ void LayerNorm(float* x, float* out, float* mean, float* var,
}
}
bool LayerNormKernel::UseMe(const int& d) const {
bool LayerNormKernel::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx) && d >= YMM_FLOAT_BLOCK;
}
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册