提交 ac0e0f51 编写于 作者: S sneaxiy

merge develop

test=develop
...@@ -179,7 +179,6 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog, ...@@ -179,7 +179,6 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
else: else:
build_strategy.reduce_strategy = fluid.BuildStrategy( build_strategy.reduce_strategy = fluid.BuildStrategy(
).ReduceStrategy.AllReduce ).ReduceStrategy.AllReduce
build_strategy.fuse_broadcast_op = args.fuse_broadcast_op
avg_loss = train_args[0] avg_loss = train_args[0]
......
...@@ -302,6 +302,7 @@ paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords= ...@@ -302,6 +302,7 @@ paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=
paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3')) paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3'))
paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b')) paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b'))
paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9d586a0b5bd05f67ee78048f9d503b6')) paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9d586a0b5bd05f67ee78048f9d503b6'))
paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3a46e0b5f9ce82348406478e610f14c9'))
paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7')) paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7'))
paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13')) paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13'))
paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '072a8541e0f632366bba10f67cb0db27')) paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '072a8541e0f632366bba10f67cb0db27'))
...@@ -309,6 +310,8 @@ paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None ...@@ -309,6 +310,8 @@ paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None
paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad')) paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad'))
paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973')) paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973'))
paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '485f2686bcc2fe37a4bd893769c8a3e2')) paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '485f2686bcc2fe37a4bd893769c8a3e2'))
paddle.fluid.layers.acos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '920a47734482276c069ba24c61c26b25'))
paddle.fluid.layers.asin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cf4ee2c9b9d7293556f8c5173dfb5d2c'))
paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4')) paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4'))
paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b47f5da13913d3e56bdb1e612a73f3f2')) paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b47f5da13913d3e56bdb1e612a73f3f2'))
paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cc6ac2f14f03c52aaa83a59bf83b8d26')) paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cc6ac2f14f03c52aaa83a59bf83b8d26'))
......
...@@ -38,10 +38,10 @@ if(WITH_GPU) ...@@ -38,10 +38,10 @@ if(WITH_GPU)
nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context) nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
add_dependencies(tensor tensor_util) add_dependencies(tensor tensor_util)
else() else()
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context ) nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler)
endif(WIN32) endif(WIN32)
else() else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context ) cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler)
endif() endif()
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
...@@ -174,7 +174,7 @@ else() ...@@ -174,7 +174,7 @@ else()
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif() endif()
target_link_libraries(executor garbage_collector) target_link_libraries(executor garbage_collector while_op_helper)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS cc_library(parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor
......
...@@ -61,7 +61,8 @@ cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_ ...@@ -61,7 +61,8 @@ cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_
cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) cc_library(while_op_eager_deletion_pass SRCS while_op_eager_deletion_pass.cc DEPS while_op_helper graph_helper pass computation_op_handle)
cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass)
cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper) cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper)
cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass) cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -31,6 +32,8 @@ class ComputationOpHandle : public OpHandleBase { ...@@ -31,6 +32,8 @@ class ComputationOpHandle : public OpHandleBase {
ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place, ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
size_t scope_idx); size_t scope_idx);
OperatorBase *GetOp() { return op_.get(); }
std::string Name() const override; std::string Name() const override;
const Scope *GetScope() const { return scope_; } const Scope *GetScope() const { return scope_; }
......
...@@ -12,6 +12,10 @@ ...@@ -12,6 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <memory>
#include <unordered_set>
#include <utility>
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h" #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
...@@ -45,6 +49,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( ...@@ -45,6 +49,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
} }
} }
#endif #endif
PADDLE_ENFORCE(!var_names_.empty(), "Var names cannot be empty");
} }
EagerDeletionOpHandle::~EagerDeletionOpHandle() { EagerDeletionOpHandle::~EagerDeletionOpHandle() {
...@@ -60,15 +65,20 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { ...@@ -60,15 +65,20 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; } std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; }
void EagerDeletionOpHandle::RunImpl() { void EagerDeletionOpHandle::RunImpl() {
auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(); Scope *exec_scope = nullptr;
std::deque<std::shared_ptr<memory::Allocation>> garbages; std::deque<std::shared_ptr<memory::Allocation>> garbages;
for (auto &name : var_names_) { for (auto &name : var_names_) {
auto it = ref_cnts_->find(name); auto it = ref_cnts_->find(name);
// Var not found, not reference count has not decreased to 0 // Reference count has not decreased to 0
if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) { if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) {
continue; continue;
} }
if (!exec_scope) {
exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
}
// Var not found
auto *var = exec_scope->FindVar(name); auto *var = exec_scope->FindVar(name);
if (var == nullptr) { if (var == nullptr) {
continue; continue;
......
...@@ -12,20 +12,173 @@ ...@@ -12,20 +12,173 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <algorithm>
#include <functional>
#include <queue> #include <queue>
#include <string> #include <string>
#include <tuple>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h" #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include "paddle/fluid/framework/details/eager_deletion_pass.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
"Fraction of eager deletion. If less than 1.0, all variables in "
"the program would be sorted according to its memory size, and "
"only the FLAGS_memory_fraction_of_eager_deletion of the largest "
"variables would be deleted.");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
// op -> variables which can be deleted after op runs
using OpToVarNameSetMap =
std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>;
// Check whether the variable is LoDTensor based on static VarDesc info
static bool IsLoDTensor(VarDesc *var) {
return var->Proto()->type().type() == proto::VarType::LOD_TENSOR;
}
// Get memory size of LoDTensor
static int64_t GetMemorySize(
const std::unordered_map<std::string, std::vector<VarHandle *>> &vars,
const std::string &var_name) {
auto *var_desc = TryGetLatestVarDesc(vars.at(var_name));
PADDLE_ENFORCE_NOT_NULL(var_desc);
PADDLE_ENFORCE(IsLoDTensor(var_desc));
auto dims = var_desc->GetShape();
return SizeOfType(var_desc->GetDataType()) *
std::accumulate(dims.begin(), dims.end(), static_cast<int64_t>(1),
std::multiplies<int64_t>());
}
// Split all variables in the graph into LoDTensor and Non-LoDTensor (e.g.
// SelectedRows, LoDTensorArray)
// Since partial GC is based on static analysis of memory size of each variable
// So we should skip SelectedRows and LoDTensorArray here
static void SplitIntoLoDTensorAndNonLoDTensorVars(
const OpToVarNameSetMap &m, const GraphVars &vars,
OpToVarNameSetMap *lod_tensors, OpToVarNameSetMap *other_vars) {
lod_tensors->clear();
other_vars->clear();
for (auto &op_vars_pair : m) {
for (auto &var_name : op_vars_pair.second) {
auto *var_desc = TryGetLatestVarDesc(
vars[op_vars_pair.first->GetScopeIdx()].at(var_name));
if (IsLoDTensor(var_desc)) {
(*lod_tensors)[op_vars_pair.first].insert(var_name);
} else {
(*other_vars)[op_vars_pair.first].insert(var_name);
}
}
}
}
struct GCVarInfo {
GCVarInfo(const std::string &name, int64_t memory_size,
ComputationOpHandle *op, size_t scope_idx)
: name_(name),
memory_size_(memory_size),
op_(op),
scope_idx_(scope_idx) {}
std::string name_; // variable name
int64_t memory_size_; // memory size
ComputationOpHandle *op_; // op after which the variable could be deleted
size_t scope_idx_; // scope index where the variable locates
int64_t AbsMemorySize() const { return std::abs(memory_size_); }
};
// Delete delete_lod_tensor_only is not used currently
static OpToVarNameSetMap ShrinkGCVars(
const OpToVarNameSetMap &m, const GraphVars &vars,
const std::vector<platform::Place> &places, double fraction_of_memory_size,
bool delete_lod_tensor_only = false) {
// Do not perform gc when fraction_of_memory_size = 0
if (fraction_of_memory_size <= 0.0) return {};
/**
* Step 1: Split all variables into LoDTensor and Non-LoDTensor.
* We can only calculate memory size of LoDTensors
*/
OpToVarNameSetMap lod_tensors, other_vars;
SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars);
// Perform complete gc when fraction_of_memory_size >= 1
if (fraction_of_memory_size >= 1.0) {
return delete_lod_tensor_only ? lod_tensors : m;
}
/**
* Step 2: build GCVarInfos, and calculate total memory sizes of each device
*/
// place -> variable info (name, memory size, place, scope_idx)
std::map<platform::Place, std::vector<GCVarInfo>> place_to_vars;
// place -> total memory sizes
std::map<platform::Place, int64_t> place_to_size;
for (auto &op_vars_pair : lod_tensors) {
auto *op = op_vars_pair.first;
auto &var_names = op_vars_pair.second;
auto scope_idx = op->GetScopeIdx();
auto &place = places[scope_idx];
for (auto &var_name : var_names) {
auto var_size = GetMemorySize(vars[scope_idx], var_name);
GCVarInfo var_info(var_name, var_size, op, scope_idx);
place_to_size[place] += var_info.AbsMemorySize();
place_to_vars[place].emplace_back(std::move(var_info));
}
}
/**
* Step 3: sort GCVarInfos, and only delete the largest variables.
*/
OpToVarNameSetMap partial_vars;
for (auto &place_to_var_pair : place_to_vars) {
auto &place = place_to_var_pair.first;
auto &gc_vars = place_to_var_pair.second;
std::sort(gc_vars.begin(), gc_vars.end(),
[](const GCVarInfo &var1, const GCVarInfo &var2) {
return var1.AbsMemorySize() > var2.AbsMemorySize();
});
int64_t accumulated_size = 0;
int64_t size_threshold =
static_cast<int64_t>(fraction_of_memory_size * place_to_size[place]);
for (size_t i = 0; i < gc_vars.size() && accumulated_size < size_threshold;
++i) {
partial_vars[gc_vars[i].op_].insert(gc_vars[i].name_);
accumulated_size += gc_vars[i].AbsMemorySize();
}
}
/**
* Step 4: Combine other vars (SelectedRows, LoDTensorArray)
*/
if (!delete_lod_tensor_only) {
for (auto &op_vars_pair : other_vars) {
partial_vars[op_vars_pair.first].insert(op_vars_pair.second.begin(),
op_vars_pair.second.end());
}
}
return partial_vars;
}
class EagerDeletionPass : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl( std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
auto &ref_cnts = auto &ref_cnts =
...@@ -43,9 +196,7 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl( ...@@ -43,9 +196,7 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
// a reverse map of last_live_ops // a reverse map of last_live_ops
// i.e., last op --> variable names which can be deleted. // i.e., last op --> variable names which can be deleted.
std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>> OpToVarNameSetMap op_vars_map;
op_vars_map;
for (auto &var_ops_map : last_live_ops) { for (auto &var_ops_map : last_live_ops) {
for (auto &var_ops_pair : var_ops_map) { for (auto &var_ops_pair : var_ops_map) {
const std::string &var_name = var_ops_pair.first; const std::string &var_name = var_ops_pair.first;
...@@ -55,6 +206,9 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl( ...@@ -55,6 +206,9 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
} }
} }
op_vars_map = ShrinkGCVars(op_vars_map, vars, places,
FLAGS_memory_fraction_of_eager_deletion);
for (auto &pair : op_vars_map) { for (auto &pair : op_vars_map) {
auto *op = pair.first; auto *op = pair.first;
auto &var_names = pair.second; auto &var_names = pair.second;
...@@ -85,8 +239,13 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl( ...@@ -85,8 +239,13 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
eager_deletion_op->AddOutput(dummy_leaf); eager_deletion_op->AddOutput(dummy_leaf);
} }
VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = "
<< FLAGS_memory_fraction_of_eager_deletion;
VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)"; VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)";
return graph;
auto while_op_eager_deletion_pass =
ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass");
return while_op_eager_deletion_pass->Apply(std::move(graph));
} }
} // namespace details } // namespace details
...@@ -99,3 +258,5 @@ REGISTER_PASS(eager_deletion_pass, ...@@ -99,3 +258,5 @@ REGISTER_PASS(eager_deletion_pass,
.RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars) .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars)
.RequirePassAttr(paddle::framework::details::kAllPlaces) .RequirePassAttr(paddle::framework::details::kAllPlaces)
.RequirePassAttr(paddle::framework::details::kGarbageCollector); .RequirePassAttr(paddle::framework::details::kGarbageCollector);
USE_PASS(while_op_eager_deletion_pass);
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <algorithm> #include <algorithm>
#include <deque> #include <deque>
#include <iterator> #include <iterator>
#include <memory>
#include <stack> #include <stack>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
...@@ -263,6 +264,10 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes, ...@@ -263,6 +264,10 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
void InplacePass::TryInplaceOpInputOutput(ir::Node* op, void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
ir::Graph* graph) const { ir::Graph* graph) const {
VLOG(4) << "Try to inplace op " << op->Name(); VLOG(4) << "Try to inplace op " << op->Name();
// FIXME(liuwei1031): Graph is not aware of the existence of BlockDescs and
// ProgramDescs.
// The operations related to BlockDesc or ProgramDesc should perform on Graph
// or Node directly!
PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr, PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
"op_desc is nullptr"); "op_desc is nullptr");
// some pre-requirments need to meet if the op want to inplaced. // some pre-requirments need to meet if the op want to inplaced.
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <type_traits> #include <type_traits>
#include <unordered_set>
#include <vector> #include <vector>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
...@@ -191,6 +192,10 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { ...@@ -191,6 +192,10 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
// immediately to make the subblock variable reuse strategy take // immediately to make the subblock variable reuse strategy take
// effect. Because it is a single op in graph. No need to // effect. Because it is a single op in graph. No need to
// update the ir nodes. // update the ir nodes.
// FIXME(liuwei1031): Graph is not aware of the existence of
// BlockDescs and ProgramDescs.
// The operations related to BlockDesc or ProgramDesc should perform
// on Graph or Node directly!
sub_op_desc->Rename(var->Name(), cache->Name()); sub_op_desc->Rename(var->Name(), cache->Name());
if (sub_op_desc->Block() != nullptr && if (sub_op_desc->Block() != nullptr &&
sub_op_desc->Block()->HasVar(var->Name())) { sub_op_desc->Block()->HasVar(var->Name())) {
......
...@@ -12,9 +12,13 @@ ...@@ -12,9 +12,13 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <memory>
#include <queue> #include <queue>
#include <string> #include <string>
#include <type_traits> #include <type_traits>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
...@@ -189,15 +193,6 @@ ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx, ...@@ -189,15 +193,6 @@ ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx,
return shrink_func(computation_op); return shrink_func(computation_op);
} }
static VarDesc *TryGetLatestVarDesc(const std::vector<VarHandle *> &vars) {
VarDesc *var_desc = nullptr;
std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool {
var_desc = var_handle->Node()->Var();
return var_desc != nullptr;
});
return var_desc;
}
std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl( std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount); auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount);
......
...@@ -13,9 +13,22 @@ ...@@ -13,9 +13,22 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/var_desc.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details {} // namespace details namespace details {
VarDesc *TryGetLatestVarDesc(const std::vector<VarHandle *> &vars) {
VarDesc *var_desc = nullptr;
std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool {
var_desc = var_handle->Node()->Var();
return var_desc != nullptr;
});
return var_desc;
}
} // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <atomic> #include <atomic>
#include <map> #include <map>
#include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
...@@ -25,6 +26,10 @@ ...@@ -25,6 +26,10 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class VarDesc;
class VarHandle;
namespace details { namespace details {
class ComputationOpHandle; class ComputationOpHandle;
...@@ -43,9 +48,11 @@ const char kGarbageCollector[] = "garbage_collector"; ...@@ -43,9 +48,11 @@ const char kGarbageCollector[] = "garbage_collector";
const char kAllPlaces[] = "all_places"; const char kAllPlaces[] = "all_places";
using LastLiveOpsOfVars = using LastLiveOpsOfVars =
std::unordered_map<std::string, std::unordered_set<ComputationOpHandle*>>; std::unordered_map<std::string, std::unordered_set<ComputationOpHandle *>>;
const char kLastLiveOpsOfVars[] = "last_live_ops_of_var"; const char kLastLiveOpsOfVars[] = "last_live_ops_of_var";
VarDesc *TryGetLatestVarDesc(const std::vector<VarHandle *> &vars);
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/operators/controlflow/while_op_helper.h"
namespace paddle {
namespace framework {
namespace details {
class WhileOpEagerDeletionPass : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override {
auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
// Find all while_op and while_grad_op
std::unordered_map<size_t, std::pair<std::vector<OperatorBase *>,
std::vector<OperatorBase *>>>
target_ops;
for (auto *op : all_ops) {
auto compute_op = dynamic_cast<ComputationOpHandle *>(op);
if (compute_op == nullptr) continue;
if (compute_op->Name() == "while") {
target_ops[compute_op->GetScopeIdx()].first.emplace_back(
compute_op->GetOp());
} else if (compute_op->Name() == "while_grad") {
target_ops[compute_op->GetScopeIdx()].second.emplace_back(
compute_op->GetOp());
}
}
for (auto &ops_pair : target_ops) {
auto &while_ops = ops_pair.second.first;
auto &while_grad_ops = ops_pair.second.second;
operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
while_ops, while_grad_ops);
}
return graph;
}
};
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_PASS(while_op_eager_deletion_pass,
paddle::framework::details::WhileOpEagerDeletionPass);
...@@ -14,6 +14,10 @@ limitations under the License. */ ...@@ -14,6 +14,10 @@ limitations under the License. */
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include <deque> #include <deque>
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_rank_table.h"
...@@ -23,17 +27,18 @@ limitations under the License. */ ...@@ -23,17 +27,18 @@ limitations under the License. */
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/controlflow/while_op_helper.h"
#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_NGRAPH #ifdef PADDLE_WITH_NGRAPH
#include "paddle/fluid/operators/ngraph/ngraph_engine.h" #include "paddle/fluid/operators/ngraph/ngraph_engine.h"
DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
#endif #endif
DECLARE_bool(benchmark); DECLARE_bool(benchmark);
DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -75,11 +80,11 @@ static std::unordered_map<std::string, size_t> GetNonPersistableReferenceCounts( ...@@ -75,11 +80,11 @@ static std::unordered_map<std::string, size_t> GetNonPersistableReferenceCounts(
ExecutorPrepareContext::ExecutorPrepareContext( ExecutorPrepareContext::ExecutorPrepareContext(
const framework::ProgramDesc& prog, size_t block_id, const framework::ProgramDesc& prog, size_t block_id,
const std::vector<std::string>& skip_ref_cnt_vars) const std::vector<std::string>& keep_vars, bool force_disable_gc)
: prog_(prog), block_id_(block_id) { : prog_(prog), block_id_(block_id), force_disable_gc_(force_disable_gc) {
if (GetEagerDeletionThreshold() >= 0) { if (GetEagerDeletionThreshold() >= 0 && !force_disable_gc_) {
global_ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id), global_ref_cnts_ =
skip_ref_cnt_vars); GetNonPersistableReferenceCounts(prog.Block(block_id), keep_vars);
} }
} }
...@@ -184,13 +189,12 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, ...@@ -184,13 +189,12 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
} }
void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
bool create_local_scope, bool create_vars) { bool create_local_scope, bool create_vars,
const std::vector<std::string>& skip_ref_cnt_vars,
bool force_disable_gc) {
platform::RecordBlock b(block_id); platform::RecordBlock b(block_id);
if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
#ifdef PADDLE_WITH_NGRAPH auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc);
if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc);
#endif
auto ctx = Prepare(pdesc, block_id);
RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
} }
...@@ -357,20 +361,27 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, ...@@ -357,20 +361,27 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
std::unique_ptr<ExecutorPrepareContext> Executor::Prepare( std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
const ProgramDesc& program, int block_id, const ProgramDesc& program, int block_id,
const std::vector<std::string>& skip_ref_cnt_vars) { const std::vector<std::string>& skip_ref_cnt_vars, bool force_disable_gc) {
std::unique_ptr<ExecutorPrepareContext> ctx( std::unique_ptr<ExecutorPrepareContext> ctx(new ExecutorPrepareContext(
new ExecutorPrepareContext(program, block_id, skip_ref_cnt_vars)); program, block_id, skip_ref_cnt_vars, force_disable_gc));
PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size()); PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
auto& block = program.Block(block_id); auto& block = program.Block(block_id);
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
} }
#ifdef PADDLE_WITH_NGRAPH
if (FLAGS_use_ngraph) {
paddle::operators::NgraphEngine::FuseNgraphOps(
ctx->prog_.Block(ctx->block_id_), &ctx->ops_);
}
#endif
return ctx; return ctx;
} }
std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare( std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
const ProgramDesc& program, const std::vector<int>& block_ids, const ProgramDesc& program, const std::vector<int>& block_ids,
const std::vector<std::vector<std::string>>& skip_ref_cnt_vars) { const std::vector<std::vector<std::string>>& skip_ref_cnt_vars,
bool force_disable_gc) {
PADDLE_ENFORCE( PADDLE_ENFORCE(
skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(), skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(),
"skip_ref_cnt_vars should be either empty or equals to block number %d", "skip_ref_cnt_vars should be either empty or equals to block number %d",
...@@ -380,9 +391,11 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare( ...@@ -380,9 +391,11 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
for (auto& bid : block_ids) { for (auto& bid : block_ids) {
ExecutorPrepareContext* ctx; ExecutorPrepareContext* ctx;
if (skip_ref_cnt_vars.empty()) { if (skip_ref_cnt_vars.empty()) {
ctx = new ExecutorPrepareContext(program, bid); ctx = new ExecutorPrepareContext(program, bid, std::vector<std::string>(),
force_disable_gc);
} else { } else {
ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx]); ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx],
force_disable_gc);
} }
PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size()); PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
auto& block = program.Block(bid); auto& block = program.Block(bid);
...@@ -409,8 +422,9 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -409,8 +422,9 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
int64_t max_memory_size = GetEagerDeletionThreshold(); int64_t max_memory_size = GetEagerDeletionThreshold();
std::unique_ptr<GarbageCollector> gc; std::unique_ptr<GarbageCollector> gc;
// skip while_op and while_grad_op temporarily // FIXME(zjl): recurrent_op is rather complex, we would
if (max_memory_size >= 0 && !keep_kids) { // disable gc forcely in recurrent_op
if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
ctx->ResetReferenceCount(); ctx->ResetReferenceCount();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) { if (platform::is_gpu_place(place_)) {
...@@ -428,6 +442,11 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -428,6 +442,11 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
} }
#endif #endif
// If gc is enabled and block size > 1
if (gc && ctx->prog_.Size() > 1) {
operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(ctx->block_id_,
ctx->ops_);
}
} }
for (auto& op : ctx->ops_) { for (auto& op : ctx->ops_) {
......
...@@ -15,7 +15,9 @@ limitations under the License. */ ...@@ -15,7 +15,9 @@ limitations under the License. */
#pragma once #pragma once
#include <map> #include <map>
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_info.h"
...@@ -30,7 +32,8 @@ namespace framework { ...@@ -30,7 +32,8 @@ namespace framework {
struct ExecutorPrepareContext { struct ExecutorPrepareContext {
ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id, ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id,
const std::vector<std::string>& skip_ref_cnt_vars = const std::vector<std::string>& skip_ref_cnt_vars =
std::vector<std::string>()); std::vector<std::string>(),
bool force_disable_gc = false);
~ExecutorPrepareContext(); ~ExecutorPrepareContext();
...@@ -38,6 +41,7 @@ struct ExecutorPrepareContext { ...@@ -38,6 +41,7 @@ struct ExecutorPrepareContext {
const framework::ProgramDesc& prog_; const framework::ProgramDesc& prog_;
size_t block_id_; size_t block_id_;
bool force_disable_gc_;
std::vector<std::unique_ptr<OperatorBase>> ops_; std::vector<std::unique_ptr<OperatorBase>> ops_;
std::unordered_map<std::string, size_t> global_ref_cnts_; std::unordered_map<std::string, size_t> global_ref_cnts_;
...@@ -66,7 +70,10 @@ class Executor { ...@@ -66,7 +70,10 @@ class Executor {
* Scope * Scope
*/ */
void Run(const ProgramDesc& prog, Scope* scope, int block_id, void Run(const ProgramDesc& prog, Scope* scope, int block_id,
bool create_local_scope = true, bool create_vars = true); bool create_local_scope = true, bool create_vars = true,
const std::vector<std::string>& skip_ref_cnt_vars =
std::vector<std::string>(),
bool force_disable_gc = false);
// This API is very slow. // This API is very slow.
void Run(const ProgramDesc& program, Scope* scope, void Run(const ProgramDesc& program, Scope* scope,
...@@ -79,12 +86,14 @@ class Executor { ...@@ -79,12 +86,14 @@ class Executor {
static std::unique_ptr<ExecutorPrepareContext> Prepare( static std::unique_ptr<ExecutorPrepareContext> Prepare(
const ProgramDesc& program, int block_id, const ProgramDesc& program, int block_id,
const std::vector<std::string>& skip_ref_cnt_vars = const std::vector<std::string>& skip_ref_cnt_vars =
std::vector<std::string>()); std::vector<std::string>(),
bool force_disable_gc = false);
static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare( static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
const ProgramDesc& program, const std::vector<int>& block_ids, const ProgramDesc& program, const std::vector<int>& block_ids,
const std::vector<std::vector<std::string>>& skip_ref_cnt_vars = const std::vector<std::vector<std::string>>& skip_ref_cnt_vars =
std::vector<std::vector<std::string>>()); std::vector<std::vector<std::string>>(),
bool force_disable_gc = false);
void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id); void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id);
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <unordered_set> #include <unordered_map>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/op_proto_maker.h"
...@@ -152,6 +152,39 @@ void Graph::ResolveHazard( ...@@ -152,6 +152,39 @@ void Graph::ResolveHazard(
} }
} }
std::shared_ptr<Graph> Graph::Clone() {
auto cloned_graph = std::make_shared<Graph>(this->program_);
cloned_graph->ReleaseNodes();
cloned_graph->num_node_created_ = 0;
std::unordered_map<ir::Node *, ir::Node *> origin_to_cloned;
for (auto *n : this->node_set_) {
ir::Node *cloned_node = nullptr;
if (n->IsCtrlVar()) {
cloned_node = cloned_graph->CreateControlDepVar();
} else if (!n->var_desc_ && !n->op_desc_) { // empty node
cloned_node = cloned_graph->CreateEmptyNode(n->Name(), n->NodeType());
} else if (n->IsVar()) {
cloned_node = cloned_graph->CreateVarNode(n->Var());
} else if (n->IsOp()) {
cloned_node = cloned_graph->CreateOpNode(n->Op());
}
if (cloned_node) {
origin_to_cloned[n] = cloned_node;
} else {
PADDLE_THROW("The cloned node's type is not supported!");
}
}
for (auto *n : this->node_set_) {
for (auto it = n->inputs.begin(); it != n->inputs.end(); it++) {
origin_to_cloned[n]->inputs.push_back(origin_to_cloned[*it]);
}
for (auto it = n->outputs.begin(); it != n->outputs.end(); it++) {
origin_to_cloned[n]->outputs.push_back(origin_to_cloned[*it]);
}
}
return cloned_graph;
}
bool IsControlDepVar(const ir::Node &var) { bool IsControlDepVar(const ir::Node &var) {
return var.Name().find(ir::Node::kControlDepVarName) != std::string::npos; return var.Name().find(ir::Node::kControlDepVarName) != std::string::npos;
} }
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <map> #include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/ir/node.h"
...@@ -199,7 +200,12 @@ class Graph { ...@@ -199,7 +200,12 @@ class Graph {
// WARN: After a series of passes, the current graph can be quite // WARN: After a series of passes, the current graph can be quite
// different from OriginProgram. Caller shouldn't assume much from // different from OriginProgram. Caller shouldn't assume much from
// the returned OriginProgram. // the returned OriginProgram.
const ProgramDesc &OriginProgram() const { return program_; } const ProgramDesc &OriginProgram() const {
LOG(WARNING) << "WARN: After a series of passes, the current graph can be "
"quite different from OriginProgram. So, please avoid "
"using the `OriginProgram()` method!";
return program_;
}
// This method takes ownership of `node`. // This method takes ownership of `node`.
ir::Node *AddNode(ir::Node *node) { ir::Node *AddNode(ir::Node *node) {
...@@ -212,6 +218,10 @@ class Graph { ...@@ -212,6 +218,10 @@ class Graph {
void ResolveHazard( void ResolveHazard(
const std::map<std::string, std::vector<ir::Node *>> &var_nodes); const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
// Create a new and duplicated graph.
// WARN: The method only clones the graph structure, not its attributes.
std::shared_ptr<Graph> Clone();
private: private:
std::map<std::string, std::vector<ir::Node *>> InitFromProgram( std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
const ProgramDesc &program); const ProgramDesc &program);
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <typeindex> #include <typeindex>
#include <typeinfo> #include <typeinfo>
......
...@@ -186,14 +186,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -186,14 +186,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
VLOG(3) << place << " " << DebugStringEx(&scope); VLOG(3) << place << " " << DebugStringEx(&scope);
} catch (platform::EnforceNotMet exception) { } catch (platform::EnforceNotMet exception) {
if (Attrs().count("sub_block") != 0) { if (Attrs().count("sub_block") != 0) {
throw; throw std::move(exception);
} }
auto& callstack = Attr<std::vector<std::string>>( auto& callstack = Attr<std::vector<std::string>>(
OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
if (callstack.empty()) { if (callstack.empty()) {
throw; throw std::move(exception);
} }
std::ostringstream sout; std::ostringstream sout;
sout << "Invoke operator " << Type() << " error.\n"; sout << "Invoke operator " << Type() << " error.\n";
...@@ -204,7 +204,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -204,7 +204,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
sout << "C++ Callstacks: \n"; sout << "C++ Callstacks: \n";
sout << exception.err_str_; sout << exception.err_str_;
exception.err_str_ = sout.str(); exception.err_str_ = sout.str();
throw; throw std::move(exception);
} catch (...) { } catch (...) {
std::rethrow_exception(std::current_exception()); std::rethrow_exception(std::current_exception());
} }
...@@ -926,8 +926,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -926,8 +926,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
dev_ctx = pool.Get(expected_kernel_key.place_); dev_ctx = pool.Get(expected_kernel_key.place_);
} }
if (!HasAttr(kAllKernelsMustComputeRuntimeShape)) {
RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx); RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx);
this->InferShape(&infer_shape_ctx); this->InferShape(&infer_shape_ctx);
}
// TODO(panyx0718): ExecutionContext should only depend on RuntimeContext // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
// not Scope. Imperative mode only pass inputs and get outputs. // not Scope. Imperative mode only pass inputs and get outputs.
kernel_iter->second( kernel_iter->second(
......
...@@ -62,6 +62,15 @@ constexpr char kZeroVarSuffix[] = "@ZERO"; ...@@ -62,6 +62,15 @@ constexpr char kZeroVarSuffix[] = "@ZERO";
/// Variables with this suffix are the new Gradient. /// Variables with this suffix are the new Gradient.
constexpr char kNewGradSuffix[] = "@NEWGRAD@"; constexpr char kNewGradSuffix[] = "@NEWGRAD@";
/// If an Op has this attribute, all its kernels should calculate output
/// variable's shape in the corresponding Compute() function. And
/// OperatorWithKernel::RunImpl() would skip call this Op's InferShape()
/// function in its runtime for speedup.
/// TODO(luotao): Note that this temporal attribute would be deleted after all
/// ops contain it.
constexpr char kAllKernelsMustComputeRuntimeShape[] =
"@ALL_KERNELS_MUST_COMPUTE_RUNTIME_SHAPE@";
// define some kernel priority // define some kernel priority
/* Define multiple kernel type fallback order*/ /* Define multiple kernel type fallback order*/
extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority; extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
......
...@@ -181,12 +181,13 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() { ...@@ -181,12 +181,13 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
return member_->local_scopes_; return member_->local_scopes_;
} }
ParallelExecutor::ParallelExecutor( ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
const std::vector<platform::Place> &places, const std::vector<std::string> &bcast_vars,
const std::unordered_set<std::string> &bcast_vars, const std::string &loss_var_name,
const std::string &loss_var_name, Scope *scope, Scope *scope,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy, const ExecutionStrategy &exec_strategy,
const BuildStrategy &build_strategy,
ir::Graph *graph) ir::Graph *graph)
: member_(new ParallelExecutorPrivate(places)) { : member_(new ParallelExecutorPrivate(places)) {
member_->global_scope_ = scope; member_->global_scope_ = scope;
...@@ -254,9 +255,23 @@ ParallelExecutor::ParallelExecutor( ...@@ -254,9 +255,23 @@ ParallelExecutor::ParallelExecutor(
PADDLE_THROW("Not compiled with CUDA"); PADDLE_THROW("Not compiled with CUDA");
#endif #endif
} }
if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { // broadcast parameters from the 0th device to others:
BCastParamsToDevices(bcast_vars); auto need_broadcast = [&]() -> bool {
if (build_strategy.num_trainers_ > 1) {
// 1. num_tariners would be grater than 1 for nccl distributed training.
return true;
} else if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
// 2. Only one trainer process, but ParallelExecutor hold multiple
// devices.
return true;
} }
return false;
};
if (need_broadcast()) {
BCastParamsToDevices(bcast_vars, build_strategy.trainer_id_);
}
// Startup Program has been run. All local scopes has correct parameters. // Startup Program has been run. All local scopes has correct parameters.
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
...@@ -338,7 +353,7 @@ ParallelExecutor::ParallelExecutor( ...@@ -338,7 +353,7 @@ ParallelExecutor::ParallelExecutor(
} }
void ParallelExecutor::BCastParamsToDevices( void ParallelExecutor::BCastParamsToDevices(
const std::unordered_set<std::string> &vars) const { const std::vector<std::string> &vars, int trainer_id) const {
// the initializing bcast, all vars would be bcast from device(0). // the initializing bcast, all vars would be bcast from device(0).
for (auto &var : vars) { for (auto &var : vars) {
framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var); framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
...@@ -362,7 +377,7 @@ void ParallelExecutor::BCastParamsToDevices( ...@@ -362,7 +377,7 @@ void ParallelExecutor::BCastParamsToDevices(
auto place = member_->places_[i]; auto place = member_->places_[i];
void *buffer; void *buffer;
if (i == 0) { if (i == 0 && trainer_id == 0) {
buffer = const_cast<void *>(main_tensor.data<void>()); buffer = const_cast<void *>(main_tensor.data<void>());
} else { } else {
auto local_scope = member_->local_scopes_[i]; auto local_scope = member_->local_scopes_[i];
......
...@@ -14,9 +14,11 @@ limitations under the License. */ ...@@ -14,9 +14,11 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/build_strategy.h"
...@@ -45,7 +47,7 @@ class ParallelExecutor { ...@@ -45,7 +47,7 @@ class ParallelExecutor {
public: public:
explicit ParallelExecutor(const std::vector<platform::Place> &places, explicit ParallelExecutor(const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &bcast_vars, const std::vector<std::string> &bcast_vars,
const std::string &loss_var_name, Scope *scope, const std::string &loss_var_name, Scope *scope,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const ExecutionStrategy &exec_strategy, const ExecutionStrategy &exec_strategy,
...@@ -70,7 +72,10 @@ class ParallelExecutor { ...@@ -70,7 +72,10 @@ class ParallelExecutor {
const std::string &fetched_var_name); const std::string &fetched_var_name);
private: private:
void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const; // broadcast the parameters from the 0th device.
// trainer_id the trainer index in nccl distributed training.
void BCastParamsToDevices(const std::vector<std::string> &vars,
int trainer_id = 0) const;
bool EnableParallelGraphExecution(const ir::Graph &graph, bool EnableParallelGraphExecution(const ir::Graph &graph,
const ExecutionStrategy &exec_strategy, const ExecutionStrategy &exec_strategy,
const BuildStrategy &build_strategy) const; const BuildStrategy &build_strategy) const;
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -137,16 +138,19 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -137,16 +138,19 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src_place) && // NOLINT else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) { platform::is_cpu_place(dst_place)) {
platform::RecordEvent record_event("TensorCopy:GPU->CPU");
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place); auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place); auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
} else if (platform::is_cpu_place(src_place) && } else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) { platform::is_gpu_place(dst_place)) {
platform::RecordEvent record_event("TensorCopy:CPU->GPU");
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place); auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place); auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr); memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
} else if (platform::is_gpu_place(src_place) && } else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) { platform::is_gpu_place(dst_place)) {
platform::RecordEvent record_event("TensorCopy:GPU->GPU");
if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) { if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
VLOG(3) << "Skip copy the same data from " << src_place << " to " VLOG(3) << "Skip copy the same data from " << src_place << " to "
<< dst_place; << dst_place;
...@@ -157,6 +161,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -157,6 +161,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
} else if (platform::is_cuda_pinned_place(src_place) && } else if (platform::is_cuda_pinned_place(src_place) &&
platform::is_gpu_place(dst_place)) { platform::is_gpu_place(dst_place)) {
platform::RecordEvent record_event("TensorCopy:CUDAPinned->GPU");
auto src_pinned_place = boost::get<platform::CUDAPinnedPlace>(src_place); auto src_pinned_place = boost::get<platform::CUDAPinnedPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place); auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size, memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,
......
...@@ -159,10 +159,9 @@ class Autograd { ...@@ -159,10 +159,9 @@ class Autograd {
for (auto it : candidate->pre_ops_) { for (auto it : candidate->pre_ops_) {
for (OpBase* pre_op : it.second) { for (OpBase* pre_op : it.second) {
if (!pre_op) continue; if (!pre_op) continue;
VLOG(5) << "op dep " << candidate->op_desc_->Type() << " trace id " VLOG(5) << "op dep " << candidate->Type() << " trace id "
<< candidate->trace_id_ << " <---- " << it.first << " <---- " << candidate->trace_id_ << " <---- " << it.first << " <---- "
<< pre_op->op_desc_->Type() << " trace id " << pre_op->Type() << " trace id " << pre_op->trace_id_;
<< pre_op->trace_id_;
if (visited.find(pre_op) == visited.end()) { if (visited.find(pre_op) == visited.end()) {
visited.insert(pre_op); visited.insert(pre_op);
queue.push_back(pre_op); queue.push_back(pre_op);
...@@ -180,10 +179,12 @@ std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place, ...@@ -180,10 +179,12 @@ std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
PADDLE_ENFORCE(var_->IsInitialized(), PADDLE_ENFORCE(var_->IsInitialized(),
"Variable must be initialized when getting numpy tensor"); "Variable must be initialized when getting numpy tensor");
std::unique_ptr<VarBase> new_var(new VarBase()); // TODO(minqiyang): change this after move unique_name generator to CXX
const framework::LoDTensor& self_tensor = var_->Get<framework::LoDTensor>();
std::unique_ptr<VarBase> new_var(new VarBase(
"Itmp", self_tensor.type(), self_tensor.dims(), dst_place, true, false));
framework::LoDTensor* tensor = framework::LoDTensor* tensor =
new_var->var_->GetMutable<framework::LoDTensor>(); new_var->var_->GetMutable<framework::LoDTensor>();
tensor->Resize(var_->Get<framework::LoDTensor>().dims());
tensor->set_lod(var_->Get<framework::LoDTensor>().lod()); tensor->set_lod(var_->Get<framework::LoDTensor>().lod());
if (blocking) { if (blocking) {
...@@ -199,52 +200,62 @@ std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place, ...@@ -199,52 +200,62 @@ std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
} }
if (platform::is_gpu_place(dst_place)) { if (platform::is_gpu_place(dst_place)) {
VLOG(3) << "copy tensor " << var_desc_->Name() << " from gpu"; VLOG(3) << "copy tensor " << Name() << " from gpu";
} }
return new_var; return new_var;
} }
framework::LoDTensor& VarBase::GradValue() { framework::LoDTensor& VarBase::GradValue() {
VLOG(3) << "get var grad " << var_desc_->Name(); VLOG(3) << "get var grad " << Name();
PADDLE_ENFORCE_NOT_NULL(grads_,
"Could not get grad value from no grad variable");
return *(grads_->var_->GetMutable<framework::LoDTensor>()); return *(grads_->var_->GetMutable<framework::LoDTensor>());
} }
std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() { std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
if (grad_op_descs_.empty() && backward_id_ <= 0) { if (grad_op_descs_.empty() && backward_id_ <= 0) {
VLOG(3) << "op with no grad: " << op_desc_->Type(); VLOG(3) << "op with no grad: " << Type();
return {}; return {};
} }
VLOG(3) << "apply op grad: " << op_desc_->Type(); VLOG(3) << "apply op grad: " << Type();
std::vector<framework::VariableValueMap> grad_outputs; std::vector<framework::VariableValueMap> tmp_grad_outputs;
if (backward_id_ > 0) { if (backward_id_ > 0) {
VLOG(3) << "py_layer_grad"; VLOG(3) << "py_layer_grad";
grad_outputs.resize(1); tmp_grad_outputs.resize(1);
grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] = tmp_grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] =
PyLayer::ApplyGrad( PyLayer::ApplyGrad(
backward_id_, backward_id_,
grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]); grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]);
} else { } else {
grad_outputs.resize(grad_op_descs_.size()); const size_t grad_op_count = grad_op_descs_.size();
for (size_t k = 0; k < grad_op_descs_.size(); ++k) {
tmp_grad_outputs.resize(grad_op_count);
for (size_t k = 0; k < grad_op_count; ++k) {
framework::OpDesc* grad_op_desc = grad_op_descs_[k]; framework::OpDesc* grad_op_desc = grad_op_descs_[k];
VLOG(3) << "op grad " << grad_op_desc->Type(); auto& grad_output_variable_map = grad_output_vars_[k];
for (auto it : grad_output_vars_[k]) {
auto& outputs = grad_outputs[k][it.first]; VLOG(3) << "apply grad op " << grad_op_desc->Type();
// Allocate tmp grad output variable
for (auto it : grad_output_variable_map) {
auto& outputs = tmp_grad_outputs[k][it.first];
outputs.reserve(it.second.size());
for (size_t i = 0; i < it.second.size(); ++i) { for (size_t i = 0; i < it.second.size(); ++i) {
// Allocate a new variable // Allocate a new variable
Variable* tmp_var = new framework::Variable(); Variable* tmp_var = new framework::Variable();
tmp_var->GetMutable<framework::LoDTensor>(); tmp_var->GetMutable<framework::LoDTensor>();
outputs.push_back(tmp_var); outputs.emplace_back(tmp_var);
} }
} }
framework::RuntimeContext ctx(grad_input_vars_[k], grad_outputs[k]); // Run grad op
framework::RuntimeContext ctx(grad_input_vars_[k], tmp_grad_outputs[k]);
// No need to do compile time infer shape here. // No need to do compile time infer shape here.
// grad_op_desc_->InferShape(*block_); // grad_op_desc_->InferShape(*block_);
grad_op_desc->InferVarType(block_); // grad_op_desc->InferVarType(block_);
std::unique_ptr<framework::OperatorBase> opbase = std::unique_ptr<framework::OperatorBase> opbase =
framework::OpRegistry::CreateOp(*grad_op_desc); framework::OpRegistry::CreateOp(*grad_op_desc);
...@@ -260,9 +271,10 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() { ...@@ -260,9 +271,10 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
} }
} }
// Add tmp grad outputs to original grad vars
for (size_t k = 0; k < grad_output_vars_.size(); ++k) { for (size_t k = 0; k < grad_output_vars_.size(); ++k) {
for (auto it : grad_output_vars_[k]) { for (auto it : grad_output_vars_[k]) {
auto& outputs = grad_outputs[k][it.first]; auto& outputs = tmp_grad_outputs[k][it.first];
auto& origin_outputs = it.second; auto& origin_outputs = it.second;
PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
...@@ -316,19 +328,14 @@ void PyLayer::RegisterFunc(int func_id, const py::object& py_func) { ...@@ -316,19 +328,14 @@ void PyLayer::RegisterFunc(int func_id, const py::object& py_func) {
int PyLayer::NumFuncs() { return py_funcs_.size(); } int PyLayer::NumFuncs() { return py_funcs_.size(); }
std::vector<VarBase*> PyLayer::Apply(int func_id, std::vector<Variable*> PyLayer::Apply(int func_id,
const std::vector<VarBase*>& inputs) { const std::vector<VarBase*>& inputs) {
std::vector<framework::Variable*> invars; std::vector<framework::Variable*> invars;
for (const VarBase* in : inputs) { for (const VarBase* in : inputs) {
invars.push_back(in->var_); invars.push_back(in->var_);
} }
PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end()); PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
std::vector<Variable*> outvars = CallPythonFunc(py_funcs_[func_id], invars); return CallPythonFunc(py_funcs_[func_id], invars);
std::vector<VarBase*> ret;
for (Variable* v : outvars) {
ret.push_back(new VarBase(v, new VarBase(true)));
}
return ret;
} }
std::vector<Variable*> PyLayer::ApplyGrad( std::vector<Variable*> PyLayer::ApplyGrad(
......
...@@ -112,31 +112,53 @@ class OpBase; ...@@ -112,31 +112,53 @@ class OpBase;
*/ */
class VarBase { class VarBase {
public: public:
VarBase() : VarBase(new framework::Variable(), new VarBase(true)) {} // Internal interface, create VarBase from exist variable
VarBase(const std::string& name, framework::Variable* var, VarBase* grad,
explicit VarBase(bool stop_gradient) bool stop_gradient)
: VarBase(new framework::Variable(), : VarBase(name, var->Get<framework::LoDTensor>().type(),
stop_gradient ? nullptr : new VarBase(true), stop_gradient) {} var->Get<framework::LoDTensor>().dims(),
var->Get<framework::LoDTensor>().place(), var, grad,
VarBase(framework::Variable* var, VarBase* grad) stop_gradient, false) {}
: VarBase(var, grad, false) {}
// Python interface
VarBase(const std::string& name, const framework::proto::VarType::Type dtype,
const std::vector<int64_t>& shape, const platform::Place& place,
bool stop_gradient, bool persistable)
: VarBase(name, dtype, framework::make_ddim(shape), place, stop_gradient,
persistable) {}
// Internal interface, create VarBase from with ddim
VarBase(const std::string& name, const framework::proto::VarType::Type dtype,
const framework::DDim& shape, const platform::Place& place,
bool stop_gradient, bool persistable)
: VarBase(name, dtype, shape, place, nullptr, nullptr, stop_gradient,
persistable) {}
private: private:
VarBase(framework::Variable* var, VarBase* grad, bool stop_gradient) VarBase(const std::string& name, framework::proto::VarType::Type dtype,
: name_(), const framework::DDim& shape, const platform::Place& place,
var_desc_(nullptr), framework::Variable* var, VarBase* grad, bool stop_gradient,
bool persistable)
: name_(name),
dtype_(dtype),
place_(place),
var_(var), var_(var),
grads_(grad), grads_(grad),
block_(nullptr),
persistable_(false),
stop_gradient_(stop_gradient), stop_gradient_(stop_gradient),
persistable_(persistable),
pre_op_(nullptr), pre_op_(nullptr),
pre_op_out_name_(), pre_op_out_name_(),
pre_op_out_idx_(-1) {} pre_op_out_idx_(-1) {
if (!var_) {
var_ = new framework::Variable();
auto tensor = var_->GetMutable<framework::LoDTensor>();
tensor->Resize(shape);
tensor->mutable_data(place_, dtype_);
}
}
public: public:
virtual ~VarBase() { virtual ~VarBase() {
// TODO(minqiyang): remove var desc from block desc
if (var_) { if (var_) {
delete var_; delete var_;
var_ = nullptr; var_ = nullptr;
...@@ -151,14 +173,30 @@ class VarBase { ...@@ -151,14 +173,30 @@ class VarBase {
pre_op_out_idx_ = -1; pre_op_out_idx_ = -1;
} }
inline OpBase* PreOp() const { return pre_op_; } inline void SetName(const std::string& name) { name_ = name; }
inline int PreOpOutIdx() const { return pre_op_out_idx_; } inline std::string Name() const { return name_; }
inline std::vector<int64_t> Shape() const {
if (var_->IsInitialized()) {
return framework::vectorize(var_->Get<framework::LoDTensor>().dims());
} else {
return {};
}
}
inline framework::proto::VarType::Type DType() const { return dtype_; }
inline void SetStopGradient(bool stop_gradient) { inline void SetStopGradient(bool stop_gradient) {
stop_gradient_ = stop_gradient; stop_gradient_ = stop_gradient;
} }
inline bool IsStopGradient() const { return stop_gradient_; } inline bool IsStopGradient() const { return stop_gradient_; }
inline void SetPersistable(bool persistable) { persistable_ = persistable; }
inline bool IsPersistable() const { return persistable_; }
inline OpBase* PreOp() const { return pre_op_; }
inline int PreOpOutIdx() const { return pre_op_out_idx_; }
void RunBackward(); void RunBackward();
inline void ResetPreOp(OpBase* op) { inline void ResetPreOp(OpBase* op) {
...@@ -180,7 +218,7 @@ class VarBase { ...@@ -180,7 +218,7 @@ class VarBase {
} }
void ClearGradient() { void ClearGradient() {
VLOG(1) << "clear gradient of " << var_desc_->Name(); VLOG(1) << "clear gradient of " << Name();
if (grads_ && grads_->var_ && grads_->var_->IsInitialized()) { if (grads_ && grads_->var_ && grads_->var_->IsInitialized()) {
auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>(); auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>();
operators::math::set_constant( operators::math::set_constant(
...@@ -196,23 +234,20 @@ class VarBase { ...@@ -196,23 +234,20 @@ class VarBase {
const bool blocking) const; const bool blocking) const;
inline std::string GradName() const { inline std::string GradName() const {
PADDLE_ENFORCE( return string::Sprintf("%s@IGrad", Name());
var_desc_,
"Couldn't get gradient variable's name, please call backward() first");
return string::Sprintf("%s@IGrad", var_desc_->Name());
} }
std::string name_; std::string name_;
framework::VarDesc* var_desc_; framework::proto::VarType::Type dtype_;
platform::Place place_;
framework::Variable* var_; framework::Variable* var_;
VarBase* grads_; VarBase* grads_;
framework::BlockDesc* block_;
bool persistable_;
private: private:
bool stop_gradient_; bool stop_gradient_;
bool persistable_;
OpBase* pre_op_; OpBase* pre_op_;
std::string pre_op_out_name_; std::string pre_op_out_name_;
int pre_op_out_idx_; int pre_op_out_idx_;
...@@ -223,11 +258,11 @@ class VarBase { ...@@ -223,11 +258,11 @@ class VarBase {
*/ */
class PYBIND11_HIDDEN OpBase { class PYBIND11_HIDDEN OpBase {
public: public:
OpBase() OpBase(const std::string& type)
: op_desc_(nullptr), : type_(type),
trace_id_(-1),
forward_id_(-1), forward_id_(-1),
backward_id_(-1), backward_id_(-1),
trace_id_(-1),
place_(platform::CPUPlace()), place_(platform::CPUPlace()),
backward_hooks_() {} backward_hooks_() {}
...@@ -249,13 +284,34 @@ class PYBIND11_HIDDEN OpBase { ...@@ -249,13 +284,34 @@ class PYBIND11_HIDDEN OpBase {
std::map<std::string, std::vector<VarBase*>> ApplyGrad(); std::map<std::string, std::vector<VarBase*>> ApplyGrad();
inline std::string Type() const { return type_; }
inline std::string GradOpType(size_t index) const {
PADDLE_ENFORCE_NOT_NULL(grad_op_descs_[index]);
return grad_op_descs_[index]->Type();
}
void RegisterBackwardHooks(const py::object& callable); void RegisterBackwardHooks(const py::object& callable);
void InvokeBackwardHooks(); void InvokeBackwardHooks();
// One of `op_desc_` or `forward_id_` is set, not both. void TrackPreOp(const VarBase* inp_var, const std::string& inp_name) {
// For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_. if (inp_var->PreOp() && !inp_var->IsStopGradient()) {
framework::OpDesc* op_desc_; VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot "
<< inp_name;
pre_ops_[inp_name].push_back(inp_var->PreOp());
pre_ops_out_idx_[inp_name].push_back(inp_var->PreOpOutIdx());
} else {
VLOG(3) << "no pre op in slot " << inp_name
<< " input var stop_gradient: " << inp_var->IsStopGradient();
pre_ops_[inp_name].push_back(nullptr);
// pre_ops_out_idx_[inp_name].push_back(-1);
}
}
std::string type_;
// One of `trace_id_` or `forward_id_` is set, not both.
// For pure python PyLayer, use `forward_id_`, otherwise, use trace_id_.
int trace_id_;
int forward_id_; int forward_id_;
// When has backward, one of `grad_op_descs_` or `backward_id_` is set, // When has backward, one of `grad_op_descs_` or `backward_id_` is set,
...@@ -263,7 +319,6 @@ class PYBIND11_HIDDEN OpBase { ...@@ -263,7 +319,6 @@ class PYBIND11_HIDDEN OpBase {
// Note: each fwd op corresponds to a vector of bwd ops. // Note: each fwd op corresponds to a vector of bwd ops.
std::vector<framework::OpDesc*> grad_op_descs_; std::vector<framework::OpDesc*> grad_op_descs_;
int backward_id_; int backward_id_;
int trace_id_;
platform::Place place_; platform::Place place_;
...@@ -277,8 +332,6 @@ class PYBIND11_HIDDEN OpBase { ...@@ -277,8 +332,6 @@ class PYBIND11_HIDDEN OpBase {
// Outputs to a vector of bwd ops. // Outputs to a vector of bwd ops.
std::vector<framework::VariableValueMap> grad_output_vars_; std::vector<framework::VariableValueMap> grad_output_vars_;
framework::BlockDesc* block_;
std::vector<py::object> backward_hooks_; std::vector<py::object> backward_hooks_;
}; };
...@@ -303,8 +356,8 @@ class PyLayer { ...@@ -303,8 +356,8 @@ class PyLayer {
static int NumFuncs(); static int NumFuncs();
static std::vector<VarBase*> Apply(int func_id, static std::vector<framework::Variable*> Apply(
const std::vector<VarBase*>& inputs); int func_id, const std::vector<VarBase*>& inputs);
static std::vector<framework::Variable*> ApplyGrad( static std::vector<framework::Variable*> ApplyGrad(
int func_id, const std::vector<framework::Variable*>& inputs); int func_id, const std::vector<framework::Variable*>& inputs);
......
...@@ -56,15 +56,19 @@ void CreateGradOp(const framework::OpDesc& op_desc, ...@@ -56,15 +56,19 @@ void CreateGradOp(const framework::OpDesc& op_desc,
} }
} }
void InitVar(framework::Variable* var, framework::Variable* grad_var, void InitGrad(VarBase* var, platform::DeviceContext* dev_ctx) {
platform::DeviceContext* dev_ctx) { PADDLE_ENFORCE_NOT_NULL(var, "Could not get valid var base");
PADDLE_ENFORCE_NOT_NULL(dev_ctx, PADDLE_ENFORCE_NOT_NULL(dev_ctx,
"Could not get valid device from forward op"); "Could not get valid device from forward op");
auto& var_t = var->Get<framework::LoDTensor>();
grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>( if (var->grads_ == nullptr) {
var_t.dims(), dev_ctx->GetPlace()); auto& var_t = var->var_->Get<framework::LoDTensor>();
operators::math::set_constant( var->grads_ = new VarBase(var->GradName(), framework::proto::VarType::FP32,
*dev_ctx, grad_var->GetMutable<framework::LoDTensor>(), 0.0); framework::vectorize(var_t.dims()),
dev_ctx->GetPlace(), true, false);
auto grad_t = var->grads_->var_->GetMutable<framework::LoDTensor>();
operators::math::set_constant(*dev_ctx, grad_t, 0.0);
}
} }
platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) { platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) {
...@@ -85,6 +89,62 @@ platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) { ...@@ -85,6 +89,62 @@ platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) {
return result; return result;
} }
framework::VariableNameMap CreateInputVarNameMap(
const OpBase* op, const VarBasePtrMap& varbase_map) {
framework::VariableNameMap result;
auto& info_map = framework::OpInfoMap::Instance();
auto* op_info = info_map.GetNullable(op->Type());
if (op_info == nullptr || op_info->proto_ == nullptr) {
return result;
}
for (auto& in : op_info->Proto().inputs()) {
auto it = varbase_map.find(in.name());
if (it == varbase_map.end()) {
PADDLE_ENFORCE(in.dispensable());
result[in.name()] = {};
} else {
auto var_vector = it->second;
std::vector<std::string> args;
args.reserve(var_vector.size());
for (VarBase* var_base : var_vector) {
args.emplace_back(var_base->Name());
}
result[in.name()] = args;
}
}
return result;
}
framework::VariableNameMap CreateOutputVarNameMap(
const OpBase* op, const VarBasePtrMap& varbase_map) {
framework::VariableNameMap result;
auto& info_map = framework::OpInfoMap::Instance();
auto* op_info = info_map.GetNullable(op->Type());
if (op_info == nullptr || op_info->proto_ == nullptr) {
return result;
}
for (auto& out : op_info->Proto().outputs()) {
auto it = varbase_map.find(out.name());
if (it == varbase_map.end()) {
PADDLE_ENFORCE(out.dispensable());
result[out.name()] = {};
} else {
auto var_vector = it->second;
std::vector<std::string> args;
args.reserve(var_vector.size());
for (VarBase* var_base : var_vector) {
args.emplace_back(var_base->Name());
}
result[out.name()] = args;
}
}
return result;
}
Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
if (!FLAGS_tracer_profile_fname.empty()) { if (!FLAGS_tracer_profile_fname.empty()) {
std::call_once(gTracerProfileOnce, [] { std::call_once(gTracerProfileOnce, [] {
...@@ -101,7 +161,7 @@ Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { ...@@ -101,7 +161,7 @@ Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
const VarBasePtrMap& outputs, const VarBasePtrMap& outputs,
framework::BlockDesc* block, framework::AttributeMap attrs_map,
const platform::Place expected_place, const platform::Place expected_place,
const bool stop_gradient) { const bool stop_gradient) {
#ifdef WITH_GPERFTOOLS #ifdef WITH_GPERFTOOLS
...@@ -110,40 +170,27 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -110,40 +170,27 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
} }
#endif #endif
std::map<std::string, VarBase*> vars;
framework::OpDesc* op_desc = op->op_desc_;
VLOG(3) << "tracer tracing " << op_desc->Type() << " trace id "
<< op->trace_id_;
op_desc->InferShape(*block);
op_desc->InferVarType(block);
std::unique_ptr<framework::OperatorBase> op_base =
framework::OpRegistry::CreateOp(*op_desc);
framework::VariableValueMap invars_map; framework::VariableValueMap invars_map;
framework::VariableValueMap outvars_map; framework::VariableValueMap outvars_map;
// Construct input_vars_map and output_vars_map
std::map<std::string, VarBase*> current_vars_map;
op->input_vars_ = inputs; op->input_vars_ = inputs;
for (auto it : op->input_vars_) { for (auto it : op->input_vars_) {
auto& invars = invars_map[it.first]; auto& invars = invars_map[it.first];
invars.reserve(it.second.size()); invars.reserve(it.second.size());
for (VarBase* inp : it.second) { for (VarBase* inp : it.second) {
PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", op->Type(),
op->op_desc_->Type(), inp->var_desc_->Name()); inp->Name());
invars.emplace_back(inp->var_); invars.emplace_back(inp->var_);
vars[inp->var_desc_->Name()] = inp; op->TrackPreOp(inp, it.first);
if (inp->PreOp() && !inp->IsStopGradient()) { if (!stop_gradient) {
op->pre_ops_[it.first].push_back(inp->PreOp()); current_vars_map[inp->Name()] = inp;
op->pre_ops_out_idx_[it.first].push_back(inp->PreOpOutIdx());
VLOG(3) << "add pre op " << inp->PreOp()->op_desc_->Type();
} else {
op->pre_ops_[it.first].push_back(nullptr);
} }
VLOG(3) << "input vname " << inp->var_desc_->Name() << " " VLOG(3) << "input var name: " << inp->Name()
<< inp->var_->IsInitialized() << " stop_gradient " << " inited: " << inp->var_->IsInitialized()
<< inp->IsStopGradient(); << " stop_grad: " << inp->IsStopGradient();
} }
} }
...@@ -152,25 +199,38 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -152,25 +199,38 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
auto& outvars = outvars_map[it.first]; auto& outvars = outvars_map[it.first];
const std::vector<VarBase*>& outputs = it.second; const std::vector<VarBase*>& outputs = it.second;
outvars.reserve(outputs.size()); outvars.reserve(outputs.size());
for (size_t i = 0; i < outputs.size(); ++i) { for (size_t i = 0U; i < outputs.size(); ++i) {
VarBase* out = outputs[i]; VarBase* out = outputs[i];
outvars.emplace_back(out->var_); outvars.emplace_back(out->var_);
vars[out->var_desc_->Name()] = out;
framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name());
if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
out->var_->GetMutable<framework::LoDTensor>();
} else {
LOG(ERROR) << "tracer doesn't support yet";
}
out->TrackPreOp(op, it.first, i, stop_gradient); out->TrackPreOp(op, it.first, i, stop_gradient);
if (!stop_gradient) {
current_vars_map[out->Name()] = out;
}
VLOG(3) << "output vname " << out->var_desc_->Name() << " " VLOG(3) << "input var name: " << out->Name()
<< out->var_->IsInitialized(); << " inited: " << out->var_->IsInitialized()
<< " stop_grad: " << out->IsStopGradient();
}
} }
// Check attrs and create op
framework::VariableNameMap invars_name_map =
CreateInputVarNameMap(op, inputs);
framework::VariableNameMap outvars_name_map =
CreateOutputVarNameMap(op, outputs);
auto& info = framework::OpInfoMap::Instance().Get(op->Type());
if (info.Checker() != nullptr) {
info.Checker()->Check(&attrs_map);
} }
VLOG(3) << "tracer running " << op_desc->Type(); std::unique_ptr<framework::OperatorBase> op_base =
framework::OpRegistry::CreateOp(op->Type(), invars_name_map,
outvars_name_map, attrs_map);
// TODO(minqiyang): Support infer var type in imperative mode
// Run forward op
VLOG(3) << "tracer running " << op->Type();
framework::RuntimeContext ctx(invars_map, outvars_map); framework::RuntimeContext ctx(invars_map, outvars_map);
// TODO(panyx0718): Cache p. // TODO(panyx0718): Cache p.
...@@ -186,36 +246,44 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -186,36 +246,44 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
framework::ExecutionContext(prepared_op.op, scope, *prepared_op.dev_ctx, framework::ExecutionContext(prepared_op.op, scope, *prepared_op.dev_ctx,
prepared_op.ctx, prepared_op.kernel_configs)); prepared_op.ctx, prepared_op.kernel_configs));
// construct backward op
std::set<std::string> vars_saved_for_backward; std::set<std::string> vars_saved_for_backward;
if (!stop_gradient) { if (!stop_gradient) {
VLOG(5) << "start construct backward op";
// construct grad op descs
std::unique_ptr<framework::OpDesc> fwd_op_desc(new framework::OpDesc(
op->Type(), invars_name_map, outvars_name_map, attrs_map));
std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var( std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
new std::unordered_map<std::string, std::string>()); new std::unordered_map<std::string, std::string>());
CreateGradOp(*op_desc, {}, {block}, &op->grad_op_descs_, grad_to_var.get()); // NOTE(minqiyang): We don't support control flow op in imperative now
// Add grad_block_ when we want to support it
CreateGradOp(*fwd_op_desc, {}, {}, &op->grad_op_descs_, grad_to_var.get());
op->grad_input_vars_.resize(op->grad_op_descs_.size()); VLOG(5) << "create grad op desc: " << op->grad_op_descs_[0]->Type();
op->grad_output_vars_.resize(op->grad_op_descs_.size());
for (size_t i = 0; i < op->grad_op_descs_.size(); ++i) { const size_t grad_op_count = op->grad_op_descs_.size();
op->grad_input_vars_.resize(grad_op_count);
op->grad_output_vars_.resize(grad_op_count);
for (size_t i = 0; i < grad_op_count; ++i) {
framework::OpDesc* grad_op_desc = op->grad_op_descs_[i]; framework::OpDesc* grad_op_desc = op->grad_op_descs_[i];
for (auto it : grad_op_desc->Inputs()) { for (auto it : grad_op_desc->Inputs()) {
auto& grad_in_vars = op->grad_input_vars_[i][it.first]; auto& grad_in_vars = op->grad_input_vars_[i][it.first];
grad_in_vars.reserve(it.second.size());
for (const std::string& grad_invar : it.second) { for (const std::string& grad_invar : it.second) {
block->FindRecursiveOrCreateVar(grad_invar);
auto var_it = grad_to_var->find(grad_invar); auto var_it = grad_to_var->find(grad_invar);
if (var_it == grad_to_var->end()) { if (var_it == grad_to_var->end()) {
auto fwd_var_it = vars.find(grad_invar); auto fwd_var_it = current_vars_map.find(grad_invar);
PADDLE_ENFORCE(fwd_var_it != vars.end()); PADDLE_ENFORCE(fwd_var_it != current_vars_map.end());
// Forward inputs or outputs. // Forward inputs or outputs.
grad_in_vars.push_back(fwd_var_it->second->var_); grad_in_vars.emplace_back(fwd_var_it->second->var_);
} else { } else {
VarBase* var = vars[var_it->second]; VarBase* var = current_vars_map[var_it->second];
if (!var->grads_->var_->IsInitialized()) { InitGrad(var, prepared_op.GetDeviceContext());
InitVar(var->var_, var->grads_->var_,
prepared_op.GetDeviceContext());
}
// Douts. // Douts.
grad_in_vars.push_back(var->grads_->var_); grad_in_vars.emplace_back(var->grads_->var_);
} }
vars_saved_for_backward.insert(it.first); vars_saved_for_backward.insert(it.first);
...@@ -225,48 +293,48 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -225,48 +293,48 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
for (auto it : grad_op_desc->Outputs()) { for (auto it : grad_op_desc->Outputs()) {
auto& grad_out_vars = op->grad_output_vars_[i][it.first]; auto& grad_out_vars = op->grad_output_vars_[i][it.first];
for (const std::string& grad_outvar : it.second) { for (const std::string& grad_outvar : it.second) {
block->FindRecursiveOrCreateVar(grad_outvar);
auto var_it = grad_to_var->find(grad_outvar); auto var_it = grad_to_var->find(grad_outvar);
PADDLE_ENFORCE(var_it != grad_to_var->end(), PADDLE_ENFORCE(var_it != grad_to_var->end(),
"Could not found the grad op output var, should this " "Could not found the grad op output var, should this "
"operator %s's stop gradient be True", "operator %s's stop gradient be True",
op_desc->Type()); op->Type());
VarBase* var = vars[var_it->second]; VarBase* var = current_vars_map[var_it->second];
if (!var->grads_->var_->IsInitialized()) { InitGrad(var, prepared_op.GetDeviceContext());
InitVar(var->var_, var->grads_->var_,
prepared_op.GetDeviceContext());
}
grad_out_vars.push_back(var->grads_->var_); grad_out_vars.push_back(var->grads_->var_);
} }
} }
} }
} }
op->block_ = block;
return vars_saved_for_backward; return vars_saved_for_backward;
} }
std::vector<VarBase*> Tracer::PyTrace(OpBase* op, std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
const std::vector<VarBase*>& inputs, const std::vector<VarBase*>& inputs,
bool stop_gradient) { bool stop_gradient) {
VLOG(3) << "py_trace"; VLOG(3) << "py_trace " << op->Type();
op->input_vars_[PyLayer::kFwdInp] = inputs; op->input_vars_[PyLayer::kFwdInp] = inputs;
op->output_vars_[PyLayer::kFwdOut] = PyLayer::Apply(op->forward_id_, inputs);
std::vector<framework::Variable*> ret_vars =
PyLayer::Apply(op->forward_id_, inputs);
for (VarBase* inp : inputs) { for (VarBase* inp : inputs) {
if (inp->PreOp() && !inp->IsStopGradient()) { op->TrackPreOp(inp, PyLayer::kFwdInp);
op->pre_ops_[PyLayer::kFwdInp].push_back(inp->PreOp());
op->pre_ops_out_idx_[PyLayer::kFwdInp].push_back(inp->PreOpOutIdx());
} else {
op->pre_ops_[PyLayer::kFwdInp].push_back(nullptr);
}
} }
auto& outputs = op->output_vars_[PyLayer::kFwdOut]; std::vector<VarBase*>& outputs = op->output_vars_[PyLayer::kFwdOut];
for (size_t i = 0; i < outputs.size(); ++i) { outputs.reserve(ret_vars.size());
VarBase* out = outputs[i]; for (size_t i = 0U; i != ret_vars.size(); ++i) {
framework::Variable* v = ret_vars[i];
VarBase* out = new VarBase(string::Sprintf("%s_out_%d", op->Type(), i), v,
nullptr, stop_gradient);
outputs.emplace_back(out);
out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient); out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient);
} }
if (!stop_gradient) { if (!stop_gradient) {
VLOG(5) << "start construct backward op";
op->grad_input_vars_.resize(1); op->grad_input_vars_.resize(1);
op->grad_output_vars_.resize(1); op->grad_output_vars_.resize(1);
auto& grad_input_vars = auto& grad_input_vars =
...@@ -281,23 +349,16 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op, ...@@ -281,23 +349,16 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
grad_input_vars.push_back(out->var_); grad_input_vars.push_back(out->var_);
} }
// TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
platform::CPUPlace place; platform::CPUPlace place;
for (VarBase* out : outputs) { for (VarBase* out : outputs) {
InitGrad(out, platform::DeviceContextPool::Instance().Get(place));
grad_input_vars.push_back(out->grads_->var_); grad_input_vars.push_back(out->grads_->var_);
if (!grad_input_vars.back()->IsInitialized()) {
// TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
InitVar(out->var_, grad_input_vars.back(),
platform::DeviceContextPool::Instance().Get(place));
}
} }
for (const VarBase* inp : inputs) { for (VarBase* inp : inputs) {
InitGrad(inp, platform::DeviceContextPool::Instance().Get(place));
grad_output_vars.push_back(inp->grads_->var_); grad_output_vars.push_back(inp->grads_->var_);
if (!grad_output_vars.back()->IsInitialized()) {
// TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
InitVar(inp->var_, grad_output_vars.back(),
platform::DeviceContextPool::Instance().Get(place));
}
} }
} }
return outputs; return outputs;
......
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
#include <map> #include <map>
#include <set> #include <set>
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
...@@ -34,7 +36,8 @@ void CreateGradOp(const framework::OpDesc& op_desc, ...@@ -34,7 +36,8 @@ void CreateGradOp(const framework::OpDesc& op_desc,
framework::OpDesc** grad_op_desc, framework::OpDesc** grad_op_desc,
std::unordered_map<std::string, std::string>* grad_to_var); std::unordered_map<std::string, std::string>* grad_to_var);
void InitVar(framework::Variable* var, framework::Variable* grad_var); void InitVar(const VarBase* var, framework::Variable* grad_var,
platform::DeviceContext* dev_ctx);
platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs); platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs);
...@@ -46,7 +49,7 @@ class Tracer { ...@@ -46,7 +49,7 @@ class Tracer {
std::set<std::string> Trace(OpBase* op, const VarBasePtrMap& inputs, std::set<std::string> Trace(OpBase* op, const VarBasePtrMap& inputs,
const VarBasePtrMap& outputs, const VarBasePtrMap& outputs,
framework::BlockDesc* block, framework::AttributeMap attrs_map,
const platform::Place expected_place, const platform::Place expected_place,
const bool stop_gradient = false); const bool stop_gradient = false);
......
...@@ -126,15 +126,20 @@ void ZeroCopyTensor::copy_to_cpu(T *data) { ...@@ -126,15 +126,20 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
} }
template void ZeroCopyTensor::copy_from_cpu<float>(const float *data); template void ZeroCopyTensor::copy_from_cpu<float>(const float *data);
template void ZeroCopyTensor::copy_from_cpu<int64_t>(const int64_t *data); template void ZeroCopyTensor::copy_from_cpu<int64_t>(const int64_t *data);
template void ZeroCopyTensor::copy_from_cpu<int32_t>(const int32_t *data);
template void ZeroCopyTensor::copy_to_cpu<float>(float *data); template void ZeroCopyTensor::copy_to_cpu<float>(float *data);
template void ZeroCopyTensor::copy_to_cpu<int64_t>(int64_t *data); template void ZeroCopyTensor::copy_to_cpu<int64_t>(int64_t *data);
template void ZeroCopyTensor::copy_to_cpu<int32_t>(int32_t *data);
template float *ZeroCopyTensor::data<float>(PaddlePlace *place, template float *ZeroCopyTensor::data<float>(PaddlePlace *place,
int *size) const; int *size) const;
template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place, template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place,
int *size) const; int *size) const;
template int32_t *ZeroCopyTensor::data<int32_t>(PaddlePlace *place,
int *size) const;
template float *ZeroCopyTensor::mutable_data<float>(PaddlePlace place); template float *ZeroCopyTensor::mutable_data<float>(PaddlePlace place);
template int64_t *ZeroCopyTensor::mutable_data<int64_t>(PaddlePlace place); template int64_t *ZeroCopyTensor::mutable_data<int64_t>(PaddlePlace place);
template int32_t *ZeroCopyTensor::mutable_data<int32_t>(PaddlePlace place);
void *ZeroCopyTensor::FindTensor() const { void *ZeroCopyTensor::FindTensor() const {
PADDLE_ENFORCE(!name_.empty(), PADDLE_ENFORCE(!name_.empty(),
......
...@@ -139,9 +139,8 @@ static void TensorAssignData(PaddleTensor *tensor, ...@@ -139,9 +139,8 @@ static void TensorAssignData(PaddleTensor *tensor,
} }
template <typename T> template <typename T>
static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
const std::vector<std::vector<T>> &data) { const std::vector<std::vector<T>> &data) {
int size{0};
auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU); auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);
int c = 0; int c = 0;
for (const auto &f : data) { for (const auto &f : data) {
...@@ -149,7 +148,15 @@ static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, ...@@ -149,7 +148,15 @@ static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
ptr[c++] = v; ptr[c++] = v;
} }
} }
return size; }
template <typename T>
static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
const PaddleBuf &data) {
auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);
for (size_t i = 0; i < data.length() / sizeof(T); i++) {
ptr[i] = *(reinterpret_cast<T *>(data.data()) + i);
}
} }
static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) { static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) {
......
...@@ -107,6 +107,9 @@ void SetConfig(AnalysisConfig *cfg) { ...@@ -107,6 +107,9 @@ void SetConfig(AnalysisConfig *cfg) {
cfg->DisableGpu(); cfg->DisableGpu();
cfg->SwitchSpecifyInputNames(); cfg->SwitchSpecifyInputNames();
cfg->SwitchIrOptim(); cfg->SwitchIrOptim();
if (FLAGS_zero_copy) {
cfg->SwitchUseFeedFetchOps(false);
}
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
...@@ -131,7 +134,7 @@ TEST(Analyzer_Pyramid_DNN, profile) { ...@@ -131,7 +134,7 @@ TEST(Analyzer_Pyramid_DNN, profile) {
TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg), TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads); input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data && !FLAGS_zero_copy) {
PADDLE_ENFORCE_EQ(outputs.size(), 1UL); PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
size_t size = GetSize(outputs[0]); size_t size = GetSize(outputs[0]);
PADDLE_ENFORCE_GT(size, 0); PADDLE_ENFORCE_GT(size, 0);
...@@ -166,6 +169,19 @@ TEST(Analyzer_Pyramid_DNN, compare) { ...@@ -166,6 +169,19 @@ TEST(Analyzer_Pyramid_DNN, compare) {
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all); reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
TEST(Analyzer_Pyramid_DNN, compare_zero_copy) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
std::vector<std::string> outputs_name;
outputs_name.emplace_back("cos_sim_2.tmp_0");
CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
input_slots_all, outputs_name);
}
// Compare Deterministic result // Compare Deterministic result
TEST(Analyzer_Pyramid_DNN, compare_determine) { TEST(Analyzer_Pyramid_DNN, compare_determine) {
AnalysisConfig cfg; AnalysisConfig cfg;
......
...@@ -207,6 +207,9 @@ void SetConfig(AnalysisConfig *cfg) { ...@@ -207,6 +207,9 @@ void SetConfig(AnalysisConfig *cfg) {
cfg->DisableGpu(); cfg->DisableGpu();
cfg->SwitchSpecifyInputNames(); cfg->SwitchSpecifyInputNames();
cfg->SwitchIrOptim(); cfg->SwitchIrOptim();
if (FLAGS_zero_copy) {
cfg->SwitchUseFeedFetchOps(false);
}
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
...@@ -285,133 +288,17 @@ TEST(Analyzer_rnn1, multi_thread) { ...@@ -285,133 +288,17 @@ TEST(Analyzer_rnn1, multi_thread) {
input_slots_all, &outputs, 2 /* multi_thread */); input_slots_all, &outputs, 2 /* multi_thread */);
} }
// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing // Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
// on the complex RNN1 model. TEST(Analyzer_rnn1, compare_zero_copy) {
TEST(Analyzer_rnn1, ZeroCopy) { AnalysisConfig cfg;
AnalysisConfig config; SetConfig(&cfg);
SetConfig(&config);
config.SwitchUseFeedFetchOps(false);
PaddlePlace place;
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
config.SwitchUseFeedFetchOps(true);
auto native_predictor =
CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
config.SwitchUseFeedFetchOps(
true); // the analysis predictor needs feed/fetch.
auto analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
#define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__);
NEW_TENSOR(data_lod_attention);
NEW_TENSOR(cell_init);
NEW_TENSOR(data);
NEW_TENSOR(week);
NEW_TENSOR(minute);
NEW_TENSOR(hidden_init);
// Prepare data for AnalysisPredictor
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
PrepareZeroCopyInputs(data_lod_attention_tensor.get(), cell_init_tensor.get(),
data_tensor.get(), hidden_init_tensor.get(),
week_tensor.get(), minute_tensor.get(), &data,
FLAGS_batch_size);
// Prepare data for NativePredictor
std::vector<std::vector<PaddleTensor>> native_inputs;
SetInput(&native_inputs);
std::vector<PaddleTensor> native_outputs;
std::vector<PaddleTensor> analysis_outputs;
auto output_tensor = predictor->GetOutputTensor("final_output.tmp_1");
// Run analysis predictor
int num_ops;
auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_EQ(fuse_statis.at("fc_fuse"), 1);
ASSERT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM
ASSERT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
ASSERT_EQ(num_ops,
13); // After graph optimization, only 13 operators exists.
Timer timer;
double total_time{0};
for (int i = 0; i < FLAGS_repeat; i++) {
timer.tic();
predictor->ZeroCopyRun();
total_time += timer.toc();
}
LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor);
ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs));
LOG(INFO) << "native output " << DescribeTensor(native_outputs.front());
int output_size{0}; // this is the number of elements not memory size
auto *zero_copy_data = output_tensor->data<float>(&place, &output_size);
auto *native_data = static_cast<float *>(native_outputs.front().data.data());
for (int i = 0; i < output_size; i++) {
EXPECT_NEAR(zero_copy_data[i], native_data[i], 1e-3);
}
}
TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
AnalysisConfig config;
SetConfig(&config);
config.SwitchUseFeedFetchOps(false);
#define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__);
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
for (int tid = 1; tid < FLAGS_num_threads; tid++) {
predictors.emplace_back(predictors.front()->Clone());
}
double total_time_of_threads{0};
std::vector<std::thread> threads;
for (int tid = 0; tid < FLAGS_num_threads; tid++) {
threads.emplace_back([&, tid] {
auto &predictor = predictors[tid];
NEW_TENSOR(data_lod_attention);
NEW_TENSOR(cell_init);
NEW_TENSOR(data);
NEW_TENSOR(week);
NEW_TENSOR(minute);
NEW_TENSOR(hidden_init);
// Prepare data for AnalysisPredictor
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
Timer timer;
double total_time{0};
for (int i = 0; i < FLAGS_repeat; i++) {
PrepareZeroCopyInputs(data_lod_attention_tensor.get(),
cell_init_tensor.get(), data_tensor.get(),
hidden_init_tensor.get(), week_tensor.get(),
minute_tensor.get(), &data, FLAGS_batch_size);
timer.tic();
predictor->ZeroCopyRun();
total_time += timer.toc();
}
total_time_of_threads += total_time;
LOG(INFO) << "thread time: " << total_time / FLAGS_repeat;
});
}
for (auto &t : threads) {
t.join();
}
LOG(INFO) << "average time: " std::vector<std::vector<PaddleTensor>> input_slots_all;
<< total_time_of_threads / FLAGS_num_threads / FLAGS_repeat; SetInput(&input_slots_all);
std::vector<std::string> outputs_name;
outputs_name.emplace_back("final_output.tmp_1");
CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
input_slots_all, outputs_name);
} }
} // namespace inference } // namespace inference
......
...@@ -144,6 +144,9 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) { ...@@ -144,6 +144,9 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
cfg->SwitchSpecifyInputNames(); cfg->SwitchSpecifyInputNames();
cfg->SwitchIrDebug(); cfg->SwitchIrDebug();
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
if (FLAGS_zero_copy) {
cfg->SwitchUseFeedFetchOps(false);
}
if (use_mkldnn) { if (use_mkldnn) {
cfg->EnableMKLDNN(); cfg->EnableMKLDNN();
} }
...@@ -184,10 +187,10 @@ TEST(Analyzer_seq_pool1, compare_determine) { ...@@ -184,10 +187,10 @@ TEST(Analyzer_seq_pool1, compare_determine) {
input_slots_all); input_slots_all);
} }
void analysis_fuse_statis(bool use_zerocopy) { // Check the fuse status
TEST(Analyzer_seq_pool1, fuse_statis) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
cfg.SwitchUseFeedFetchOps(!use_zerocopy);
int num_ops; int num_ops;
auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg); auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops); auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
...@@ -203,137 +206,17 @@ void analysis_fuse_statis(bool use_zerocopy) { ...@@ -203,137 +206,17 @@ void analysis_fuse_statis(bool use_zerocopy) {
EXPECT_EQ(num_ops, 171); EXPECT_EQ(num_ops, 171);
} }
// Check the fuse status // Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
TEST(Analyzer_seq_pool1, fuse_statis) { analysis_fuse_statis(false); } TEST(Analyzer_seq_pool1, compare_zero_copy) {
AnalysisConfig cfg;
void PrepareZeroCopyInputs( SetConfig(&cfg);
const std::unique_ptr<PaddlePredictor> &predictor,
std::vector<std::unique_ptr<ZeroCopyTensor>> *inputs) {
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
// only feed one batch
const auto &one_batch = data.NextBatch();
inputs->clear();
for (size_t i = 0; i < one_batch.size(); ++i) {
auto &slot = one_batch[i];
auto tensor = predictor->GetInputTensor(slot.name + "_embed");
tensor->Reshape(slot.shape);
tensor->SetLoD({slot.lod});
ZeroCopyTensorAssignData<float>(tensor.get(), slot.data);
inputs->emplace_back(std::move(tensor));
}
}
// return the output values
std::vector<float> zerocopy_profile(int repeat_times) {
AnalysisConfig config;
SetConfig(&config);
config.SwitchUseFeedFetchOps(false);
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
std::vector<std::unique_ptr<ZeroCopyTensor>> inputs;
PrepareZeroCopyInputs(predictor, &inputs);
auto output_tensor = predictor->GetOutputTensor(out_var_name);
Timer timer;
LOG(INFO) << "Warm up run...";
timer.tic();
predictor->ZeroCopyRun();
PrintTime(FLAGS_batch_size, 1, 1, 0, timer.toc(), 1);
if (FLAGS_profile) {
paddle::platform::ResetProfiler();
}
LOG(INFO) << "Run " << repeat_times << " times...";
timer.tic();
for (int i = 0; i < repeat_times; i++) {
predictor->ZeroCopyRun();
}
PrintTime(FLAGS_batch_size, repeat_times, 1, 0, timer.toc() / repeat_times,
1);
LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor);
PaddlePlace place;
int output_size{0};
auto *pdata = output_tensor->data<float>(&place, &output_size);
std::vector<float> res(output_size);
for (int i = 0; i < output_size; ++i) {
res[i] = pdata[i];
}
return res;
}
TEST(Analyzer_seq_pool1, zerocopy_profile) { zerocopy_profile(FLAGS_repeat); }
TEST(Analyzer_seq_pool1, zerocopy_profile_threads) {
AnalysisConfig config;
SetConfig(&config);
config.SwitchUseFeedFetchOps(false);
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
for (int tid = 1; tid < FLAGS_num_threads; tid++) {
predictors.emplace_back(predictors.front()->Clone());
}
double total_time_of_threads{0};
std::vector<std::thread> threads;
for (int tid = 0; tid < FLAGS_num_threads; tid++) {
threads.emplace_back([&, tid] {
auto &predictor = predictors[tid];
std::vector<std::unique_ptr<ZeroCopyTensor>> inputs;
PrepareZeroCopyInputs(predictor, &inputs);
auto output_tensor = predictor->GetOutputTensor(out_var_name);
Timer timer;
double total_time{0};
LOG(INFO) << "Warm up run...";
timer.tic();
predictor->ZeroCopyRun();
PrintTime(FLAGS_batch_size, 1, FLAGS_num_threads, tid, timer.toc(), 1);
if (FLAGS_profile) {
paddle::platform::ResetProfiler();
}
int repeat_times = FLAGS_repeat;
LOG(INFO) << "Run " << repeat_times << " times...";
timer.tic();
for (int i = 0; i < repeat_times; i++) {
predictor->ZeroCopyRun();
}
total_time += timer.toc();
total_time_of_threads += total_time;
LOG(INFO) << "thread time: " << total_time / repeat_times;
});
}
for (auto &t : threads) {
t.join();
}
LOG(INFO) << "average time: "
<< total_time_of_threads / FLAGS_num_threads / FLAGS_repeat;
}
TEST(Analyzer_seq_pool1, zerocopy_fuse_statis) { analysis_fuse_statis(true); }
TEST(Analyzer_seq_pool1, zerocopy_compare_native) {
AnalysisConfig config;
SetConfig(&config);
config.SwitchUseFeedFetchOps(true);
auto predictor = CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
std::vector<PaddleTensor> native_outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
ASSERT_TRUE(predictor->Run(input_slots_all[0], &native_outputs)); std::vector<std::string> outputs_name;
EXPECT_EQ(native_outputs.size(), 1UL); outputs_name.emplace_back(out_var_name);
CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
auto zerocopy_output = zerocopy_profile(1); input_slots_all, outputs_name);
EXPECT_EQ(zerocopy_output.size() * sizeof(float),
native_outputs.front().data.length());
auto *native_data = static_cast<float *>(native_outputs.front().data.data());
for (size_t i = 0; i < zerocopy_output.size(); ++i) {
EXPECT_LT(
std::fabs((zerocopy_output[i] - native_data[i]) / zerocopy_output[i]),
1e-3);
}
} }
} // namespace analysis } // namespace analysis
......
...@@ -50,6 +50,7 @@ DEFINE_bool(use_analysis, true, ...@@ -50,6 +50,7 @@ DEFINE_bool(use_analysis, true,
DEFINE_bool(record_benchmark, false, DEFINE_bool(record_benchmark, false,
"Record benchmark after profiling the model"); "Record benchmark after profiling the model");
DEFINE_double(accuracy, 1e-3, "Result Accuracy."); DEFINE_double(accuracy, 1e-3, "Result Accuracy.");
DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch.");
DECLARE_bool(profile); DECLARE_bool(profile);
DECLARE_int32(paddle_num_threads); DECLARE_int32(paddle_num_threads);
...@@ -67,6 +68,7 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { ...@@ -67,6 +68,7 @@ void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
LOG(INFO) << analysis_config->ToNativeConfig(); LOG(INFO) << analysis_config->ToNativeConfig();
} }
// Compare result between two PaddleTensor
void CompareResult(const std::vector<PaddleTensor> &outputs, void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &ref_outputs) { const std::vector<PaddleTensor> &ref_outputs) {
EXPECT_GT(outputs.size(), 0UL); EXPECT_GT(outputs.size(), 0UL);
...@@ -108,6 +110,50 @@ void CompareResult(const std::vector<PaddleTensor> &outputs, ...@@ -108,6 +110,50 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
} }
} }
// Compare result between a PaddleTensor and a ZeroCopyTensor
void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<ZeroCopyTensor> &ref_outputs) {
EXPECT_GT(outputs.size(), 0UL);
EXPECT_EQ(outputs.size(), ref_outputs.size());
for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i];
auto &ref_out = ref_outputs[i];
size_t size = VecReduceToInt(out.shape);
EXPECT_GT(size, 0UL);
int ref_size = 0; // this is the number of elements not memory size
PaddlePlace place;
switch (out.dtype) {
case PaddleDType::INT64: {
int64_t *pdata = static_cast<int64_t *>(out.data.data());
int64_t *pdata_ref = ref_out.data<int64_t>(&place, &ref_size);
EXPECT_EQ(size, ref_size);
for (size_t j = 0; j < size; ++j) {
EXPECT_EQ(pdata_ref[j], pdata[j]);
}
break;
}
case PaddleDType::FLOAT32: {
float *pdata = static_cast<float *>(out.data.data());
float *pdata_ref = ref_out.data<float>(&place, &ref_size);
EXPECT_EQ(size, ref_size);
for (size_t j = 0; j < size; ++j) {
CHECK_LE(std::abs(pdata_ref[j] - pdata[j]), FLAGS_accuracy);
}
break;
}
case PaddleDType::INT32: {
int32_t *pdata = static_cast<int32_t *>(out.data.data());
int32_t *pdata_ref = ref_out.data<int32_t>(&place, &ref_size);
EXPECT_EQ(size, ref_size);
for (size_t j = 0; j < size; ++j) {
EXPECT_EQ(pdata_ref[j], pdata[j]);
}
break;
}
}
}
}
std::unique_ptr<PaddlePredictor> CreateTestPredictor( std::unique_ptr<PaddlePredictor> CreateTestPredictor(
const PaddlePredictor::Config *config, bool use_analysis = true) { const PaddlePredictor::Config *config, bool use_analysis = true) {
const auto *analysis_config = const auto *analysis_config =
...@@ -205,52 +251,99 @@ void GetInputPerBatch(const std::vector<std::vector<int64_t>> &in, ...@@ -205,52 +251,99 @@ void GetInputPerBatch(const std::vector<std::vector<int64_t>> &in,
} }
} }
void TestOneThreadPrediction( void ConvertPaddleTensorToZeroCopyTensor(
const PaddlePredictor::Config *config, PaddlePredictor *predictor, const std::vector<PaddleTensor> &inputs) {
for (size_t i = 0; i < inputs.size(); i++) {
auto input = inputs[i];
auto tensor = predictor->GetInputTensor(input.name);
tensor->Reshape(input.shape);
tensor->SetLoD({input.lod});
if (input.dtype == PaddleDType::INT64) {
ZeroCopyTensorAssignData<int64_t>(tensor.get(), input.data);
} else if (input.dtype == PaddleDType::FLOAT32) {
ZeroCopyTensorAssignData<float>(tensor.get(), input.data);
} else if (input.dtype == PaddleDType::INT32) {
ZeroCopyTensorAssignData<int32_t>(tensor.get(), input.data);
} else {
LOG(ERROR) << "unsupported feed type " << input.dtype;
}
}
}
void PredictionWarmUp(PaddlePredictor *predictor,
const std::vector<std::vector<PaddleTensor>> &inputs, const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, bool use_analysis = true) { std::vector<PaddleTensor> *outputs, int num_threads,
int tid) {
int batch_size = FLAGS_batch_size; int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat; LOG(INFO) << "Running thread " << tid << ", warm up run...";
auto predictor = CreateTestPredictor(config, use_analysis); if (FLAGS_zero_copy) {
ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[0]);
// warmup run }
LOG(INFO) << "Warm up run...";
{
Timer warmup_timer; Timer warmup_timer;
warmup_timer.tic(); warmup_timer.tic();
if (!FLAGS_zero_copy) {
predictor->Run(inputs[0], outputs, batch_size); predictor->Run(inputs[0], outputs, batch_size);
PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1); } else {
predictor->ZeroCopyRun();
}
PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1);
if (FLAGS_profile) { if (FLAGS_profile) {
paddle::platform::ResetProfiler(); paddle::platform::ResetProfiler();
} }
} }
LOG(INFO) << "Run " << num_times << " times..."; void PredictionRun(PaddlePredictor *predictor,
{ const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, int num_threads,
int tid) {
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
LOG(INFO) << "Thread " << tid << " run " << num_times << " times...";
Timer run_timer; Timer run_timer;
run_timer.tic(); double elapsed_time = 0;
#ifdef WITH_GPERFTOOLS #ifdef WITH_GPERFTOOLS
ProfilerStart("paddle_inference.prof"); ProfilerStart("paddle_inference.prof");
#endif #endif
for (int i = 0; i < num_times; i++) { if (!FLAGS_zero_copy) {
for (size_t j = 0; j < inputs.size(); j++) { run_timer.tic();
predictor->Run(inputs[j], outputs, batch_size); for (size_t i = 0; i < inputs.size(); i++) {
for (int j = 0; j < num_times; j++) {
predictor->Run(inputs[i], outputs, batch_size);
}
}
elapsed_time = run_timer.toc();
} else {
for (size_t i = 0; i < inputs.size(); i++) {
ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]);
run_timer.tic();
for (int j = 0; j < num_times; j++) {
predictor->ZeroCopyRun();
}
elapsed_time += run_timer.toc();
} }
} }
#ifdef WITH_GPERFTOOLS #ifdef WITH_GPERFTOOLS
ProfilerStop(); ProfilerStop();
#endif #endif
double latency = run_timer.toc() / (num_times > 1 ? num_times : 1); PrintTime(batch_size, num_times, num_threads, tid, elapsed_time / num_times,
PrintTime(batch_size, num_times, 1, 0, latency, inputs.size()); inputs.size());
if (FLAGS_record_benchmark) { if (FLAGS_record_benchmark) {
Benchmark benchmark; Benchmark benchmark;
benchmark.SetName(FLAGS_model_name); benchmark.SetName(FLAGS_model_name);
benchmark.SetBatchSize(batch_size); benchmark.SetBatchSize(batch_size);
benchmark.SetLatency(latency); benchmark.SetLatency(elapsed_time / num_times);
benchmark.PersistToFile("benchmark_record.txt"); benchmark.PersistToFile("benchmark_record.txt");
} }
} }
void TestOneThreadPrediction(
const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
auto predictor = CreateTestPredictor(config, use_analysis);
PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0);
PredictionRun(predictor.get(), inputs, outputs, 1, 0);
} }
void TestMultiThreadPrediction( void TestMultiThreadPrediction(
...@@ -258,8 +351,6 @@ void TestMultiThreadPrediction( ...@@ -258,8 +351,6 @@ void TestMultiThreadPrediction(
const std::vector<std::vector<PaddleTensor>> &inputs, const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, int num_threads, std::vector<PaddleTensor> *outputs, int num_threads,
bool use_analysis = true) { bool use_analysis = true) {
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
std::vector<std::thread> threads; std::vector<std::thread> threads;
std::vector<std::unique_ptr<PaddlePredictor>> predictors; std::vector<std::unique_ptr<PaddlePredictor>> predictors;
predictors.emplace_back(CreateTestPredictor(config, use_analysis)); predictors.emplace_back(CreateTestPredictor(config, use_analysis));
...@@ -267,7 +358,6 @@ void TestMultiThreadPrediction( ...@@ -267,7 +358,6 @@ void TestMultiThreadPrediction(
predictors.emplace_back(predictors.front()->Clone()); predictors.emplace_back(predictors.front()->Clone());
} }
size_t total_time{0};
for (int tid = 0; tid < num_threads; ++tid) { for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() { threads.emplace_back([&, tid]() {
// Each thread should have local inputs and outputs. // Each thread should have local inputs and outputs.
...@@ -280,34 +370,8 @@ void TestMultiThreadPrediction( ...@@ -280,34 +370,8 @@ void TestMultiThreadPrediction(
->SetMkldnnThreadID(static_cast<int>(tid) + 1); ->SetMkldnnThreadID(static_cast<int>(tid) + 1);
} }
#endif #endif
PredictionWarmUp(predictor.get(), inputs, outputs, num_threads, tid);
// warmup run PredictionRun(predictor.get(), inputs, outputs, num_threads, tid);
LOG(INFO) << "Running thread " << tid << ", warm up run...";
{
Timer warmup_timer;
warmup_timer.tic();
predictor->Run(inputs[0], outputs, batch_size);
PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1);
if (FLAGS_profile) {
paddle::platform::ResetProfiler();
}
}
LOG(INFO) << "Thread " << tid << " run " << num_times << " times...";
{
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
for (const auto &input : inputs) {
ASSERT_TRUE(predictor->Run(input, &outputs_tid));
}
}
auto time = timer.toc();
total_time += time;
PrintTime(batch_size, num_times, num_threads, tid, time / num_times,
inputs.size());
}
}); });
} }
for (int i = 0; i < num_threads; ++i) { for (int i = 0; i < num_threads; ++i) {
...@@ -367,6 +431,31 @@ void CompareNativeAndAnalysis( ...@@ -367,6 +431,31 @@ void CompareNativeAndAnalysis(
CompareResult(analysis_outputs, native_outputs); CompareResult(analysis_outputs, native_outputs);
} }
void CompareAnalysisAndZeroCopy(
PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs,
const std::vector<std::string> &outputs_name) {
int batch_size = FLAGS_batch_size;
// analysis
std::vector<PaddleTensor> analysis_outputs;
auto predictor = CreateTestPredictor(config, true);
predictor->Run(inputs[0], &analysis_outputs, batch_size);
// analysis + zero_copy
std::vector<ZeroCopyTensor> zerocopy_outputs;
reinterpret_cast<AnalysisConfig *>(config)->SwitchUseFeedFetchOps(false);
predictor = CreateTestPredictor(config, true);
ConvertPaddleTensorToZeroCopyTensor(predictor.get(), inputs[0]);
predictor->ZeroCopyRun();
for (size_t i = 0; i < outputs_name.size(); i++) {
ZeroCopyTensor zerocopy_output =
*predictor->GetOutputTensor(outputs_name[i]).get();
zerocopy_outputs.emplace_back(zerocopy_output);
LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(zerocopy_output);
}
// compare
CompareResult(analysis_outputs, zerocopy_outputs);
}
template <typename T> template <typename T>
std::string LoDTensorSummary(const framework::LoDTensor &tensor) { std::string LoDTensorSummary(const framework::LoDTensor &tensor) {
std::stringstream ss; std::stringstream ss;
......
...@@ -30,18 +30,19 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME) ...@@ -30,18 +30,19 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
${EXTERNAL_PROJECT_NAME} ${EXTERNAL_PROJECT_NAME}
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${INSTALL_DIR} PREFIX ${INSTALL_DIR}
URL ${URL}/${FILENAME} DOWNLOAD_COMMAND wget -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} &&
${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
DOWNLOAD_DIR ${INSTALL_DIR} DOWNLOAD_DIR ${INSTALL_DIR}
DOWNLOAD_NO_PROGRESS 1 DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_COMMAND "" BUILD_COMMAND ""
UPDATE_COMMAND "" UPDATE_COMMAND ""
INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory ${UNPACK_DIR} ${INSTALL_DIR} INSTALL_COMMAND ""
) )
endfunction() endfunction()
set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec") set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
if (NOT EXISTS ${WORD2VEC_INSTALL_DIR}) if(NOT EXISTS ${WORD2VEC_INSTALL_DIR} AND NOT WIN32)
inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz") inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
endif() endif()
set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model") set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
......
add_subdirectory(detail) add_subdirectory(detail)
add_subdirectory(allocation) add_subdirectory(allocation)
cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade) cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade profiler)
cc_library(memcpy SRCS memcpy.cc DEPS place) cc_library(memcpy SRCS memcpy.cc DEPS place)
cc_library(memory cc_library(memory
......
...@@ -3,7 +3,7 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator) ...@@ -3,7 +3,7 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator) cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator) cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator) cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
if (WITH_GPU) if (WITH_GPU)
......
...@@ -12,8 +12,7 @@ ...@@ -12,8 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/memory/allocation/legacy_allocator.h" #include <memory>
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
...@@ -23,9 +22,11 @@ ...@@ -23,9 +22,11 @@
#endif #endif
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/split.h" #include "paddle/fluid/string/split.h"
...@@ -328,18 +329,22 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const { ...@@ -328,18 +329,22 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
} // namespace legacy } // namespace legacy
namespace allocation { namespace allocation {
LegacyMemMonitor GPUMemMonitor; LegacyMemMonitor GPUMemMonitor;
Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_); void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
return new Allocation(ptr, size, place_); auto *tmp_alloc = new Allocation(ptr, size, place_);
platform::MemEvenRecorder::Instance().PushMemRecord(
static_cast<void *>(tmp_alloc), place_, size);
return tmp_alloc;
} }
void LegacyAllocator::Free(Allocation *allocation) { void LegacyAllocator::Free(Allocation *allocation) {
boost::apply_visitor( boost::apply_visitor(
legacy::FreeVisitor(allocation->ptr(), allocation->size()), legacy::FreeVisitor(allocation->ptr(), allocation->size()),
allocation->place()); allocation->place());
platform::MemEvenRecorder::Instance().PopMemRecord(
static_cast<void *>(allocation), place_);
delete allocation; delete allocation;
} }
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include <cstring> // for memcpy #include <cstring> // for memcpy
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
...@@ -29,14 +30,23 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst, ...@@ -29,14 +30,23 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K
// NOTE(zcd): Do not use GpuMemcpySync as much as possible.
// because GpuMemcpySync issues the copying command to the default stream,
// which will make two commands from different streams cannot run concurrently.
// Reference:
// https://devblogs.nvidia.com/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
template <> template <>
void Copy<platform::CPUPlace, platform::CUDAPlace>( void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, cudaStream_t stream) { const void* src, size_t num, cudaStream_t stream) {
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
// FIXME(zjl): do we really need it? // FIXME(zjl): do we really need it?
if (num <= kMaxGpuAsyncCopyBytes) { if (num <= kMaxGpuAsyncCopyBytes) {
...@@ -51,8 +61,10 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>( ...@@ -51,8 +61,10 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
const void* src, size_t num, cudaStream_t stream) { const void* src, size_t num, cudaStream_t stream) {
platform::SetDeviceId(dst_place.device); platform::SetDeviceId(dst_place.device);
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
// FIXME(zjl): do we really need it? // FIXME(zjl): do we really need it?
if (num <= kMaxGpuAsyncCopyBytes) { if (num <= kMaxGpuAsyncCopyBytes) {
...@@ -68,15 +80,19 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>( ...@@ -68,15 +80,19 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
if (dst_place == src_place) { if (dst_place == src_place) {
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice); platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
} }
} else { } else {
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU");
platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
num, stream); num, stream);
} else { } else {
platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU");
platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device, platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
num); num);
} }
...@@ -111,8 +127,10 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>( ...@@ -111,8 +127,10 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
cudaStream_t stream) { cudaStream_t stream) {
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
} }
} }
...@@ -124,8 +142,10 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>( ...@@ -124,8 +142,10 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
cudaStream_t stream) { cudaStream_t stream) {
platform::SetDeviceId(dst_place.device); platform::SetDeviceId(dst_place.device);
if (stream) { if (stream) {
platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU");
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
} else { } else {
platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU");
platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
} }
} }
......
...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h" #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -269,6 +271,48 @@ $$out = \\frac{x}{1 + \|x\|}$$ ...@@ -269,6 +271,48 @@ $$out = \\frac{x}{1 + \|x\|}$$
)DOC"; )DOC";
class AcosOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input of acos operator");
AddOutput("Out", "Output of acos operator");
AddComment(R"DOC(
Arccosine Activation Operator.
$$out = \cos^{-1}(x)$$
)DOC");
}
};
class AsinOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input of asin operator");
AddOutput("Out", "Output of asin operator");
AddComment(R"DOC(
Arcsine Activation Operator.
$$out = \sin^{-1}(x)$$
)DOC");
}
};
class AtanOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input of atan operator");
AddOutput("Out", "Output of atan operator");
AddComment(R"DOC(
Arctanh Activation Operator.
$$out = \tanh^{-1}(x)$$
)DOC");
}
};
class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker { class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
...@@ -543,7 +587,10 @@ namespace ops = paddle::operators; ...@@ -543,7 +587,10 @@ namespace ops = paddle::operators;
__macro(SoftShrink, softshrink); \ __macro(SoftShrink, softshrink); \
__macro(Abs, abs); \ __macro(Abs, abs); \
__macro(Cos, cos); \ __macro(Cos, cos); \
__macro(Acos, acos); \
__macro(Sin, sin); \ __macro(Sin, sin); \
__macro(Asin, asin); \
__macro(Atan, atan); \
__macro(Round, round); \ __macro(Round, round); \
__macro(Log, log); \ __macro(Log, log); \
__macro(Square, square); \ __macro(Square, square); \
......
...@@ -40,8 +40,7 @@ namespace operators { ...@@ -40,8 +40,7 @@ namespace operators {
*/ */
static std::unordered_set<std::string> InplaceOpSet = { static std::unordered_set<std::string> InplaceOpSet = {
"sigmoid", "exp", "relu", "tanh", "sqrt", "ceil", "sigmoid", "exp", "relu", "tanh", "sqrt", "ceil",
"floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid", "floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid"};
};
static bool IsInplace(const std::string& op) { static bool IsInplace(const std::string& op) {
bool inplace = InplaceOpSet.count(op); bool inplace = InplaceOpSet.count(op);
...@@ -553,6 +552,101 @@ struct SinFunctor : public BaseActivationFunctor<T> { ...@@ -553,6 +552,101 @@ struct SinFunctor : public BaseActivationFunctor<T> {
} }
}; };
template <typename T>
struct Acos {
HOSTDEVICE T operator()(const T& val) const { return acos(val); }
};
template <>
struct Acos<platform::float16> {
HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
return platform::float16(acos(static_cast<float>(val)));
}
};
// Acos(x) = acos(x)
template <typename T>
struct AcosFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.unaryExpr(Acos<T>());
}
};
// acos'(x) = -1/sqrt(1-x^2)
template <typename T>
struct AcosGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) =
-dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
}
};
template <typename T>
struct Asin {
HOSTDEVICE T operator()(const T& val) const { return asin(val); }
};
template <>
struct Asin<platform::float16> {
HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
return platform::float16(asin(static_cast<float>(val)));
}
};
// Asin(x) = asin(x)
template <typename T>
struct AsinFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.unaryExpr(Asin<T>());
}
};
// asin'(x) = 1/sqrt(1-x^2)
template <typename T>
struct AsinGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) =
dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
}
};
template <typename T>
struct Atan {
HOSTDEVICE T operator()(const T& val) const { return atan(val); }
};
template <>
struct Atan<platform::float16> {
HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
return platform::float16(atan(static_cast<float>(val)));
}
};
// Atan(x) = atan(x)
template <typename T>
struct AtanFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.unaryExpr(Atan<T>());
}
};
// atan'(x) = 1 / (1 + x^2)
template <typename T>
struct AtanGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) + x.square());
}
};
// round(x) = [x] // round(x) = [x]
template <typename T> template <typename T>
struct RoundFunctor : public BaseActivationFunctor<T> { struct RoundFunctor : public BaseActivationFunctor<T> {
...@@ -1001,13 +1095,16 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> { ...@@ -1001,13 +1095,16 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
__macro(relu, ReluFunctor, ReluGradFunctor); \ __macro(relu, ReluFunctor, ReluGradFunctor); \
__macro(gelu, GeluFunctor, GeluGradFunctor); \ __macro(gelu, GeluFunctor, GeluGradFunctor); \
__macro(tanh, TanhFunctor, TanhGradFunctor); \ __macro(tanh, TanhFunctor, TanhGradFunctor); \
__macro(atan, AtanFunctor, AtanGradFunctor); \
__macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \
__macro(sqrt, SqrtFunctor, SqrtGradFunctor); \ __macro(sqrt, SqrtFunctor, SqrtGradFunctor); \
__macro(abs, AbsFunctor, AbsGradFunctor); \ __macro(abs, AbsFunctor, AbsGradFunctor); \
__macro(ceil, CeilFunctor, ZeroGradFunctor); \ __macro(ceil, CeilFunctor, ZeroGradFunctor); \
__macro(floor, FloorFunctor, ZeroGradFunctor); \ __macro(floor, FloorFunctor, ZeroGradFunctor); \
__macro(cos, CosFunctor, CosGradFunctor); \ __macro(cos, CosFunctor, CosGradFunctor); \
__macro(acos, AcosFunctor, AcosGradFunctor); \
__macro(sin, SinFunctor, SinGradFunctor); \ __macro(sin, SinFunctor, SinGradFunctor); \
__macro(asin, AsinFunctor, AsinGradFunctor); \
__macro(round, RoundFunctor, ZeroGradFunctor); \ __macro(round, RoundFunctor, ZeroGradFunctor); \
__macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \
__macro(log, LogFunctor, LogGradFunctor); \ __macro(log, LogFunctor, LogGradFunctor); \
......
include(operators) include(operators)
register_operators(DEPS naive_executor) register_operators(DEPS naive_executor)
cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator)
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/controlflow/while_op_helper.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
namespace paddle { namespace paddle {
...@@ -26,14 +27,6 @@ namespace operators { ...@@ -26,14 +27,6 @@ namespace operators {
using StepScopeVar = std::vector<framework::Scope *>; using StepScopeVar = std::vector<framework::Scope *>;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
static constexpr char kStepBlock[] = "sub_block";
static constexpr char kCondition[] = "Condition";
static constexpr char kStepScopes[] = "StepScopes";
static constexpr char kX[] = "X";
static constexpr char kXGRAD[] = "X@GRAD";
static constexpr char kOutputs[] = "Out";
static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
namespace { // NOLINT namespace { // NOLINT
static std::string GetSkipEagerDeletionVarsDebugString( static std::string GetSkipEagerDeletionVarsDebugString(
const std::vector<std::string> &vars) { const std::vector<std::string> &vars) {
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/controlflow/while_op_helper.h"
#include <string>
#include <unordered_set>
#include <utility>
#include "paddle/fluid/framework/program_desc.h"
namespace paddle {
namespace operators {
// OpVariant is a wrapper class of OpDesc and OperatorBase
// So that API would be the same.
class OpVariant {
struct InputsVisitor
: public boost::static_visitor<const framework::VariableNameMap *> {
template <typename OpType>
const framework::VariableNameMap *operator()(const OpType *op) const {
return &(op->Inputs());
}
};
struct OutputsVisitor
: public boost::static_visitor<const framework::VariableNameMap *> {
template <typename OpType>
const framework::VariableNameMap *operator()(const OpType *op) const {
return &(op->Outputs());
}
};
struct AttributeMapVisitor
: public boost::static_visitor<const framework::AttributeMap *> {
const framework::AttributeMap *operator()(
const framework::OpDesc *op) const {
return &(op->GetAttrMap());
}
const framework::AttributeMap *operator()(
const framework::OperatorBase *op) const {
return &(op->Attrs());
}
};
struct RawPointerVisitor : public boost::static_visitor<const void *> {
template <typename OpType>
const void *operator()(const OpType *op) const {
return op;
}
};
public:
OpVariant(const framework::OperatorBase *op) : op_(op) {} // NOLINT
OpVariant(const framework::OpDesc *op) : op_(op) {} // NOLINT
const framework::VariableNameMap &Inputs() const {
return *boost::apply_visitor(InputsVisitor(), op_);
}
const framework::VariableNameMap &Outputs() const {
return *boost::apply_visitor(OutputsVisitor(), op_);
}
const framework::AttributeMap &Attrs() const {
return *boost::apply_visitor(AttributeMapVisitor(), op_);
}
template <typename AttrType>
const AttrType &Attr(const std::string &name) const {
auto &attrs = Attrs();
auto it = attrs.find(name);
PADDLE_ENFORCE(it != attrs.end(), "Cannot find attribute %s", name);
return boost::get<AttrType>(it->second);
}
bool operator==(const OpVariant &other) const {
return RawPointer() == other.RawPointer();
}
const void *RawPointer() const {
return boost::apply_visitor(RawPointerVisitor(), op_);
}
int which() const { return static_cast<int>(op_.which()); }
struct Hasher {
size_t operator()(const OpVariant &op) const {
return reinterpret_cast<size_t>(op.RawPointer());
}
};
private:
const boost::variant<const framework::OperatorBase *,
const framework::OpDesc *>
op_;
};
static std::string GetDebugString(const std::vector<std::string> &names) {
if (names.empty()) return "";
std::string ret = names[0];
for (size_t i = 1; i < names.size(); ++i) {
ret += (" " + names[i]);
}
return ret;
}
// Set skip variables of while_op and while_grad_op
// These variables should be skipped when eager deletion enables.
// It is because:
// 1. while_grad_op needs some variables defined in while_op.
// 2. while_grad_op needs variables from the previous time step.
static void SetSkipVars(const OpVariant &op, std::vector<std::string> attr) {
auto &attrs = const_cast<framework::AttributeMap &>(op.Attrs());
VLOG(2) << "Prepare to skip " << attr.size()
<< " var(s): " << GetDebugString(attr);
attrs[kSkipEagerDeletionVars] = std::move(attr);
}
// Check whether the forward while_op and while_grad_op match
// The program may have many while_ops.
static bool IsMatchedWhileOpAndWhileGradOp(const OpVariant &fwd_op,
const OpVariant &grad_op) {
return fwd_op.Inputs().at(kX) == grad_op.Inputs().at(kX) &&
fwd_op.Outputs().at(kOutputs) == grad_op.Inputs().at(kOutputs);
}
// Test whether the variable is skippable in forward while_op
// The variable is skippable in while_op when the variable used in while_grad
// is not from grad_block.
static bool IsSkippableVar(const std::string &name,
framework::BlockDesc *grad_block) {
return name != framework::kEmptyVarName && !grad_block->HasVar(name);
}
static void ModifyWhileOpAndWhileGradOpAttr(const OpVariant &fwd_op,
const OpVariant &bwd_op) {
auto *grad_block = bwd_op.Attr<framework::BlockDesc *>(kStepBlock);
// Find all skippable variables in forward while_op
std::unordered_set<std::string> forward_skip_vars;
for (auto *op_desc : grad_block->AllOps()) {
for (auto &in_arg_name : op_desc->InputArgumentNames()) {
if (IsSkippableVar(in_arg_name, grad_block)) {
forward_skip_vars.insert(in_arg_name);
}
}
for (auto &out_arg_name : op_desc->OutputArgumentNames()) {
if (IsSkippableVar(out_arg_name, grad_block)) {
forward_skip_vars.insert(out_arg_name);
}
}
}
SetSkipVars(fwd_op, std::vector<std::string>(forward_skip_vars.begin(),
forward_skip_vars.end()));
// Find all skippable variables in while_grad_op
// The skipped variables are those which would be used across time steps.
auto &fwd_input = fwd_op.Inputs().at(kX);
auto &in_grads = bwd_op.Outputs().at(framework::GradVarName(kX));
PADDLE_ENFORCE_EQ(
fwd_input.size(), in_grads.size(),
"Backward input gradient number does not match forward input number.");
std::unordered_set<std::string> backward_skip_vars;
for (size_t i = 0; i < in_grads.size(); ++i) {
if (in_grads[i] == framework::kEmptyVarName) {
continue;
}
backward_skip_vars.insert(in_grads[i]);
backward_skip_vars.insert(framework::GradVarName(fwd_input[i]));
}
SetSkipVars(bwd_op, std::vector<std::string>(backward_skip_vars.begin(),
backward_skip_vars.end()));
}
// Find all while_ops and while_grad_ops in the graph or program
// The while_grad_op and while_op may located in different blocks
// So we should traverse all blocks in the program and find them out.
static void FindAllWhileAndWhileGradOp(std::vector<OpVariant> *while_ops,
std::vector<OpVariant> *while_grad_ops) {
PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size());
if (while_ops->empty()) return;
const auto *program =
while_ops->front().Attr<framework::BlockDesc *>(kStepBlock)->Program();
for (size_t i = 1; i < program->Size(); ++i) {
auto &block = program->Block(i);
for (size_t j = 0; j < block.OpSize(); ++j) {
auto *op = block.Op(j);
if (op->Type() == "while") {
while_ops->emplace_back(op);
} else if (op->Type() == "while_grad") {
while_grad_ops->emplace_back(op);
}
}
}
PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size(),
"There are extra while_grad ops in the graph or program");
}
static void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(
std::vector<OpVariant> *while_ops, std::vector<OpVariant> *while_grad_ops) {
FindAllWhileAndWhileGradOp(while_ops, while_grad_ops);
VLOG(2) << "Found while op num: " << while_ops->size()
<< ", while grad op num: " << while_grad_ops->size();
if (while_grad_ops->empty()) {
return;
}
std::unordered_set<OpVariant, OpVariant::Hasher> while_op_set(
while_ops->begin(), while_ops->end());
for (auto &bwd_op : *while_grad_ops) {
const OpVariant *matched_fwd_op = nullptr;
for (auto &fwd_op : while_op_set) {
if (IsMatchedWhileOpAndWhileGradOp(fwd_op, bwd_op)) {
PADDLE_ENFORCE(matched_fwd_op == nullptr,
"Found multiple matched while ops");
matched_fwd_op = &fwd_op;
}
}
PADDLE_ENFORCE_NOT_NULL(matched_fwd_op,
"Cannot find matched forward while op.");
ModifyWhileOpAndWhileGradOpAttr(*matched_fwd_op, bwd_op);
while_op_set.erase(*matched_fwd_op);
}
}
void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
int block_id,
const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops) {
// If block_id is not 0, returns
// This is because all while_ops and while_grad_ops in the whole program
// would be processed when block_id is 0 (i.e. when Executor::Run() or
// ParallelExecutor constructs).
// What's more, all while_ops and while_grad_ops must be processed when
// block_id is zero. If not, while_op may run first and erase variables
// used in while_grad_op, and in this moment, while_grad_ops may be not
// constructed yet.
if (block_id != 0) return;
std::vector<OpVariant> fwd_ops, bwd_ops;
for (auto &op : all_ops) {
if (op->Type() == "while") {
fwd_ops.emplace_back(op.get());
} else if (op->Type() == "while_grad") {
bwd_ops.emplace_back(op.get());
}
}
PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops);
}
void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
const std::vector<framework::OperatorBase *> &while_ops,
const std::vector<framework::OperatorBase *> &while_grad_ops) {
std::vector<OpVariant> fwd_ops, bwd_ops;
fwd_ops.reserve(while_ops.size());
for (auto *op : while_ops) {
fwd_ops.emplace_back(op);
}
bwd_ops.reserve(while_grad_ops.size());
for (auto *op : while_grad_ops) {
bwd_ops.emplace_back(op);
}
PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops);
}
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -14,19 +14,30 @@ ...@@ -14,19 +14,30 @@
#pragma once #pragma once
#include "paddle/fluid/framework/ir/graph.h" #include <memory>
#include "paddle/fluid/framework/ir/pass.h" #include <string>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/variant.h"
namespace paddle { namespace paddle {
namespace framework { namespace operators {
namespace details {
class EagerDeletionPass : public ir::Pass { static constexpr char kStepBlock[] = "sub_block";
protected: static constexpr char kCondition[] = "Condition";
std::unique_ptr<ir::Graph> ApplyImpl( static constexpr char kStepScopes[] = "StepScopes";
std::unique_ptr<ir::Graph> graph) const override; static constexpr char kX[] = "X";
}; static constexpr char kXGRAD[] = "X@GRAD";
static constexpr char kOutputs[] = "Out";
static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
} // namespace details void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
} // namespace framework int block_id,
const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops);
void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
const std::vector<framework::OperatorBase *> &while_ops,
const std::vector<framework::OperatorBase *> &while_grad_ops);
} // namespace operators
} // namespace paddle } // namespace paddle
...@@ -82,8 +82,9 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> { ...@@ -82,8 +82,9 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
Tensor track; Tensor track;
int* track_value = int* track_value =
track.mutable_data<int>(emission_dims, platform::CPUPlace()); track.mutable_data<int>(emission_dims, platform::CPUPlace());
auto ker = jit::Get<jit::kCRFDecoding, jit::CRFDecodingTuples<T>, auto ker =
platform::CPUPlace>(tag_num); jit::KernelFuncs<jit::CRFDecodingTuple<T>, platform::CPUPlace>::Cache()
.At(tag_num);
ker(static_cast<int>(seq_len), x, w, alpha_value, track_value, tag_num); ker(static_cast<int>(seq_len), x, w, alpha_value, track_value, tag_num);
T max_score = -std::numeric_limits<T>::max(); T max_score = -std::numeric_limits<T>::max();
int max_i = 0; int max_i = 0;
......
...@@ -20,7 +20,7 @@ namespace operators { ...@@ -20,7 +20,7 @@ namespace operators {
enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 }; enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };
inline BoxCodeType GetBoxCodeType(const std::string& type) { inline BoxCodeType GetBoxCodeType(const std::string &type) {
if (type == "encode_center_size") { if (type == "encode_center_size") {
return BoxCodeType::kEncodeCenterSize; return BoxCodeType::kEncodeCenterSize;
} else if (type == "decode_center_size") { } else if (type == "decode_center_size") {
...@@ -32,24 +32,23 @@ inline BoxCodeType GetBoxCodeType(const std::string& type) { ...@@ -32,24 +32,23 @@ inline BoxCodeType GetBoxCodeType(const std::string& type) {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class BoxCoderKernel : public framework::OpKernel<T> { class BoxCoderKernel : public framework::OpKernel<T> {
public: public:
void EncodeCenterSize(const framework::Tensor* target_box, void EncodeCenterSize(const framework::Tensor *target_box,
const framework::Tensor* prior_box, const framework::Tensor *prior_box,
const framework::Tensor* prior_box_var, const framework::Tensor *prior_box_var,
const bool normalized, const bool normalized,
const std::vector<float> variance, T* output) const { const std::vector<float> variance, T *output) const {
int64_t row = target_box->dims()[0]; int64_t row = target_box->dims()[0];
int64_t col = prior_box->dims()[0]; int64_t col = prior_box->dims()[0];
int64_t len = prior_box->dims()[1]; int64_t len = prior_box->dims()[1];
auto* target_box_data = target_box->data<T>();
auto* prior_box_data = prior_box->data<T>();
const T* prior_box_var_data = nullptr;
if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
#endif #endif
for (int64_t i = 0; i < row; ++i) { for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) { for (int64_t j = 0; j < col; ++j) {
auto *target_box_data = target_box->data<T>();
auto *prior_box_data = prior_box->data<T>();
size_t offset = i * col * len + j * len;
T prior_box_width = prior_box_data[j * len + 2] - T prior_box_width = prior_box_data[j * len + 2] -
prior_box_data[j * len] + (normalized == false); prior_box_data[j * len] + (normalized == false);
T prior_box_height = prior_box_data[j * len + 3] - T prior_box_height = prior_box_data[j * len + 3] -
...@@ -69,7 +68,6 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -69,7 +68,6 @@ class BoxCoderKernel : public framework::OpKernel<T> {
target_box_data[i * len + 1] + target_box_data[i * len + 1] +
(normalized == false); (normalized == false);
size_t offset = i * col * len + j * len;
output[offset] = output[offset] =
(target_box_center_x - prior_box_center_x) / prior_box_width; (target_box_center_x - prior_box_center_x) / prior_box_width;
output[offset + 1] = output[offset + 1] =
...@@ -78,44 +76,61 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -78,44 +76,61 @@ class BoxCoderKernel : public framework::OpKernel<T> {
std::log(std::fabs(target_box_width / prior_box_width)); std::log(std::fabs(target_box_width / prior_box_width));
output[offset + 3] = output[offset + 3] =
std::log(std::fabs(target_box_height / prior_box_height)); std::log(std::fabs(target_box_height / prior_box_height));
}
}
if (prior_box_var) { if (prior_box_var) {
const T *prior_box_var_data = prior_box_var->data<T>();
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(3)
#endif
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
for (int k = 0; k < 4; ++k) {
size_t offset = i * col * len + j * len;
int prior_var_offset = j * len; int prior_var_offset = j * len;
output[offset] /= prior_box_var_data[prior_var_offset]; output[offset + k] /= prior_box_var_data[prior_var_offset + k];
output[offset + 1] /= prior_box_var_data[prior_var_offset + 1]; }
output[offset + 2] /= prior_box_var_data[prior_var_offset + 2]; }
output[offset + 3] /= prior_box_var_data[prior_var_offset + 3]; }
} else if (!(variance.empty())) { } else if (!(variance.empty())) {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(3)
#endif
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
for (int k = 0; k < 4; ++k) { for (int k = 0; k < 4; ++k) {
size_t offset = i * col * len + j * len;
output[offset + k] /= static_cast<T>(variance[k]); output[offset + k] /= static_cast<T>(variance[k]);
} }
} }
} }
} }
} }
template <int axis, int var_size> template <int axis, int var_size>
void DecodeCenterSize(const framework::Tensor* target_box, void DecodeCenterSize(const framework::Tensor *target_box,
const framework::Tensor* prior_box, const framework::Tensor *prior_box,
const framework::Tensor* prior_box_var, const framework::Tensor *prior_box_var,
const bool normalized, std::vector<float> variance, const bool normalized, std::vector<float> variance,
T* output) const { T *output) const {
int64_t row = target_box->dims()[0]; int64_t row = target_box->dims()[0];
int64_t col = target_box->dims()[1]; int64_t col = target_box->dims()[1];
int64_t len = target_box->dims()[2]; int64_t len = target_box->dims()[2];
auto* target_box_data = target_box->data<T>();
auto* prior_box_data = prior_box->data<T>();
const T* prior_box_var_data = nullptr;
if (var_size == 2) prior_box_var_data = prior_box_var->data<T>();
int prior_box_offset = 0;
T var_data[4] = {1., 1., 1., 1.};
T* var_ptr = var_data;
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
#endif #endif
for (int64_t i = 0; i < row; ++i) { for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) { for (int64_t j = 0; j < col; ++j) {
auto *target_box_data = target_box->data<T>();
auto *prior_box_data = prior_box->data<T>();
T var_data[4] = {1., 1., 1., 1.};
T *var_ptr = var_data;
size_t offset = i * col * len + j * len; size_t offset = i * col * len + j * len;
prior_box_offset = axis == 0 ? j * len : i * len; int prior_box_offset = axis == 0 ? j * len : i * len;
T prior_box_width = prior_box_data[prior_box_offset + 2] - T prior_box_width = prior_box_data[prior_box_offset + 2] -
prior_box_data[prior_box_offset] + prior_box_data[prior_box_offset] +
(normalized == false); (normalized == false);
...@@ -131,10 +146,10 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -131,10 +146,10 @@ class BoxCoderKernel : public framework::OpKernel<T> {
T target_box_width = 0, target_box_height = 0; T target_box_width = 0, target_box_height = 0;
int prior_var_offset = axis == 0 ? j * len : i * len; int prior_var_offset = axis == 0 ? j * len : i * len;
if (var_size == 2) { if (var_size == 2) {
std::memcpy(var_ptr, prior_box_var_data + prior_var_offset, std::memcpy(var_ptr, prior_box_var->data<T>() + prior_var_offset,
4 * sizeof(T)); 4 * sizeof(T));
} else if (var_size == 1) { } else if (var_size == 1) {
var_ptr = reinterpret_cast<T*>(variance.data()); var_ptr = reinterpret_cast<T *>(variance.data());
} }
T box_var_x = *var_ptr; T box_var_x = *var_ptr;
T box_var_y = *(var_ptr + 1); T box_var_y = *(var_ptr + 1);
...@@ -162,11 +177,11 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -162,11 +177,11 @@ class BoxCoderKernel : public framework::OpKernel<T> {
} }
} }
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext &context) const override {
auto* prior_box = context.Input<framework::Tensor>("PriorBox"); auto *prior_box = context.Input<framework::Tensor>("PriorBox");
auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar"); auto *prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
auto* target_box = context.Input<framework::LoDTensor>("TargetBox"); auto *target_box = context.Input<framework::LoDTensor>("TargetBox");
auto* output_box = context.Output<framework::Tensor>("OutputBox"); auto *output_box = context.Output<framework::Tensor>("OutputBox");
std::vector<float> variance = context.Attr<std::vector<float>>("variance"); std::vector<float> variance = context.Attr<std::vector<float>>("variance");
const int axis = context.Attr<int>("axis"); const int axis = context.Attr<int>("axis");
if (target_box->lod().size()) { if (target_box->lod().size()) {
...@@ -194,7 +209,7 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -194,7 +209,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {
output_box->mutable_data<T>({row, col, len}, context.GetPlace()); output_box->mutable_data<T>({row, col, len}, context.GetPlace());
T* output = output_box->data<T>(); T *output = output_box->data<T>();
if (code_type == BoxCodeType::kEncodeCenterSize) { if (code_type == BoxCodeType::kEncodeCenterSize) {
EncodeCenterSize(target_box, prior_box, prior_box_var, normalized, EncodeCenterSize(target_box, prior_box, prior_box_var, normalized,
variance, output); variance, output);
......
...@@ -110,8 +110,9 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> { ...@@ -110,8 +110,9 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
constexpr int simd_width = 16; constexpr int simd_width = 16;
int C = c / simd_width; int C = c / simd_width;
auto multiply = jit::Get<jit::kNCHW16CMulNC, jit::NCHW16CMulNCTuples<T>, auto multiply = jit::KernelFuncs<jit::NCHW16CMulNCTuple<T>,
platform::CPUPlace>(0); platform::CPUPlace>::Cache()
.At(0);
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (int ni = 0; ni < n; ni++) { for (int ni = 0; ni < n; ni++) {
for (int ci = 0; ci < C; ci++) { for (int ci = 0; ci < C; ci++) {
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/fake_dequantize_op.h" #include "paddle/fluid/operators/fake_dequantize_op.h"
#include <string> #include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -76,6 +77,63 @@ $$Out = \frac{scale*X}{ max_range }$$ ...@@ -76,6 +77,63 @@ $$Out = \frac{scale*X}{ max_range }$$
} }
}; };
class FakeChannelWiseDequantizeMaxAbsOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(
ctx->HasInput("X"),
"Input(X) of FakeChannelWiseDequantizeMaxAbsOp should not be null.");
PADDLE_ENFORCE(ctx->HasInputs("Scales"),
"Input(Scales) of FakeChannelWiseDequantizeMaxAbsOp "
"should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Out"),
"Output(Out) of FakeChannelWiseDequantizeMaxAbsOp should not be null.");
ctx->ShareDim("X", /*->*/ "Out");
ctx->ShareLoD("X", /*->*/ "Out");
}
};
class FakeChannelWiseDequantizeMaxAbsOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor) The input with float-32/64 type is the "
"low precision tensor.");
AddInput("Scales",
"(Tensors) The scales in quantization stage. "
"Now, `Scales` is a vector with at most two tensors. "
"If Scales has two elements, the second tensor should only have "
"one value.")
.AsDuplicable();
AddOutput("Out",
"(Tensor) The output is the dequantized high "
"precision tensor.");
AddAttr<std::vector<int>>(
"quant_bits",
"Quantization bit numbers in quantization stage. "
"The size of `quant_bits` should be equal to the size of `Scales`.")
.SetDefault({8});
AddComment(R"DOC(
FakeChannelWiseDequantizeMaxAbsOp operator.
This calculation is an opposite operation of FakeChannelWiseQuantizeMaxAbsOp:
$$Out_c = \frac{X_c\prod_{i=1}^{n}Scales_{ic}}{\prod_{i=1}^{n}(2^{quant\_bits_i-1}-1)}$$
In the above formula, the range value of $c$ can be represented as $0 \leq c \lt \ the\ channel\ number\ of\ X$.
Besides, the size of $quant\_bits$ should be equal to the size of $Scales$, and it is called $n$ in the formula.
Notes: In general, the per-channel quantization is only applied to weights and the activations use per-layer quantization.
)DOC");
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -88,3 +146,11 @@ REGISTER_OPERATOR(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsOp, ...@@ -88,3 +146,11 @@ REGISTER_OPERATOR(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsOp,
REGISTER_OP_CPU_KERNEL(fake_dequantize_max_abs, REGISTER_OP_CPU_KERNEL(fake_dequantize_max_abs,
ops::FakeDequantizeMaxAbsKernel<CPU, float>, ops::FakeDequantizeMaxAbsKernel<CPU, float>,
ops::FakeDequantizeMaxAbsKernel<CPU, double>); ops::FakeDequantizeMaxAbsKernel<CPU, double>);
REGISTER_OPERATOR(fake_channel_wise_dequantize_max_abs,
ops::FakeChannelWiseDequantizeMaxAbsOp,
ops::FakeChannelWiseDequantizeMaxAbsOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(fake_channel_wise_dequantize_max_abs,
ops::FakeChannelWiseDequantizeMaxAbsKernel<CPU, float>,
ops::FakeChannelWiseDequantizeMaxAbsKernel<CPU, double>);
...@@ -55,3 +55,7 @@ using CUDA = paddle::platform::CUDADeviceContext; ...@@ -55,3 +55,7 @@ using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs, REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs,
ops::FakeDequantizeMaxAbsKernel<CUDA, float>, ops::FakeDequantizeMaxAbsKernel<CUDA, float>,
ops::FakeDequantizeMaxAbsKernel<CUDA, double>); ops::FakeDequantizeMaxAbsKernel<CUDA, double>);
REGISTER_OP_CUDA_KERNEL(
fake_channel_wise_dequantize_max_abs,
ops::FakeChannelWiseDequantizeMaxAbsKernel<CUDA, float>,
ops::FakeChannelWiseDequantizeMaxAbsKernel<CUDA, double>);
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -45,5 +46,42 @@ class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> { ...@@ -45,5 +46,42 @@ class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
} }
}; };
template <typename DeviceContext, typename T>
class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
public:
virtual void Compute(const framework::ExecutionContext& ctx) const {
auto* in = ctx.Input<framework::Tensor>("X");
auto scales = ctx.MultiInput<framework::Tensor>("Scales");
auto* out = ctx.Output<framework::Tensor>("Out");
PADDLE_ENFORCE_EQ(scales[0]->numel(), in->dims()[0],
"The number of first scale values must be the same with "
"first dimension value of Input(X).");
auto quant_bits = ctx.Attr<std::vector<int>>("quant_bits");
int max_range = std::pow(2, quant_bits[0] - 1) - 1;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
out->mutable_data<T>(dev_ctx.GetPlace());
auto dequant = DequantizeFunctor<DeviceContext, T>();
for (int64_t i = 0; i < in->dims()[0]; i++) {
framework::Tensor one_channel_in = in->Slice(i, i + 1);
framework::Tensor one_channel_out = out->Slice(i, i + 1);
framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1);
dequant(dev_ctx, &one_channel_in, &one_channel_scale,
static_cast<T>(max_range), &one_channel_out);
}
if (scales.size() == 2) {
PADDLE_ENFORCE_EQ(
scales[1]->numel(), 1,
"The second scale tensor should only have one value at now.");
max_range = std::pow(2, quant_bits[1] - 1) - 1;
dequant(dev_ctx, out, scales[1], static_cast<T>(max_range), out);
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -134,6 +134,60 @@ $$Out = round(X/scale * range)$$ ...@@ -134,6 +134,60 @@ $$Out = round(X/scale * range)$$
} }
}; };
class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of FakeChannelWiseQuantizeOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Out"),
"Output(Out) of FakeChannelWiseQuantizeOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("OutScales"),
"Output(Scales) of FakeChannelWiseQuantizeOp should not be null.");
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->SetOutputDim("OutScales", {ctx->GetInputDim("X")[0]});
ctx->ShareLoD("X", /*->*/ "Out");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
ctx.GetPlace());
}
};
class FakeChannelWiseQuantizeAbsMaxOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor) Input is float data type.");
AddOutput("Out",
"(Tensor) Output of quantized low level tensor, "
"but also saved as float data type.");
AddOutput("OutScales", "(Tensor) Current channel wise scale");
AddAttr<int>("bit_length", "(int, default 8)")
.SetDefault(8)
.AddCustomChecker([](const int& bit_length) {
PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
"'bit_length' should be between 1 and 16.");
});
AddComment(R"DOC(
The scale of FakeChannelWiseQuantize operator is a vector.
In detail, each channel of the input X has a scale value.
$$scale_c = max(abs(X_c))$$
$$range = 2^{bit\_length - 1} - 1$$
$$Out_c = round(\frac{X_c * range} {scale_c})$$
In above three formulas, the range value of c is as follow:
$$0 \leq c \lt \ the\ channel\ number\ of\ X$$
)DOC");
}
};
class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel { class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
public: public:
FakeQuantizeRangeAbsMaxOp(const std::string& type, FakeQuantizeRangeAbsMaxOp(const std::string& type,
...@@ -218,3 +272,10 @@ REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp, ...@@ -218,3 +272,10 @@ REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp,
paddle::framework::EmptyGradOpMaker); paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max, REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max,
ops::FakeQuantizeRangeAbsMaxKernel<CPU, float>); ops::FakeQuantizeRangeAbsMaxKernel<CPU, float>);
REGISTER_OPERATOR(fake_channel_wise_quantize_abs_max,
ops::FakeChannelWiseQuantizeAbsMaxOp,
ops::FakeChannelWiseQuantizeAbsMaxOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(fake_channel_wise_quantize_abs_max,
ops::FakeChannelWiseQuantizeAbsMaxKernel<CPU, float>);
...@@ -174,5 +174,7 @@ namespace ops = paddle::operators; ...@@ -174,5 +174,7 @@ namespace ops = paddle::operators;
using CUDA = paddle::platform::CUDADeviceContext; using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max, REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max,
ops::FakeQuantizeAbsMaxKernel<CUDA, float>); ops::FakeQuantizeAbsMaxKernel<CUDA, float>);
REGISTER_OP_CUDA_KERNEL(fake_channel_wise_quantize_abs_max,
ops::FakeChannelWiseQuantizeAbsMaxKernel<CUDA, float>);
REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max, REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max,
ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>); ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>);
...@@ -63,6 +63,39 @@ class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> { ...@@ -63,6 +63,39 @@ class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> {
} }
}; };
template <typename DeviceContext, typename T>
class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<framework::Tensor>("X");
auto* out = context.Output<framework::Tensor>("Out");
auto* out_scales = context.Output<framework::Tensor>("OutScales");
T* out_scales_data = out_scales->mutable_data<T>(context.GetPlace());
out->mutable_data<T>(context.GetPlace());
int bit_length = context.Attr<int>("bit_length");
int bin_cnt = std::pow(2, bit_length - 1) - 1;
auto& dev_ctx = context.template device_context<DeviceContext>();
auto find_abs_max = FindAbsMaxFunctor<DeviceContext, T>();
for (int64_t i = 0; i < in->dims()[0]; i++) {
framework::Tensor one_channel = in->Slice(i, i + 1);
const T* one_channel_data = one_channel.data<T>();
find_abs_max(dev_ctx, one_channel_data, one_channel.numel(),
&out_scales_data[i]);
}
auto clip_quant = ClipAndFakeQuantFunctor<DeviceContext, T>();
for (int64_t i = 0; i < in->dims()[0]; i++) {
framework::Tensor one_channel_in = in->Slice(i, i + 1);
framework::Tensor one_channel_out = out->Slice(i, i + 1);
framework::Tensor one_channel_scale = out_scales->Slice(i, i + 1);
clip_quant(dev_ctx, one_channel_in, one_channel_scale, bin_cnt,
&one_channel_out);
}
}
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> { class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
public: public:
......
...@@ -23,9 +23,6 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { ...@@ -23,9 +23,6 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
if (ctx->IsRuntime()) {
return;
}
PADDLE_ENFORCE(ctx->HasInput("W"), PADDLE_ENFORCE(ctx->HasInput("W"),
"Input W of FusedEmbeddingSeqPoolOp should not be null."); "Input W of FusedEmbeddingSeqPoolOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Ids"), PADDLE_ENFORCE(ctx->HasInput("Ids"),
...@@ -91,6 +88,8 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -91,6 +88,8 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker {
"(boolean, default false) " "(boolean, default false) "
"Sparse update.") "Sparse update.")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
.SetDefault(true);
AddComment(R"DOC( AddComment(R"DOC(
FusedEmbeddingSeqPool Operator. FusedEmbeddingSeqPool Operator.
......
...@@ -52,8 +52,9 @@ struct EmbeddingVSumFunctor { ...@@ -52,8 +52,9 @@ struct EmbeddingVSumFunctor {
out_width, jit::SeqPoolType::kSum); out_width, jit::SeqPoolType::kSum);
for (size_t i = 0; i != ids_lod.size() - 1; ++i) { for (size_t i = 0; i != ids_lod.size() - 1; ++i) {
attr.index_height = ids_lod[i + 1] - ids_lod[i]; attr.index_height = ids_lod[i + 1] - ids_lod[i];
auto emb_seqpool = jit::Get<jit::kEmbSeqPool, jit::EmbSeqPoolTuples<T>, auto emb_seqpool =
platform::CPUPlace>(attr); jit::KernelFuncs<jit::EmbSeqPoolTuple<T>, platform::CPUPlace>::Cache()
.At(attr);
emb_seqpool(table, ids + ids_lod[i] * idx_width, output + i * out_width, emb_seqpool(table, ids + ids_lod[i] * idx_width, output + i * out_width,
&attr); &attr);
} }
...@@ -120,6 +121,8 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> { ...@@ -120,6 +121,8 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
auto *ids = context.Input<LoDTensor>("Ids"); auto *ids = context.Input<LoDTensor>("Ids");
auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out")); auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W")); auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
// runtime shape
d_table->set_height(table_dim[0]);
auto *ids_data = ids->data<int64_t>(); auto *ids_data = ids->data<int64_t>();
int64_t ids_num = ids->numel(); int64_t ids_num = ids->numel();
...@@ -135,8 +138,9 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> { ...@@ -135,8 +138,9 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
T *d_table_data = d_table_value->mutable_data<T>(context.GetPlace()); T *d_table_data = d_table_value->mutable_data<T>(context.GetPlace());
const T *d_output_data = d_output->data<T>(); const T *d_output_data = d_output->data<T>();
auto vbroadcast = jit::Get<jit::kVBroadcast, jit::VBroadcastTuples<T>, auto vbroadcast =
platform::CPUPlace>(out_width); jit::KernelFuncs<jit::VBroadcastTuple<T>, platform::CPUPlace>::Cache()
.At(out_width);
for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) { for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]); int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
const T *src = d_output_data + i * out_width; const T *src = d_output_data + i * out_width;
......
...@@ -196,11 +196,14 @@ class FusionGRUKernel : public framework::OpKernel<T> { ...@@ -196,11 +196,14 @@ class FusionGRUKernel : public framework::OpKernel<T> {
jit::to_kerneltype(ctx.Attr<std::string>("activation"))); \ jit::to_kerneltype(ctx.Attr<std::string>("activation"))); \
jit::gru_t one_step; \ jit::gru_t one_step; \
auto ComputeH1 = \ auto ComputeH1 = \
jit::Get<jit::kGRUH1, jit::GRUTuples<T>, platform::CPUPlace>(attr); \ jit::KernelFuncs<jit::GRUH1Tuple<T>, platform::CPUPlace>::Cache().At( \
attr); \
auto ComputeHtPart1 = \ auto ComputeHtPart1 = \
jit::Get<jit::kGRUHtPart1, jit::GRUTuples<T>, platform::CPUPlace>(attr); \ jit::KernelFuncs<jit::GRUHtPart1Tuple<T>, platform::CPUPlace>::Cache() \
.At(attr); \
auto ComputeHtPart2 = \ auto ComputeHtPart2 = \
jit::Get<jit::kGRUHtPart2, jit::GRUTuples<T>, platform::CPUPlace>(attr); \ jit::KernelFuncs<jit::GRUHtPart2Tuple<T>, platform::CPUPlace>::Cache() \
.At(attr); \
const T* x_data = x->data<T>(); \ const T* x_data = x->data<T>(); \
const T* wx_data = wx->data<T>(); \ const T* wx_data = wx->data<T>(); \
const T* wh_data = wh->data<T>(); \ const T* wh_data = wh->data<T>(); \
......
...@@ -258,9 +258,11 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -258,9 +258,11 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
one_step.wp = wp_data; \ one_step.wp = wp_data; \
one_step.checked = checked_cell_data; \ one_step.checked = checked_cell_data; \
auto ComputeC1H1 = \ auto ComputeC1H1 = \
jit::Get<jit::kLSTMC1H1, jit::LSTMTuples<T>, platform::CPUPlace>(attr); \ jit::KernelFuncs<jit::LSTMC1H1Tuple<T>, platform::CPUPlace>::Cache().At( \
attr); \
auto ComputeCtHt = \ auto ComputeCtHt = \
jit::Get<jit::kLSTMCtHt, jit::LSTMTuples<T>, platform::CPUPlace>(attr) jit::KernelFuncs<jit::LSTMCtHtTuple<T>, platform::CPUPlace>::Cache().At( \
attr)
// Wh GEMM // Wh GEMM
#define GEMM_WH_ADDON(bs, prev, out) \ #define GEMM_WH_ADDON(bs, prev, out) \
......
...@@ -82,9 +82,11 @@ template <typename T> ...@@ -82,9 +82,11 @@ template <typename T>
static void fc_relu(const T* x, const T* w, const T* b, T* y, static void fc_relu(const T* x, const T* w, const T* b, T* y,
const jit::matmul_attr_t& attr) { const jit::matmul_attr_t& attr) {
auto matmul = auto matmul =
jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr); jit::KernelFuncs<jit::MatMulTuple<T>, platform::CPUPlace>::Cache().At(
attr);
auto addbias_relu = auto addbias_relu =
jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(attr.n); jit::KernelFuncs<jit::VAddReluTuple<T>, platform::CPUPlace>::Cache().At(
attr.n);
matmul(x, w, y, &attr); matmul(x, w, y, &attr);
T* dst = y; T* dst = y;
for (int i = 0; i < attr.m; ++i) { for (int i = 0; i < attr.m; ++i) {
......
...@@ -98,7 +98,7 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel<T> { ...@@ -98,7 +98,7 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel<T> {
attr.type = jit::SeqPoolType::kSqrt; attr.type = jit::SeqPoolType::kSqrt;
} }
auto seqpool = auto seqpool =
jit::Get<jit::kSeqPool, jit::SeqPoolTuples<T>, platform::CPUPlace>( jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache().At(
attr); attr);
size_t n = ins.size(); size_t n = ins.size();
size_t dst_step_size = n * w; size_t dst_step_size = n * w;
......
...@@ -94,19 +94,23 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> { ...@@ -94,19 +94,23 @@ class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
int o_numel = attr.m * attr.n; int o_numel = attr.m * attr.n;
auto vsquare_x = auto vsquare_x =
jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.m * jit::KernelFuncs<jit::VSquareTuple<T>, platform::CPUPlace>::Cache().At(
attr.k); attr.m * attr.k);
auto vsquare_y = auto vsquare_y =
jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(attr.k * jit::KernelFuncs<jit::VSquareTuple<T>, platform::CPUPlace>::Cache().At(
attr.n); attr.k * attr.n);
auto vsquare_xy = auto vsquare_xy =
jit::Get<jit::kVSquare, jit::XYNTuples<T>, platform::CPUPlace>(o_numel); jit::KernelFuncs<jit::VSquareTuple<T>, platform::CPUPlace>::Cache().At(
o_numel);
auto vsub = auto vsub =
jit::Get<jit::kVSub, jit::XYZNTuples<T>, platform::CPUPlace>(o_numel); jit::KernelFuncs<jit::VSubTuple<T>, platform::CPUPlace>::Cache().At(
o_numel);
auto vscal = auto vscal =
jit::Get<jit::kVScal, jit::AXYNTuples<T>, platform::CPUPlace>(o_numel); jit::KernelFuncs<jit::VScalTuple<T>, platform::CPUPlace>::Cache().At(
o_numel);
auto matmul = auto matmul =
jit::Get<jit::kMatMul, jit::MatMulTuples<T>, platform::CPUPlace>(attr); jit::KernelFuncs<jit::MatMulTuple<T>, platform::CPUPlace>::Cache().At(
attr);
const T* x_data = x->data<T>(); const T* x_data = x->data<T>();
const T* y_data = y->data<T>(); const T* y_data = y->data<T>();
......
...@@ -26,9 +26,6 @@ class HashOp : public framework::OperatorWithKernel { ...@@ -26,9 +26,6 @@ class HashOp : public framework::OperatorWithKernel {
: OperatorWithKernel(type, inputs, outputs, attrs) {} : OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
if (ctx->IsRuntime()) {
return;
}
PADDLE_ENFORCE(ctx->HasInput("X"), PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of HashOp should not be null."); "Input(X) of HashOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
...@@ -57,6 +54,8 @@ $$Out = scale * X$$ ...@@ -57,6 +54,8 @@ $$Out = scale * X$$
)DOC"); )DOC");
AddAttr<int>("num_hash", "").SetDefault(1); AddAttr<int>("num_hash", "").SetDefault(1);
AddAttr<int>("mod_by", "").SetDefault(100000); AddAttr<int>("mod_by", "").SetDefault(100000);
AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
.SetDefault(true);
} }
}; };
......
...@@ -5,7 +5,7 @@ file(APPEND ${jit_file} "\#pragma once\n") ...@@ -5,7 +5,7 @@ file(APPEND ${jit_file} "\#pragma once\n")
file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n") file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n")
file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n") file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n")
set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place) set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place xxhash)
file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
list(REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc) list(REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc)
......
...@@ -59,8 +59,6 @@ BenchJITKernel* InsertBenchmark(BenchJITKernel* b) { ...@@ -59,8 +59,6 @@ BenchJITKernel* InsertBenchmark(BenchJITKernel* b) {
InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_()); \ InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_()); \
void BenchJITKernel_##name##_##dtype##_##place##_::Run() void BenchJITKernel_##name##_##dtype##_##place##_::Run()
#define BENCH_FP32_CPU(name) BENCH_JITKERNEL(name, FP32, CPU)
void RUN_ALL_BENCHMARK() { void RUN_ALL_BENCHMARK() {
for (auto p : g_all_benchmarks) { for (auto p : g_all_benchmarks) {
if (!FLAGS_filter.empty() && FLAGS_filter != p->Name()) { if (!FLAGS_filter.empty() && FLAGS_filter != p->Name()) {
...@@ -90,11 +88,11 @@ std::vector<int> TestSizes() { ...@@ -90,11 +88,11 @@ std::vector<int> TestSizes() {
return s; return s;
} }
template <typename KernelTuples, typename... Args> template <typename KernelTuple, typename... Args>
struct BenchFunc { struct BenchFunc {
// return this function avg time // return this function avg time
// TODO(TJ): clear cache every time // TODO(TJ): clear cache every time
double operator()(const typename KernelTuples::func_type tgt, Args... args) { double operator()(const typename KernelTuple::func_type tgt, Args... args) {
for (int i = 0; i < FLAGS_burning; ++i) { for (int i = 0; i < FLAGS_burning; ++i) {
tgt(args...); tgt(args...);
} }
...@@ -109,40 +107,17 @@ struct BenchFunc { ...@@ -109,40 +107,17 @@ struct BenchFunc {
namespace jit = paddle::operators::jit; namespace jit = paddle::operators::jit;
template <jit::KernelType KT, typename KernelTuples, typename PlaceType, template <typename KernelTuple, typename PlaceType, typename... Args>
typename... Args> void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) {
void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { BenchFunc<KernelTuple, Args...> benchmark;
BenchFunc<KernelTuples, Args...> benchmark;
std::vector<std::pair<std::string, double>> infos; std::vector<std::pair<std::string, double>> infos;
// test refer auto funcs = jit::GetAllCandidateFuncsWithTypes<KernelTuple, PlaceType>(attr);
auto refer = jit::GetRefer<KT, KernelTuples>(); for (auto f : funcs) {
if (!refer) { infos.push_back(std::make_pair(f.first, benchmark(f.second, args...)));
LOG(FATAL) << "Refer can not be empty!";
} }
infos.push_back(std::make_pair("Refer", benchmark(refer, args...)));
// test jitcode
auto jitcode = jit::GetJitCode<KT, KernelTuples, PlaceType>(attr);
if (jitcode) {
infos.push_back(std::make_pair("JitCode", benchmark(jitcode, args...)));
}
// test all impls in more
jit::KernelKey kkey(KT, PlaceType());
auto& pool = jit::KernelPool().Instance().AllKernels();
auto iter = pool.find(kkey);
if (iter != pool.end()) {
auto& impls = iter->second;
for (auto& impl : impls) {
auto i = dynamic_cast<const jit::KernelMore<KernelTuples>*>(impl.get());
if (i && i->UseMe(attr)) {
auto more = i->GetFunc();
infos.push_back(
std::make_pair(i->ImplType(), benchmark(more, args...)));
}
}
}
// Test result from Get function // Test result from Get function
auto tgt = jit::Get<KT, KernelTuples, PlaceType>(attr); auto tgt = jit::KernelFuncs<KernelTuple, PlaceType>::Cache().At(attr);
if (!tgt) { if (!tgt) {
LOG(FATAL) << "Target can not be empty!"; LOG(FATAL) << "Target can not be empty!";
} }
...@@ -150,7 +125,8 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { ...@@ -150,7 +125,8 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
// print // print
std::ostringstream loginfos; std::ostringstream loginfos;
loginfos << "Kernel Type " << jit::to_string(KT) << ": " << attr << ": "; loginfos << "Kernel Type " << jit::to_string(KernelTuple::kernel_type) << ": "
<< attr << ": ";
for (auto pair : infos) { for (auto pair : infos) {
loginfos << pair.first << " takes " << pair.second << " us; "; loginfos << pair.first << " takes " << pair.second << " us; ";
} }
...@@ -159,8 +135,9 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { ...@@ -159,8 +135,9 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
using Tensor = paddle::framework::Tensor; using Tensor = paddle::framework::Tensor;
template <jit::KernelType KT, typename T, typename PlaceType> template <typename KernelTuple, typename PlaceType>
void BenchXYZNKernel() { void BenchKernelXYZN() {
using T = typename KernelTuple::data_type;
for (int d : TestSizes()) { for (int d : TestSizes()) {
Tensor x, y, z; Tensor x, y, z;
x.Resize({d}); x.Resize({d});
...@@ -171,16 +148,16 @@ void BenchXYZNKernel() { ...@@ -171,16 +148,16 @@ void BenchXYZNKernel() {
T* z_data = z.mutable_data<T>(PlaceType()); T* z_data = z.mutable_data<T>(PlaceType());
RandomVec<T>(d, x_data); RandomVec<T>(d, x_data);
RandomVec<T>(d, y_data); RandomVec<T>(d, y_data);
BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data<T>(), BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), y.data<T>(), z_data,
y.data<T>(), z_data, d); d);
// test inplace // test inplace
BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data<T>(), z_data, BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), z_data, z_data, d);
z_data, d);
} }
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <typename KernelTuple, typename PlaceType>
void BenchAXYNKernel() { void BenchKernelAXYN() {
using T = typename KernelTuple::data_type;
for (int d : TestSizes()) { for (int d : TestSizes()) {
const T a = static_cast<T>(3); const T a = static_cast<T>(3);
Tensor x, y; Tensor x, y;
...@@ -189,26 +166,26 @@ void BenchAXYNKernel() { ...@@ -189,26 +166,26 @@ void BenchAXYNKernel() {
T* x_data = x.mutable_data<T>(PlaceType()); T* x_data = x.mutable_data<T>(PlaceType());
T* y_data = y.mutable_data<T>(PlaceType()); T* y_data = y.mutable_data<T>(PlaceType());
RandomVec<T>(d, x_data); RandomVec<T>(d, x_data);
BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data<T>(), y_data, BenchAllImpls<KernelTuple, PlaceType>(d, &a, x.data<T>(), y_data, d);
d);
// test inplace // test inplace
BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data<T>(), x_data, BenchAllImpls<KernelTuple, PlaceType>(d, &a, x.data<T>(), x_data, d);
d);
} }
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <typename KernelTuple, typename PlaceType>
void BenchXRNKernel() { void BenchKernelXRN() {
using T = typename KernelTuple::data_type;
for (int d : TestSizes()) { for (int d : TestSizes()) {
Tensor x; Tensor x;
RandomVec<T>(d, x.mutable_data<T>({d}, PlaceType())); RandomVec<T>(d, x.mutable_data<T>({d}, PlaceType()));
T res; T res;
BenchAllImpls<KT, jit::XRNTuples<T>, PlaceType>(d, x.data<T>(), &res, d); BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), &res, d);
} }
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <typename KernelTuple, typename PlaceType>
void BenchXYNKernel() { void BenchKernelXYN() {
using T = typename KernelTuple::data_type;
for (int d : TestSizes()) { for (int d : TestSizes()) {
Tensor x, y; Tensor x, y;
x.Resize({d}); x.Resize({d});
...@@ -216,12 +193,13 @@ void BenchXYNKernel() { ...@@ -216,12 +193,13 @@ void BenchXYNKernel() {
T* x_data = x.mutable_data<T>(PlaceType()); T* x_data = x.mutable_data<T>(PlaceType());
T* y_data = y.mutable_data<T>(PlaceType()); T* y_data = y.mutable_data<T>(PlaceType());
RandomVec<T>(d, x_data); RandomVec<T>(d, x_data);
BenchAllImpls<KT, jit::XYNTuples<T>, PlaceType>(d, x.data<T>(), y_data, d); BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), y_data, d);
} }
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <typename KernelTuple, typename PlaceType>
void BenchLSTMKernel() { void BenchKernelLSTM() {
using T = typename KernelTuple::data_type;
for (bool use_peephole : {true, false}) { for (bool use_peephole : {true, false}) {
for (int d : TestSizes()) { for (int d : TestSizes()) {
const jit::lstm_attr_t attr(d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh, const jit::lstm_attr_t attr(d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh,
...@@ -252,13 +230,14 @@ void BenchLSTMKernel() { ...@@ -252,13 +230,14 @@ void BenchLSTMKernel() {
step.wp = wp_data; step.wp = wp_data;
step.checked = checked_data; step.checked = checked_data;
} }
BenchAllImpls<KT, jit::LSTMTuples<T>, PlaceType>(attr, &step, &attr); BenchAllImpls<KernelTuple, PlaceType>(attr, &step, &attr);
} }
} }
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <typename KernelTuple, typename PlaceType>
void BenchGRUKernel() { void BenchKernelGRU() {
using T = typename KernelTuple::data_type;
for (int d : TestSizes()) { for (int d : TestSizes()) {
const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh); const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh);
auto place = PlaceType(); auto place = PlaceType();
...@@ -275,12 +254,13 @@ void BenchGRUKernel() { ...@@ -275,12 +254,13 @@ void BenchGRUKernel() {
step.gates = x_data; step.gates = x_data;
step.ht_1 = ht_1_data; step.ht_1 = ht_1_data;
step.ht = ht_data; step.ht = ht_data;
BenchAllImpls<KT, jit::GRUTuples<T>, PlaceType>(attr, &step, &attr); BenchAllImpls<KernelTuple, PlaceType>(attr, &step, &attr);
} }
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <typename KernelTuple, typename PlaceType>
void BenchSeqPoolKernel() { void BenchKernelSeqPool() {
using T = typename KernelTuple::data_type;
std::vector<jit::SeqPoolType> pool_types = { std::vector<jit::SeqPoolType> pool_types = {
jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt}; jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
for (auto type : pool_types) { for (auto type : pool_types) {
...@@ -294,15 +274,15 @@ void BenchSeqPoolKernel() { ...@@ -294,15 +274,15 @@ void BenchSeqPoolKernel() {
RandomVec<T>(h * w, x.mutable_data<T>(PlaceType()), -2.f, 2.f); RandomVec<T>(h * w, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
const T* x_data = x.data<T>(); const T* x_data = x.data<T>();
T* y_data = y.mutable_data<T>(PlaceType()); T* y_data = y.mutable_data<T>(PlaceType());
BenchAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType>(attr, x_data, BenchAllImpls<KernelTuple, PlaceType>(attr, x_data, y_data, &attr);
y_data, &attr);
} }
} }
} }
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <typename KernelTuple, typename PlaceType>
void BenchEmbSeqPoolKernel() { void BenchKernelEmbSeqPool() {
using T = typename KernelTuple::data_type;
std::vector<jit::SeqPoolType> pool_types = {jit::SeqPoolType::kSum}; std::vector<jit::SeqPoolType> pool_types = {jit::SeqPoolType::kSum};
int64_t tbl_h = 1e4; int64_t tbl_h = 1e4;
for (int tbl_w : {10, 16, 256}) { for (int tbl_w : {10, 16, 256}) {
...@@ -324,16 +304,17 @@ void BenchEmbSeqPoolKernel() { ...@@ -324,16 +304,17 @@ void BenchEmbSeqPoolKernel() {
tbl_h - 1); tbl_h - 1);
const int64_t* idx_data = idx.data<int64_t>(); const int64_t* idx_data = idx.data<int64_t>();
T* o_data = out.mutable_data<T>(PlaceType()); T* o_data = out.mutable_data<T>(PlaceType());
BenchAllImpls<KT, jit::EmbSeqPoolTuples<T>, PlaceType>( BenchAllImpls<KernelTuple, PlaceType>(attr, table_data, idx_data,
attr, table_data, idx_data, o_data, &attr); o_data, &attr);
} }
} }
} }
} }
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <typename KernelTuple, typename PlaceType>
void BenchSgdKernel() { void BenchKernelSgd() {
using T = typename KernelTuple::data_type;
const T lr = 0.1; const T lr = 0.1;
auto UnDuplicatedRandomVec = [](int n, const int64_t lower, auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
const int64_t upper) -> std::vector<int64_t> { const int64_t upper) -> std::vector<int64_t> {
...@@ -364,15 +345,16 @@ void BenchSgdKernel() { ...@@ -364,15 +345,16 @@ void BenchSgdKernel() {
const T* grad_data = grad.data<T>(); const T* grad_data = grad.data<T>();
const int64_t* rows_data = rows.data(); const int64_t* rows_data = rows.data();
jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size); jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size);
BenchAllImpls<KT, jit::SgdTuples<T>, PlaceType>( BenchAllImpls<KernelTuple, PlaceType>(attr, &lr, param_data, grad_data,
attr, &lr, param_data, grad_data, rows_data, param_data, &attr); rows_data, param_data, &attr);
} }
} }
} }
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <typename KernelTuple, typename PlaceType>
void BenchMatMulKernel() { void BenchKernelMatMul() {
using T = typename KernelTuple::data_type;
for (int m : {1, 2, 3, 4}) { for (int m : {1, 2, 3, 4}) {
for (int n : TestSizes()) { for (int n : TestSizes()) {
for (int k : TestSizes()) { for (int k : TestSizes()) {
...@@ -386,15 +368,16 @@ void BenchMatMulKernel() { ...@@ -386,15 +368,16 @@ void BenchMatMulKernel() {
const T* b_data = b.data<T>(); const T* b_data = b.data<T>();
T* c_data = c.mutable_data<T>(PlaceType()); T* c_data = c.mutable_data<T>(PlaceType());
const jit::matmul_attr_t attr{m, n, k}; const jit::matmul_attr_t attr{m, n, k};
BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(attr, a_data, b_data, BenchAllImpls<KernelTuple, PlaceType>(attr, a_data, b_data, c_data,
c_data, &attr); &attr);
} }
} }
} }
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <typename KernelTuple, typename PlaceType>
void BenchSoftmaxKernel() { void BenchKernelSoftmax() {
using T = typename KernelTuple::data_type;
for (int bs : {1, 2, 10}) { for (int bs : {1, 2, 10}) {
for (int n : TestSizes()) { for (int n : TestSizes()) {
Tensor x, y; Tensor x, y;
...@@ -403,14 +386,14 @@ void BenchSoftmaxKernel() { ...@@ -403,14 +386,14 @@ void BenchSoftmaxKernel() {
RandomVec<T>(bs * n, x.mutable_data<T>(PlaceType()), -2.f, 2.f); RandomVec<T>(bs * n, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
const T* x_data = x.data<T>(); const T* x_data = x.data<T>();
T* y_data = y.mutable_data<T>(PlaceType()); T* y_data = y.mutable_data<T>(PlaceType());
BenchAllImpls<KT, jit::SoftmaxTuples<T>, PlaceType>(n, x_data, y_data, n, BenchAllImpls<KernelTuple, PlaceType>(n, x_data, y_data, n, bs);
bs);
} }
} }
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <typename KernelTuple, typename PlaceType>
void BenchLayerNormKernel() { void BenchKernelLayerNorm() {
using T = typename KernelTuple::data_type;
const T epsilon = 9.99999975e-06; const T epsilon = 9.99999975e-06;
for (int n : {1, 2, 10}) { for (int n : {1, 2, 10}) {
for (int x_dim_0 : {1, 9, 17, 50}) { for (int x_dim_0 : {1, 9, 17, 50}) {
...@@ -439,16 +422,17 @@ void BenchLayerNormKernel() { ...@@ -439,16 +422,17 @@ void BenchLayerNormKernel() {
T* var_data = var.data<T>(); T* var_data = var.data<T>();
T* out_data = out.mutable_data<T>(PlaceType()); T* out_data = out.mutable_data<T>(PlaceType());
BenchAllImpls<KT, jit::LayerNormTuples<T>, PlaceType>( BenchAllImpls<KernelTuple, PlaceType>(right, x_data, out_data,
right, x_data, out_data, mean_data, var_data, scale_data, bias_data, mean_data, var_data, scale_data,
left, epsilon, right); bias_data, left, epsilon, right);
} }
} }
} }
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <typename KernelTuple, typename PlaceType>
void BenchCRFDecodingKernel() { void BenchKernelCRFDecoding() {
using T = typename KernelTuple::data_type;
constexpr int state_trans_base_idx = 2; constexpr int state_trans_base_idx = 2;
for (int seq_len : {1, 11, 17, 50}) { for (int seq_len : {1, 11, 17, 50}) {
for (int tag_num : TestSizes()) { for (int tag_num : TestSizes()) {
...@@ -468,14 +452,15 @@ void BenchCRFDecodingKernel() { ...@@ -468,14 +452,15 @@ void BenchCRFDecodingKernel() {
T* alpha_data = alpha.mutable_data<T>(PlaceType()); T* alpha_data = alpha.mutable_data<T>(PlaceType());
int* track_data = track.mutable_data<int>(PlaceType()); int* track_data = track.mutable_data<int>(PlaceType());
BenchAllImpls<KT, jit::CRFDecodingTuples<T>, PlaceType>( BenchAllImpls<KernelTuple, PlaceType>(tag_num, seq_len, x_data, w_data,
tag_num, seq_len, x_data, w_data, alpha_data, track_data, tag_num); alpha_data, track_data, tag_num);
} }
} }
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <typename KernelTuple, typename PlaceType>
void BenchVBroadcastKernel() { void BenchKernelVBroadcast() {
using T = typename KernelTuple::data_type;
for (int64_t w : {1, 16, 64, 100, 256}) { for (int64_t w : {1, 16, 64, 100, 256}) {
Tensor x; Tensor x;
x.Resize({w}); x.Resize({w});
...@@ -485,78 +470,86 @@ void BenchVBroadcastKernel() { ...@@ -485,78 +470,86 @@ void BenchVBroadcastKernel() {
Tensor y; Tensor y;
y.Resize({h * w}); y.Resize({h * w});
T* y_data = y.mutable_data<T>(PlaceType()); T* y_data = y.mutable_data<T>(PlaceType());
BenchAllImpls<KT, jit::VBroadcastTuples<T>, PlaceType>( BenchAllImpls<KernelTuple, PlaceType>(w, x_data, y_data,
w, x_data, y_data, static_cast<int64_t>(h), w); static_cast<int64_t>(h), w);
} }
} }
} }
using T = float; #define BenchKernelVMul BenchKernelXYZN
using CPUPlace = paddle::platform::CPUPlace; #define BenchKernelVAdd BenchKernelXYZN
#define BenchKernelVAddRelu BenchKernelXYZN
#define BenchKernelVSub BenchKernelXYZN
// xyzn #define BenchKernelVScal BenchKernelAXYN
BENCH_FP32_CPU(kVMul) { BenchXYZNKernel<jit::kVMul, T, CPUPlace>(); } #define BenchKernelVAddBias BenchKernelAXYN
BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel<jit::kVAdd, T, CPUPlace>(); }
BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel<jit::kVAddRelu, T, CPUPlace>(); }
BENCH_FP32_CPU(kVSub) { BenchXYZNKernel<jit::kVSub, T, CPUPlace>(); }
// axyn #define BenchKernelVRelu BenchKernelXYN
BENCH_FP32_CPU(kVScal) { BenchAXYNKernel<jit::kVScal, T, CPUPlace>(); } #define BenchKernelVIdentity BenchKernelXYN
BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel<jit::kVAddBias, T, CPUPlace>(); } #define BenchKernelVSquare BenchKernelXYN
#define BenchKernelVExp BenchKernelXYN
#define BenchKernelVSigmoid BenchKernelXYN
#define BenchKernelVTanh BenchKernelXYN
#define BenchKernelVCopy BenchKernelXYN
// xrn #define BenchKernelHMax BenchKernelXRN
BENCH_FP32_CPU(kHSum) { BenchXRNKernel<jit::kHSum, T, CPUPlace>(); } #define BenchKernelHSum BenchKernelXRN
BENCH_FP32_CPU(kHMax) { BenchXRNKernel<jit::kHMax, T, CPUPlace>(); }
// xyn #define BenchKernelLSTMCtHt BenchKernelLSTM
BENCH_FP32_CPU(kVRelu) { BenchXYNKernel<jit::kVRelu, T, CPUPlace>(); } #define BenchKernelLSTMC1H1 BenchKernelLSTM
BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel<jit::kVIdentity, T, CPUPlace>(); }
BENCH_FP32_CPU(kVSquare) { BenchXYNKernel<jit::kVSquare, T, CPUPlace>(); }
BENCH_FP32_CPU(kVExp) { BenchXYNKernel<jit::kVExp, T, CPUPlace>(); }
BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel<jit::kVSigmoid, T, CPUPlace>(); }
BENCH_FP32_CPU(kVTanh) { BenchXYNKernel<jit::kVTanh, T, CPUPlace>(); }
BENCH_FP32_CPU(kVCopy) { BenchXYNKernel<jit::kVCopy, T, CPUPlace>(); }
// lstm and peephole
BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel<jit::kLSTMCtHt, T, CPUPlace>(); }
BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel<jit::kLSTMC1H1, T, CPUPlace>(); }
// gru functions
BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel<jit::kGRUH1, T, CPUPlace>(); }
BENCH_FP32_CPU(kGRUHtPart1) { BenchGRUKernel<jit::kGRUHtPart1, T, CPUPlace>(); }
BENCH_FP32_CPU(kGRUHtPart2) { BenchGRUKernel<jit::kGRUHtPart2, T, CPUPlace>(); }
// seq pool function
BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel<jit::kSeqPool, T, CPUPlace>(); }
// embedding seq pool function
BENCH_FP32_CPU(kEmbSeqPool) {
BenchEmbSeqPoolKernel<jit::kEmbSeqPool, T, CPUPlace>();
}
// sgd function #define BenchKernelGRUH1 BenchKernelGRU
BENCH_FP32_CPU(kSgd) { BenchSgdKernel<jit::kSgd, T, CPUPlace>(); } #define BenchKernelGRUHtPart1 BenchKernelGRU
#define BenchKernelGRUHtPart2 BenchKernelGRU
// matmul using CPUPlace = paddle::platform::CPUPlace;
BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, CPUPlace>(); }
// softmax #define BENCH_FP32_CPU(name) \
BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel<jit::kSoftmax, T, CPUPlace>(); } BENCH_JITKERNEL(name, FP32, CPU) { \
BenchKernel##name<jit::name##Tuple<float>, CPUPlace>(); \
}
// layernorm // xyzn
BENCH_FP32_CPU(kLayerNorm) { BENCH_FP32_CPU(VMul);
BenchLayerNormKernel<jit::kLayerNorm, T, CPUPlace>(); BENCH_FP32_CPU(VAdd);
} BENCH_FP32_CPU(VAddRelu);
BENCH_FP32_CPU(VSub);
// crfdecoding // axyn
BENCH_FP32_CPU(kCRFDecoding) { BENCH_FP32_CPU(VScal);
BenchCRFDecodingKernel<jit::kCRFDecoding, T, CPUPlace>(); BENCH_FP32_CPU(VAddBias);
}
// vbroadcast function // xyn
BENCH_FP32_CPU(kVBroadcast) { BENCH_FP32_CPU(VRelu);
BenchVBroadcastKernel<jit::kVBroadcast, T, CPUPlace>(); BENCH_FP32_CPU(VIdentity);
} BENCH_FP32_CPU(VSquare);
BENCH_FP32_CPU(VExp);
BENCH_FP32_CPU(VSigmoid);
BENCH_FP32_CPU(VTanh);
BENCH_FP32_CPU(VCopy);
// xrn
BENCH_FP32_CPU(HMax);
BENCH_FP32_CPU(HSum);
// LSTM
BENCH_FP32_CPU(LSTMCtHt);
BENCH_FP32_CPU(LSTMC1H1);
// GRU
BENCH_FP32_CPU(GRUH1);
BENCH_FP32_CPU(GRUHtPart1);
BENCH_FP32_CPU(GRUHtPart2);
BENCH_FP32_CPU(LayerNorm);
BENCH_FP32_CPU(CRFDecoding);
BENCH_FP32_CPU(SeqPool);
BENCH_FP32_CPU(EmbSeqPool);
BENCH_FP32_CPU(MatMul);
BENCH_FP32_CPU(Softmax);
BENCH_FP32_CPU(Sgd);
BENCH_FP32_CPU(VBroadcast);
// Benchmark all jit kernels including jitcode, mkl and refer. // Benchmark all jit kernels including jitcode, mkl and refer.
// To use this tool, run command: ./benchmark [options...] // To use this tool, run command: ./benchmark [options...]
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
* limitations under the License. */ * limitations under the License. */
#include "paddle/fluid/operators/jit/gen/act.h" #include "paddle/fluid/operators/jit/gen/act.h"
#include <memory>
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
...@@ -81,7 +82,7 @@ void VActJitCode::genCode() { ...@@ -81,7 +82,7 @@ void VActJitCode::genCode() {
#define DECLARE_ACT_CREATOR(name) \ #define DECLARE_ACT_CREATOR(name) \
class name##Creator : public JitCodeCreator<int> { \ class name##Creator : public JitCodeCreator<int> { \
public: \ public: \
bool UseMe(const int& attr) const override; \ bool CanBeUsed(const int& attr) const override; \
size_t CodeSize(const int& d) const override; \ size_t CodeSize(const int& d) const override; \
std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \ std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
return make_unique<name##JitCode>(attr, CodeSize(attr)); \ return make_unique<name##JitCode>(attr, CodeSize(attr)); \
...@@ -96,27 +97,27 @@ DECLARE_ACT_CREATOR(VSigmoid); ...@@ -96,27 +97,27 @@ DECLARE_ACT_CREATOR(VSigmoid);
DECLARE_ACT_CREATOR(VTanh); DECLARE_ACT_CREATOR(VTanh);
// TODO(TJ): tuning use me // TODO(TJ): tuning use me
bool VReluCreator::UseMe(const int& d) const { bool VReluCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx); return platform::MayIUse(platform::avx);
} }
bool VSquareCreator::UseMe(const int& d) const { bool VSquareCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx); return platform::MayIUse(platform::avx);
} }
bool VIdentityCreator::UseMe(const int& d) const { bool VIdentityCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx); return platform::MayIUse(platform::avx);
} }
bool VExpCreator::UseMe(const int& d) const { bool VExpCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx) && d < 32; return platform::MayIUse(platform::avx) && d < 32;
} }
bool VSigmoidCreator::UseMe(const int& d) const { bool VSigmoidCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx); return platform::MayIUse(platform::avx);
} }
bool VTanhCreator::UseMe(const int& d) const { bool VTanhCreator::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx); return platform::MayIUse(platform::avx);
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
* limitations under the License. */ * limitations under the License. */
#include "paddle/fluid/operators/jit/gen/blas.h" #include "paddle/fluid/operators/jit/gen/blas.h"
#include <memory>
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
...@@ -142,7 +143,7 @@ void NCHW16CMulNCJitCode::genCode() { ...@@ -142,7 +143,7 @@ void NCHW16CMulNCJitCode::genCode() {
class NCHW16CMulNCCreator : public JitCodeCreator<int> { class NCHW16CMulNCCreator : public JitCodeCreator<int> {
public: public:
bool UseMe(const int& attr) const override { bool CanBeUsed(const int& attr) const override {
return platform::MayIUse(platform::avx512f); return platform::MayIUse(platform::avx512f);
} }
size_t CodeSize(const int& d) const override { return 256 * 1024; } size_t CodeSize(const int& d) const override { return 256 * 1024; }
...@@ -154,7 +155,7 @@ class NCHW16CMulNCCreator : public JitCodeCreator<int> { ...@@ -154,7 +155,7 @@ class NCHW16CMulNCCreator : public JitCodeCreator<int> {
#define DECLARE_BLAS_CREATOR(name) \ #define DECLARE_BLAS_CREATOR(name) \
class name##Creator : public JitCodeCreator<int> { \ class name##Creator : public JitCodeCreator<int> { \
public: \ public: \
bool UseMe(const int& attr) const override { \ bool CanBeUsed(const int& attr) const override { \
return platform::MayIUse(platform::avx) && attr <= 1024; \ return platform::MayIUse(platform::avx) && attr <= 1024; \
} \ } \
size_t CodeSize(const int& d) const override { \ size_t CodeSize(const int& d) const override { \
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/operators/jit/gen/embseqpool.h" #include "paddle/fluid/operators/jit/gen/embseqpool.h"
#include <stddef.h> // offsetof #include <stddef.h> // offsetof
#include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones #include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
...@@ -121,7 +122,7 @@ void EmbSeqPoolJitCode::genCode() { ...@@ -121,7 +122,7 @@ void EmbSeqPoolJitCode::genCode() {
class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> { class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
public: public:
bool UseMe(const emb_seq_pool_attr_t& attr) const override { bool CanBeUsed(const emb_seq_pool_attr_t& attr) const override {
return platform::MayIUse(platform::avx) && return platform::MayIUse(platform::avx) &&
attr.table_width % YMM_FLOAT_BLOCK == 0; attr.table_width % YMM_FLOAT_BLOCK == 0;
} }
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/operators/jit/gen/gru.h" #include "paddle/fluid/operators/jit/gen/gru.h"
#include <stddef.h> // offsetof #include <stddef.h> // offsetof
#include <memory>
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
...@@ -86,7 +87,7 @@ void GRUJitCode::genCode() { ...@@ -86,7 +87,7 @@ void GRUJitCode::genCode() {
class name##Creator : public JitCodeCreator<gru_attr_t> { \ class name##Creator : public JitCodeCreator<gru_attr_t> { \
public: \ public: \
/* TODO(TJ): enable more */ \ /* TODO(TJ): enable more */ \
bool UseMe(const gru_attr_t& attr) const override { \ bool CanBeUsed(const gru_attr_t& attr) const override { \
return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \ return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
} \ } \
size_t CodeSize(const gru_attr_t& attr) const override { \ size_t CodeSize(const gru_attr_t& attr) const override { \
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
* limitations under the License. */ * limitations under the License. */
#include "paddle/fluid/operators/jit/gen/hopv.h" #include "paddle/fluid/operators/jit/gen/hopv.h"
#include <memory>
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
...@@ -76,7 +77,7 @@ void HOPVJitCode::genCode() { ...@@ -76,7 +77,7 @@ void HOPVJitCode::genCode() {
#define DECLARE_HOP_CREATOR(name) \ #define DECLARE_HOP_CREATOR(name) \
class name##Creator : public JitCodeCreator<int> { \ class name##Creator : public JitCodeCreator<int> { \
public: \ public: \
bool UseMe(const int& attr) const override { \ bool CanBeUsed(const int& attr) const override { \
return platform::MayIUse(platform::avx); \ return platform::MayIUse(platform::avx); \
} \ } \
size_t CodeSize(const int& d) const override { \ size_t CodeSize(const int& d) const override { \
......
...@@ -73,7 +73,7 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator { ...@@ -73,7 +73,7 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
virtual void genCode() = 0; virtual void genCode() = 0;
size_t getSize() const override { return CodeGenerator::getSize(); } size_t getSize() const override { return CodeGenerator::getSize(); }
const unsigned char* getCodeInternal() override { const unsigned char* getCodeInternal() const override {
const Xbyak::uint8* code = CodeGenerator::getCode(); const Xbyak::uint8* code = CodeGenerator::getCode();
return code; return code;
} }
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/operators/jit/gen/lstm.h" #include "paddle/fluid/operators/jit/gen/lstm.h"
#include <stddef.h> // offsetof #include <stddef.h> // offsetof
#include <memory>
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
...@@ -114,7 +115,7 @@ void LSTMJitCode::genCode() { ...@@ -114,7 +115,7 @@ void LSTMJitCode::genCode() {
class name##Creator : public JitCodeCreator<lstm_attr_t> { \ class name##Creator : public JitCodeCreator<lstm_attr_t> { \
public: \ public: \
/* TODO(TJ): enable more */ \ /* TODO(TJ): enable more */ \
bool UseMe(const lstm_attr_t& attr) const override { \ bool CanBeUsed(const lstm_attr_t& attr) const override { \
return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \ return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
} \ } \
size_t CodeSize(const lstm_attr_t& attr) const override { \ size_t CodeSize(const lstm_attr_t& attr) const override { \
......
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
#include "paddle/fluid/operators/jit/gen/matmul.h" #include "paddle/fluid/operators/jit/gen/matmul.h"
#include <stddef.h> // offsetof #include <stddef.h> // offsetof
#include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
...@@ -98,7 +98,7 @@ void MatMulJitCode::genCode() { ...@@ -98,7 +98,7 @@ void MatMulJitCode::genCode() {
class MatMulCreator : public JitCodeCreator<matmul_attr_t> { class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
public: public:
bool UseMe(const matmul_attr_t& attr) const override { bool CanBeUsed(const matmul_attr_t& attr) const override {
return attr.m == 1 && platform::MayIUse(platform::avx512f) && return attr.m == 1 && platform::MayIUse(platform::avx512f) &&
attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512; attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512;
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
* limitations under the License. */ * limitations under the License. */
#include "paddle/fluid/operators/jit/gen/seqpool.h" #include "paddle/fluid/operators/jit/gen/seqpool.h"
#include <memory>
#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones #include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
...@@ -57,7 +58,7 @@ void SeqPoolJitCode::genCode() { ...@@ -57,7 +58,7 @@ void SeqPoolJitCode::genCode() {
class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> { class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
public: public:
bool UseMe(const seq_pool_attr_t& attr) const override { bool CanBeUsed(const seq_pool_attr_t& attr) const override {
return platform::MayIUse(platform::avx); return platform::MayIUse(platform::avx);
} }
size_t CodeSize(const seq_pool_attr_t& attr) const override { size_t CodeSize(const seq_pool_attr_t& attr) const override {
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/operators/jit/gen/sgd.h" #include "paddle/fluid/operators/jit/gen/sgd.h"
#include <stddef.h> // offsetof #include <stddef.h> // offsetof
#include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/operators/jit/registry.h" #include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
...@@ -104,7 +105,7 @@ void SgdJitCode::genCode() { ...@@ -104,7 +105,7 @@ void SgdJitCode::genCode() {
class SgdCreator : public JitCodeCreator<sgd_attr_t> { class SgdCreator : public JitCodeCreator<sgd_attr_t> {
public: public:
bool UseMe(const sgd_attr_t& attr) const override { bool CanBeUsed(const sgd_attr_t& attr) const override {
return platform::MayIUse(platform::avx) && return platform::MayIUse(platform::avx) &&
attr.grad_width % YMM_FLOAT_BLOCK == 0; attr.grad_width % YMM_FLOAT_BLOCK == 0;
} }
......
...@@ -69,7 +69,7 @@ void VBroadcastJitCode::genCode() { ...@@ -69,7 +69,7 @@ void VBroadcastJitCode::genCode() {
class VBroadcastCreator : public JitCodeCreator<int64_t> { class VBroadcastCreator : public JitCodeCreator<int64_t> {
public: public:
bool UseMe(const int64_t& w) const override { bool CanBeUsed(const int64_t& w) const override {
return platform::MayIUse(platform::avx) && w % YMM_FLOAT_BLOCK == 0; return platform::MayIUse(platform::avx) && w % YMM_FLOAT_BLOCK == 0;
} }
size_t CodeSize(const int64_t& w) const override { size_t CodeSize(const int64_t& w) const override {
......
...@@ -31,7 +31,7 @@ namespace paddle { ...@@ -31,7 +31,7 @@ namespace paddle {
namespace operators { namespace operators {
namespace jit { namespace jit {
// refer do not need useme, it would be the last one. // refer do not need CanBeUsed, it would be the last one.
void GenBase::dumpCode(const unsigned char* code) const { void GenBase::dumpCode(const unsigned char* code) const {
if (code) { if (code) {
static int counter = 0; static int counter = 0;
......
...@@ -31,9 +31,10 @@ class GenBase : public Kernel { ...@@ -31,9 +31,10 @@ class GenBase : public Kernel {
virtual ~GenBase() = default; virtual ~GenBase() = default;
virtual std::string name() const = 0; virtual std::string name() const = 0;
virtual size_t getSize() const = 0; virtual size_t getSize() const = 0;
virtual const unsigned char* getCodeInternal() = 0; virtual const unsigned char* getCodeInternal() const = 0;
const char* ImplType() const override { return "JitCode"; }
template <typename Func> template <typename Func>
Func getCode() { Func getCode() const {
const unsigned char* code = this->getCodeInternal(); const unsigned char* code = this->getCodeInternal();
if (FLAGS_dump_jitcode) { if (FLAGS_dump_jitcode) {
this->dumpCode(code); this->dumpCode(code);
...@@ -65,7 +66,7 @@ class JitCodeCreator : public GenCreator { ...@@ -65,7 +66,7 @@ class JitCodeCreator : public GenCreator {
virtual ~JitCodeCreator() = default; virtual ~JitCodeCreator() = default;
// condition when this jit code can be used. // condition when this jit code can be used.
virtual bool UseMe(const Attr& attr) const = 0; virtual bool CanBeUsed(const Attr& attr) const = 0;
// estimate this code size // estimate this code size
virtual size_t CodeSize(const Attr& attr) const = 0; virtual size_t CodeSize(const Attr& attr) const = 0;
......
...@@ -16,6 +16,8 @@ ...@@ -16,6 +16,8 @@
#include <iostream> #include <iostream>
#include <string> #include <string>
#include <unordered_map>
#include <utility> // for std::move
#include <vector> #include <vector>
#include "paddle/fluid/operators/jit/gen_base.h" #include "paddle/fluid/operators/jit/gen_base.h"
#include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/operators/jit/kernel_base.h"
...@@ -27,35 +29,34 @@ namespace paddle { ...@@ -27,35 +29,34 @@ namespace paddle {
namespace operators { namespace operators {
namespace jit { namespace jit {
template <KernelType KT, typename KernelTuples, typename PlaceType> template <typename KernelTuple, typename PlaceType>
inline typename std::enable_if< inline typename std::enable_if<
std::is_same<typename KernelTuples::data_type, float>::value && std::is_same<typename KernelTuple::data_type, float>::value &&
std::is_same<PlaceType, platform::CPUPlace>::value, std::is_same<PlaceType, platform::CPUPlace>::value,
typename KernelTuples::func_type>::type const Kernel*>::type
GetJitCode(const typename KernelTuples::attr_type& attr) { GetJitCode(const typename KernelTuple::attr_type& attr) {
using Func = typename KernelTuples::func_type; using Attr = typename KernelTuple::attr_type;
using Attr = typename KernelTuples::attr_type; int64_t key = JitCodeKey<Attr>(attr);
size_t key = JitCodeKey<Attr>(attr); auto& codes = JitCodePool<KernelTuple::kernel_type>::Instance();
auto& codes = JitCodePool<KT>().Instance();
if (codes.Has(key)) { if (codes.Has(key)) {
return codes.AllKernels().at(key)->template getCode<Func>(); return codes.AllKernels().at(key).get();
} }
// creator is not related with attr, so can use KernelKey as key // creator is not related with attr, so can use KernelKey as key
KernelKey kkey(KT, PlaceType()); KernelKey kkey(KernelTuple::kernel_type, PlaceType());
// pool: (KernelKey(type, place), vector<GenCreatorPtr>) // pool: (KernelKey(type, place), vector<GenCreatorPtr>)
auto& creator_map = JitCodeCreatorPool().Instance().AllCreators(); auto& creator_map = JitCodeCreatorPool::Instance().AllCreators();
auto iter = creator_map.find(kkey); auto iter = creator_map.find(kkey);
if (iter != creator_map.end()) { if (iter != creator_map.end()) {
auto& creators = iter->second; auto& creators = iter->second;
for (auto& cur : creators) { for (auto& cur : creators) {
auto i = dynamic_cast<const JitCodeCreator<Attr>*>(cur.get()); auto i = dynamic_cast<const JitCodeCreator<Attr>*>(cur.get());
if (i && i->UseMe(attr)) { if (i && i->CanBeUsed(attr)) {
auto p = i->CreateJitCode(attr); auto p = i->CreateJitCode(attr);
if (p) { if (p) {
auto f = p->template getCode<Func>(); auto res = p.get();
codes.Insert(key, std::move(p)); codes.Insert(key, std::move(p));
return f; return res;
} }
} }
} }
...@@ -63,87 +64,153 @@ GetJitCode(const typename KernelTuples::attr_type& attr) { ...@@ -63,87 +64,153 @@ GetJitCode(const typename KernelTuples::attr_type& attr) {
return nullptr; return nullptr;
} }
template <KernelType KT, typename KernelTuples, typename PlaceType> template <typename KernelTuple, typename PlaceType>
inline typename std::enable_if< inline typename std::enable_if<
!std::is_same<typename KernelTuples::data_type, float>::value || !std::is_same<typename KernelTuple::data_type, float>::value ||
!std::is_same<PlaceType, platform::CPUPlace>::value, !std::is_same<PlaceType, platform::CPUPlace>::value,
typename KernelTuples::func_type>::type const Kernel*>::type
GetJitCode(const typename KernelTuples::attr_type& attr) { GetJitCode(const typename KernelTuple::attr_type& attr) {
return nullptr; return nullptr;
} }
// Refer code do not related with attr, which is just for cast // Refer code do not related with attr, which is just for cast
// Refer is always on CPUPlace // Refer is always on CPUPlace
template <KernelType KT, typename KernelTuples> template <typename KernelTuple>
inline typename KernelTuples::func_type GetRefer() { inline const Kernel* GetReferKernel() {
auto& ref_pool = ReferKernelPool().Instance().AllKernels(); auto& ref_pool = ReferKernelPool::Instance().AllKernels();
KernelKey kkey(KT, platform::CPUPlace()); KernelKey kkey(KernelTuple::kernel_type, platform::CPUPlace());
auto ref_iter = ref_pool.find(kkey); auto ref_iter = ref_pool.find(kkey);
PADDLE_ENFORCE(ref_iter != ref_pool.end(), PADDLE_ENFORCE(ref_iter != ref_pool.end(),
"Every Kernel should have reference function."); "Every Kernel should have reference function.");
auto& ref_impls = ref_iter->second; auto& ref_impls = ref_iter->second;
for (auto& impl : ref_impls) { for (auto& impl : ref_impls) {
auto i = dynamic_cast<const ReferKernel<KernelTuples>*>(impl.get()); auto i = dynamic_cast<const ReferKernel<KernelTuple>*>(impl.get());
if (i) { if (i) {
return i->GetFunc(); return i;
} }
} }
return nullptr; return nullptr;
} }
template <KernelType KT, typename KernelTuples, template <typename KernelTuple>
typename PlaceType = platform::CPUPlace> inline typename KernelTuple::func_type GetReferFunc() {
typename KernelTuples::func_type Get( auto ker = GetReferKernel<KernelTuple>();
const typename KernelTuples::attr_type& attr) { auto p = dynamic_cast<const ReferKernel<KernelTuple>*>(ker);
auto jitfunc = GetJitCode<KT, KernelTuples, PlaceType>(attr); PADDLE_ENFORCE(p, "The Refer kernel should exsit");
if (jitfunc) { return p->GetFunc();
return jitfunc; }
// Return all Kernels that can be used
template <typename KernelTuple, typename PlaceType>
std::vector<const Kernel*> GetAllCandidateKernels(
const typename KernelTuple::attr_type& attr) {
// the search order shoudl be jitcode > more > refer
std::vector<const Kernel*> res;
auto jitker = GetJitCode<KernelTuple, PlaceType>(attr);
if (jitker) {
res.emplace_back(jitker);
} }
// pool: (KernelKey(type, place), vector<KernelPtr>) // more kernelpool: (KernelKey(type, place), vector<KernelPtr>)
KernelKey kkey(KT, PlaceType()); KernelKey kkey(KernelTuple::kernel_type, PlaceType());
auto& pool = KernelPool().Instance().AllKernels(); auto& pool = KernelPool::Instance().AllKernels();
auto iter = pool.find(kkey); auto iter = pool.find(kkey);
if (iter != pool.end()) { if (iter != pool.end()) {
auto& impls = iter->second; auto& impls = iter->second;
for (auto& impl : impls) { for (auto& impl : impls) {
auto i = dynamic_cast<const KernelMore<KernelTuples>*>(impl.get()); auto i = dynamic_cast<const KernelMore<KernelTuple>*>(impl.get());
if (i && i->UseMe(attr)) { if (i && i->CanBeUsed(attr)) {
return i->GetFunc(); res.emplace_back(i);
} }
} }
} }
// The last implementation should be reference function on CPUPlace. // The last implementation should be reference function on CPUPlace.
return GetRefer<KT, KernelTuples>(); auto ref = GetReferKernel<KernelTuple>();
PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty.");
res.emplace_back(ref);
return res;
}
template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
std::vector<std::pair<std::string, typename KernelTuple::func_type>>
GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
using Func = typename KernelTuple::func_type;
auto kers = GetAllCandidateKernels<KernelTuple, PlaceType>(attr);
std::vector<std::pair<std::string, Func>> res;
for (auto k : kers) {
std::string name = k->ImplType();
if (name == "JitCode") {
auto i = dynamic_cast<const GenBase*>(k);
PADDLE_ENFORCE(i, "jitcode kernel cast can not fail.");
res.emplace_back(std::make_pair(name, i->template getCode<Func>()));
} else {
auto i = dynamic_cast<const KernelMore<KernelTuple>*>(k);
PADDLE_ENFORCE(i, "kernel cast can not fail.");
res.emplace_back(std::make_pair(name, i->GetFunc()));
}
}
return res;
}
template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
std::vector<typename KernelTuple::func_type> GetAllCandidateFuncs(
const typename KernelTuple::attr_type& attr) {
auto funcs = GetAllCandidateFuncsWithTypes<KernelTuple, PlaceType>(attr);
std::vector<typename KernelTuple::func_type> res;
for (auto& i : funcs) {
res.emplace_back(i.second);
}
return res;
}
template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
typename KernelTuple::func_type GetDefaultBestFunc(
const typename KernelTuple::attr_type& attr) {
auto funcs = GetAllCandidateFuncs<KernelTuple, PlaceType>(attr);
PADDLE_ENFORCE_GE(funcs.size(), 1UL);
// Here could do some runtime benchmark of this attr and return the best one.
// But yet just get the first one as the default best one,
// which is searched in order and tuned by offline.
return funcs[0];
} }
template <KernelType KT, typename KernelTuples, typename PlaceType> template <typename KernelTuple, typename PlaceType>
class KernelFuncs { class KernelFuncs {
public: public:
KernelFuncs() = default; KernelFuncs() = default;
static KernelFuncs& Cache() { static KernelFuncs& Cache() {
static thread_local KernelFuncs<KT, KernelTuples, PlaceType> g_func_cache; static thread_local KernelFuncs<KernelTuple, PlaceType> g_func_cache;
return g_func_cache; return g_func_cache;
} }
bool Has(int key) const { return funcs_.find(key) != funcs_.end(); } // the exposed interface to use
typename KernelTuple::func_type At(
void Insert(int key, typename KernelTuples::func_type func) { const typename KernelTuple::attr_type& attr) {
funcs_.emplace(key, func); // Maybe here is not good enough, not all kernels should have jitcode
} int64_t key = JitCodeKey<typename KernelTuple::attr_type>(attr);
typename KernelTuples::func_type At(int key) {
if (Has(key)) { if (Has(key)) {
return funcs_.at(key); return funcs_.at(key);
} }
auto func = Get<KT, KernelTuples, PlaceType>(key); // If do not have this attr in cache then get the default best
auto func = GetDefaultBestFunc<KernelTuple, PlaceType>(attr);
Insert(key, func); Insert(key, func);
return func; return func;
} }
typename KernelTuple::func_type operator[](
const typename KernelTuple::attr_type& attr) {
return At(attr);
}
protected:
bool Has(int64_t key) const { return funcs_.find(key) != funcs_.end(); }
void Insert(int64_t key, typename KernelTuple::func_type func) {
funcs_.emplace(key, func);
}
private: private:
std::unordered_map<int, typename KernelTuples::func_type> funcs_; std::unordered_map<int64_t, typename KernelTuple::func_type> funcs_;
DISABLE_COPY_AND_ASSIGN(KernelFuncs); DISABLE_COPY_AND_ASSIGN(KernelFuncs);
}; };
......
...@@ -62,26 +62,55 @@ typedef enum { ...@@ -62,26 +62,55 @@ typedef enum {
kSqrt, kSqrt,
} SeqPoolType; } SeqPoolType;
// x, y, z, n
template <typename T> template <typename T>
struct XYZNTuples { struct XYZNTuple {
typedef T data_type; typedef T data_type;
typedef int attr_type; typedef int attr_type;
typedef void (*func_type)(const T*, const T*, T*, int); typedef void (*func_type)(const T*, const T*, T*, int);
}; };
// a, x, y, n
template <typename T> template <typename T>
struct AXYNTuples : public XYZNTuples<T> {}; struct AXYNTuple : public XYZNTuple<T> {};
// x, y, n
template <typename T> template <typename T>
struct XYNTuples { struct XYNTuple {
typedef T data_type; typedef T data_type;
typedef int attr_type; typedef int attr_type;
typedef void (*func_type)(const T*, T*, int); typedef void (*func_type)(const T*, T*, int);
}; };
// x, return and int // x, returned value, n
template <typename T> template <typename T>
struct XRNTuples : public XYNTuples<T> {}; struct XRNTuple : public XYNTuple<T> {};
#define DECLARE_KERNELTUPLE(kernel_tuple, type) \
template <typename T> \
struct type##Tuple : public kernel_tuple<T> { \
static constexpr KernelType kernel_type = k##type; \
}
// Tuple should be corresponding to the KernelType
DECLARE_KERNELTUPLE(XYZNTuple, VMul);
DECLARE_KERNELTUPLE(XYZNTuple, VAdd);
DECLARE_KERNELTUPLE(XYZNTuple, VAddRelu);
DECLARE_KERNELTUPLE(XYZNTuple, VSub);
DECLARE_KERNELTUPLE(AXYNTuple, VScal);
DECLARE_KERNELTUPLE(AXYNTuple, VAddBias);
DECLARE_KERNELTUPLE(XYNTuple, VRelu);
DECLARE_KERNELTUPLE(XYNTuple, VIdentity);
DECLARE_KERNELTUPLE(XYNTuple, VSquare);
DECLARE_KERNELTUPLE(XYNTuple, VExp);
DECLARE_KERNELTUPLE(XYNTuple, VSigmoid);
DECLARE_KERNELTUPLE(XYNTuple, VTanh);
DECLARE_KERNELTUPLE(XYNTuple, VCopy);
DECLARE_KERNELTUPLE(XRNTuple, HMax);
DECLARE_KERNELTUPLE(XRNTuple, HSum);
typedef struct { typedef struct {
void* gates; // gates: x_ch, x_ih, x_fh, x_oh void* gates; // gates: x_ch, x_ih, x_fh, x_oh
...@@ -122,21 +151,31 @@ typedef struct rnn_attr_s gru_attr_t; ...@@ -122,21 +151,31 @@ typedef struct rnn_attr_s gru_attr_t;
typedef struct lstm_attr_s lstm_attr_t; typedef struct lstm_attr_s lstm_attr_t;
template <typename T> template <typename T>
struct LSTMTuples { struct LSTMTuple {
typedef T data_type; typedef T data_type;
typedef lstm_attr_t attr_type; typedef lstm_attr_t attr_type;
typedef void (*func_type)(lstm_t*, const lstm_attr_t*); typedef void (*func_type)(lstm_t*, const lstm_attr_t*);
}; };
template <typename T> template <typename T>
struct GRUTuples { struct GRUTuple {
typedef T data_type; typedef T data_type;
typedef gru_attr_t attr_type; typedef gru_attr_t attr_type;
typedef void (*func_type)(gru_t*, const gru_attr_t*); typedef void (*func_type)(gru_t*, const gru_attr_t*);
}; };
DECLARE_KERNELTUPLE(LSTMTuple, LSTMCtHt);
DECLARE_KERNELTUPLE(LSTMTuple, LSTMC1H1);
DECLARE_KERNELTUPLE(GRUTuple, GRUH1);
DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart1);
DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart2);
#undef DECLARE_KERNELTUPLE
template <typename T> template <typename T>
struct VBroadcastTuples { struct VBroadcastTuple {
static constexpr KernelType kernel_type = kVBroadcast;
typedef T data_type; typedef T data_type;
typedef int64_t attr_type; typedef int64_t attr_type;
typedef void (*func_type)(const T*, T*, int64_t, int64_t); typedef void (*func_type)(const T*, T*, int64_t, int64_t);
...@@ -151,7 +190,8 @@ typedef struct seq_pool_attr_s { ...@@ -151,7 +190,8 @@ typedef struct seq_pool_attr_s {
} seq_pool_attr_t; } seq_pool_attr_t;
template <typename T> template <typename T>
struct SeqPoolTuples { struct SeqPoolTuple {
static constexpr KernelType kernel_type = kSeqPool;
typedef T data_type; typedef T data_type;
typedef seq_pool_attr_t attr_type; typedef seq_pool_attr_t attr_type;
typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*); typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
...@@ -176,7 +216,8 @@ typedef struct emb_seq_pool_attr_s { ...@@ -176,7 +216,8 @@ typedef struct emb_seq_pool_attr_s {
} emb_seq_pool_attr_t; } emb_seq_pool_attr_t;
template <typename T> template <typename T>
struct EmbSeqPoolTuples { struct EmbSeqPoolTuple {
static constexpr KernelType kernel_type = kEmbSeqPool;
typedef T data_type; typedef T data_type;
typedef emb_seq_pool_attr_t attr_type; typedef emb_seq_pool_attr_t attr_type;
typedef void (*func_type)(const T*, const int64_t*, T*, typedef void (*func_type)(const T*, const int64_t*, T*,
...@@ -198,7 +239,8 @@ typedef struct sgd_attr_s { ...@@ -198,7 +239,8 @@ typedef struct sgd_attr_s {
} sgd_attr_t; } sgd_attr_t;
template <typename T> template <typename T>
struct SgdTuples { struct SgdTuple {
static constexpr KernelType kernel_type = kSgd;
typedef T data_type; typedef T data_type;
typedef sgd_attr_t attr_type; typedef sgd_attr_t attr_type;
typedef void (*func_type)(const T*, const T*, const T*, const int64_t*, T*, typedef void (*func_type)(const T*, const T*, const T*, const int64_t*, T*,
...@@ -214,21 +256,24 @@ typedef struct matmul_attr_s { ...@@ -214,21 +256,24 @@ typedef struct matmul_attr_s {
} matmul_attr_t; } matmul_attr_t;
template <typename T> template <typename T>
struct MatMulTuples { struct MatMulTuple {
static constexpr KernelType kernel_type = kMatMul;
typedef T data_type; typedef T data_type;
typedef matmul_attr_t attr_type; typedef matmul_attr_t attr_type;
typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*); typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*);
}; };
template <typename T> template <typename T>
struct CRFDecodingTuples { struct CRFDecodingTuple {
static constexpr KernelType kernel_type = kCRFDecoding;
typedef T data_type; typedef T data_type;
typedef int attr_type; typedef int attr_type;
typedef void (*func_type)(const int, const T*, const T*, T*, int*, int); typedef void (*func_type)(const int, const T*, const T*, T*, int*, int);
}; };
template <typename T> template <typename T>
struct LayerNormTuples { struct LayerNormTuple {
static constexpr KernelType kernel_type = kLayerNorm;
typedef T data_type; typedef T data_type;
typedef int attr_type; typedef int attr_type;
typedef void (*func_type)(T*, T*, T*, T*, const T*, const T*, int, typedef void (*func_type)(T*, T*, T*, T*, const T*, const T*, int,
...@@ -236,7 +281,8 @@ struct LayerNormTuples { ...@@ -236,7 +281,8 @@ struct LayerNormTuples {
}; };
template <typename T> template <typename T>
struct SoftmaxTuples { struct SoftmaxTuple {
static constexpr KernelType kernel_type = kSoftmax;
typedef T data_type; typedef T data_type;
typedef int attr_type; typedef int attr_type;
typedef void (*func_type)(const T*, T*, int, int); typedef void (*func_type)(const T*, T*, int, int);
...@@ -244,7 +290,8 @@ struct SoftmaxTuples { ...@@ -244,7 +290,8 @@ struct SoftmaxTuples {
// nChw16c = nChw16c .* NC // nChw16c = nChw16c .* NC
template <typename T> template <typename T>
struct NCHW16CMulNCTuples { struct NCHW16CMulNCTuple {
static constexpr KernelType kernel_type = kNCHW16CMulNC;
typedef T data_type; typedef T data_type;
typedef int attr_type; typedef int attr_type;
typedef void (*func_type)(const T*, const T*, T*, int, int); typedef void (*func_type)(const T*, const T*, T*, int, int);
...@@ -255,28 +302,29 @@ class Kernel { ...@@ -255,28 +302,29 @@ class Kernel {
public: public:
Kernel() = default; Kernel() = default;
virtual ~Kernel() = default; virtual ~Kernel() = default;
virtual const char* ImplType() const = 0;
DISABLE_COPY_AND_ASSIGN(Kernel); DISABLE_COPY_AND_ASSIGN(Kernel);
}; };
template <typename KernelTuples> template <typename KernelTuple>
class KernelMore : public Kernel { class KernelMore : public Kernel {
public: public:
using T = typename KernelTuples::data_type; using T = typename KernelTuple::data_type;
using Func = typename KernelTuples::func_type; using Func = typename KernelTuple::func_type;
using Attr = typename KernelTuples::attr_type; using Attr = typename KernelTuple::attr_type;
virtual Func GetFunc() const { return func; } virtual Func GetFunc() const { return func; }
virtual bool UseMe(const Attr& attr) const = 0; // specify this kernel can be used, means it should not fail if use it.
virtual const char* ImplType() const = 0; virtual bool CanBeUsed(const Attr& attr) const = 0;
protected: protected:
Func func{nullptr}; Func func{nullptr};
}; };
template <typename KernelTuples> template <typename KernelTuple>
class ReferKernel : public KernelMore<KernelTuples> { class ReferKernel : public KernelMore<KernelTuple> {
public: public:
// Refer code can always be used // Refer code can always be used
bool UseMe(const typename KernelTuples::attr_type& attr) const override { bool CanBeUsed(const typename KernelTuple::attr_type& attr) const override {
return true; return true;
} }
const char* ImplType() const override { return "Refer"; } const char* ImplType() const override { return "Refer"; }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
* limitations under the License. */ * limitations under the License. */
#include "paddle/fluid/operators/jit/kernel_key.h" #include "paddle/fluid/operators/jit/kernel_key.h"
#include <xxhash.h> // XXH64: 13.8 GB/s
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
...@@ -20,71 +21,46 @@ namespace operators { ...@@ -20,71 +21,46 @@ namespace operators {
namespace jit { namespace jit {
template <> template <>
size_t JitCodeKey<int>(const int& d) { int64_t JitCodeKey<int>(const int& d) {
return d; return d;
} }
template <> template <>
size_t JitCodeKey<int64_t>(const int64_t& d) { int64_t JitCodeKey<int64_t>(const int64_t& d) {
return d; return d;
} }
// TODO(TJ): refine and benchmark JitCodeKey generatation
constexpr int act_type_shift = 3; // suppot 2^3 act types
static inline int act_type_convert(KernelType type) {
if (type == kVIdentity) {
return 0;
} else if (type == kVExp) {
return 1;
} else if (type == kVRelu) {
return 2;
} else if (type == kVSigmoid) {
return 3;
} else if (type == kVTanh) {
return 4;
}
PADDLE_THROW("Unsupported act type %d", type);
return 0;
}
template <> template <>
size_t JitCodeKey<lstm_attr_t>(const lstm_attr_t& attr) { int64_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) {
size_t key = attr.d; return XXH64(&attr, sizeof(gru_attr_t), 0);
int gate_key = act_type_convert(attr.act_gate) << 1;
int cand_key = act_type_convert(attr.act_cand) << (1 + act_type_shift);
int cell_key = act_type_convert(attr.act_cell) << (1 + act_type_shift * 2);
return (key << (1 + act_type_shift * 3)) + gate_key + cand_key + cell_key +
attr.use_peephole;
} }
template <> template <>
size_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) { int64_t JitCodeKey<lstm_attr_t>(const lstm_attr_t& attr) {
size_t key = attr.d; int keys[5] = {
return (key << (act_type_shift * 2)) + act_type_convert(attr.act_gate) + attr.d, static_cast<int>(attr.act_gate), static_cast<int>(attr.act_cand),
(act_type_convert(attr.act_cand) << act_type_shift); static_cast<int>(attr.act_cell), static_cast<int>(attr.use_peephole)};
return XXH64(keys, sizeof(int) * 5, 0);
} }
template <> template <>
size_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) { int64_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) {
size_t key = attr.w; int keys[2] = {attr.w, static_cast<int>(attr.type)};
constexpr int pool_type_shift = 3; return XXH64(keys, sizeof(int) * 2, 0);
return (key << pool_type_shift) + static_cast<int>(attr.type);
} }
template <> template <>
size_t JitCodeKey<matmul_attr_t>(const matmul_attr_t& attr) { int64_t JitCodeKey<matmul_attr_t>(const matmul_attr_t& attr) {
size_t key = attr.m; return XXH64(&attr, sizeof(int) * 3, 0); // m, n, k
constexpr int shift = 21;
return (key << shift * 2) + ((static_cast<size_t>(attr.n)) << shift) + attr.k;
} }
template <> template <>
size_t JitCodeKey<emb_seq_pool_attr_t>(const emb_seq_pool_attr_t& attr) { int64_t JitCodeKey<emb_seq_pool_attr_t>(const emb_seq_pool_attr_t& attr) {
return attr.table_width; return attr.table_width;
} }
template <> template <>
size_t JitCodeKey<sgd_attr_t>(const sgd_attr_t& attr) { int64_t JitCodeKey<sgd_attr_t>(const sgd_attr_t& attr) {
return attr.grad_width; return attr.grad_width;
} }
......
...@@ -46,7 +46,7 @@ struct KernelKey { ...@@ -46,7 +46,7 @@ struct KernelKey {
// Every JitCode should have a method to get the key from attribution // Every JitCode should have a method to get the key from attribution
template <typename Attr> template <typename Attr>
size_t JitCodeKey(const Attr& attr); int64_t JitCodeKey(const Attr& attr);
} // namespace jit } // namespace jit
} // namespace operators } // namespace operators
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <memory> // for unique_ptr #include <memory> // for unique_ptr
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <utility> // for move
#include <vector> #include <vector>
#include "paddle/fluid/operators/jit/gen_base.h" #include "paddle/fluid/operators/jit/gen_base.h"
#include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/operators/jit/kernel_base.h"
...@@ -30,7 +31,7 @@ namespace jit { ...@@ -30,7 +31,7 @@ namespace jit {
template <KernelType KT> template <KernelType KT>
class JitCodePool { class JitCodePool {
typedef std::unique_ptr<GenBase> GenBasePtr; typedef std::unique_ptr<GenBase> GenBasePtr;
typedef std::unordered_map<size_t, GenBasePtr> JitCodeMap; typedef std::unordered_map<int64_t, GenBasePtr> JitCodeMap;
public: public:
JitCodePool() = default; JitCodePool() = default;
...@@ -41,9 +42,9 @@ class JitCodePool { ...@@ -41,9 +42,9 @@ class JitCodePool {
const JitCodeMap& AllKernels() { return codes_; } const JitCodeMap& AllKernels() { return codes_; }
bool Has(size_t key) const { return codes_.find(key) != codes_.end(); } bool Has(int64_t key) const { return codes_.find(key) != codes_.end(); }
void Insert(size_t key, GenBasePtr value) { void Insert(int64_t key, GenBasePtr value) {
codes_.emplace(key, std::move(value)); codes_.emplace(key, std::move(value));
} }
......
...@@ -161,7 +161,7 @@ void CRFDecoding(const int seq_len, const float* x, const float* w, ...@@ -161,7 +161,7 @@ void CRFDecoding(const int seq_len, const float* x, const float* w,
} }
} }
bool CRFDecodingKernel::UseMe(const int& d) const { bool CRFDecodingKernel::CanBeUsed(const int& d) const {
#ifdef __AVX512F__ #ifdef __AVX512F__
constexpr int block = ZMM_FLOAT_BLOCK; constexpr int block = ZMM_FLOAT_BLOCK;
#else #else
......
...@@ -26,11 +26,11 @@ namespace intrinsic { ...@@ -26,11 +26,11 @@ namespace intrinsic {
void CRFDecoding(const int seq_len, const float* x, const float* w, void CRFDecoding(const int seq_len, const float* x, const float* w,
float* alpha, int* track, int tag_num); float* alpha, int* track, int tag_num);
class CRFDecodingKernel : public KernelMore<CRFDecodingTuples<float>> { class CRFDecodingKernel : public KernelMore<CRFDecodingTuple<float>> {
public: public:
CRFDecodingKernel() { this->func = CRFDecoding; } CRFDecodingKernel() { this->func = CRFDecoding; }
bool UseMe( bool CanBeUsed(
const typename CRFDecodingTuples<float>::attr_type&) const override; const typename CRFDecodingTuple<float>::attr_type&) const override;
const char* ImplType() const override { return "Intrinsic"; } const char* ImplType() const override { return "Intrinsic"; }
}; };
......
...@@ -153,7 +153,7 @@ void LayerNorm(float* x, float* out, float* mean, float* var, ...@@ -153,7 +153,7 @@ void LayerNorm(float* x, float* out, float* mean, float* var,
} }
} }
bool LayerNormKernel::UseMe(const int& d) const { bool LayerNormKernel::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx) && d >= YMM_FLOAT_BLOCK; return platform::MayIUse(platform::avx) && d >= YMM_FLOAT_BLOCK;
} }
......
...@@ -27,10 +27,11 @@ void LayerNorm(float* x, float* out, float* mean, float* var, ...@@ -27,10 +27,11 @@ void LayerNorm(float* x, float* out, float* mean, float* var,
const float* scale, const float* bias, int height, const float* scale, const float* bias, int height,
const float epsilon, int right); const float epsilon, int right);
class LayerNormKernel : public KernelMore<LayerNormTuples<float>> { class LayerNormKernel : public KernelMore<LayerNormTuple<float>> {
public: public:
LayerNormKernel() { this->func = LayerNorm; } LayerNormKernel() { this->func = LayerNorm; }
bool UseMe(const typename LayerNormTuples<float>::attr_type&) const override; bool CanBeUsed(
const typename LayerNormTuple<float>::attr_type&) const override;
const char* ImplType() const override { return "Intrinsic"; } const char* ImplType() const override { return "Intrinsic"; }
}; };
......
...@@ -23,6 +23,8 @@ namespace jit { ...@@ -23,6 +23,8 @@ namespace jit {
namespace more { namespace more {
namespace mix { namespace mix {
using CPUPlace = platform::CPUPlace;
void VSigmoid(const T* x, T* y, int n) { void VSigmoid(const T* x, T* y, int n) {
const float min = SIGMOID_THRESHOLD_MIN; const float min = SIGMOID_THRESHOLD_MIN;
const float max = SIGMOID_THRESHOLD_MAX; const float max = SIGMOID_THRESHOLD_MAX;
...@@ -30,7 +32,7 @@ void VSigmoid(const T* x, T* y, int n) { ...@@ -30,7 +32,7 @@ void VSigmoid(const T* x, T* y, int n) {
y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
y[i] = static_cast<T>(0) - y[i]; y[i] = static_cast<T>(0) - y[i];
} }
auto compute = Get<KernelType::kVExp, XYNTuples<T>, platform::CPUPlace>(n); auto compute = KernelFuncs<VExpTuple<T>, CPUPlace>::Cache().At(n);
compute(y, y, n); compute(y, y, n);
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]); y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
...@@ -39,9 +41,9 @@ void VSigmoid(const T* x, T* y, int n) { ...@@ -39,9 +41,9 @@ void VSigmoid(const T* x, T* y, int n) {
void VTanh(const T* x, T* y, int n) { void VTanh(const T* x, T* y, int n) {
const T a = 2, b = -1; const T a = 2, b = -1;
auto compute_scal = Get<kVScal, AXYNTuples<T>, platform::CPUPlace>(n); auto compute_scal = KernelFuncs<VScalTuple<T>, CPUPlace>::Cache().At(n);
auto compute_addbias = Get<kVAddBias, AXYNTuples<T>, platform::CPUPlace>(n); auto compute_addbias = KernelFuncs<VAddBiasTuple<T>, CPUPlace>::Cache().At(n);
auto compute_sigmoid = Get<kVSigmoid, XYNTuples<T>, platform::CPUPlace>(n); auto compute_sigmoid = KernelFuncs<VSigmoidTuple<T>, CPUPlace>::Cache().At(n);
compute_scal(&a, x, y, n); compute_scal(&a, x, y, n);
compute_sigmoid(y, y, n); compute_sigmoid(y, y, n);
compute_scal(&a, y, y, n); compute_scal(&a, y, y, n);
...@@ -49,16 +51,12 @@ void VTanh(const T* x, T* y, int n) { ...@@ -49,16 +51,12 @@ void VTanh(const T* x, T* y, int n) {
} }
void Softmax(const T* x, T* y, int n, int bs) { void Softmax(const T* x, T* y, int n, int bs) {
auto compute_hmax = auto compute_hmax = KernelFuncs<HMaxTuple<T>, CPUPlace>::Cache().At(n);
KernelFuncs<kHMax, XRNTuples<T>, platform::CPUPlace>::Cache().At(n); auto compute_hsum = KernelFuncs<HSumTuple<T>, CPUPlace>::Cache().At(n);
auto compute_hsum = auto compute_vscal = KernelFuncs<VScalTuple<T>, CPUPlace>::Cache().At(n);
KernelFuncs<kHSum, XRNTuples<T>, platform::CPUPlace>::Cache().At(n);
auto compute_vscal =
KernelFuncs<kVScal, AXYNTuples<T>, platform::CPUPlace>::Cache().At(n);
auto compute_vaddbias = auto compute_vaddbias =
KernelFuncs<kVAddBias, AXYNTuples<T>, platform::CPUPlace>::Cache().At(n); KernelFuncs<VAddBiasTuple<T>, CPUPlace>::Cache().At(n);
auto compute_vexp = auto compute_vexp = KernelFuncs<VExpTuple<T>, CPUPlace>::Cache().At(n);
KernelFuncs<kVExp, XYNTuples<T>, platform::CPUPlace>::Cache().At(n);
for (int i = 0; i < bs; ++i) { for (int i = 0; i < bs; ++i) {
T scalar; T scalar;
...@@ -76,13 +74,13 @@ void Softmax(const T* x, T* y, int n, int bs) { ...@@ -76,13 +74,13 @@ void Softmax(const T* x, T* y, int n, int bs) {
void (*getActFunc(KernelType type, int d))(const T*, T*, int) { // NOLINT void (*getActFunc(KernelType type, int d))(const T*, T*, int) { // NOLINT
if (type == kVSigmoid) { if (type == kVSigmoid) {
return Get<kVSigmoid, XYNTuples<T>, platform::CPUPlace>(d); return KernelFuncs<VSigmoidTuple<T>, CPUPlace>::Cache().At(d);
} else if (type == kVRelu) { } else if (type == kVRelu) {
return Get<kVRelu, XYNTuples<T>, platform::CPUPlace>(d); return KernelFuncs<VReluTuple<T>, CPUPlace>::Cache().At(d);
} else if (type == kVTanh) { } else if (type == kVTanh) {
return Get<kVTanh, XYNTuples<T>, platform::CPUPlace>(d); return KernelFuncs<VTanhTuple<T>, CPUPlace>::Cache().At(d);
} else if (type == kVIdentity) { } else if (type == kVIdentity) {
return Get<kVIdentity, XYNTuples<T>, platform::CPUPlace>(d); return KernelFuncs<VIdentityTuple<T>, CPUPlace>::Cache().At(d);
} }
PADDLE_THROW("Not support type: %s", type); PADDLE_THROW("Not support type: %s", type);
return nullptr; return nullptr;
...@@ -98,9 +96,9 @@ void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) { ...@@ -98,9 +96,9 @@ void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) {
const int d = attr->d; const int d = attr->d;
const int d2 = d * 2; const int d2 = d * 2;
const int d3 = d * 3; const int d3 = d * 3;
auto vmul_d = Get<kVMul, XYZNTuples<T>, platform::CPUPlace>(d); auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(d);
auto vadd_d = Get<kVAdd, XYZNTuples<T>, platform::CPUPlace>(d); auto vadd_d = KernelFuncs<VAddTuple<T>, CPUPlace>::Cache().At(d);
auto vadd_d2 = Get<kVAdd, XYZNTuples<T>, platform::CPUPlace>(d2); auto vadd_d2 = KernelFuncs<VAddTuple<T>, CPUPlace>::Cache().At(d2);
auto act_gate_d = getActFunc(attr->act_gate, d); auto act_gate_d = getActFunc(attr->act_gate, d);
auto act_gate_d2 = getActFunc(attr->act_gate, d2); auto act_gate_d2 = getActFunc(attr->act_gate, d2);
auto act_gate_d3 = getActFunc(attr->act_gate, d3); auto act_gate_d3 = getActFunc(attr->act_gate, d3);
...@@ -140,8 +138,8 @@ void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) { ...@@ -140,8 +138,8 @@ void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) {
int d = attr->d; int d = attr->d;
int d2 = d * 2; int d2 = d * 2;
int d3 = d * 3; int d3 = d * 3;
auto vmul_d = Get<kVMul, XYZNTuples<T>, platform::CPUPlace>(d); auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(d);
auto vadd_d = Get<kVAdd, XYZNTuples<T>, platform::CPUPlace>(d); auto vadd_d = KernelFuncs<VAddTuple<T>, CPUPlace>::Cache().At(d);
auto act_gate_d = getActFunc(attr->act_gate, d); auto act_gate_d = getActFunc(attr->act_gate, d);
auto act_cand_d = getActFunc(attr->act_cand, d); auto act_cand_d = getActFunc(attr->act_cand, d);
auto act_cell_d = getActFunc(attr->act_cell, d); auto act_cell_d = getActFunc(attr->act_cell, d);
...@@ -169,7 +167,7 @@ void GRUH1(gru_t* step, const gru_attr_t* attr) { ...@@ -169,7 +167,7 @@ void GRUH1(gru_t* step, const gru_attr_t* attr) {
int d2 = d * 2; int d2 = d * 2;
auto act_gate = getActFunc(attr->act_gate, d); auto act_gate = getActFunc(attr->act_gate, d);
auto act_cand = getActFunc(attr->act_cand, d); auto act_cand = getActFunc(attr->act_cand, d);
auto vmul_d = Get<kVMul, XYZNTuples<T>, platform::CPUPlace>(d); auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(d);
act_gate(gates, gates, d); act_gate(gates, gates, d);
act_cand(gates + d2, gates + d2, d); act_cand(gates + d2, gates + d2, d);
vmul_d(gates, gates + d2, ht, d); vmul_d(gates, gates + d2, ht, d);
...@@ -182,7 +180,7 @@ void GRUHtPart1(gru_t* step, const gru_attr_t* attr) { ...@@ -182,7 +180,7 @@ void GRUHtPart1(gru_t* step, const gru_attr_t* attr) {
T* ht = reinterpret_cast<T*>(step->ht); T* ht = reinterpret_cast<T*>(step->ht);
const T* ht_1 = reinterpret_cast<const T*>(step->ht_1); const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
auto act_gate = getActFunc(attr->act_gate, attr->d); auto act_gate = getActFunc(attr->act_gate, attr->d);
auto vmul_d = Get<kVMul, XYZNTuples<T>, platform::CPUPlace>(attr->d); auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(attr->d);
act_gate(gates + attr->d, gates + attr->d, attr->d); act_gate(gates + attr->d, gates + attr->d, attr->d);
vmul_d(ht_1, gates + attr->d, ht, attr->d); vmul_d(ht_1, gates + attr->d, ht, attr->d);
} }
...@@ -206,21 +204,21 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr) { ...@@ -206,21 +204,21 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr) {
} }
// TODO(TJ): tuning me // TODO(TJ): tuning me
bool VSigmoidKernel::UseMe(const int& d) const { return true; } bool VSigmoidKernel::CanBeUsed(const int& d) const { return true; }
bool VTanhKernel::UseMe(const int& d) const { return true; } bool VTanhKernel::CanBeUsed(const int& d) const { return true; }
bool SoftmaxKernel::UseMe(const int& d) const { return true; } bool SoftmaxKernel::CanBeUsed(const int& d) const { return true; }
bool LSTMCtHtKernel::UseMe(const lstm_attr_t& attr) const { return true; } bool LSTMCtHtKernel::CanBeUsed(const lstm_attr_t& attr) const { return true; }
bool LSTMC1H1Kernel::UseMe(const lstm_attr_t& attr) const { return true; } bool LSTMC1H1Kernel::CanBeUsed(const lstm_attr_t& attr) const { return true; }
bool GRUH1Kernel::UseMe(const gru_attr_t& attr) const { return true; } bool GRUH1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
bool GRUHtPart1Kernel::UseMe(const gru_attr_t& attr) const { return true; } bool GRUHtPart1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
bool GRUHtPart2Kernel::UseMe(const gru_attr_t& attr) const { return true; } bool GRUHtPart2Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
} // namespace mix } // namespace mix
} // namespace more } // namespace more
...@@ -230,16 +228,16 @@ bool GRUHtPart2Kernel::UseMe(const gru_attr_t& attr) const { return true; } ...@@ -230,16 +228,16 @@ bool GRUHtPart2Kernel::UseMe(const gru_attr_t& attr) const { return true; }
namespace mix = paddle::operators::jit::more::mix; namespace mix = paddle::operators::jit::more::mix;
#define REGISTER_MORE_KERNEL(key, func) \ #define REGISTER_MORE_KERNEL(func) \
REGISTER_JITKERNEL_MORE(key, mix, mix::func##Kernel) REGISTER_JITKERNEL_MORE(k##func, mix, mix::func##Kernel)
REGISTER_MORE_KERNEL(kVSigmoid, VSigmoid); REGISTER_MORE_KERNEL(VSigmoid);
REGISTER_MORE_KERNEL(kVTanh, VTanh); REGISTER_MORE_KERNEL(VTanh);
REGISTER_MORE_KERNEL(kSoftmax, Softmax); REGISTER_MORE_KERNEL(Softmax);
REGISTER_MORE_KERNEL(kLSTMCtHt, LSTMCtHt); REGISTER_MORE_KERNEL(LSTMCtHt);
REGISTER_MORE_KERNEL(kLSTMC1H1, LSTMC1H1); REGISTER_MORE_KERNEL(LSTMC1H1);
REGISTER_MORE_KERNEL(kGRUH1, GRUH1); REGISTER_MORE_KERNEL(GRUH1);
REGISTER_MORE_KERNEL(kGRUHtPart1, GRUHtPart1); REGISTER_MORE_KERNEL(GRUHtPart1);
REGISTER_MORE_KERNEL(kGRUHtPart2, GRUHtPart2); REGISTER_MORE_KERNEL(GRUHtPart2);
#undef REGISTER_MORE_KERNEL #undef REGISTER_MORE_KERNEL
...@@ -34,27 +34,27 @@ void GRUH1(gru_t* step, const gru_attr_t* attr); ...@@ -34,27 +34,27 @@ void GRUH1(gru_t* step, const gru_attr_t* attr);
void GRUHtPart1(gru_t* step, const gru_attr_t* attr); void GRUHtPart1(gru_t* step, const gru_attr_t* attr);
void GRUHtPart2(gru_t* step, const gru_attr_t* attr); void GRUHtPart2(gru_t* step, const gru_attr_t* attr);
#define DECLARE_MORE_KERNEL(name, tuples) \ #define DECLARE_MORE_KERNEL(name) \
class name##Kernel : public KernelMore<tuples<T>> { \ class name##Kernel : public KernelMore<name##Tuple<T>> { \
public: \ public: \
name##Kernel() { this->func = name; } \ name##Kernel() { this->func = name; } \
bool UseMe(const typename tuples<T>::attr_type&) const override; \ bool CanBeUsed(const typename name##Tuple<T>::attr_type&) const override; \
const char* ImplType() const override { return "Mixed"; } \ const char* ImplType() const override { return "Mixed"; } \
} }
// XYN // XYN
DECLARE_MORE_KERNEL(VSigmoid, XYNTuples); DECLARE_MORE_KERNEL(VSigmoid);
DECLARE_MORE_KERNEL(VTanh, XYNTuples); DECLARE_MORE_KERNEL(VTanh);
// XRN // XRN
DECLARE_MORE_KERNEL(Softmax, SoftmaxTuples); DECLARE_MORE_KERNEL(Softmax);
DECLARE_MORE_KERNEL(LSTMCtHt, LSTMTuples); DECLARE_MORE_KERNEL(LSTMCtHt);
DECLARE_MORE_KERNEL(LSTMC1H1, LSTMTuples); DECLARE_MORE_KERNEL(LSTMC1H1);
DECLARE_MORE_KERNEL(GRUH1, GRUTuples); DECLARE_MORE_KERNEL(GRUH1);
DECLARE_MORE_KERNEL(GRUHtPart1, GRUTuples); DECLARE_MORE_KERNEL(GRUHtPart1);
DECLARE_MORE_KERNEL(GRUHtPart2, GRUTuples); DECLARE_MORE_KERNEL(GRUHtPart2);
#undef DECLARE_MORE_KERNEL #undef DECLARE_MORE_KERNEL
......
...@@ -130,104 +130,105 @@ void ASum<double>(const double* x, double* res, int n) { ...@@ -130,104 +130,105 @@ void ASum<double>(const double* x, double* res, int n) {
// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
template <> template <>
bool VMulKernel<float>::UseMe(const int& d) const { bool VMulKernel<float>::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx512f) && d > 512; return platform::MayIUse(platform::avx512f) && d > 512;
} }
template <> template <>
bool VAddKernel<float>::UseMe(const int& d) const { bool VAddKernel<float>::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx) && d > 512; return platform::MayIUse(platform::avx) && d > 512;
} }
template <> template <>
bool VScalKernel<float>::UseMe(const int& d) const { bool VScalKernel<float>::CanBeUsed(const int& d) const {
return platform::MayIUse(platform::avx512f) && d > 512; return platform::MayIUse(platform::avx512f) && d > 512;
} }
template <> template <>
bool VExpKernel<float>::UseMe(const int& d) const { bool VExpKernel<float>::CanBeUsed(const int& d) const {
return d > 7; return d > 7;
} }
template <> template <>
bool VSquareKernel<float>::UseMe(const int& d) const { bool VSquareKernel<float>::CanBeUsed(const int& d) const {
return d > 7; return d > 7;
} }
template <> template <>
bool VCopyKernel<float>::UseMe(const int& d) const { bool VCopyKernel<float>::CanBeUsed(const int& d) const {
return d > 15; return d > 15;
} }
template <> template <>
bool VBroadcastKernel<float>::UseMe(const int64_t& d) const { bool VBroadcastKernel<float>::CanBeUsed(const int64_t& d) const {
return d > 127; return d > 127;
} }
template <> template <>
bool VBroadcastKernel<double>::UseMe(const int64_t& attr) const { bool VBroadcastKernel<double>::CanBeUsed(const int64_t& attr) const {
return true; return true;
} }
template <> template <>
bool VSigmoidKernel<float>::UseMe(const int& d) const { bool VSigmoidKernel<float>::CanBeUsed(const int& d) const {
return d > 7; return d > 7;
} }
template <> template <>
bool VTanhKernel<float>::UseMe(const int& d) const { bool VTanhKernel<float>::CanBeUsed(const int& d) const {
return d > 7; return d > 7;
} }
template <> template <>
bool SeqPoolKernel<float>::UseMe(const seq_pool_attr_t& attr) const { bool SeqPoolKernel<float>::CanBeUsed(const seq_pool_attr_t& attr) const {
return true; return true;
} }
template <> template <>
bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const { bool SeqPoolKernel<double>::CanBeUsed(const seq_pool_attr_t& attr) const {
return true; return true;
} }
template <> template <>
bool EmbSeqPoolKernel<float>::UseMe(const emb_seq_pool_attr_t& attr) const { bool EmbSeqPoolKernel<float>::CanBeUsed(const emb_seq_pool_attr_t& attr) const {
return true; return true;
} }
template <> template <>
bool EmbSeqPoolKernel<double>::UseMe(const emb_seq_pool_attr_t& attr) const { bool EmbSeqPoolKernel<double>::CanBeUsed(
const emb_seq_pool_attr_t& attr) const {
return true; return true;
} }
template <> template <>
bool SgdKernel<float>::UseMe(const sgd_attr_t& attr) const { bool SgdKernel<float>::CanBeUsed(const sgd_attr_t& attr) const {
return true; return true;
} }
template <> template <>
bool SgdKernel<double>::UseMe(const sgd_attr_t& attr) const { bool SgdKernel<double>::CanBeUsed(const sgd_attr_t& attr) const {
return true; return true;
} }
template <> template <>
bool MatMulKernel<float>::UseMe(const matmul_attr_t& attr) const { bool MatMulKernel<float>::CanBeUsed(const matmul_attr_t& attr) const {
return platform::MayIUse(platform::avx); return platform::MayIUse(platform::avx);
} }
template <> template <>
bool MatMulKernel<double>::UseMe(const matmul_attr_t& attr) const { bool MatMulKernel<double>::CanBeUsed(const matmul_attr_t& attr) const {
return true; return true;
} }
template <> template <>
bool SoftmaxKernel<float>::UseMe(const int& d) const { bool SoftmaxKernel<float>::CanBeUsed(const int& d) const {
// tuned on avx2 // tuned on avx2
return platform::MayIUse(platform::avx) && d < 60; return platform::MayIUse(platform::avx) && d < 60;
} }
#define AWALYS_USE_ME_WITH_DOUBLE(func) \ #define AWALYS_USE_ME_WITH_DOUBLE(func) \
template <> \ template <> \
bool func##Kernel<double>::UseMe(const int& d) const { \ bool func##Kernel<double>::CanBeUsed(const int& d) const { \
return true; \ return true; \
} }
...@@ -250,23 +251,23 @@ AWALYS_USE_ME_WITH_DOUBLE(Softmax); ...@@ -250,23 +251,23 @@ AWALYS_USE_ME_WITH_DOUBLE(Softmax);
namespace mkl = paddle::operators::jit::more::mkl; namespace mkl = paddle::operators::jit::more::mkl;
#define REGISTER_MKL_KERNEL(key, func) \ #define REGISTER_MKL_KERNEL(func) \
REGISTER_JITKERNEL_MORE(key, mkl, mkl::func##Kernel<float>, \ REGISTER_JITKERNEL_MORE(k##func, mkl, mkl::func##Kernel<float>, \
mkl::func##Kernel<double>) mkl::func##Kernel<double>)
REGISTER_MKL_KERNEL(kMatMul, MatMul); REGISTER_MKL_KERNEL(MatMul);
REGISTER_MKL_KERNEL(kVMul, VMul); REGISTER_MKL_KERNEL(VMul);
REGISTER_MKL_KERNEL(kVAdd, VAdd); REGISTER_MKL_KERNEL(VAdd);
REGISTER_MKL_KERNEL(kVScal, VScal); REGISTER_MKL_KERNEL(VScal);
REGISTER_MKL_KERNEL(kVExp, VExp); REGISTER_MKL_KERNEL(VExp);
REGISTER_MKL_KERNEL(kVSquare, VSquare); REGISTER_MKL_KERNEL(VSquare);
REGISTER_MKL_KERNEL(kVCopy, VCopy); REGISTER_MKL_KERNEL(VCopy);
REGISTER_MKL_KERNEL(kVBroadcast, VBroadcast); REGISTER_MKL_KERNEL(VBroadcast);
REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid); REGISTER_MKL_KERNEL(VSigmoid);
REGISTER_MKL_KERNEL(kVTanh, VTanh); REGISTER_MKL_KERNEL(VTanh);
REGISTER_MKL_KERNEL(kSeqPool, SeqPool); REGISTER_MKL_KERNEL(SeqPool);
REGISTER_MKL_KERNEL(kEmbSeqPool, EmbSeqPool); REGISTER_MKL_KERNEL(EmbSeqPool);
REGISTER_MKL_KERNEL(kSoftmax, Softmax); REGISTER_MKL_KERNEL(Softmax);
REGISTER_MKL_KERNEL(kSgd, Sgd); REGISTER_MKL_KERNEL(Sgd);
#undef REGISTER_MKL_KERNEL #undef REGISTER_MKL_KERNEL
...@@ -175,41 +175,38 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows, ...@@ -175,41 +175,38 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
} }
} }
#define DECLARE_MKL_KERNEL(name, tuples) \ #define DECLARE_MKL_KERNEL(name) \
template <typename T> \ template <typename T> \
class name##Kernel : public KernelMore<tuples<T>> { \ class name##Kernel : public KernelMore<name##Tuple<T>> { \
public: \ public: \
name##Kernel() { this->func = name<T>; } \ name##Kernel() { this->func = name<T>; } \
bool UseMe(const typename tuples<T>::attr_type&) const override; \ bool CanBeUsed(const typename name##Tuple<T>::attr_type&) const override; \
const char* ImplType() const override { return "MKL"; } \ const char* ImplType() const override { return "MKL"; } \
} }
// ABCMNK // ABCMNK
DECLARE_MKL_KERNEL(MatMul, MatMulTuples); DECLARE_MKL_KERNEL(MatMul);
// XYZN // XYZN
DECLARE_MKL_KERNEL(VMul, XYZNTuples); DECLARE_MKL_KERNEL(VMul);
DECLARE_MKL_KERNEL(VAdd, XYZNTuples); DECLARE_MKL_KERNEL(VAdd);
// AXYN // AXYN
DECLARE_MKL_KERNEL(VScal, AXYNTuples); DECLARE_MKL_KERNEL(VScal);
// XYN // XYN
DECLARE_MKL_KERNEL(VExp, XYNTuples); DECLARE_MKL_KERNEL(VExp);
DECLARE_MKL_KERNEL(VSigmoid, XYNTuples); DECLARE_MKL_KERNEL(VSigmoid);
DECLARE_MKL_KERNEL(VTanh, XYNTuples); DECLARE_MKL_KERNEL(VTanh);
DECLARE_MKL_KERNEL(VSquare, XYNTuples); DECLARE_MKL_KERNEL(VSquare);
DECLARE_MKL_KERNEL(VCopy, XYNTuples); DECLARE_MKL_KERNEL(VCopy);
DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples); // others
DECLARE_MKL_KERNEL(SeqPool);
DECLARE_MKL_KERNEL(EmbSeqPool, EmbSeqPoolTuples); DECLARE_MKL_KERNEL(EmbSeqPool);
DECLARE_MKL_KERNEL(Softmax);
DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples); DECLARE_MKL_KERNEL(Sgd);
DECLARE_MKL_KERNEL(VBroadcast);
DECLARE_MKL_KERNEL(Sgd, SgdTuples);
DECLARE_MKL_KERNEL(VBroadcast, VBroadcastTuples);
#undef DECLARE_MKL_KERNEL #undef DECLARE_MKL_KERNEL
......
...@@ -17,51 +17,43 @@ ...@@ -17,51 +17,43 @@
namespace refer = paddle::operators::jit::refer; namespace refer = paddle::operators::jit::refer;
#define REGISTER_REFER_KERNEL(key, func) \ #define REGISTER_REFER_KERNEL(func) \
REGISTER_JITKERNEL_REFER(key, refer::func##Kernel<float>, \ REGISTER_JITKERNEL_REFER(k##func, refer::func##Kernel<float>, \
refer::func##Kernel<double>) refer::func##Kernel<double>)
REGISTER_REFER_KERNEL(kVMul, VMul); REGISTER_REFER_KERNEL(VMul);
REGISTER_REFER_KERNEL(kVAdd, VAdd); REGISTER_REFER_KERNEL(VAdd);
REGISTER_REFER_KERNEL(kVAddRelu, VAddRelu); REGISTER_REFER_KERNEL(VAddRelu);
REGISTER_REFER_KERNEL(kVSub, VSub); REGISTER_REFER_KERNEL(VSub);
REGISTER_REFER_KERNEL(kVScal, VScal); REGISTER_REFER_KERNEL(VScal);
REGISTER_REFER_KERNEL(kVAddBias, VAddBias); REGISTER_REFER_KERNEL(VAddBias);
REGISTER_REFER_KERNEL(kVRelu, VRelu); REGISTER_REFER_KERNEL(VRelu);
REGISTER_REFER_KERNEL(kVCopy, VCopy); REGISTER_REFER_KERNEL(VCopy);
REGISTER_REFER_KERNEL(kVIdentity, VIdentity); REGISTER_REFER_KERNEL(VIdentity);
REGISTER_REFER_KERNEL(kVSquare, VSquare); REGISTER_REFER_KERNEL(VSquare);
REGISTER_REFER_KERNEL(kVExp, VExp); REGISTER_REFER_KERNEL(VExp);
REGISTER_REFER_KERNEL(kVSigmoid, VSigmoid); REGISTER_REFER_KERNEL(VSigmoid);
REGISTER_REFER_KERNEL(kVTanh, VTanh); REGISTER_REFER_KERNEL(VTanh);
REGISTER_REFER_KERNEL(kLSTMCtHt, LSTMCtHt); REGISTER_REFER_KERNEL(LSTMCtHt);
REGISTER_REFER_KERNEL(kLSTMC1H1, LSTMC1H1); REGISTER_REFER_KERNEL(LSTMC1H1);
REGISTER_REFER_KERNEL(kGRUH1, GRUH1); REGISTER_REFER_KERNEL(GRUH1);
REGISTER_REFER_KERNEL(kGRUHtPart1, GRUHtPart1); REGISTER_REFER_KERNEL(GRUHtPart1);
REGISTER_REFER_KERNEL(kGRUHtPart2, GRUHtPart2); REGISTER_REFER_KERNEL(GRUHtPart2);
REGISTER_REFER_KERNEL(kCRFDecoding, CRFDecoding); REGISTER_REFER_KERNEL(CRFDecoding);
REGISTER_REFER_KERNEL(kLayerNorm, LayerNorm); REGISTER_REFER_KERNEL(LayerNorm);
REGISTER_REFER_KERNEL(NCHW16CMulNC);
REGISTER_REFER_KERNEL(kNCHW16CMulNC, NCHW16CMulNC); REGISTER_REFER_KERNEL(SeqPool);
REGISTER_REFER_KERNEL(MatMul);
REGISTER_REFER_KERNEL(kSeqPool, SeqPool); REGISTER_REFER_KERNEL(HMax);
REGISTER_REFER_KERNEL(HSum);
REGISTER_REFER_KERNEL(kMatMul, MatMul); REGISTER_REFER_KERNEL(Softmax);
REGISTER_REFER_KERNEL(EmbSeqPool);
REGISTER_REFER_KERNEL(kHMax, HMax); REGISTER_REFER_KERNEL(Sgd);
REGISTER_REFER_KERNEL(kHSum, HSum); REGISTER_REFER_KERNEL(VBroadcast);
REGISTER_REFER_KERNEL(kSoftmax, Softmax);
REGISTER_REFER_KERNEL(kEmbSeqPool, EmbSeqPool);
REGISTER_REFER_KERNEL(kSgd, Sgd);
REGISTER_REFER_KERNEL(kVBroadcast, VBroadcast);
#undef REGISTER_REFER_KERNEL #undef REGISTER_REFER_KERNEL
...@@ -490,60 +490,54 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows, ...@@ -490,60 +490,54 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
} }
} }
#define DECLARE_REFER_KERNEL(name, tuples) \ #define DECLARE_REFER_KERNEL(name) \
template <typename T> \ template <typename T> \
class name##Kernel : public ReferKernel<tuples<T>> { \ class name##Kernel : public ReferKernel<name##Tuple<T>> { \
public: \ public: \
name##Kernel() { this->func = name<T>; } \ name##Kernel() { this->func = name<T>; } \
} }
// const T* x, const T* y, T* z, int n // const T* x, const T* y, T* z, int n
DECLARE_REFER_KERNEL(VMul, XYZNTuples); DECLARE_REFER_KERNEL(VMul);
DECLARE_REFER_KERNEL(VAdd, XYZNTuples); DECLARE_REFER_KERNEL(VAdd);
DECLARE_REFER_KERNEL(VAddRelu, XYZNTuples); DECLARE_REFER_KERNEL(VAddRelu);
DECLARE_REFER_KERNEL(VSub, XYZNTuples); DECLARE_REFER_KERNEL(VSub);
// const T* a, const T* x, T* y, int n // const T* a, const T* x, T* y, int n
DECLARE_REFER_KERNEL(VScal, AXYNTuples); DECLARE_REFER_KERNEL(VScal);
DECLARE_REFER_KERNEL(VAddBias, AXYNTuples); DECLARE_REFER_KERNEL(VAddBias);
// const T* x, T* y, int n // const T* x, T* y, int n
DECLARE_REFER_KERNEL(VRelu, XYNTuples); DECLARE_REFER_KERNEL(VRelu);
DECLARE_REFER_KERNEL(VIdentity, XYNTuples); DECLARE_REFER_KERNEL(VIdentity);
DECLARE_REFER_KERNEL(VExp, XYNTuples); DECLARE_REFER_KERNEL(VExp);
DECLARE_REFER_KERNEL(VSigmoid, XYNTuples); DECLARE_REFER_KERNEL(VSigmoid);
DECLARE_REFER_KERNEL(VTanh, XYNTuples); DECLARE_REFER_KERNEL(VTanh);
DECLARE_REFER_KERNEL(VSquare, XYNTuples); DECLARE_REFER_KERNEL(VSquare);
DECLARE_REFER_KERNEL(VCopy, XYNTuples); DECLARE_REFER_KERNEL(VCopy);
// lstm_t*, const lstm_attr_t* // lstm_t*, const lstm_attr_t*
DECLARE_REFER_KERNEL(LSTMCtHt, LSTMTuples); DECLARE_REFER_KERNEL(LSTMCtHt);
DECLARE_REFER_KERNEL(LSTMC1H1, LSTMTuples); DECLARE_REFER_KERNEL(LSTMC1H1);
// gru_t*, const gru_attr_t* // gru_t*, const gru_attr_t*
DECLARE_REFER_KERNEL(GRUH1, GRUTuples); DECLARE_REFER_KERNEL(GRUH1);
DECLARE_REFER_KERNEL(GRUHtPart1, GRUTuples); DECLARE_REFER_KERNEL(GRUHtPart1);
DECLARE_REFER_KERNEL(GRUHtPart2, GRUTuples); DECLARE_REFER_KERNEL(GRUHtPart2);
DECLARE_REFER_KERNEL(CRFDecoding, CRFDecodingTuples); DECLARE_REFER_KERNEL(HMax);
DECLARE_REFER_KERNEL(LayerNorm, LayerNormTuples); DECLARE_REFER_KERNEL(HSum);
DECLARE_REFER_KERNEL(NCHW16CMulNC, NCHW16CMulNCTuples); // others
DECLARE_REFER_KERNEL(CRFDecoding);
DECLARE_REFER_KERNEL(SeqPool, SeqPoolTuples); DECLARE_REFER_KERNEL(LayerNorm);
DECLARE_REFER_KERNEL(NCHW16CMulNC);
DECLARE_REFER_KERNEL(MatMul, MatMulTuples); DECLARE_REFER_KERNEL(SeqPool);
DECLARE_REFER_KERNEL(MatMul);
DECLARE_REFER_KERNEL(HMax, XRNTuples); DECLARE_REFER_KERNEL(Softmax);
DECLARE_REFER_KERNEL(HSum, XRNTuples); DECLARE_REFER_KERNEL(EmbSeqPool);
DECLARE_REFER_KERNEL(Sgd);
DECLARE_REFER_KERNEL(Softmax, SoftmaxTuples); DECLARE_REFER_KERNEL(VBroadcast);
DECLARE_REFER_KERNEL(EmbSeqPool, EmbSeqPoolTuples);
DECLARE_REFER_KERNEL(Sgd, SgdTuples);
DECLARE_REFER_KERNEL(VBroadcast, VBroadcastTuples);
#undef DECLARE_REFER_KERNEL #undef DECLARE_REFER_KERNEL
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <memory> #include <memory>
#include <tuple> #include <tuple>
#include <type_traits> #include <type_traits>
#include <utility> // for std::move
#include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/operators/jit/kernel_base.h"
#include "paddle/fluid/operators/jit/kernel_pool.h" #include "paddle/fluid/operators/jit/kernel_pool.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
...@@ -49,7 +50,7 @@ struct JitKernelRegistrarFunctor<Pool, PlaceType, false, I, KernelImpls...> { ...@@ -49,7 +50,7 @@ struct JitKernelRegistrarFunctor<Pool, PlaceType, false, I, KernelImpls...> {
void operator()(KernelType kt) const { void operator()(KernelType kt) const {
KernelKey kkey(kt, PlaceType()); KernelKey kkey(kt, PlaceType());
Pool().Instance().Insert(kkey, Pool::Instance().Insert(kkey,
std::move(make_unique<const KERNEL_IMPL_TYPE>())); std::move(make_unique<const KERNEL_IMPL_TYPE>()));
constexpr auto size = std::tuple_size<std::tuple<KernelImpls...>>::value; constexpr auto size = std::tuple_size<std::tuple<KernelImpls...>>::value;
JitKernelRegistrarFunctor<Pool, PlaceType, I + 1 == size, I + 1, JitKernelRegistrarFunctor<Pool, PlaceType, I + 1 == size, I + 1,
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <iostream>
#include <random> #include <random>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -64,17 +65,47 @@ std::vector<int> TestSizes() { ...@@ -64,17 +65,47 @@ std::vector<int> TestSizes() {
namespace jit = paddle::operators::jit; namespace jit = paddle::operators::jit;
using CPUPlace = paddle::platform::CPUPlace; using CPUPlace = paddle::platform::CPUPlace;
template <typename KernelTuples, typename... Args> template <typename KernelTuple, typename PlaceType, typename Tester,
struct TestFuncWithRefer { typename... Args>
void operator()(const typename KernelTuples::func_type tgt, Args... args) { void TestAllImpls(const typename KernelTuple::attr_type& attr,
LOG(FATAL) << "Should specify this function."; const Tester& verifier, const Args&... args) {
auto funcs = jit::GetAllCandidateFuncsWithTypes<KernelTuple, PlaceType>(attr);
for (auto f : funcs) {
VLOG(10) << "Test Kernel " << f.first;
verifier(f.second, args...);
} }
}; }
template <typename T> template <typename KernelTuple, typename PlaceType>
struct TestFuncWithRefer<jit::XYZNTuples<T>, std::vector<T>, std::vector<T>, void TestKernelXYZN() {
std::vector<T>> { using T = typename KernelTuple::data_type;
void operator()(const typename jit::XYZNTuples<T>::func_type tgt, VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
for (int d : TestSizes()) {
auto ref = jit::GetReferFunc<KernelTuple>();
EXPECT_TRUE(ref != nullptr);
std::vector<T> x(d), y(d), zref(d);
RandomVec<T>(d, x.data());
RandomVec<T>(d, y.data());
std::vector<T> xinp(d), yinp(d); // inplace test
std::copy(x.begin(), x.end(), xinp.begin());
std::copy(y.begin(), y.end(), yinp.begin());
const T* x_data = x.data();
const T* y_data = y.data();
T* zref_data = zref.data();
T* xinp_data = xinp.data();
T* yinp_data = yinp.data();
// test refer code inplace
ref(x_data, y_data, zref_data, d);
ref(x_data, yinp_data, yinp_data, d);
ref(xinp_data, y_data, xinp_data, d);
ExpectEQ<T>(xinp_data, zref_data, d);
ExpectEQ<T>(yinp_data, zref_data, d);
auto verifier = [](const typename KernelTuple::func_type tgt,
const std::vector<T>& x, const std::vector<T>& y, const std::vector<T>& x, const std::vector<T>& y,
const std::vector<T>& zref) { const std::vector<T>& zref) {
EXPECT_TRUE(tgt != nullptr); EXPECT_TRUE(tgt != nullptr);
...@@ -98,13 +129,35 @@ struct TestFuncWithRefer<jit::XYZNTuples<T>, std::vector<T>, std::vector<T>, ...@@ -98,13 +129,35 @@ struct TestFuncWithRefer<jit::XYZNTuples<T>, std::vector<T>, std::vector<T>,
std::copy(y.begin(), y.end(), ztgt.begin()); std::copy(y.begin(), y.end(), ztgt.begin());
tgt(x_data, ztgt_data, ztgt_data, d); tgt(x_data, ztgt_data, ztgt_data, d);
ExpectEQ<T>(ztgt_data, zref_data, d); ExpectEQ<T>(ztgt_data, zref_data, d);
};
TestAllImpls<KernelTuple, PlaceType>(d, verifier, x, y, zref);
} }
}; }
template <typename T> template <typename KernelTuple, typename PlaceType>
struct TestFuncWithRefer<jit::AXYNTuples<T>, T, std::vector<T>, void TestKernelAXYN() {
std::vector<T>> { using T = typename KernelTuple::data_type;
void operator()(const typename jit::AXYNTuples<T>::func_type tgt, const T a, VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
for (int d : TestSizes()) {
auto ref = jit::GetReferFunc<KernelTuple>();
EXPECT_TRUE(ref != nullptr);
const T a = static_cast<T>(3);
std::vector<T> x(d), yref(d);
std::vector<T> xinp(d); // inplace test
RandomVec<T>(d, x.data());
std::copy(x.begin(), x.end(), xinp.begin());
const T* x_data = x.data();
T* yref_data = yref.data();
T* xinp_data = xinp.data();
// test refer code inplace
ref(&a, x_data, yref_data, d);
ref(&a, xinp_data, xinp_data, d);
ExpectEQ<T>(xinp_data, yref_data, d);
auto verifier = [](const typename KernelTuple::func_type tgt, const T a,
const std::vector<T>& x, const std::vector<T>& yref) { const std::vector<T>& x, const std::vector<T>& yref) {
EXPECT_TRUE(tgt != nullptr); EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(yref.size(), x.size()); EXPECT_EQ(yref.size(), x.size());
...@@ -120,66 +173,32 @@ struct TestFuncWithRefer<jit::AXYNTuples<T>, T, std::vector<T>, ...@@ -120,66 +173,32 @@ struct TestFuncWithRefer<jit::AXYNTuples<T>, T, std::vector<T>,
std::copy(x.begin(), x.end(), ytgt.begin()); std::copy(x.begin(), x.end(), ytgt.begin());
tgt(&a, ytgt_data, ytgt_data, d); tgt(&a, ytgt_data, ytgt_data, d);
ExpectEQ<T>(ytgt_data, yref_data, d); ExpectEQ<T>(ytgt_data, yref_data, d);
};
TestAllImpls<KernelTuple, PlaceType>(d, verifier, a, x, yref);
} }
}; }
template <typename T> template <typename KernelTuple, typename PlaceType>
struct TestFuncWithRefer<jit::SoftmaxTuples<T>, std::vector<T>, std::vector<T>, void TestKernelXYN() {
int, int> { using T = typename KernelTuple::data_type;
void operator()(const typename jit::SoftmaxTuples<T>::func_type tgt, VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
const std::vector<T>& x, const std::vector<T>& yref, int n, for (int d : TestSizes()) {
int bs) { auto ref = jit::GetReferFunc<KernelTuple>();
EXPECT_TRUE(tgt != nullptr); EXPECT_TRUE(ref != nullptr);
EXPECT_EQ(yref.size(), x.size());
EXPECT_EQ(x.size(), static_cast<size_t>(n * bs));
const T* x_data = x.data();
const T* yref_data = yref.data();
std::vector<T> ytgt(n * bs);
T* ytgt_data = ytgt.data();
// test normal
tgt(x_data, ytgt_data, n, bs);
ExpectEQ<T>(ytgt_data, yref_data, n * bs);
// test inplace x
std::copy(x.begin(), x.end(), ytgt.begin());
tgt(ytgt_data, ytgt_data, n, bs);
ExpectEQ<T>(ytgt_data, yref_data, n * bs);
}
};
template <typename T> std::vector<T> x(d), yref(d);
struct TestFuncWithRefer<jit::XRNTuples<T>, std::vector<T>, T> { std::vector<T> xinp(d); // inplace test
void operator()(const typename jit::XRNTuples<T>::func_type tgt, RandomVec<T>(d, x.data());
const std::vector<T>& x, const T ref_res) { std::copy(x.begin(), x.end(), xinp.begin());
EXPECT_TRUE(tgt != nullptr);
T tgt_res;
tgt(x.data(), &tgt_res, x.size());
ExpectEQ<T>(&tgt_res, &ref_res, 1);
}
};
template <typename T>
struct TestFuncWithRefer<jit::VBroadcastTuples<T>, std::vector<T>,
std::vector<T>, int64_t,
typename jit::VBroadcastTuples<T>::attr_type> {
void operator()(const typename jit::VBroadcastTuples<T>::func_type tgt,
const std::vector<T>& x, const std::vector<T>& yref,
int64_t h,
const typename jit::VBroadcastTuples<T>::attr_type& attr) {
EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(x.size(), static_cast<size_t>(attr));
EXPECT_EQ(yref.size(), x.size() * h);
std::vector<T> y(yref.size());
const T* x_data = x.data(); const T* x_data = x.data();
const T* yref_data = yref.data(); T* yref_data = yref.data();
T* y_data = y.data(); T* xinp_data = xinp.data();
tgt(x_data, y_data, h, attr); // test refer code inplace
ExpectEQ<T>(y_data, yref_data, yref.size()); ref(x_data, yref_data, d);
} ref(xinp_data, xinp_data, d);
}; ExpectEQ<T>(xinp_data, yref_data, d);
auto verifier = [](const typename KernelTuple::func_type tgt,
template <typename T>
struct TestFuncWithRefer<jit::XYNTuples<T>, std::vector<T>, std::vector<T>> {
void operator()(const typename jit::XYNTuples<T>::func_type tgt,
const std::vector<T>& x, const std::vector<T>& yref) { const std::vector<T>& x, const std::vector<T>& yref) {
EXPECT_TRUE(tgt != nullptr); EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(yref.size(), x.size()); EXPECT_EQ(yref.size(), x.size());
...@@ -195,18 +214,86 @@ struct TestFuncWithRefer<jit::XYNTuples<T>, std::vector<T>, std::vector<T>> { ...@@ -195,18 +214,86 @@ struct TestFuncWithRefer<jit::XYNTuples<T>, std::vector<T>, std::vector<T>> {
std::copy(x.begin(), x.end(), ytgt.begin()); std::copy(x.begin(), x.end(), ytgt.begin());
tgt(ytgt_data, ytgt_data, d); tgt(ytgt_data, ytgt_data, d);
ExpectEQ<T>(ytgt_data, yref_data, d); ExpectEQ<T>(ytgt_data, yref_data, d);
};
TestAllImpls<KernelTuple, PlaceType>(d, verifier, x, yref);
} }
}; }
template <typename T> template <typename KernelTuple, typename PlaceType>
struct TestFuncWithRefer<jit::LSTMTuples<T>, std::vector<T>, std::vector<T>, void TestKernelXRN() {
std::vector<T>, std::vector<T>, std::vector<T>, using T = typename KernelTuple::data_type;
typename jit::LSTMTuples<T>::attr_type> { VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
void operator()(const typename jit::LSTMTuples<T>::func_type tgt, auto last_acc = FLAGS_acc;
FLAGS_acc = 1e-4;
for (int d : TestSizes()) {
auto ref = jit::GetReferFunc<KernelTuple>();
EXPECT_TRUE(ref != nullptr);
std::vector<T> x(d);
RandomVec<T>(d, x.data());
T ref_res;
ref(x.data(), &ref_res, d);
auto verifier = [](const typename KernelTuple::func_type tgt,
const std::vector<T>& x, const T ref_res) {
EXPECT_TRUE(tgt != nullptr);
T tgt_res;
tgt(x.data(), &tgt_res, x.size());
ExpectEQ<T>(&tgt_res, &ref_res, 1);
};
TestAllImpls<KernelTuple, PlaceType>(d, verifier, x, ref_res);
}
FLAGS_acc = last_acc;
}
template <typename KernelTuple, typename PlaceType>
void TestKernelLSTM() {
using T = typename KernelTuple::data_type;
VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
auto test_sizes = TestSizes();
test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
for (int d : test_sizes) {
for (bool use_peephole : {true, false}) {
for (auto& act_gate : all_acts) {
for (auto& act_cand : all_acts) {
for (auto& act_cell : all_acts) {
const jit::lstm_attr_t attr(
d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand),
jit::to_kerneltype(act_cell), use_peephole);
auto ref = jit::GetReferFunc<KernelTuple>();
EXPECT_TRUE(ref != nullptr);
std::vector<T> xsrc(4 * d), wp(3 * d), ct_1(d);
std::vector<T> ct_ref(d), ht_ref(d), checked(2 * d);
RandomVec<T>(4 * d, xsrc.data());
RandomVec<T>(3 * d, wp.data(), -1.f, 1.f);
RandomVec<T>(d, ct_1.data(), -1.f, 1.f);
// x could be changed after compute, so copy to save src
std::vector<T> x(xsrc.size());
std::copy(xsrc.begin(), xsrc.end(), x.begin());
const T* ct_1_data = ct_1.data();
const T* wp_data = wp.data();
T* x_data = x.data();
T* checked_data = checked.data();
T* ct_ref_data = ct_ref.data();
T* ht_ref_data = ht_ref.data();
jit::lstm_t step;
step.gates = x_data;
step.ct_1 = ct_1_data;
step.ct = ct_ref_data;
step.ht = ht_ref_data;
if (use_peephole) {
step.wp = wp_data;
step.checked = checked_data;
}
ref(&step, &attr);
VLOG(10) << attr;
auto verifier = [](
const typename KernelTuple::func_type tgt,
const std::vector<T>& xsrc, const std::vector<T>& wp, const std::vector<T>& xsrc, const std::vector<T>& wp,
const std::vector<T>& ct_1, const std::vector<T>& ct_ref, const std::vector<T>& ct_1, const std::vector<T>& ct_ref,
const std::vector<T>& ht_ref, const std::vector<T>& ht_ref,
const typename jit::LSTMTuples<T>::attr_type& attr) { const typename KernelTuple::attr_type& attr) {
EXPECT_TRUE(tgt != nullptr); EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(ct_ref.size(), ht_ref.size()); EXPECT_EQ(ct_ref.size(), ht_ref.size());
EXPECT_EQ(ct_1.size(), ht_ref.size()); EXPECT_EQ(ct_1.size(), ht_ref.size());
...@@ -215,7 +302,8 @@ struct TestFuncWithRefer<jit::LSTMTuples<T>, std::vector<T>, std::vector<T>, ...@@ -215,7 +302,8 @@ struct TestFuncWithRefer<jit::LSTMTuples<T>, std::vector<T>, std::vector<T>,
// x could be changed after compute, so copy to save src // x could be changed after compute, so copy to save src
int d = ht_ref.size(); int d = ht_ref.size();
std::vector<T> x(xsrc.size()), ct(ct_ref.size()), ht(ht_ref.size()); std::vector<T> x(xsrc.size()), ct(ct_ref.size()),
ht(ht_ref.size());
std::vector<T> checked(2 * d); std::vector<T> checked(2 * d);
std::copy(xsrc.begin(), xsrc.end(), x.begin()); std::copy(xsrc.begin(), xsrc.end(), x.begin());
...@@ -241,17 +329,50 @@ struct TestFuncWithRefer<jit::LSTMTuples<T>, std::vector<T>, std::vector<T>, ...@@ -241,17 +329,50 @@ struct TestFuncWithRefer<jit::LSTMTuples<T>, std::vector<T>, std::vector<T>,
tgt(&step, &attr); tgt(&step, &attr);
ExpectEQ<T>(ct_data, ct_ref_data, d); ExpectEQ<T>(ct_data, ct_ref_data, d);
ExpectEQ<T>(ht_data, ht_ref_data, d); ExpectEQ<T>(ht_data, ht_ref_data, d);
};
TestAllImpls<KernelTuple, PlaceType>(attr, verifier, xsrc, wp, ct_1,
ct_ref, ht_ref, attr);
}
} }
}; }
}
}
}
template <typename T> template <typename KernelTuple, typename PlaceType>
struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>, void TestKernelGRU() {
std::vector<T>, using T = typename KernelTuple::data_type;
typename jit::GRUTuples<T>::attr_type> { VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
void operator()(const typename jit::GRUTuples<T>::func_type tgt, std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
const std::vector<T>& xsrc, const std::vector<T>& ht_1, auto test_sizes = TestSizes();
test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
for (int d : test_sizes) {
for (auto& act_gate : all_acts) {
for (auto& act_cand : all_acts) {
const jit::gru_attr_t attr(d, jit::to_kerneltype(act_gate),
jit::to_kerneltype(act_cand));
auto ref = jit::GetReferFunc<KernelTuple>();
EXPECT_TRUE(ref != nullptr);
std::vector<T> xsrc(3 * d), ht_1(d), ht_ref(d);
RandomVec<T>(3 * d, xsrc.data());
RandomVec<T>(d, ht_1.data());
// x could be changed after compute, so copy to save src
std::vector<T> x(xsrc.size());
std::copy(xsrc.begin(), xsrc.end(), x.begin());
const T* ht_1_data = ht_1.data();
T* x_data = x.data();
T* ht_ref_data = ht_ref.data();
jit::gru_t step;
step.gates = x_data;
step.ht_1 = ht_1_data;
step.ht = ht_ref_data;
ref(&step, &attr);
VLOG(10) << attr;
auto verifier = [](const typename KernelTuple::func_type tgt,
const std::vector<T>& xsrc,
const std::vector<T>& ht_1,
const std::vector<T>& ht_ref, const std::vector<T>& ht_ref,
const typename jit::GRUTuples<T>::attr_type& attr) { const typename KernelTuple::attr_type& attr) {
EXPECT_TRUE(tgt != nullptr); EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(ht_1.size(), ht_ref.size()); EXPECT_EQ(ht_1.size(), ht_ref.size());
EXPECT_EQ(xsrc.size(), 3 * ht_ref.size()); EXPECT_EQ(xsrc.size(), 3 * ht_ref.size());
...@@ -270,131 +391,125 @@ struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>, ...@@ -270,131 +391,125 @@ struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>,
step.ht = ht_data; step.ht = ht_data;
tgt(&step, &attr); tgt(&step, &attr);
ExpectEQ<T>(ht_data, ht_ref_data, d); ExpectEQ<T>(ht_data, ht_ref_data, d);
};
TestAllImpls<KernelTuple, PlaceType>(attr, verifier, xsrc, ht_1, ht_ref,
attr);
} }
};
template <typename T>
struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
typename jit::SeqPoolTuples<T>::attr_type> {
void operator()(const typename jit::SeqPoolTuples<T>::func_type tgt,
const std::vector<T>& x, const std::vector<T>& yref,
const typename jit::SeqPoolTuples<T>::attr_type& attr) {
EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(x.size() % yref.size(), static_cast<size_t>(0));
int w = yref.size();
std::vector<T> y(w);
const T* x_data = x.data();
const T* yref_data = yref.data();
T* y_data = y.data();
tgt(x_data, y_data, &attr);
ExpectEQ<T>(y_data, yref_data, w);
} }
};
template <typename T>
struct TestFuncWithRefer<jit::EmbSeqPoolTuples<T>, std::vector<T>,
std::vector<int64_t>, std::vector<T>,
typename jit::EmbSeqPoolTuples<T>::attr_type> {
void operator()(const typename jit::EmbSeqPoolTuples<T>::func_type tgt,
const std::vector<T>& table, const std::vector<int64_t>& idx,
const std::vector<T>& oref,
const typename jit::EmbSeqPoolTuples<T>::attr_type& attr) {
EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(table.size(),
static_cast<size_t>(attr.table_height * attr.table_width));
EXPECT_EQ(idx.size(),
static_cast<size_t>(attr.index_height * attr.index_width));
EXPECT_EQ(oref.size(),
static_cast<size_t>(attr.table_width * attr.index_width));
const T* table_data = table.data();
const int64_t* idx_data = idx.data();
const T* oref_data = oref.data();
int o_w = oref.size();
std::vector<T> out(o_w);
T* o_data = out.data();
tgt(table_data, idx_data, o_data, &attr);
ExpectEQ<T>(o_data, oref_data, o_w);
} }
}; }
template <typename T> template <typename KernelTuple, typename PlaceType>
struct TestFuncWithRefer<jit::SgdTuples<T>, T, std::vector<T>, std::vector<T>, void TestKernelNCHW16CMulNC() {
std::vector<int64_t>, std::vector<T>, using T = typename KernelTuple::data_type;
typename jit::SgdTuples<T>::attr_type> { VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
void operator()(const typename jit::SgdTuples<T>::func_type tgt, const T lr, const int n = 3, c = 16 * 4, h = 10, w = 10;
const std::vector<T>& param, const std::vector<T>& grad, auto ref = jit::GetReferFunc<KernelTuple>();
const std::vector<int64_t>& rows, const std::vector<T>& oref, EXPECT_TRUE(ref != nullptr);
const typename jit::SgdTuples<T>::attr_type& attr) { int sz = n * c * h * w;
std::vector<T> x(sz), y(n * c), zref(sz);
std::vector<T> ztgt(sz), zjit(sz);
RandomVec<T>(sz, x.data());
RandomVec<T>(n * c, y.data());
const T* x_data = x.data();
const T* y_data = y.data();
T* zref_data = zref.data();
T* ztgt_data = ztgt.data();
T* zjit_data = zjit.data();
constexpr int simd_width = ZMM_FLOAT_BLOCK;
int C = c / simd_width;
auto tgt = jit::KernelFuncs<KernelTuple, PlaceType>::Cache().At(0);
auto funcs = jit::GetAllCandidateFuncs<KernelTuple, PlaceType>(0);
EXPECT_GT(funcs.size(), 0UL);
auto jitcode = funcs[0];
EXPECT_TRUE(tgt != nullptr); EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(param.size(),
static_cast<size_t>(attr.param_height * attr.param_width));
EXPECT_EQ(grad.size(),
static_cast<size_t>(attr.grad_height * attr.grad_width));
EXPECT_EQ(rows.size(), static_cast<size_t>(attr.selected_rows_size));
EXPECT_EQ(param.size(), oref.size());
const T* param_data = param.data();
const T* grad_data = grad.data();
const int64_t* rows_data = rows.data();
const T* oref_data = oref.data();
std::vector<T> out(oref.size()); if (std::is_same<T, float>::value &&
T* o_data = out.data(); paddle::platform::MayIUse(paddle::platform::avx512f)) {
tgt(&lr, param_data, grad_data, rows_data, o_data, &attr); EXPECT_TRUE(jitcode != nullptr);
// only the selected rows should be equal
for (size_t i = 0; i < rows.size(); ++i) {
ExpectEQ<T>(o_data + rows[i] * attr.grad_width,
oref_data + rows[i] * attr.grad_width, attr.grad_width);
} }
for (int ni = 0; ni < n; ni++) {
for (int ci = 0; ci < C; ci++) {
auto ptr_x =
x_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
auto ptr_y = y_data + ni * C * simd_width + ci * simd_width;
auto ptr_zref =
zref_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
auto ptr_ztgt =
ztgt_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
// inplace ref(ptr_x, ptr_y, ptr_zref, h, w);
std::copy(param.begin(), param.end(), out.begin()); tgt(ptr_x, ptr_y, ptr_ztgt, h, w);
tgt(&lr, o_data, grad_data, rows_data, o_data, &attr);
for (size_t i = 0; i < rows.size(); ++i) { if (jitcode) {
ExpectEQ<T>(o_data + rows[i] * attr.grad_width, auto ptr_zjit =
oref_data + rows[i] * attr.grad_width, attr.grad_width); zjit_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
jitcode(ptr_x, ptr_y, ptr_zjit, h, w);
} }
} }
};
template <typename T>
struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>,
std::vector<T>,
typename jit::MatMulTuples<T>::attr_type> {
void operator()(const typename jit::MatMulTuples<T>::func_type tgt,
const std::vector<T>& a, const std::vector<T>& b,
const std::vector<T>& cref,
const typename jit::MatMulTuples<T>::attr_type& attr) {
EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(a.size(), static_cast<size_t>(attr.m * attr.k));
EXPECT_EQ(b.size(), static_cast<size_t>(attr.k * attr.n));
EXPECT_EQ(cref.size(), static_cast<size_t>(attr.m * attr.n));
std::vector<T> c(cref.size());
const T* a_data = a.data();
const T* b_data = b.data();
const T* cref_data = cref.data();
T* c_data = c.data();
tgt(a_data, b_data, c_data, &attr);
ExpectEQ<T>(c_data, cref_data, attr.m * attr.n);
} }
}; ExpectEQ<T>(ztgt_data, zref_data, sz);
if (jitcode) {
ExpectEQ<T>(zjit_data, zref_data, sz);
}
}
template <typename T> template <typename KernelTuple, typename PlaceType>
struct TestFuncWithRefer<jit::LayerNormTuples<T>, std::vector<T>, void TestKernelLayerNorm() {
std::vector<T>, std::vector<T>, std::vector<T>, using T = typename KernelTuple::data_type;
std::vector<T>, std::vector<T>, int, float, int> { VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
void operator()(const typename jit::LayerNormTuples<T>::func_type tgt, const T epsilon = 9.99999975e-06;
std::vector<T>& x, std::vector<T>& outref, // NOLINT for (int n : {1, 2, 10}) {
std::vector<T>& mean, std::vector<T>& var, // NOLINT for (int x_dim_0 : {1, 9, 17, 50}) {
const std::vector<T>& scale, const std::vector<T>& bias, int left = n * x_dim_0;
int left, const float epsilon, int right) { for (int x_dim_1 : TestSizes()) {
int right = x_dim_1;
auto ref = jit::GetReferFunc<KernelTuple>();
EXPECT_TRUE(ref != nullptr);
int sz = left * right;
std::vector<T> x(sz), mean(left), var(left), scale(right), bias(right),
outref(sz);
RandomVec<T>(sz, x.data());
RandomVec<T>(left, mean.data());
RandomVec<T>(left, var.data());
RandomVec<T>(right, scale.data());
RandomVec<T>(right, bias.data());
const T* scale_data = scale.data();
const T* bias_data = bias.data();
T* x_data = x.data();
T* mean_data = mean.data();
T* var_data = var.data();
T* outref_data = outref.data();
ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data,
left, epsilon, right);
auto verifier = [](
const typename KernelTuple::func_type tgt, const std::vector<T>& x_,
const std::vector<T>& outref_, const std::vector<T>& mean_,
const std::vector<T>& var_, const std::vector<T>& scale,
const std::vector<T>& bias, const int& left, const float& epsilon,
const typename KernelTuple::attr_type& right) {
EXPECT_TRUE(tgt != nullptr); EXPECT_TRUE(tgt != nullptr);
std::vector<T> outtgt(outref_.size());
std::vector<T> x(x_.size());
std::vector<T> mean(mean_.size());
std::vector<T> var(var_.size());
std::vector<T> outref(outref_.size());
std::copy(x_.begin(), x_.end(), x.begin());
std::copy(mean_.begin(), mean_.end(), mean.begin());
std::copy(var_.begin(), var_.end(), var.begin());
std::copy(outref_.begin(), outref_.end(), outref.begin());
EXPECT_EQ(x.size(), static_cast<size_t>(left * right)); EXPECT_EQ(x.size(), static_cast<size_t>(left * right));
EXPECT_EQ(outref.size(), static_cast<size_t>(left * right)); EXPECT_EQ(outref.size(), static_cast<size_t>(left * right));
EXPECT_EQ(mean.size(), static_cast<size_t>(left)); EXPECT_EQ(mean.size(), static_cast<size_t>(left));
EXPECT_EQ(var.size(), static_cast<size_t>(left)); EXPECT_EQ(var.size(), static_cast<size_t>(left));
EXPECT_EQ(scale.size(), static_cast<size_t>(right)); EXPECT_EQ(scale.size(), static_cast<size_t>(right));
EXPECT_EQ(bias.size(), static_cast<size_t>(right)); EXPECT_EQ(bias.size(), static_cast<size_t>(right));
std::vector<T> outtgt(outref.size());
const T* scale_data = scale.data(); const T* scale_data = scale.data();
const T* bias_data = bias.data(); const T* bias_data = bias.data();
T* x_data = x.data(); T* x_data = x.data();
...@@ -402,263 +517,69 @@ struct TestFuncWithRefer<jit::LayerNormTuples<T>, std::vector<T>, ...@@ -402,263 +517,69 @@ struct TestFuncWithRefer<jit::LayerNormTuples<T>, std::vector<T>,
T* var_data = var.data(); T* var_data = var.data();
T* outref_data = outref.data(); T* outref_data = outref.data();
T* outtgt_data = outtgt.data(); T* outtgt_data = outtgt.data();
tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data,
tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data, left, left, epsilon, right);
epsilon, right);
ExpectEQ<T>(outtgt_data, outref_data, left * right); ExpectEQ<T>(outtgt_data, outref_data, left * right);
};
TestAllImpls<KernelTuple, PlaceType>(right, verifier, x, outref, mean,
var, scale, bias, left, epsilon,
right);
} }
}; }
}
}
template <typename T> template <typename KernelTuple, typename PlaceType>
struct TestFuncWithRefer<jit::CRFDecodingTuples<T>, int, std::vector<T>, void TestKernelCRFDecoding() {
std::vector<T>, std::vector<T>, std::vector<int>, using T = typename KernelTuple::data_type;
int> { VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
void operator()(const typename jit::CRFDecodingTuples<T>::func_type tgt, constexpr int state_trans_base_idx = 2;
const int seq_len, const std::vector<T>& x, auto test_sizes = TestSizes();
const std::vector<T>& w, std::vector<T>& alpharef, // NOLINT test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000));
std::vector<int>& trackref, int tag_num) { // NOLINT for (int seq_len : {1, 11, 17, 50}) {
for (int tag_num : test_sizes) {
auto ref = jit::GetReferFunc<KernelTuple>();
EXPECT_TRUE(ref != nullptr);
int x_sz = seq_len * tag_num;
int w_sz = (tag_num + state_trans_base_idx) * tag_num;
std::vector<T> x(x_sz), w(w_sz), alpharef(x_sz);
std::vector<int> trackref(x_sz);
RandomVec<T>(x_sz, x.data());
RandomVec<T>(w_sz, w.data());
ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(),
trackref.data(), tag_num);
auto verifier = [](
const typename KernelTuple::func_type tgt, const int& seq_len,
const std::vector<T>& x, const std::vector<T>& w,
const std::vector<T>& alpharef, const std::vector<int>& trackref,
const typename KernelTuple::attr_type& tag_num) {
constexpr int state_trans_base_idx = 2; constexpr int state_trans_base_idx = 2;
EXPECT_TRUE(tgt != nullptr); EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(x.size(), static_cast<size_t>(seq_len * tag_num)); EXPECT_EQ(x.size(), static_cast<size_t>(seq_len * tag_num));
EXPECT_EQ(w.size(), EXPECT_EQ(w.size(), static_cast<size_t>(
static_cast<size_t>((tag_num + state_trans_base_idx) * tag_num)); (tag_num + state_trans_base_idx) * tag_num));
EXPECT_EQ(alpharef.size(), static_cast<size_t>(seq_len * tag_num)); EXPECT_EQ(alpharef.size(), static_cast<size_t>(seq_len * tag_num));
EXPECT_EQ(trackref.size(), static_cast<size_t>(seq_len * tag_num)); EXPECT_EQ(trackref.size(), static_cast<size_t>(seq_len * tag_num));
std::vector<T> alphatgt(alpharef.size()); std::vector<T> alphatgt(alpharef.size());
std::vector<int> tracktgt(trackref.size()); std::vector<int> tracktgt(trackref.size());
memcpy(tracktgt.data(), trackref.data(), tag_num * sizeof(int));
memcpy(trackref.data(), tracktgt.data(), tag_num * sizeof(int));
tgt(seq_len, (const T*)x.data(), (const T*)w.data(), alphatgt.data(), tgt(seq_len, (const T*)x.data(), (const T*)w.data(), alphatgt.data(),
tracktgt.data(), tag_num); tracktgt.data(), tag_num);
ExpectEQ<T>(alpharef.data(), alphatgt.data(), seq_len * tag_num); ExpectEQ<T>(alpharef.data(), alphatgt.data(), seq_len * tag_num);
ExpectEQ<int>(trackref.data(), tracktgt.data(), seq_len * tag_num); ExpectEQ<int>(trackref.data(), tracktgt.data(), seq_len * tag_num);
} };
}; TestAllImpls<KernelTuple, PlaceType>(tag_num, verifier, seq_len, x, w,
alpharef, trackref, tag_num);
template <jit::KernelType KT, typename KernelTuples, typename PlaceType,
typename... Args>
void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
TestFuncWithRefer<KernelTuples, Args...> test;
// test jitcode
auto jitcode = jit::GetJitCode<KT, KernelTuples, PlaceType>(attr);
if (jitcode) {
VLOG(10) << "Test Jitcode Kernel ";
test(jitcode, args...);
}
// test all impls in more
jit::KernelKey kkey(KT, PlaceType());
auto& pool = jit::KernelPool().Instance().AllKernels();
auto iter = pool.find(kkey);
if (iter != pool.end()) {
auto& impls = iter->second;
for (auto& impl : impls) {
auto i = dynamic_cast<const jit::KernelMore<KernelTuples>*>(impl.get());
if (i && i->UseMe(attr)) {
auto more = i->GetFunc();
VLOG(10) << "Test More Kernel : " << i->ImplType();
test(more, args...);
}
}
}
// test result from Get function
// VLOG(10) << "Test Get function ";
auto tgt = jit::Get<KT, KernelTuples, PlaceType>(attr);
test(tgt, args...);
}
template <jit::KernelType KT, typename T, typename PlaceType>
void TestKernelXYZNTuples() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
for (int d : TestSizes()) {
auto ref = jit::GetRefer<KT, jit::XYZNTuples<T>>();
EXPECT_TRUE(ref != nullptr);
std::vector<T> x(d), y(d), zref(d);
RandomVec<T>(d, x.data());
RandomVec<T>(d, y.data());
std::vector<T> xinp(d), yinp(d); // inplace test
std::copy(x.begin(), x.end(), xinp.begin());
std::copy(y.begin(), y.end(), yinp.begin());
const T* x_data = x.data();
const T* y_data = y.data();
T* zref_data = zref.data();
T* xinp_data = xinp.data();
T* yinp_data = yinp.data();
// test refer code inplace
ref(x_data, y_data, zref_data, d);
ref(x_data, yinp_data, yinp_data, d);
ref(xinp_data, y_data, xinp_data, d);
ExpectEQ<T>(xinp_data, zref_data, d);
ExpectEQ<T>(yinp_data, zref_data, d);
TestAllImpls<KT, jit::XYZNTuples<T>, PlaceType, std::vector<T>,
std::vector<T>, std::vector<T>>(d, x, y, zref);
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void TestKernelAXYNTuples() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
for (int d : TestSizes()) {
auto ref = jit::GetRefer<KT, jit::AXYNTuples<T>>();
EXPECT_TRUE(ref != nullptr);
const T a = static_cast<T>(3);
std::vector<T> x(d), yref(d);
std::vector<T> xinp(d); // inplace test
RandomVec<T>(d, x.data());
std::copy(x.begin(), x.end(), xinp.begin());
const T* x_data = x.data();
T* yref_data = yref.data();
T* xinp_data = xinp.data();
// test refer code inplace
ref(&a, x_data, yref_data, d);
ref(&a, xinp_data, xinp_data, d);
ExpectEQ<T>(xinp_data, yref_data, d);
TestAllImpls<KT, jit::AXYNTuples<T>, PlaceType, T, std::vector<T>,
std::vector<T>>(d, a, x, yref);
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void TestKernelXRNTuples() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
auto last_acc = FLAGS_acc;
FLAGS_acc = 1e-4;
for (int d : TestSizes()) {
auto ref = jit::GetRefer<KT, jit::XRNTuples<T>>();
EXPECT_TRUE(ref != nullptr);
std::vector<T> x(d);
RandomVec<T>(d, x.data());
T ref_res;
ref(x.data(), &ref_res, d);
TestAllImpls<KT, jit::XRNTuples<T>, PlaceType, std::vector<T>, T>(d, x,
ref_res);
}
FLAGS_acc = last_acc;
}
template <jit::KernelType KT, typename T, typename PlaceType>
void TestKernelXYNTuples() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
for (int d : TestSizes()) {
auto ref = jit::GetRefer<KT, jit::XYNTuples<T>>();
EXPECT_TRUE(ref != nullptr);
std::vector<T> x(d), yref(d);
std::vector<T> xinp(d); // inplace test
RandomVec<T>(d, x.data());
std::copy(x.begin(), x.end(), xinp.begin());
const T* x_data = x.data();
T* yref_data = yref.data();
T* xinp_data = xinp.data();
// test refer code inplace
ref(x_data, yref_data, d);
ref(xinp_data, xinp_data, d);
ExpectEQ<T>(xinp_data, yref_data, d);
TestAllImpls<KT, jit::XYNTuples<T>, PlaceType, std::vector<T>,
std::vector<T>>(d, x, yref);
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void TestKernelLSTMTuples() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
auto test_sizes = TestSizes();
test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
for (int d : test_sizes) {
for (bool use_peephole : {true, false}) {
for (auto& act_gate : all_acts) {
for (auto& act_cand : all_acts) {
for (auto& act_cell : all_acts) {
const jit::lstm_attr_t attr(
d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand),
jit::to_kerneltype(act_cell), use_peephole);
auto ref = jit::GetRefer<KT, jit::LSTMTuples<T>>();
EXPECT_TRUE(ref != nullptr);
std::vector<T> xsrc(4 * d), wp(3 * d), ct_1(d);
std::vector<T> ct_ref(d), ht_ref(d), checked(2 * d);
RandomVec<T>(4 * d, xsrc.data());
RandomVec<T>(3 * d, wp.data(), -1.f, 1.f);
RandomVec<T>(d, ct_1.data(), -1.f, 1.f);
// x could be changed after compute, so copy to save src
std::vector<T> x(xsrc.size());
std::copy(xsrc.begin(), xsrc.end(), x.begin());
const T* ct_1_data = ct_1.data();
const T* wp_data = wp.data();
T* x_data = x.data();
T* checked_data = checked.data();
T* ct_ref_data = ct_ref.data();
T* ht_ref_data = ht_ref.data();
jit::lstm_t step;
step.gates = x_data;
step.ct_1 = ct_1_data;
step.ct = ct_ref_data;
step.ht = ht_ref_data;
if (use_peephole) {
step.wp = wp_data;
step.checked = checked_data;
}
ref(&step, &attr);
VLOG(10) << attr;
TestAllImpls<KT, jit::LSTMTuples<T>, PlaceType, std::vector<T>,
std::vector<T>, std::vector<T>, std::vector<T>,
std::vector<T>>(attr, xsrc, wp, ct_1, ct_ref, ht_ref,
attr);
}
}
}
}
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void TestKernelGRUTuples() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
auto test_sizes = TestSizes();
test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
for (int d : test_sizes) {
for (auto& act_gate : all_acts) {
for (auto& act_cand : all_acts) {
const jit::gru_attr_t attr(d, jit::to_kerneltype(act_gate),
jit::to_kerneltype(act_cand));
auto ref = jit::GetRefer<KT, jit::GRUTuples<T>>();
EXPECT_TRUE(ref != nullptr);
std::vector<T> xsrc(3 * d), ht_1(d), ht_ref(d);
RandomVec<T>(3 * d, xsrc.data());
RandomVec<T>(d, ht_1.data());
// x could be changed after compute, so copy to save src
std::vector<T> x(xsrc.size());
std::copy(xsrc.begin(), xsrc.end(), x.begin());
const T* ht_1_data = ht_1.data();
T* x_data = x.data();
T* ht_ref_data = ht_ref.data();
jit::gru_t step;
step.gates = x_data;
step.ht_1 = ht_1_data;
step.ht = ht_ref_data;
ref(&step, &attr);
VLOG(10) << attr;
TestAllImpls<KT, jit::GRUTuples<T>, PlaceType, std::vector<T>,
std::vector<T>, std::vector<T>>(attr, xsrc, ht_1, ht_ref,
attr);
}
} }
} }
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <typename KernelTuple, typename PlaceType>
void TestKernelSeqPoolTuples() { void TestKernelSeqPool() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); using T = typename KernelTuple::data_type;
VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
std::vector<jit::SeqPoolType> pool_types = { std::vector<jit::SeqPoolType> pool_types = {
jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt}; jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
auto test_sizes = TestSizes(); auto test_sizes = TestSizes();
...@@ -668,7 +589,7 @@ void TestKernelSeqPoolTuples() { ...@@ -668,7 +589,7 @@ void TestKernelSeqPoolTuples() {
jit::seq_pool_attr_t attr(w, type); jit::seq_pool_attr_t attr(w, type);
for (int h : test_sizes) { for (int h : test_sizes) {
attr.h = h; attr.h = h;
auto ref = jit::GetRefer<KT, jit::SeqPoolTuples<T>>(); auto ref = jit::GetReferFunc<KernelTuple>();
EXPECT_TRUE(ref != nullptr); EXPECT_TRUE(ref != nullptr);
std::vector<T> x(h * w), yref(w); std::vector<T> x(h * w), yref(w);
RandomVec<T>(h * w, x.data()); RandomVec<T>(h * w, x.data());
...@@ -676,16 +597,86 @@ void TestKernelSeqPoolTuples() { ...@@ -676,16 +597,86 @@ void TestKernelSeqPoolTuples() {
T* yref_data = yref.data(); T* yref_data = yref.data();
ref(x_data, yref_data, &attr); ref(x_data, yref_data, &attr);
VLOG(10) << attr; VLOG(10) << attr;
TestAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType, std::vector<T>, auto verifier = [](const typename KernelTuple::func_type tgt,
std::vector<T>>(attr, x, yref, attr); const std::vector<T>& x, const std::vector<T>& yref,
const typename KernelTuple::attr_type& attr) {
EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(x.size() % yref.size(), static_cast<size_t>(0));
int w = yref.size();
std::vector<T> y(w);
const T* x_data = x.data();
const T* yref_data = yref.data();
T* y_data = y.data();
tgt(x_data, y_data, &attr);
ExpectEQ<T>(y_data, yref_data, w);
};
TestAllImpls<KernelTuple, PlaceType>(attr, verifier, x, yref, attr);
}
}
}
}
template <typename KernelTuple, typename PlaceType>
void TestKernelEmbSeqPool() {
using T = typename KernelTuple::data_type;
VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
int64_t tbl_h = 1e4;
std::vector<jit::SeqPoolType> pool_types = {
jit::SeqPoolType::kSum}; // only support sum yet
auto test_sizes = TestSizes();
test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
for (int tbl_w : test_sizes) {
std::vector<T> table(tbl_h * tbl_w);
RandomVec<T>(tbl_h * tbl_w, table.data());
const T* table_data = table.data();
for (auto type : pool_types) {
for (int idx_w : {1, 2, 10, 16}) {
for (int idx_h : {1, 2, 9, 13, 16}) {
auto ref = jit::GetReferFunc<KernelTuple>();
EXPECT_TRUE(ref != nullptr);
std::vector<int64_t> idx(idx_h * idx_w);
RandomVec<int64_t>(idx_h * idx_w, idx.data(), 0, tbl_h - 1);
int64_t out_w = tbl_w * idx_w;
std::vector<T> oref(out_w);
const int64_t* idx_data = idx.data();
T* o_data = oref.data();
jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w,
type);
ref(table_data, idx_data, o_data, &attr);
auto verifier = [](const typename KernelTuple::func_type tgt,
const std::vector<T>& table,
const std::vector<int64_t>& idx,
const std::vector<T>& oref,
const typename KernelTuple::attr_type& attr) {
EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(table.size(), static_cast<size_t>(attr.table_height *
attr.table_width));
EXPECT_EQ(idx.size(), static_cast<size_t>(attr.index_height *
attr.index_width));
EXPECT_EQ(oref.size(),
static_cast<size_t>(attr.table_width * attr.index_width));
const T* table_data = table.data();
const int64_t* idx_data = idx.data();
const T* oref_data = oref.data();
int o_w = oref.size();
std::vector<T> out(o_w);
T* o_data = out.data();
tgt(table_data, idx_data, o_data, &attr);
ExpectEQ<T>(o_data, oref_data, o_w);
};
TestAllImpls<KernelTuple, PlaceType>(attr, verifier, table, idx, oref,
attr);
}
} }
} }
} }
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <typename KernelTuple, typename PlaceType>
void TestKernelMatMulTuples() { void TestKernelMatMul() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); using T = typename KernelTuple::data_type;
VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
auto last_acc = FLAGS_acc; auto last_acc = FLAGS_acc;
// export MKL_CBWR=AVX would make MKL force to use AVX // export MKL_CBWR=AVX would make MKL force to use AVX
// export KMP_DETERMINISTIC_REDUCTION=yes would make the result deterministic // export KMP_DETERMINISTIC_REDUCTION=yes would make the result deterministic
...@@ -693,7 +684,7 @@ void TestKernelMatMulTuples() { ...@@ -693,7 +684,7 @@ void TestKernelMatMulTuples() {
for (int m : {1, 2, 3, 4}) { for (int m : {1, 2, 3, 4}) {
for (int n : {1, 2, 3, 4}) { for (int n : {1, 2, 3, 4}) {
for (int k : TestSizes()) { for (int k : TestSizes()) {
auto ref = jit::GetRefer<KT, jit::MatMulTuples<T>>(); auto ref = jit::GetReferFunc<KernelTuple>();
EXPECT_TRUE(ref != nullptr); EXPECT_TRUE(ref != nullptr);
std::vector<T> a(m * k), b(k * n), c(m * n); std::vector<T> a(m * k), b(k * n), c(m * n);
RandomVec<T>(m * k, a.data()); RandomVec<T>(m * k, a.data());
...@@ -703,20 +694,36 @@ void TestKernelMatMulTuples() { ...@@ -703,20 +694,36 @@ void TestKernelMatMulTuples() {
T* c_data = c.data(); T* c_data = c.data();
const jit::matmul_attr_t attr{m, n, k}; const jit::matmul_attr_t attr{m, n, k};
ref(a_data, b_data, c_data, &attr); ref(a_data, b_data, c_data, &attr);
TestAllImpls<KT, jit::MatMulTuples<T>, PlaceType, std::vector<T>, auto verifier = [](const typename KernelTuple::func_type tgt,
std::vector<T>, std::vector<T>>(attr, a, b, c, attr); const std::vector<T>& a, const std::vector<T>& b,
const std::vector<T>& cref,
const typename KernelTuple::attr_type& attr) {
EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(a.size(), static_cast<size_t>(attr.m * attr.k));
EXPECT_EQ(b.size(), static_cast<size_t>(attr.k * attr.n));
EXPECT_EQ(cref.size(), static_cast<size_t>(attr.m * attr.n));
std::vector<T> c(cref.size());
const T* a_data = a.data();
const T* b_data = b.data();
const T* cref_data = cref.data();
T* c_data = c.data();
tgt(a_data, b_data, c_data, &attr);
ExpectEQ<T>(c_data, cref_data, attr.m * attr.n);
};
TestAllImpls<KernelTuple, PlaceType>(attr, verifier, a, b, c, attr);
} }
} }
} }
FLAGS_acc = last_acc; FLAGS_acc = last_acc;
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <typename KernelTuple, typename PlaceType>
void TestKernelSoftmaxTuples() { void TestKernelSoftmax() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); using T = typename KernelTuple::data_type;
VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
for (int bs : {1, 2, 10}) { for (int bs : {1, 2, 10}) {
for (int n : TestSizes()) { for (int n : TestSizes()) {
auto ref = jit::GetRefer<KT, jit::SoftmaxTuples<T>>(); auto ref = jit::GetReferFunc<KernelTuple>();
EXPECT_TRUE(ref != nullptr); EXPECT_TRUE(ref != nullptr);
std::vector<T> x(bs * n), y(bs * n); std::vector<T> x(bs * n), y(bs * n);
RandomVec<T>(bs * n, x.data()); RandomVec<T>(bs * n, x.data());
...@@ -730,51 +737,33 @@ void TestKernelSoftmaxTuples() { ...@@ -730,51 +737,33 @@ void TestKernelSoftmaxTuples() {
ref(xinp_data, xinp_data, n, bs); ref(xinp_data, xinp_data, n, bs);
ExpectEQ<T>(xinp_data, y_data, n * bs); ExpectEQ<T>(xinp_data, y_data, n * bs);
TestAllImpls<KT, jit::SoftmaxTuples<T>, PlaceType, std::vector<T>, auto verifier = [](const typename KernelTuple::func_type tgt,
std::vector<T>>(n, x, y, n, bs); const std::vector<T>& x, const std::vector<T>& yref,
} int n, int bs) {
} EXPECT_TRUE(tgt != nullptr);
} EXPECT_EQ(yref.size(), x.size());
EXPECT_EQ(x.size(), static_cast<size_t>(n * bs));
template <jit::KernelType KT, typename T, typename PlaceType> const T* x_data = x.data();
void TestKernelEmbSeqPoolTuples() { const T* yref_data = yref.data();
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); std::vector<T> ytgt(n * bs);
int64_t tbl_h = 1e4; T* ytgt_data = ytgt.data();
std::vector<jit::SeqPoolType> pool_types = { // test normal
jit::SeqPoolType::kSum}; // only support sum yet tgt(x_data, ytgt_data, n, bs);
auto test_sizes = TestSizes(); ExpectEQ<T>(ytgt_data, yref_data, n * bs);
test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000)); // test inplace x
for (int tbl_w : test_sizes) { std::copy(x.begin(), x.end(), ytgt.begin());
std::vector<T> table(tbl_h * tbl_w); tgt(ytgt_data, ytgt_data, n, bs);
RandomVec<T>(tbl_h * tbl_w, table.data()); ExpectEQ<T>(ytgt_data, yref_data, n * bs);
const T* table_data = table.data(); };
for (auto type : pool_types) { TestAllImpls<KernelTuple, PlaceType>(n, verifier, x, y, n, bs);
for (int idx_w : {1, 2, 10, 16}) {
for (int idx_h : {1, 2, 9, 13, 16}) {
auto ref = jit::GetRefer<KT, jit::EmbSeqPoolTuples<T>>();
EXPECT_TRUE(ref != nullptr);
std::vector<int64_t> idx(idx_h * idx_w);
RandomVec<int64_t>(idx_h * idx_w, idx.data(), 0, tbl_h - 1);
int64_t out_w = tbl_w * idx_w;
std::vector<T> oref(out_w);
const int64_t* idx_data = idx.data();
T* o_data = oref.data();
jit::emb_seq_pool_attr_t attr(tbl_h, tbl_w, idx_h, idx_w, out_w,
type);
ref(table_data, idx_data, o_data, &attr);
TestAllImpls<KT, jit::EmbSeqPoolTuples<T>, PlaceType, std::vector<T>,
std::vector<int64_t>, std::vector<T>>(attr, table, idx,
oref, attr);
}
}
} }
} }
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <typename KernelTuple, typename PlaceType>
void TestKernelSgdTuples() { void TestKernelSgd() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); using T = typename KernelTuple::data_type;
VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
const T lr = 0.1; const T lr = 0.1;
auto UnDuplicatedRandomVec = [](int n, const int64_t lower, auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
const int64_t upper) -> std::vector<int64_t> { const int64_t upper) -> std::vector<int64_t> {
...@@ -802,7 +791,7 @@ void TestKernelSgdTuples() { ...@@ -802,7 +791,7 @@ void TestKernelSgdTuples() {
RandomVec<T>(rows_size * grad_w, grad.data()); RandomVec<T>(rows_size * grad_w, grad.data());
const int64_t* rows_data = rows.data(); const int64_t* rows_data = rows.data();
const T* grad_data = grad.data(); const T* grad_data = grad.data();
auto ref = jit::GetRefer<KT, jit::SgdTuples<T>>(); auto ref = jit::GetReferFunc<KernelTuple>();
EXPECT_TRUE(ref != nullptr); EXPECT_TRUE(ref != nullptr);
jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size); jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size);
ref(&lr, param_data, grad_data, rows_data, out_data, &attr); ref(&lr, param_data, grad_data, rows_data, out_data, &attr);
...@@ -818,227 +807,488 @@ void TestKernelSgdTuples() { ...@@ -818,227 +807,488 @@ void TestKernelSgdTuples() {
grad_w); grad_w);
} }
TestAllImpls<KT, jit::SgdTuples<T>, PlaceType, T, std::vector<T>, auto verifier = [](
std::vector<T>, std::vector<int64_t>, std::vector<T>>( const typename KernelTuple::func_type tgt, const T lr,
attr, lr, param, grad, rows, param_out, attr); const std::vector<T>& param, const std::vector<T>& grad,
} const std::vector<int64_t>& rows, const std::vector<T>& oref,
} const typename KernelTuple::attr_type& attr) {
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void TestKernelNCHW16CMulNCTuples() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
const int n = 3, c = 16 * 4, h = 10, w = 10;
auto ref = jit::GetRefer<KT, jit::NCHW16CMulNCTuples<T>>();
EXPECT_TRUE(ref != nullptr);
int sz = n * c * h * w;
std::vector<T> x(sz), y(n * c), zref(sz);
std::vector<T> ztgt(sz), zjit(sz);
RandomVec<T>(sz, x.data());
RandomVec<T>(n * c, y.data());
const T* x_data = x.data();
const T* y_data = y.data();
T* zref_data = zref.data();
T* ztgt_data = ztgt.data();
T* zjit_data = zjit.data();
constexpr int simd_width = ZMM_FLOAT_BLOCK;
int C = c / simd_width;
auto tgt = jit::Get<KT, jit::NCHW16CMulNCTuples<T>, PlaceType>(0);
auto jitcode = jit::GetJitCode<KT, jit::NCHW16CMulNCTuples<T>, PlaceType>(0);
EXPECT_TRUE(tgt != nullptr); EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(param.size(),
static_cast<size_t>(attr.param_height * attr.param_width));
EXPECT_EQ(grad.size(),
static_cast<size_t>(attr.grad_height * attr.grad_width));
EXPECT_EQ(rows.size(), static_cast<size_t>(attr.selected_rows_size));
EXPECT_EQ(param.size(), oref.size());
const T* param_data = param.data();
const T* grad_data = grad.data();
const int64_t* rows_data = rows.data();
const T* oref_data = oref.data();
if (std::is_same<T, float>::value && std::vector<T> out(oref.size());
paddle::platform::MayIUse(paddle::platform::avx512f)) { T* o_data = out.data();
EXPECT_TRUE(jitcode != nullptr); tgt(&lr, param_data, grad_data, rows_data, o_data, &attr);
} // only the selected rows should be equal
for (int ni = 0; ni < n; ni++) { for (size_t i = 0; i < rows.size(); ++i) {
for (int ci = 0; ci < C; ci++) { ExpectEQ<T>(o_data + rows[i] * attr.grad_width,
auto ptr_x = oref_data + rows[i] * attr.grad_width, attr.grad_width);
x_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
auto ptr_y = y_data + ni * C * simd_width + ci * simd_width;
auto ptr_zref =
zref_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
auto ptr_ztgt =
ztgt_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
ref(ptr_x, ptr_y, ptr_zref, h, w);
tgt(ptr_x, ptr_y, ptr_ztgt, h, w);
if (jitcode) {
auto ptr_zjit =
zjit_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
jitcode(ptr_x, ptr_y, ptr_zjit, h, w);
}
}
}
ExpectEQ<T>(ztgt_data, zref_data, sz);
if (jitcode) {
ExpectEQ<T>(zjit_data, zref_data, sz);
} }
}
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
void TestKernelLayerNormTuples() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
const T epsilon = 9.99999975e-06;
for (int n : {1, 2, 10}) {
for (int x_dim_0 : {1, 9, 17, 50}) {
int left = n * x_dim_0;
for (int x_dim_1 : TestSizes()) {
int right = x_dim_1;
auto ref = jit::GetRefer<KT, jit::LayerNormTuples<T>>();
EXPECT_TRUE(ref != nullptr);
int sz = left * right;
std::vector<T> x(sz), mean(left), var(left), scale(right), bias(right),
outref(sz);
RandomVec<T>(sz, x.data());
RandomVec<T>(left, mean.data());
RandomVec<T>(left, var.data());
RandomVec<T>(right, scale.data());
RandomVec<T>(right, bias.data());
const T* scale_data = scale.data();
const T* bias_data = bias.data();
T* x_data = x.data();
T* mean_data = mean.data();
T* var_data = var.data();
T* outref_data = outref.data();
ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data,
left, epsilon, right);
TestAllImpls<KT, jit::LayerNormTuples<T>, PlaceType, std::vector<T>, // inplace
std::vector<T>, std::vector<T>, std::vector<T>, std::copy(param.begin(), param.end(), out.begin());
std::vector<T>, std::vector<T>, int, float>( tgt(&lr, o_data, grad_data, rows_data, o_data, &attr);
right, x, outref, mean, var, scale, bias, left, epsilon, right); for (size_t i = 0; i < rows.size(); ++i) {
} ExpectEQ<T>(o_data + rows[i] * attr.grad_width,
oref_data + rows[i] * attr.grad_width, attr.grad_width);
} }
};
TestAllImpls<KernelTuple, PlaceType>(attr, verifier, lr, param, grad,
rows, param_out, attr);
} }
}
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
void TestKernelCRFDecodingTuples() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
constexpr int state_trans_base_idx = 2;
auto test_sizes = TestSizes();
test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000));
for (int seq_len : {1, 11, 17, 50}) {
for (int tag_num : test_sizes) {
auto ref = jit::GetRefer<KT, jit::CRFDecodingTuples<T>>();
EXPECT_TRUE(ref != nullptr);
int x_sz = seq_len * tag_num;
int w_sz = (tag_num + state_trans_base_idx) * tag_num;
std::vector<T> x(x_sz), w(w_sz), alpharef(x_sz);
std::vector<int> trackref(x_sz);
RandomVec<T>(x_sz, x.data());
RandomVec<T>(w_sz, w.data());
ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(),
trackref.data(), tag_num);
TestAllImpls<KT, jit::CRFDecodingTuples<T>, PlaceType, int,
std::vector<T>, std::vector<T>, std::vector<T>,
std::vector<int>, int>(tag_num, seq_len, x, w, alpharef,
trackref, tag_num);
} }
} }
} }
template <jit::KernelType KT, typename T, typename PlaceType> template <typename KernelTuple, typename PlaceType>
void TestKernelVBroadcastTuples() { void TestKernelVBroadcast() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); using T = typename KernelTuple::data_type;
VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
for (int w : TestSizes()) { for (int w : TestSizes()) {
std::vector<T> x(w); std::vector<T> x(w);
RandomVec<T>(w, x.data()); RandomVec<T>(w, x.data());
const T* x_data = x.data(); const T* x_data = x.data();
for (int64_t h : {1, 2, 6}) { for (int64_t h : {1, 2, 6}) {
auto ref = jit::GetRefer<KT, jit::VBroadcastTuples<T>>(); auto ref = jit::GetReferFunc<KernelTuple>();
EXPECT_TRUE(ref != nullptr); EXPECT_TRUE(ref != nullptr);
std::vector<T> y(w * h); std::vector<T> y(w * h);
T* y_data = y.data(); T* y_data = y.data();
ref(x_data, y_data, h, w); ref(x_data, y_data, h, w);
TestAllImpls<KT, jit::VBroadcastTuples<T>, PlaceType, std::vector<T>, auto verifier = [](const typename KernelTuple::func_type tgt,
std::vector<T>, int64_t>(static_cast<int64_t>(w), x, y, h, const std::vector<T>& x, const std::vector<T>& yref,
static_cast<int64_t>(w)); const int64_t& h,
const typename KernelTuple::attr_type& attr) {
EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(x.size(), static_cast<size_t>(attr));
EXPECT_EQ(yref.size(), x.size() * h);
std::vector<T> y(yref.size());
const T* x_data = x.data();
const T* yref_data = yref.data();
T* y_data = y.data();
tgt(x_data, y_data, h, attr);
ExpectEQ<T>(y_data, yref_data, yref.size());
};
TestAllImpls<KernelTuple, PlaceType>(static_cast<int64_t>(w), verifier, x,
y, h, static_cast<int64_t>(w));
} }
} }
} }
#define TEST_CPU_KERNEL(test_tuple, kernel_type) \ // test pool
TEST(JITKernel, kernel_type) { \ TEST(JITKernel_pool, jitcreator) {
TestKernel##test_tuple<jit::kernel_type, float, CPUPlace>(); \ const auto& jitcreators = jit::JitCodeCreatorPool::Instance().AllCreators();
TestKernel##test_tuple<jit::kernel_type, float, CPUPlace>(); \ #if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
EXPECT_EQ(jitcreators.size(), 0UL);
#else
EXPECT_EQ(jitcreators.size(), 25UL);
#endif
}
TEST(JITKernel_pool, jitpool) {
// jitpool is related with attr
const auto& kers = jit::JitCodePool<jit::kVAdd>().Instance().AllKernels();
EXPECT_EQ(kers.size(), 0UL);
jit::GetAllCandidateKernels<jit::VAddTuple<float>, CPUPlace>(3);
// after call GetAllCandidateKernels, it will create jitcode Automatically
#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
EXPECT_EQ(kers.size(), 0UL);
#else
EXPECT_EQ(kers.size(), 1UL);
#endif
}
TEST(JITKernel_pool, more) {
const auto& kers = jit::KernelPool::Instance().AllKernels();
#if defined(__APPLE__) || defined(__OSX__)
EXPECT_EQ(kers.size(), 10UL);
#else
#ifdef PADDLE_WITH_MKLML
EXPECT_EQ(kers.size(), 21UL);
#else
EXPECT_EQ(kers.size(), 8UL);
#endif
#endif
}
TEST(JITKernel_pool, refer) {
const auto& kers = jit::ReferKernelPool::Instance().AllKernels();
EXPECT_EQ(kers.size(), 29UL);
}
// test helper
TEST(JITKernel_helper, GetAllCandidateKernels) {
auto fp_kers =
jit::GetAllCandidateKernels<jit::VExpTuple<float>, CPUPlace>(10);
#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
EXPECT_GE(fp_kers.size(), 1UL); // refer
#else
#ifdef PADDLE_WITH_MKLML
EXPECT_GE(fp_kers.size(), 3UL); // jitcode, mkl, refer
#else
EXPECT_GE(fp_kers.size(), 2UL); // jitcode, refer
#endif
#endif
auto db_kers =
jit::GetAllCandidateKernels<jit::VExpTuple<double>, CPUPlace>(10);
#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
EXPECT_GE(db_kers.size(), 1UL); // refer
#else
#ifdef PADDLE_WITH_MKLML
EXPECT_GE(db_kers.size(), 2UL); // mkl, refer
#else
EXPECT_GE(db_kers.size(), 1UL); // refer
#endif
#endif
}
TEST(JITKernel_helper, GetAllCandidateFuncsWithTypes) {
auto fp_kers =
jit::GetAllCandidateFuncsWithTypes<jit::VExpTuple<float>, CPUPlace>(10);
#if defined(__APPLE__) || defined(__OSX__)
EXPECT_GE(fp_kers.size(), 1UL); // refer
#else
#if !defined(PADDLE_WITH_MKLML) || defined(_WIN32)
EXPECT_GE(fp_kers.size(), 2UL); // jitcode/mkl, refer
#else
EXPECT_GE(fp_kers.size(), 3UL); // jitcode, mkl, refer
#endif
#endif
auto db_kers =
jit::GetAllCandidateFuncsWithTypes<jit::VExpTuple<double>, CPUPlace>(10);
#if defined(__APPLE__) || defined(__OSX__) || !defined(PADDLE_WITH_MKLML)
EXPECT_GE(db_kers.size(), 1UL); // refer
#else
EXPECT_GE(db_kers.size(), 2UL); // mkl, refer
#endif
}
TEST(JITKernel_helper, KernelFuncs) {
auto f1 = jit::KernelFuncs<jit::VAddTuple<float>, CPUPlace>::Cache().At(3);
auto f2 = jit::KernelFuncs<jit::VAddTuple<float>, CPUPlace>::Cache()[3];
EXPECT_TRUE(f1 != nullptr);
EXPECT_TRUE(f1 == f2);
auto f3 = jit::KernelFuncs<jit::VAddTuple<float>, CPUPlace>::Cache()[5];
#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
EXPECT_TRUE(f2 == f3);
#else
EXPECT_TRUE(f2 != f3);
#endif
}
TEST(JITKernel_helper, GetAllCandidateFuncs) {
auto funcs = jit::GetAllCandidateFuncs<jit::VExpTuple<float>, CPUPlace>(10);
auto kers = jit::GetAllCandidateKernels<jit::VExpTuple<float>, CPUPlace>(10);
EXPECT_EQ(funcs.size(), kers.size());
std::vector<float> x(10), tgt(10);
RandomVec<float>(10, x.data());
auto best = jit::GetDefaultBestFunc<jit::VExpTuple<float>, CPUPlace>(10);
best(x.data(), tgt.data(), 10);
for (auto f : funcs) {
std::vector<float> y(10);
f(x.data(), y.data(), 10);
ExpectEQ<float>(y.data(), tgt.data(), 10);
} }
}
TEST_CPU_KERNEL(XYZNTuples, kVMul); TEST(JITKernel_helper, pack_weights) {
TEST_CPU_KERNEL(XYZNTuples, kVAdd); const int N = 8 * 60, K = 2;
TEST_CPU_KERNEL(XYZNTuples, kVAddRelu); float src[K][N], yref[K][N], y[K * N];
TEST_CPU_KERNEL(XYZNTuples, kVSub); float* x = &(src[0][0]);
float* ref = &(yref[0][0]);
for (int i = 0; i < N * K; ++i) {
*(x + i) = static_cast<float>(i);
}
int block = 0;
std::vector<int> groups;
if (paddle::platform::MayIUse(paddle::platform::avx512f)) {
block = ZMM_FLOAT_BLOCK;
groups.push_back(30);
} else {
block = YMM_FLOAT_BLOCK;
groups.insert(groups.end(), {14, 14, 14, 14, 4});
}
TEST_CPU_KERNEL(AXYNTuples, kVScal); int offset = 0;
TEST_CPU_KERNEL(AXYNTuples, kVAddBias); int acc = 0;
for (int g : groups) {
g = g * block;
for (int k = 0; k < K; ++k) {
for (int i = 0; i < g; ++i) {
*(ref + offset) = src[k][i + acc];
offset++;
}
}
acc += g;
}
TEST_CPU_KERNEL(XRNTuples, kHMax); jit::pack_weights<float>(x, y, N, K);
TEST_CPU_KERNEL(XRNTuples, kHSum); ExpectEQ<float>(y, ref, N * K);
}
TEST_CPU_KERNEL(XYNTuples, kVRelu); TEST(JITKernel_helper, attr) {
TEST_CPU_KERNEL(XYNTuples, kVIdentity); std::ostringstream out;
TEST_CPU_KERNEL(XYNTuples, kVSquare); // KernelTypes
TEST_CPU_KERNEL(XYNTuples, kVExp); out << jit::to_string(jit::kNone) << jit::to_string(jit::kCRFDecoding)
TEST_CPU_KERNEL(XYNTuples, kVSigmoid); << jit::to_string(jit::kEmbSeqPool) << jit::to_string(jit::kGRUH1)
TEST_CPU_KERNEL(XYNTuples, kVTanh); << jit::to_string(jit::kGRUHtPart1) << jit::to_string(jit::kGRUHtPart2)
TEST_CPU_KERNEL(XYNTuples, kVCopy); << jit::to_string(jit::kHSum) << jit::to_string(jit::kHMax)
<< jit::to_string(jit::kLSTMCtHt) << jit::to_string(jit::kLSTMC1H1)
<< jit::to_string(jit::kLayerNorm) << jit::to_string(jit::kMatMul)
<< jit::to_string(jit::kNCHW16CMulNC) << jit::to_string(jit::kSeqPool)
<< jit::to_string(jit::kSoftmax) << jit::to_string(jit::kVAdd)
<< jit::to_string(jit::kVAddBias) << jit::to_string(jit::kVAddRelu)
<< jit::to_string(jit::kVBroadcast) << jit::to_string(jit::kVCopy)
<< jit::to_string(jit::kVExp) << jit::to_string(jit::kVIdentity)
<< jit::to_string(jit::kVMul) << jit::to_string(jit::kVRelu)
<< jit::to_string(jit::kVScal) << jit::to_string(jit::kSgd)
<< jit::to_string(jit::kVSigmoid) << jit::to_string(jit::kVSquare)
<< jit::to_string(jit::kVSub) << jit::to_string(jit::kVTanh);
EXPECT_EQ(out.str().size(), 234);
// SeqPoolTypes
out.str("");
out << jit::to_string(jit::kSum) << jit::to_string(jit::kAvg)
<< jit::to_string(jit::kSqrt);
EXPECT_EQ(out.str().size(), 13);
EXPECT_EQ(jit::to_kerneltype("relu"), jit::kVRelu);
EXPECT_EQ(jit::to_kerneltype("Identity"), jit::kVIdentity);
EXPECT_EQ(jit::to_kerneltype("VEXP"), jit::kVExp);
EXPECT_EQ(jit::to_kerneltype("SigmoiD"), jit::kVSigmoid);
EXPECT_EQ(jit::to_kerneltype("VTanh"), jit::kVTanh);
out.str("");
out << jit::lstm_attr_t(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
EXPECT_EQ(out.str().size(), 89);
out.str("");
out << jit::gru_attr_t(8, jit::kVIdentity, jit::kVSigmoid);
EXPECT_EQ(out.str().size(), 52);
out.str("");
out << jit::seq_pool_attr_t(8, jit::SeqPoolType::kSum);
EXPECT_EQ(out.str().size(), 44);
out.str("");
out << jit::emb_seq_pool_attr_t(1, 2, 3, 4, 5, jit::SeqPoolType::kAvg);
EXPECT_EQ(out.str().size(), 93);
out.str("");
out << jit::sgd_attr_t(1, 2, 3, 4, 5);
EXPECT_EQ(out.str().size(), 81);
out.str("");
out << jit::matmul_attr_t(1, 2, 3);
EXPECT_EQ(out.str().size(), 14);
}
TEST_CPU_KERNEL(LSTMTuples, kLSTMCtHt); // test keys
TEST_CPU_KERNEL(LSTMTuples, kLSTMC1H1); TEST(JITKernel_key, int) {
EXPECT_TRUE(jit::JitCodeKey<int>(2) == jit::JitCodeKey<int>(2));
EXPECT_TRUE(jit::JitCodeKey<int>(2) == jit::JitCodeKey<int64_t>(2));
EXPECT_TRUE(jit::JitCodeKey<int>(2) != jit::JitCodeKey<int>(3));
}
TEST_CPU_KERNEL(GRUTuples, kGRUH1); TEST(JITKernel_key, gru) {
TEST_CPU_KERNEL(GRUTuples, kGRUHtPart1); jit::gru_attr_t attr1(8, jit::kVSigmoid, jit::kVTanh);
TEST_CPU_KERNEL(GRUTuples, kGRUHtPart2); jit::gru_attr_t attr2(8, jit::kVSigmoid, jit::kVTanh);
jit::gru_attr_t attr3(9, jit::kVSigmoid, jit::kVTanh);
jit::gru_attr_t attr4(9, jit::kVSigmoid, jit::kVIdentity);
jit::gru_attr_t attr5(9, jit::kVTanh, jit::kVIdentity);
TEST_CPU_KERNEL(NCHW16CMulNCTuples, kNCHW16CMulNC); auto key1 = jit::JitCodeKey<jit::gru_attr_t>(attr1);
auto key2 = jit::JitCodeKey<jit::gru_attr_t>(attr2);
auto key3 = jit::JitCodeKey<jit::gru_attr_t>(attr3);
auto key4 = jit::JitCodeKey<jit::gru_attr_t>(attr4);
auto key5 = jit::JitCodeKey<jit::gru_attr_t>(attr5);
TEST_CPU_KERNEL(SeqPoolTuples, kSeqPool); EXPECT_TRUE(key1 == key2);
TEST_CPU_KERNEL(MatMulTuples, kMatMul); EXPECT_TRUE(key2 != key3);
TEST_CPU_KERNEL(SoftmaxTuples, kSoftmax); EXPECT_TRUE(key2 != key4);
TEST_CPU_KERNEL(EmbSeqPoolTuples, kEmbSeqPool); EXPECT_TRUE(key2 != key5);
TEST_CPU_KERNEL(SgdTuples, kSgd); EXPECT_TRUE(key3 != key4);
TEST_CPU_KERNEL(LayerNormTuples, kLayerNorm); EXPECT_TRUE(key3 != key5);
TEST_CPU_KERNEL(CRFDecodingTuples, kCRFDecoding); EXPECT_TRUE(key4 != key5);
TEST_CPU_KERNEL(VBroadcastTuples, kVBroadcast); }
TEST(JITKernel_key, lstm) { TEST(JITKernel_key, lstm) {
jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
jit::lstm_attr_t attr2(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); jit::lstm_attr_t attr2(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
jit::lstm_attr_t attr3(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh); jit::lstm_attr_t attr3(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
jit::lstm_attr_t attr4(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh); jit::lstm_attr_t attr4(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh);
jit::lstm_attr_t attr5(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true);
jit::lstm_attr_t attr6(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true);
auto key1 = jit::JitCodeKey<jit::lstm_attr_t>(attr1); auto key1 = jit::JitCodeKey<jit::lstm_attr_t>(attr1);
auto key2 = jit::JitCodeKey<jit::lstm_attr_t>(attr2); auto key2 = jit::JitCodeKey<jit::lstm_attr_t>(attr2);
auto key3 = jit::JitCodeKey<jit::lstm_attr_t>(attr3); auto key3 = jit::JitCodeKey<jit::lstm_attr_t>(attr3);
auto key4 = jit::JitCodeKey<jit::lstm_attr_t>(attr4); auto key4 = jit::JitCodeKey<jit::lstm_attr_t>(attr4);
auto key5 = jit::JitCodeKey<jit::lstm_attr_t>(attr5);
auto key6 = jit::JitCodeKey<jit::lstm_attr_t>(attr6);
EXPECT_TRUE(key1 != key2); EXPECT_TRUE(key1 == key2);
EXPECT_TRUE(key2 == key3); EXPECT_TRUE(key2 != key3);
EXPECT_TRUE(key2 != key4);
EXPECT_TRUE(key2 != key5);
EXPECT_TRUE(key3 != key4); EXPECT_TRUE(key3 != key4);
EXPECT_TRUE(key3 != key5);
EXPECT_TRUE(key4 != key5);
EXPECT_TRUE(key5 == key6);
} }
TEST(JITKernel_key, gru) { TEST(JITKernel_key, seq_pool) {
jit::gru_attr_t attr1(8, jit::kVSigmoid, jit::kVTanh); jit::seq_pool_attr_t attr1(2, jit::SeqPoolType::kSum, 1);
jit::gru_attr_t attr2(9, jit::kVSigmoid, jit::kVTanh); jit::seq_pool_attr_t attr2(2, jit::SeqPoolType::kSum, 3);
jit::gru_attr_t attr3(9, jit::kVSigmoid, jit::kVTanh); jit::seq_pool_attr_t attr3(3, jit::SeqPoolType::kSum, 3);
jit::gru_attr_t attr4(9, jit::kVSigmoid, jit::kVIdentity); jit::seq_pool_attr_t attr4(3, jit::SeqPoolType::kAvg, 3);
auto key1 = jit::JitCodeKey<jit::gru_attr_t>(attr1); auto key1 = jit::JitCodeKey<jit::seq_pool_attr_t>(attr1);
auto key2 = jit::JitCodeKey<jit::gru_attr_t>(attr2); auto key2 = jit::JitCodeKey<jit::seq_pool_attr_t>(attr2);
auto key3 = jit::JitCodeKey<jit::gru_attr_t>(attr3); auto key3 = jit::JitCodeKey<jit::seq_pool_attr_t>(attr3);
auto key4 = jit::JitCodeKey<jit::gru_attr_t>(attr4); auto key4 = jit::JitCodeKey<jit::seq_pool_attr_t>(attr4);
EXPECT_TRUE(key1 == key2);
EXPECT_TRUE(key2 != key3);
EXPECT_TRUE(key2 != key4);
EXPECT_TRUE(key3 != key4);
}
TEST(JITKernel_key, matmul) {
jit::matmul_attr_t attr1(1, 2, 3);
jit::matmul_attr_t attr2(1, 2, 3);
jit::matmul_attr_t attr3(1, 3, 3);
jit::matmul_attr_t attr4(2, 3, 4);
auto key1 = jit::JitCodeKey<jit::matmul_attr_t>(attr1);
auto key2 = jit::JitCodeKey<jit::matmul_attr_t>(attr2);
auto key3 = jit::JitCodeKey<jit::matmul_attr_t>(attr3);
auto key4 = jit::JitCodeKey<jit::matmul_attr_t>(attr4);
EXPECT_TRUE(key1 == key2);
EXPECT_TRUE(key2 != key3);
EXPECT_TRUE(key2 != key4);
EXPECT_TRUE(key3 != key4);
}
TEST(JITKernel_key, emb_seq_pool) {
jit::emb_seq_pool_attr_t attr1(1, 2, 3, 4, 5, jit::SeqPoolType::kSum);
jit::emb_seq_pool_attr_t attr2(1, 2, 3, 4, 5, jit::SeqPoolType::kSum);
jit::emb_seq_pool_attr_t attr3(10, 2, 9, 8, 7, jit::SeqPoolType::kAvg);
jit::emb_seq_pool_attr_t attr4(10, 3, 9, 8, 7, jit::SeqPoolType::kSum);
jit::emb_seq_pool_attr_t attr5(1, 6, 3, 4, 5, jit::SeqPoolType::kSum);
auto key1 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr1);
auto key2 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr2);
auto key3 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr3);
auto key4 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr4);
auto key5 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr5);
EXPECT_TRUE(key1 == key2);
EXPECT_TRUE(key2 == key3);
EXPECT_TRUE(key2 != key4);
EXPECT_TRUE(key2 != key5);
EXPECT_TRUE(key4 != key5);
}
TEST(JITKernel_key, sgd) {
jit::sgd_attr_t attr1(1, 2, 3, 4, 5);
jit::sgd_attr_t attr2(1, 2, 3, 4, 5);
jit::sgd_attr_t attr3(9, 8, 7, 4, 6);
jit::sgd_attr_t attr4(1, 2, 3, 6, 5);
jit::sgd_attr_t attr5(10, 9, 8, 7, 6);
auto key1 = jit::JitCodeKey<jit::sgd_attr_t>(attr1);
auto key2 = jit::JitCodeKey<jit::sgd_attr_t>(attr2);
auto key3 = jit::JitCodeKey<jit::sgd_attr_t>(attr3);
auto key4 = jit::JitCodeKey<jit::sgd_attr_t>(attr4);
auto key5 = jit::JitCodeKey<jit::sgd_attr_t>(attr5);
EXPECT_TRUE(key1 != key2); EXPECT_TRUE(key1 == key2);
EXPECT_TRUE(key2 == key3); EXPECT_TRUE(key2 == key3);
EXPECT_TRUE(key3 != key4); EXPECT_TRUE(key3 != key4);
EXPECT_TRUE(key3 != key5);
EXPECT_TRUE(key4 != key5);
} }
// TODO(TJ): add more test about key and pool
// test kernerls
#define TestKernelVMul TestKernelXYZN
#define TestKernelVAdd TestKernelXYZN
#define TestKernelVAddRelu TestKernelXYZN
#define TestKernelVSub TestKernelXYZN
#define TestKernelVScal TestKernelAXYN
#define TestKernelVAddBias TestKernelAXYN
#define TestKernelVRelu TestKernelXYN
#define TestKernelVIdentity TestKernelXYN
#define TestKernelVSquare TestKernelXYN
#define TestKernelVExp TestKernelXYN
#define TestKernelVSigmoid TestKernelXYN
#define TestKernelVTanh TestKernelXYN
#define TestKernelVCopy TestKernelXYN
#define TestKernelHMax TestKernelXRN
#define TestKernelHSum TestKernelXRN
#define TestKernelLSTMCtHt TestKernelLSTM
#define TestKernelLSTMC1H1 TestKernelLSTM
#define TestKernelGRUH1 TestKernelGRU
#define TestKernelGRUHtPart1 TestKernelGRU
#define TestKernelGRUHtPart2 TestKernelGRU
#define TEST_CPU_KERNEL(kernel_type) \
TEST(JITKernel, kernel_type) { \
TestKernel##kernel_type<jit::kernel_type##Tuple<float>, CPUPlace>(); \
TestKernel##kernel_type<jit::kernel_type##Tuple<double>, CPUPlace>(); \
}
TEST_CPU_KERNEL(VMul);
TEST_CPU_KERNEL(VAdd);
TEST_CPU_KERNEL(VAddRelu);
TEST_CPU_KERNEL(VSub);
TEST_CPU_KERNEL(VScal);
TEST_CPU_KERNEL(VAddBias);
TEST_CPU_KERNEL(VRelu);
TEST_CPU_KERNEL(VIdentity);
TEST_CPU_KERNEL(VSquare);
TEST_CPU_KERNEL(VExp);
TEST_CPU_KERNEL(VSigmoid);
TEST_CPU_KERNEL(VTanh);
TEST_CPU_KERNEL(VCopy);
TEST_CPU_KERNEL(HMax);
TEST_CPU_KERNEL(HSum);
TEST_CPU_KERNEL(LSTMCtHt);
TEST_CPU_KERNEL(LSTMC1H1);
TEST_CPU_KERNEL(GRUH1);
TEST_CPU_KERNEL(GRUHtPart1);
TEST_CPU_KERNEL(GRUHtPart2);
TEST_CPU_KERNEL(NCHW16CMulNC);
TEST_CPU_KERNEL(LayerNorm);
TEST_CPU_KERNEL(CRFDecoding);
TEST_CPU_KERNEL(SeqPool);
TEST_CPU_KERNEL(EmbSeqPool);
TEST_CPU_KERNEL(MatMul);
TEST_CPU_KERNEL(Softmax);
TEST_CPU_KERNEL(Sgd);
TEST_CPU_KERNEL(VBroadcast);
...@@ -230,8 +230,8 @@ class LayerNormKernel : public framework::OpKernel<T> { ...@@ -230,8 +230,8 @@ class LayerNormKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(bias->numel(), right); PADDLE_ENFORCE_EQ(bias->numel(), right);
auto ker = auto ker =
jit::Get<jit::kLayerNorm, jit::LayerNormTuples<T>, platform::CPUPlace>( jit::KernelFuncs<jit::LayerNormTuple<T>, platform::CPUPlace>::Cache()
right); .At(right);
ker(x.data<T>(), out.data<T>(), mean->data<T>(), var->data<T>(), ker(x.data<T>(), out.data<T>(), mean->data<T>(), var->data<T>(),
scale->data<T>(), bias->data<T>(), static_cast<int>(left), scale->data<T>(), bias->data<T>(), static_cast<int>(left),
static_cast<const float>(epsilon), right); static_cast<const float>(epsilon), right);
......
...@@ -30,17 +30,16 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M, ...@@ -30,17 +30,16 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
return; return;
} }
if (relu) { if (relu) {
auto compute = jit::KernelFuncs<jit::kVAddRelu, jit::XYZNTuples<T>, auto compute =
platform::CPUPlace>::Cache() jit::KernelFuncs<jit::VAddReluTuple<T>, platform::CPUPlace>::Cache().At(
.At(N); N);
for (int i = 0; i < M; i++) { for (int i = 0; i < M; i++) {
T* dst = Y + i * N; T* dst = Y + i * N;
compute(B, dst, dst, N); compute(B, dst, dst, N);
} }
} else { } else {
auto compute = jit::KernelFuncs<jit::kVAdd, jit::XYZNTuples<T>, auto compute =
platform::CPUPlace>::Cache() jit::KernelFuncs<jit::VAddTuple<T>, platform::CPUPlace>::Cache().At(N);
.At(N);
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#pragma omp parallel for #pragma omp parallel for
#endif #endif
......
...@@ -256,8 +256,8 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> { ...@@ -256,8 +256,8 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
static_cast<int>(input.numel() / input.dims()[0]), static_cast<int>(input.numel() / input.dims()[0]),
jit::SeqPoolType::kSum); jit::SeqPoolType::kSum);
auto seqpool = auto seqpool =
jit::Get<jit::kSeqPool, jit::SeqPoolTuples<T>, platform::CPUPlace>( jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache()
attr); .At(attr);
for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) { for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
attr.h = static_cast<int>(lod[i + 1] - lod[i]); attr.h = static_cast<int>(lod[i + 1] - lod[i]);
seqpool(src, dst, &attr); seqpool(src, dst, &attr);
......
...@@ -82,8 +82,7 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> { ...@@ -82,8 +82,7 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
const int kClassDim = 1; const int kClassDim = 1;
// 2D data. Batch x C // 2D data. Batch x C
auto compute_softmax = auto compute_softmax =
jit::KernelFuncs<jit::kSoftmax, jit::SoftmaxTuples<float>, jit::KernelFuncs<jit::SoftmaxTuple<float>, platform::CPUPlace>::Cache()
platform::CPUPlace>::Cache()
.At(in_dims[kClassDim]); .At(in_dims[kClassDim]);
compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]); compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]);
} }
......
...@@ -29,7 +29,6 @@ limitations under the License. */ ...@@ -29,7 +29,6 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/ngraph/ngraph_bridge.h" #include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
#include "paddle/fluid/operators/ngraph/ngraph_engine.h" #include "paddle/fluid/operators/ngraph/ngraph_engine.h"
...@@ -42,44 +41,75 @@ static ngraph::Shape Ddim2Shape(const framework::DDim& dims) { ...@@ -42,44 +41,75 @@ static ngraph::Shape Ddim2Shape(const framework::DDim& dims) {
for (int i = 0; i < dims.size(); ++i) { for (int i = 0; i < dims.size(); ++i) {
int k = dims[i]; int k = dims[i];
k = k == 0 ? 1 : k; k = k == 0 ? 1 : k;
sp.push_back(k); sp.emplace_back(k);
} }
return sp; return sp;
} }
static framework::DDim Shape2Ddim(const ngraph::Shape& shape) {
std::vector<int64_t> dims;
for (size_t i = 0; i < shape.size(); ++i) {
int64_t k = shape[i];
dims.emplace_back(k);
}
return framework::make_ddim(dims);
}
static std::map<framework::proto::VarType::Type, ngraph::element::Type> static std::map<framework::proto::VarType::Type, ngraph::element::Type>
pd2ng_type_map = { pd2ng_type_map = {
{framework::proto::VarType::FP32, ngraph::element::f32}, {framework::proto::VarType::FP32, ngraph::element::f32},
{framework::proto::VarType::FP64, ngraph::element::f64}, {framework::proto::VarType::FP64, ngraph::element::f64},
{framework::proto::VarType::INT32, ngraph::element::i32}, {framework::proto::VarType::INT32, ngraph::element::i32},
{framework::proto::VarType::INT64, ngraph::element::i64}, {framework::proto::VarType::INT64, ngraph::element::i64},
{framework::proto::VarType::BOOL, ngraph::element::boolean}, {framework::proto::VarType::BOOL, ngraph::element::boolean}};
};
static std::map<ngraph::element::Type, framework::proto::VarType::Type>
std::unordered_map<std::string, std::shared_ptr<ngraph::Function>> ng2pd_type_map = {
NgraphEngine::func_cache_ = {}; {ngraph::element::f32, framework::proto::VarType::FP32},
{ngraph::element::f64, framework::proto::VarType::FP64},
{ngraph::element::i32, framework::proto::VarType::INT32},
{ngraph::element::i64, framework::proto::VarType::INT64},
{ngraph::element::boolean, framework::proto::VarType::BOOL}};
std::vector<std::string> NgraphEngine::feed_vars = {};
std::vector<std::string> NgraphEngine::fetch_vars = {};
framework::Variable* NgraphEngine::pre_var_ptr = nullptr;
const framework::BlockDesc* NgraphEngine::p_bdesc = nullptr;
std::unordered_map<std::string, EngineCache> NgraphEngine::engine_cache = {};
std::unordered_map<std::string,
std::vector<std::shared_ptr<ngraph::runtime::Tensor>>>
NgraphEngine::t_in_cache_ = {};
std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ = std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
ngraph::runtime::Backend::create("CPU"); ngraph::runtime::Backend::create("CPU");
static std::vector<std::vector<int>> NgraphOpIntervals( static std::vector<std::vector<int>> NgraphOpIntervals(
framework::BlockDesc* block) { std::vector<std::unique_ptr<framework::OperatorBase>>* ops) {
NgraphEngine::feed_vars.clear();
NgraphEngine::fetch_vars.clear();
std::vector<std::vector<int>> intervals; std::vector<std::vector<int>> intervals;
auto ops = block->AllOps();
int size = ops.size(); int size = ops->size();
int left = 0; int left = 0;
while (left < size && ops.at(left)->Type() != framework::kFeedOpType) { while (left < size && ops->at(left)->Type() != framework::kFeedOpType) {
++left; ++left;
} }
if (left == size) { if (left == size) {
return intervals; return intervals;
} }
while (left < size && ops.at(left)->Type() == framework::kFeedOpType) {
while (left < size && ops->at(left)->Type() == framework::kFeedOpType) {
for (auto& var_name_item : ops->at(left)->Outputs()) {
for (auto& var_name : var_name_item.second) {
NgraphEngine::feed_vars.emplace_back(var_name);
}
}
++left; ++left;
} }
int right = left; int right = left;
while (right < size && ops.at(right)->Type() != framework::kFetchOpType) { while (right < size && ops->at(right)->Type() != framework::kFetchOpType) {
++right; ++right;
} }
if (right == size) { if (right == size) {
...@@ -87,85 +117,124 @@ static std::vector<std::vector<int>> NgraphOpIntervals( ...@@ -87,85 +117,124 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
} }
if (left >= right) return intervals; if (left >= right) return intervals;
int index = right;
while (index < size && ops->at(index)->Type() == framework::kFetchOpType) {
for (auto& var_name_item : ops->at(index)->Inputs()) {
for (auto& var_name : var_name_item.second) {
NgraphEngine::fetch_vars.emplace_back(var_name);
}
}
++index;
}
// (left, right - 1) represents indices between feed and fetch // (left, right - 1) represents indices between feed and fetch
int pivot = left; int pivot = left;
while (pivot < right) { while (pivot < right) {
auto op_type = ops.at(pivot)->Type(); auto op_type = ops->at(pivot)->Type();
if (NgraphBridge::isRegister(op_type)) { if (NgraphBridge::isRegister(op_type)) {
++pivot; ++pivot;
} else { } else {
int start = pivot, end = start; int start = pivot, end = start;
while (pivot < right && while (pivot < right &&
(!NgraphBridge::isRegister(ops.at(pivot)->Type()))) { (!NgraphBridge::isRegister(ops->at(pivot)->Type()))) {
++pivot; ++pivot;
++end; ++end;
} }
std::vector<int> interval = {start, end}; std::vector<int> interval = {start, end};
intervals.push_back(interval); intervals.emplace_back(interval);
} }
} // end while } // end while
return intervals; return intervals;
} }
static void SubstituteNgraphOp(framework::BlockDesc* block, static void SubstituteNgraphOp(
std::string block_str, std::vector<std::unique_ptr<framework::OperatorBase>>* ops,
std::vector<int> interval) { std::string engine_key, std::string block_str, std::vector<int> interval) {
framework::ProgramDesc program; framework::OpDesc ng_op_desc(nullptr);
block->RemoveOp(interval.at(0), interval.at(1)); ng_op_desc.SetType("ngraph_engine");
auto* ng_op = block->InsertOp(interval.at(0)); ng_op_desc.SetAttr("interval", interval);
ng_op->SetType("ngraph_engine"); ng_op_desc.SetAttr("engine_key", engine_key);
ng_op->SetAttr("interval", interval); ng_op_desc.SetAttr("graph", block_str);
ng_op->SetAttr("graph", block_str);
ops->erase(ops->begin() + interval[0], ops->begin() + interval[1]);
ops->insert(ops->begin() + interval[0],
framework::OpRegistry::CreateOp(ng_op_desc));
} }
// TODO(baojun-nervana): Move EnableNgraph to compile time per PR #15089 std::string SerializedBlock(const std::vector<framework::OpDesc*>& op_descs) {
void NgraphEngine::EnableNgraph(const framework::ProgramDesc& program) { framework::proto::BlockDesc block_proto;
#ifdef PADDLE_WITH_NGRAPH framework::BlockDesc block_desc(nullptr, &block_proto);
VLOG(4) << "use_ngraph=True"; block_desc.Proto()->set_parent_idx(-1);
for (size_t bid = 0; bid < program.Size(); ++bid) { block_desc.Proto()->set_idx(0);
// TODO(baojun-nervana): Remove the const_cast
auto* block = for (auto* op_desc : op_descs) {
const_cast<framework::ProgramDesc&>(program).MutableBlock(bid); auto* op = block_desc.AppendOp();
std::string block_str = block->Proto()->SerializeAsString(); *op->Proto() = *op_desc->Proto();
auto intervals = NgraphOpIntervals(block);
for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
SubstituteNgraphOp(block, block_str, *it);
} }
return block_desc.Proto()->SerializeAsString();
}
std::string GenerateEngineKey(const framework::BlockDesc& bdesc) {
framework::proto::BlockDesc block_proto;
framework::BlockDesc block_desc(nullptr, &block_proto);
block_desc.Proto()->set_parent_idx(-1);
block_desc.Proto()->set_idx(0);
for (auto& op_desc : bdesc.AllOps()) {
auto* op = block_desc.AppendOp();
*op->Proto() = *op_desc->Proto();
}
auto engine_key = std::to_string(
std::hash<std::string>()(block_desc.Proto()->SerializeAsString()));
return engine_key;
}
std::string GenerateEngineKey(const std::vector<std::string>& engine_inputs,
const std::vector<std::string>& engine_outputs,
int size) {
std::string engine_hash_key = "";
for (auto name : engine_inputs) {
engine_hash_key += name;
}
for (auto name : engine_outputs) {
engine_hash_key += name;
}
engine_hash_key += std::to_string(size);
auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
return engine_key;
}
void NgraphEngine::FuseNgraphOps(
const framework::BlockDesc& block_desc,
std::vector<std::unique_ptr<framework::OperatorBase>>* ops) {
NgraphEngine::p_bdesc = &block_desc;
auto intervals = NgraphOpIntervals(ops);
std::string engine_key =
GenerateEngineKey(feed_vars, fetch_vars, ops->size());
for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
SubstituteNgraphOp(ops, engine_key, "", *it);
} }
#else
LOG(WARNING)
<< "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
#endif
} }
NgraphEngine::NgraphEngine(const framework::Scope& scope, NgraphEngine::NgraphEngine(const framework::Scope& scope,
const platform::Place& place, const platform::Place& place,
const std::string& serialized_graph, const framework::ExecutionContext& ctx)
const std::vector<int>& interval)
: scope_(scope), place_(place) { : scope_(scope), place_(place) {
std::string serialized_graph = ctx.Attr<std::string>("graph");
auto interval = ctx.Attr<std::vector<int>>("interval");
std::string engine_key = ctx.Attr<std::string>("engine_key");
var_in_node_map_ = std::make_shared< var_in_node_map_ = std::make_shared<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>(); std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
var_node_map_ = std::make_shared< var_node_map_ = std::make_shared<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>(); std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
func_cache_key_ = std::to_string(interval[0]) + std::to_string(interval[1]) + GetNgFunction(engine_key, interval);
serialized_graph;
framework::proto::BlockDesc bdesc;
bdesc.ParseFromString(serialized_graph);
framework::BlockDesc block(nullptr, &bdesc);
Prepare(block, interval);
BuildNgIO();
GetNgFunction();
} }
void NgraphEngine::Prepare(const framework::BlockDesc& block, void NgraphEngine::Prepare(const std::vector<int>& interval) {
const std::vector<int>& interval) { for (auto& var : p_bdesc->AllVars()) {
for (auto& var : block.AllVars()) {
if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS || if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS ||
var->GetType() == framework::proto::VarType::LOD_TENSOR || var->GetType() == framework::proto::VarType::LOD_TENSOR ||
var->GetType() == framework::proto::VarType::LOD_TENSOR_ARRAY)) { var->GetType() == framework::proto::VarType::LOD_TENSOR_ARRAY)) {
...@@ -192,108 +261,57 @@ void NgraphEngine::Prepare(const framework::BlockDesc& block, ...@@ -192,108 +261,57 @@ void NgraphEngine::Prepare(const framework::BlockDesc& block,
} }
} }
auto ops_desc = block.AllOps(); std::vector<paddle::framework::OpDesc*> ops_desc;
int idx = interval[0]; for (auto op_desc : p_bdesc->AllOps()) {
while (idx < interval[1]) { ops_desc.emplace_back(op_desc);
auto op_desc = ops_desc.at(idx);
auto op = framework::OpRegistry::CreateOp(*op_desc);
fused_ops_.push_back(std::move(op));
++idx;
} }
while (ops_desc.at(idx)->Type() != framework::kFetchOpType) { for (auto op_desc : ops_desc) {
auto op_desc = ops_desc.at(idx); if (op_desc->Type().find("_grad") != std::string::npos) {
for (auto& var_name_item : op_desc->Inputs()) { this->is_test_ = false;
for (auto& var_name : var_name_item.second) { break;
post_op_inputs_.insert(var_name);
}
}
++idx;
} }
while (idx < static_cast<int>(ops_desc.size()) &&
ops_desc.at(idx)->Type() == framework::kFetchOpType) {
std::string fetch_target_name = ops_desc.at(idx)->Input("X")[0];
fetches_.insert(fetch_target_name);
++idx;
} }
if (ops_desc.at(interval.at(0) - 1)->Type() == framework::kFeedOpType && if (interval[0] > 0 &&
ops_desc.at(interval[0] - 1)->Type() == framework::kFeedOpType &&
interval[1] < static_cast<int>(ops_desc.size()) &&
ops_desc.at(interval.at(1))->Type() == framework::kFetchOpType) { ops_desc.at(interval.at(1))->Type() == framework::kFetchOpType) {
ng_op_state_ = OpState::FULL; this->op_state_ = OpState::FULL;
} }
for (auto* op_desc : ops_desc) { if (this->op_state_ == OpState::FULL) {
if (op_desc->Type().find("_grad") != std::string::npos) { this->op_state_ = this->is_test_ ? OpState::FULL_TEST : OpState::FULL_TRAIN;
ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TRAIN } else {
: OpState::PARTIAL_TRAIN; this->op_state_ =
break; this->is_test_ ? OpState::PARTIAL_TEST : OpState::PARTIAL_TRAIN;
}
} }
if (ng_op_state_ != OpState::FULL_TRAIN && int idx = interval[0];
ng_op_state_ != OpState::PARTIAL_TRAIN) { while (idx < interval[1]) {
ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TEST this->fused_ops_.emplace_back(
: OpState::PARTIAL_TEST; framework::OpRegistry::CreateOp(*(ops_desc[idx])));
++idx;
} }
} while (ops_desc.at(idx)->Type() != framework::kFetchOpType) {
auto op_desc = ops_desc.at(idx);
void NgraphEngine::GetNgInputShape( for (auto& var_name_item : op_desc->Inputs()) {
std::shared_ptr<framework::OperatorBase> op) {
framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
op->RuntimeInferShape(scope_, place_, ctx);
for (auto& var_name_item : op->Inputs()) {
for (auto& var_name : var_name_item.second) { for (auto& var_name : var_name_item.second) {
auto* var = scope_.FindVar(var_name); this->post_op_inputs_.insert(var_name);
if (var && var->IsType<framework::LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto sp = Ddim2Shape(tensor_pd->dims());
if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
var_in_.end()) {
if (var_node_map_->find(var_name) == var_node_map_->end()) {
// auto ng_type = pd2ng_type_map.at(GetDataTypeOfVar(var));
auto ng_type = var_type_map_.at(var_name);
auto prm =
std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
(*var_node_map_)[var_name] = prm;
(*var_in_node_map_)[var_name] = prm;
}
}
} }
} }
++idx;
} }
}
void NgraphEngine::BuildNgNodes() { BuildNgIO(ops_desc, interval);
for (auto& op : fused_ops_) {
for (auto& var_name_item : op->Outputs()) {
for (auto& var_name : var_name_item.second) {
if (var_node_map_->find(var_name) == var_node_map_->end()) {
auto* var = scope_.FindVar(var_name);
if (var && var->IsType<framework::LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto& ddim = tensor_pd->dims();
auto ng_shape = Ddim2Shape(ddim);
auto ng_type = var_type_map_.at(var_name);
auto prm = std::make_shared<ngraph::op::Parameter>(ng_type,
ng_shape, true);
(*var_node_map_)[var_name] = prm;
}
}
}
}
}
NgraphBridge ngb(var_node_map_);
for (auto& op : fused_ops_) {
ngb.BuildNgNode(op);
}
} }
void NgraphEngine::BuildNgIO() { void NgraphEngine::BuildNgIO(const std::vector<framework::OpDesc*>& ops_desc,
const std::vector<int>& interval) {
std::unordered_set<std::string> inputs; std::unordered_set<std::string> inputs;
std::unordered_set<std::string> outputs; std::unordered_set<std::string> outputs;
for (int i = interval[0]; i < interval[1]; ++i) {
for (auto& op : fused_ops_) { auto op = ops_desc[i];
for (auto& var_name_item : op->Inputs()) { for (auto& var_name_item : op->Inputs()) {
for (auto& var_name : var_name_item.second) { for (auto& var_name : var_name_item.second) {
inputs.insert(var_name); inputs.insert(var_name);
...@@ -302,15 +320,11 @@ void NgraphEngine::BuildNgIO() { ...@@ -302,15 +320,11 @@ void NgraphEngine::BuildNgIO() {
std::find(var_in_.begin(), var_in_.end(), var_name) == std::find(var_in_.begin(), var_in_.end(), var_name) ==
var_in_.end()) { var_in_.end()) {
// fill var_in here to keep lhs and rhs order // fill var_in here to keep lhs and rhs order
var_in_.push_back(var_name); this->var_in_.emplace_back(var_name);
} }
} }
} }
if (op->Type() != "fill_constant") {
GetNgInputShape(op);
}
for (auto& var_name_item : op->Outputs()) { for (auto& var_name_item : op->Outputs()) {
PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
"op %s has more than 1 output - Not handling yet", "op %s has more than 1 output - Not handling yet",
...@@ -322,172 +336,278 @@ void NgraphEngine::BuildNgIO() { ...@@ -322,172 +336,278 @@ void NgraphEngine::BuildNgIO() {
} }
// var_out.clear(); // var_out.clear();
for (auto& op : fused_ops_) { for (int i = interval[0]; i < interval[1]; ++i) {
auto op = ops_desc[i];
for (auto& var_name_item : op->Outputs()) { for (auto& var_name_item : op->Outputs()) {
PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
"op %s has more than 1 output - Not handling yet", "op %s has more than 1 output - Not handling yet",
op->Type()); op->Type());
for (auto& var_name : var_name_item.second) { for (auto& var_name : var_name_item.second) {
switch (ng_op_state_) { switch (this->op_state_) {
case OpState::PARTIAL_TEST: case OpState::PARTIAL_TEST:
if (post_op_inputs_.find(var_name) != post_op_inputs_.end() || if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
fetches_.find(var_name) != fetches_.end()) { find(fetch_vars.begin(), fetch_vars.end(), var_name) !=
var_out_.push_back(var_name); fetch_vars.end()) {
this->var_out_.emplace_back(var_name);
} }
break; break;
case OpState::FULL_TEST: case OpState::FULL_TEST:
if (fetches_.find(var_name) != fetches_.end()) { if (find(fetch_vars.begin(), fetch_vars.end(), var_name) !=
var_out_.push_back(var_name); fetch_vars.end()) {
this->var_out_.emplace_back(var_name);
} }
break; break;
case OpState::PARTIAL_TRAIN: case OpState::PARTIAL_TRAIN:
if (fetches_.find(var_name) != fetches_.end() || if (find(fetch_vars.begin(), fetch_vars.end(), var_name) !=
fetch_vars.end() ||
post_op_inputs_.find(var_name) != post_op_inputs_.end() || post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
persistables_.find(var_name) != persistables_.end()) { persistables_.find(var_name) != persistables_.end()) {
var_out_.push_back(var_name); this->var_out_.emplace_back(var_name);
} }
break; break;
case OpState::FULL_TRAIN: case OpState::FULL_TRAIN:
if (fetches_.find(var_name) != fetches_.end() || if (find(fetch_vars.begin(), fetch_vars.end(), var_name) !=
fetch_vars.end() ||
persistables_.find(var_name) != persistables_.end()) { persistables_.find(var_name) != persistables_.end()) {
var_out_.push_back(var_name); this->var_out_.emplace_back(var_name);
} }
break; break;
default: default:
var_out_.push_back(var_name); this->var_out_.emplace_back(var_name);
} }
} }
} }
} }
for (size_t i = 0; i < var_in_.size(); ++i) {
auto var_name = var_in_[i];
if (persistables_.find(var_name) == persistables_.end()) {
var_in_updates_.emplace_back(i);
}
}
} }
void NgraphEngine::BuildNgFunction() { void NgraphEngine::GetNgInputShape() {
for (auto& var_name : var_in_) {
auto* var = scope_.FindVar(var_name);
if (var && var->IsType<framework::LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto sp = Ddim2Shape(tensor_pd->dims());
auto ng_type = var_type_map_[var_name];
auto prm = std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
(*var_node_map_)[var_name] = prm;
(*var_in_node_map_)[var_name] = prm;
}
}
}
void NgraphEngine::BuildNgNodes() {
for (auto& op : fused_ops_) {
for (auto& var_name_item : op->Outputs()) {
for (auto& var_name : var_name_item.second) {
if (var_node_map_->find(var_name) == var_node_map_->end()) {
auto* var = scope_.FindVar(var_name);
if (var && var->IsType<framework::LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto& ddim = tensor_pd->dims();
auto ng_shape = Ddim2Shape(ddim);
auto ng_type = var_type_map_[var_name];
auto prm = std::make_shared<ngraph::op::Parameter>(ng_type,
ng_shape, true);
(*var_node_map_)[var_name] = prm;
}
}
}
}
}
NgraphBridge ngb(var_node_map_);
for (auto& op : fused_ops_) {
ngb.BuildNgNode(op);
}
}
void NgraphEngine::RunInferShape() {
for (auto& op : fused_ops_) {
framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
op->RuntimeInferShape(scope_, place_, ctx);
}
}
void NgraphEngine::BuildNgFunction(const std::vector<int>& interval) {
Prepare(interval);
RunInferShape();
GetNgInputShape();
BuildNgNodes(); BuildNgNodes();
ngraph_function_ = nullptr; ngraph_function_ = nullptr;
ngraph::NodeVector func_outputs; ngraph::NodeVector func_outputs;
ngraph::ParameterVector func_inputs; ngraph::ParameterVector func_inputs;
for (auto& vo : var_out_) { for (auto& vo : var_out_) {
func_outputs.push_back(var_node_map_->at(vo)); func_outputs.emplace_back(var_node_map_->at(vo));
} }
for (auto& vi : var_in_) { for (auto& vi : var_in_) {
std::shared_ptr<ngraph::op::Parameter> prm = std::shared_ptr<ngraph::op::Parameter> prm =
std::dynamic_pointer_cast<ngraph::op::Parameter>( std::dynamic_pointer_cast<ngraph::op::Parameter>(
var_in_node_map_->at(vi)); var_in_node_map_->at(vi));
func_inputs.push_back(prm); func_inputs.emplace_back(prm);
} }
ngraph_function_ = ngraph_function_ =
std::make_shared<ngraph::Function>(func_outputs, func_inputs); std::make_shared<ngraph::Function>(func_outputs, func_inputs);
} }
void NgraphEngine::GetNgFunction() { void NgraphEngine::GetNgFunction(std::string engine_key,
bool cache_on = true; const std::vector<int>& interval) {
if (cache_on) { bool use_cache = true;
std::string input_shape_str; if (use_cache) {
for (auto& var_name : var_in_) { this->func_cache_key_ = "";
auto shape = var_node_map_->at(var_name)->get_shape(); for (int i = 0; i < std::min(static_cast<int>(feed_vars.size()), 10); ++i) {
for (size_t i = 0; i < shape.size(); ++i) { auto* var = scope_.FindVar(feed_vars[i]);
input_shape_str += std::to_string(shape.at(i)); if (var && var->IsType<framework::LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto dims = tensor_pd->dims();
for (int j = 0; j < dims.size(); ++j) {
func_cache_key_ += std::to_string(dims[j]);
} }
} }
func_cache_key_ = input_shape_str + func_cache_key_; }
if (func_cache_.find(func_cache_key_) != func_cache_.end()) { func_cache_key_ += std::to_string(interval[0]) + "_" +
ngraph_function_ = func_cache_.at(func_cache_key_); std::to_string(interval[1]) + engine_key;
func_cache_key_ = std::to_string(std::hash<std::string>()(func_cache_key_));
if (engine_cache.find(func_cache_key_) != engine_cache.end()) {
if (engine_cache[func_cache_key_].persistables.size() == 0) {
engine_cache.clear();
t_in_cache_.clear();
} else { } else {
BuildNgFunction(); auto var_name = engine_cache[func_cache_key_].persistables.begin();
func_cache_[func_cache_key_] = ngraph_function_; framework::Variable* var = scope_.FindVar(*var_name);
if (var != pre_var_ptr) {
engine_cache.clear();
t_in_cache_.clear();
}
pre_var_ptr = var;
}
}
if (engine_cache.find(func_cache_key_) == engine_cache.end()) {
BuildNgFunction(interval);
engine_cache[func_cache_key_].ngraph_function = this->ngraph_function_;
engine_cache[func_cache_key_].persistables = this->persistables_;
engine_cache[func_cache_key_].var_in_updates = this->var_in_updates_;
engine_cache[func_cache_key_].var_in = this->var_in_;
engine_cache[func_cache_key_].var_out = this->var_out_;
engine_cache[func_cache_key_].is_test = this->is_test_;
} }
} else { } else {
BuildNgFunction(); BuildNgFunction(interval);
} }
} }
void NgraphEngine::Run(const framework::Scope& scope, void NgraphEngine::Run(const framework::Scope& scope,
const platform::Place& place) const { const platform::Place& place) const {
std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in; std::shared_ptr<ngraph::Function> ng_func;
std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out; const std::set<std::string>* p_persistables;
const std::vector<size_t>* p_var_in_updates;
const std::vector<std::string>* p_var_in;
const std::vector<std::string>* p_var_out;
bool is_test;
bool use_cache = true;
if (use_cache) {
PADDLE_ENFORCE(engine_cache.find(func_cache_key_) != engine_cache.end(),
"Cannot find cached data to run ngraph function");
ng_func = engine_cache[func_cache_key_].ngraph_function;
p_persistables = &(engine_cache[func_cache_key_].persistables);
p_var_in_updates = &(engine_cache[func_cache_key_].var_in_updates);
p_var_in = &(engine_cache[func_cache_key_].var_in);
p_var_out = &(engine_cache[func_cache_key_].var_out);
is_test = engine_cache[func_cache_key_].is_test;
} else {
ng_func = ngraph_function_;
p_persistables = &this->persistables_;
p_var_in_updates = &this->var_in_updates_;
p_var_in = &this->var_in_;
p_var_out = &this->var_out_;
is_test = this->is_test_;
}
std::vector<std::shared_ptr<ngraph::runtime::Tensor>>* p_t_in;
std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in = {};
auto m_parameters = ng_func->get_parameters();
auto m_results = ng_func->get_results();
if (is_test && use_cache &&
t_in_cache_.find(func_cache_key_) != t_in_cache_.end()) {
p_t_in = &(t_in_cache_[func_cache_key_]);
for (size_t i = 0; i < p_var_in_updates->size(); ++i) {
int index = p_var_in_updates->at(i);
auto vi = p_var_in->at(index);
auto sp = m_parameters[index]->get_shape();
auto ng_type = m_parameters[index]->get_element_type();
std::shared_ptr<ngraph::runtime::Tensor> ti;
auto* var = scope.FindVar(vi);
if (var && var->IsType<framework::LoDTensor>()) {
auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]);
ti = backend_->create_tensor(ng_type, sp, pd_arr);
(*p_t_in)[index] = ti;
} else {
PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
}
}
} else {
if (is_test && use_cache) {
p_t_in = &(t_in_cache_[func_cache_key_]);
} else {
p_t_in = &t_in;
}
for (size_t i = 0; i < var_in_.size(); ++i) { for (size_t i = 0; i < p_var_in->size(); ++i) {
auto vi = var_in_.at(i); auto vi = p_var_in->at(i);
auto sp = var_node_map_->at(vi)->get_shape(); auto sp = m_parameters[i]->get_shape();
auto ng_type = m_parameters[i]->get_element_type();
std::shared_ptr<ngraph::runtime::Tensor> ti; std::shared_ptr<ngraph::runtime::Tensor> ti;
auto* var = scope.FindVar(vi); auto* var = scope.FindVar(vi);
if (var && var->IsType<framework::LoDTensor>()) { if (var && var->IsType<framework::LoDTensor>()) {
auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]);
PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
"Ensure ngraph tensor layout align with paddle tensor"); "Ensure ngraph tensor layout align with paddle tensor");
auto ng_type = var_type_map_.at(vi); ti = backend_->create_tensor(ng_type, sp, pd_arr);
if (ng_type == ngraph::element::f32) {
auto pd_arr = tensor_pd->mutable_data<float>(place);
ti = backend_->create_tensor(ngraph::element::f32, sp, pd_arr);
} else if (ng_type == ngraph::element::i32) {
const int* arr = tensor_pd->data<int>();
ti = backend_->create_tensor(ngraph::element::i32, sp,
const_cast<int*>(arr));
} else if (ng_type == ngraph::element::i64) {
auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
ti = backend_->create_tensor(ngraph::element::i64, sp, pd_arr);
} else if (ng_type == ngraph::element::f64) {
auto pd_arr = tensor_pd->mutable_data<double>(place);
ti = backend_->create_tensor(ngraph::element::f64, sp, pd_arr);
} else if (ng_type == ngraph::element::boolean) {
auto pd_arr = tensor_pd->mutable_data<bool>(place);
ti = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr);
} else {
PADDLE_THROW("Data type not handling for var %s", vi);
}
} else { } else {
PADDLE_THROW("Cannot find var or tensor with var name %s", vi); PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
} }
bool is_test = (ng_op_state_ == OpState::PARTIAL_TEST ||
ng_op_state_ == OpState::FULL_TEST)
? true
: false;
bool is_persistable = bool is_persistable =
(persistables_.find(vi) != persistables_.end()) ? true : false; (p_persistables->find(vi) != p_persistables->end()) ? true : false;
if (is_test && is_persistable) { if (is_test && is_persistable) {
ti->set_stale(false); ti->set_stale(false);
} }
t_in.push_back(ti); (*p_t_in).emplace_back(ti);
}
} }
for (size_t i = 0; i < var_out_.size(); ++i) { std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out = {};
auto vo = var_out_[i]; for (size_t i = 0; i < p_var_out->size(); ++i) {
auto vo = p_var_out->at(i);
auto* var = scope.FindVar(vo); auto* var = scope.FindVar(vo);
std::shared_ptr<ngraph::runtime::Tensor> to;
if (var && var->IsType<framework::LoDTensor>()) { if (var && var->IsType<framework::LoDTensor>()) {
auto sp = m_results[i]->get_shape();
var->GetMutable<framework::LoDTensor>()->Resize(Shape2Ddim(sp));
auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
auto dd = tensor_pd->dims(); auto ng_type = m_results[i]->get_element_type();
ngraph::Shape sp = Ddim2Shape(dd); void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]);
auto ng_type = var_type_map_.at(vo); std::shared_ptr<ngraph::runtime::Tensor> to =
if (ng_type == ngraph::element::f32) { backend_->create_tensor(ng_type, sp, pd_arr);
auto pd_arr = tensor_pd->mutable_data<float>(place); t_out.emplace_back(to);
to = backend_->create_tensor(ng_type, sp, pd_arr);
} else if (ng_type == ngraph::element::i64) {
auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
to = backend_->create_tensor(ng_type, sp, pd_arr);
} else if (ng_type == ngraph::element::i32) {
auto pd_arr = tensor_pd->mutable_data<int>(place);
to = backend_->create_tensor(ng_type, sp, pd_arr);
} else if (ng_type == ngraph::element::f64) {
auto pd_arr = tensor_pd->mutable_data<double>(place);
to = backend_->create_tensor(ng_type, sp, pd_arr);
} else if (ng_type == ngraph::element::boolean) {
auto pd_arr = tensor_pd->mutable_data<bool>(place);
to = backend_->create_tensor(ng_type, sp, pd_arr);
} else {
PADDLE_THROW("Data type not handled in for var %s", vo);
}
t_out.push_back(to);
} else { } else {
PADDLE_THROW("Cannot find var or tensor with var name %s", vo); PADDLE_THROW("Cannot find var or tensor with var name %s", vo);
} }
} }
auto handle = backend_->compile(ngraph_function_); auto handle = backend_->compile(ng_func);
handle->call_with_validate(t_out, t_in); handle->call_with_validate(t_out, *p_t_in);
} // NgraphEngine::Run } // NgraphEngine::Run
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -12,12 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,12 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifndef PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
#define PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
#include <memory>
#include <set>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/var_desc.h"
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
...@@ -33,29 +39,47 @@ enum class OpState { /* nGraph support state on ops */ ...@@ -33,29 +39,47 @@ enum class OpState { /* nGraph support state on ops */
UNKNOWN /* Output all for debug purpose */ UNKNOWN /* Output all for debug purpose */
}; };
// cache engine repetitives
struct EngineCache {
std::shared_ptr<ngraph::Function> ngraph_function;
std::set<std::string> persistables;
std::vector<std::string> var_in;
std::vector<std::string> var_out;
std::vector<size_t> var_in_updates;
bool is_test = true;
};
// perform graph build through bridge and execute computation // perform graph build through bridge and execute computation
class NgraphEngine { class NgraphEngine {
public: public:
explicit NgraphEngine(const framework::Scope& scope, explicit NgraphEngine(const framework::Scope& scope,
const platform::Place& place, const platform::Place& place,
const std::string& serialized_graph, const framework::ExecutionContext& ctx);
const std::vector<int>& interval);
void Run(const framework::Scope& scope, const platform::Place& place) const; void Run(const framework::Scope& scope, const platform::Place& place) const;
static void EnableNgraph(const framework::ProgramDesc& program); static const framework::BlockDesc* p_bdesc;
static std::vector<std::string> feed_vars, fetch_vars;
static void FuseNgraphOps(
const framework::BlockDesc& prog,
std::vector<std::unique_ptr<framework::OperatorBase>>* ops);
private: private:
static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>> static std::unordered_map<std::string, EngineCache> engine_cache;
func_cache_; static std::unordered_map<
std::string, std::vector<std::shared_ptr<ngraph::runtime::Tensor>>>
t_in_cache_;
static framework::Variable* pre_var_ptr;
const framework::Scope& scope_; const framework::Scope& scope_;
const platform::Place& place_; const platform::Place& place_;
std::vector<std::shared_ptr<framework::OperatorBase>> fused_ops_; std::vector<std::shared_ptr<framework::OperatorBase>> fused_ops_;
std::unordered_map<std::string, ngraph::element::Type> var_type_map_; std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
std::unordered_set<std::string> persistables_; std::set<std::string> persistables_;
std::unordered_set<std::string> fetches_;
std::unordered_set<std::string> post_op_inputs_; std::unordered_set<std::string> post_op_inputs_;
OpState ng_op_state_ = OpState::UNKNOWN; OpState op_state_ = OpState::UNKNOWN;
bool is_test_{true};
std::string func_cache_key_; std::string func_cache_key_;
// ngraph backend eg. CPU // ngraph backend eg. CPU
...@@ -66,6 +90,8 @@ class NgraphEngine { ...@@ -66,6 +90,8 @@ class NgraphEngine {
std::vector<std::string> var_in_; std::vector<std::string> var_in_;
// var_name of outputs from fetch in order // var_name of outputs from fetch in order
std::vector<std::string> var_out_; std::vector<std::string> var_out_;
// non-persitable var_in
std::vector<size_t> var_in_updates_;
// map input vars to nodes // map input vars to nodes
std::shared_ptr< std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
...@@ -74,20 +100,23 @@ class NgraphEngine { ...@@ -74,20 +100,23 @@ class NgraphEngine {
std::shared_ptr< std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
var_node_map_; var_node_map_;
// prepare info for nraph engine // prepare info for ngraph engine need
void Prepare(const framework::BlockDesc& block, void Prepare(const std::vector<int>& interval);
// get ngraph engine input and output list
void BuildNgIO(const std::vector<framework::OpDesc*>& op_descs,
const std::vector<int>& interval); const std::vector<int>& interval);
// get ngraph input and define ngraph input parameters // get ngraph input and define ngraph input parameters
void GetNgInputShape(std::shared_ptr<framework::OperatorBase> op); void GetNgInputShape();
// Call ngraph bridge to map ops // Call ngraph bridge to map ops
void BuildNgNodes(); void BuildNgNodes();
// get the ngraph input and output var list // run paddle RuntimeInferShape to get the tensor shape
void BuildNgIO(); void RunInferShape();
// build ngraph function call // build ngraph function call
void BuildNgFunction(); void BuildNgFunction(const std::vector<int>& interval);
// Check cache for ngraph function or otherwise build the function // Check cache for ngraph function or otherwise build the function
void GetNgFunction(); void GetNgFunction(std::string engine_key, const std::vector<int>& interval);
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
#endif // PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
...@@ -29,6 +29,7 @@ class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -29,6 +29,7 @@ class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Xs", "A list of inputs.").AsDispensable(); AddInput("Xs", "A list of inputs.").AsDispensable();
AddOutput("Ys", "A list of outputs").AsDispensable(); AddOutput("Ys", "A list of outputs").AsDispensable();
AddAttr<std::string>("graph", "the graph."); AddAttr<std::string>("graph", "the graph.");
AddAttr<std::string>("engine_key", "the engine hash key.");
AddAttr<std::vector<int>>("interval", "op interval supported by ngraph"); AddAttr<std::vector<int>>("interval", "op interval supported by ngraph");
AddComment("ngraph engine operator."); AddComment("ngraph engine operator.");
} }
......
...@@ -46,10 +46,8 @@ class NgraphEngineKernel : public framework::OpKernel<T> { ...@@ -46,10 +46,8 @@ class NgraphEngineKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto& scope = ctx.scope(); auto& scope = ctx.scope();
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
std::string serialized_graph = ctx.Attr<std::string>("graph");
auto interval = ctx.Attr<std::vector<int>>("interval");
NgraphEngine ngraph_engine(scope, place, serialized_graph, interval); NgraphEngine ngraph_engine(scope, place, ctx);
ngraph_engine.Run(scope, place); ngraph_engine.Run(scope, place);
} }
}; };
......
...@@ -48,7 +48,8 @@ class SGDOpKernel : public framework::OpKernel<T> { ...@@ -48,7 +48,8 @@ class SGDOpKernel : public framework::OpKernel<T> {
T *out_data = param_out->mutable_data<T>(ctx.GetPlace()); T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
auto sgd = auto sgd =
jit::Get<jit::kSgd, jit::SgdTuples<T>, platform::CPUPlace>(attr); jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
attr);
sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr); sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr);
} else if (grad_var->IsType<framework::SelectedRows>()) { } else if (grad_var->IsType<framework::SelectedRows>()) {
// TODO(qijun): In Sparse SGD operator, in-place update is enforced. // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
...@@ -82,7 +83,8 @@ class SGDOpKernel : public framework::OpKernel<T> { ...@@ -82,7 +83,8 @@ class SGDOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(attr.grad_width, attr.param_width); PADDLE_ENFORCE_EQ(attr.grad_width, attr.param_width);
auto sgd = auto sgd =
jit::Get<jit::kSgd, jit::SgdTuples<T>, platform::CPUPlace>(attr); jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
attr);
sgd(lr, param_data, grad_data, rows_data, out_data, &attr); sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
} else { } else {
PADDLE_THROW("Unsupported Variable Type of Grad"); PADDLE_THROW("Unsupported Variable Type of Grad");
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace reader { namespace reader {
...@@ -87,12 +88,15 @@ void BufferedReader::ReadAsync(size_t i) { ...@@ -87,12 +88,15 @@ void BufferedReader::ReadAsync(size_t i) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
// NOTE(liangdun): using async copy instead of TensorCopySync // NOTE(liangdun): using async copy instead of TensorCopySync
// TensorCopySync would block other stream // TensorCopySync would block other stream, because TensorCopySync
// issues the copying command to the default stream, it will make two
// commands from different streams cannot run concurrently.
if (platform::is_gpu_place(place_)) { if (platform::is_gpu_place(place_)) {
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device); platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
PADDLE_ENFORCE(cudaStreamWaitEvent(stream_, events_[i], 0)); PADDLE_ENFORCE(cudaStreamWaitEvent(stream_, events_[i], 0));
TensorVec &gpu = gpu_buffer_[i]; TensorVec &gpu = gpu_buffer_[i];
gpu.resize(cpu.size()); gpu.resize(cpu.size());
platform::RecordEvent record_event("BufferedReader:MemoryCopy");
for (size_t i = 0; i < cpu.size(); ++i) { for (size_t i = 0; i < cpu.size(); ++i) {
gpu[i].Resize(cpu[i].dims()); gpu[i].Resize(cpu[i].dims());
gpu[i].set_layout(cpu[i].layout()); gpu[i].set_layout(cpu[i].layout());
...@@ -112,6 +116,7 @@ void BufferedReader::ReadAsync(size_t i) { ...@@ -112,6 +116,7 @@ void BufferedReader::ReadAsync(size_t i) {
} else { } else {
// if cpu place is not pinned, async copy is slower than sync copy, // if cpu place is not pinned, async copy is slower than sync copy,
// so we use sync copy instead. // so we use sync copy instead.
// TODO(zcd): The default stream should not be used here.
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr, memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size, boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size,
0); 0);
......
...@@ -282,7 +282,9 @@ class RecurrentOp : public RecurrentBase { ...@@ -282,7 +282,9 @@ class RecurrentOp : public RecurrentBase {
// Every inputs are linked now, execute! // Every inputs are linked now, execute!
executor.Run(*program, &cur_scope, block->ID(), executor.Run(*program, &cur_scope, block->ID(),
false /*create_local_scope*/); false /*create_local_scope*/, true /*create_vars*/,
std::vector<std::string>() /*skip_ref_cnt_vars*/,
true /*force_disable_gc*/);
// get device context from pool // get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool &pool =
...@@ -398,7 +400,9 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -398,7 +400,9 @@ class RecurrentGradOp : public RecurrentBase {
VLOG(5) << "Recurrent memory linking finished "; VLOG(5) << "Recurrent memory linking finished ";
// Run step block with cur_scope // Run step block with cur_scope
executor.Run(*program, &cur_scope, block->ID(), executor.Run(*program, &cur_scope, block->ID(),
false /*create_local_scope*/); false /*create_local_scope*/, true /*create_vars*/,
std::vector<std::string>() /*skip_ref_cnt_vars*/,
true /*force_disable_gc*/);
VLOG(5) << "executor.Run finished "; VLOG(5) << "executor.Run finished ";
......
...@@ -22,9 +22,6 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel { ...@@ -22,9 +22,6 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
if (ctx->IsRuntime()) {
return;
}
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->HasInput("X"), ctx->HasInput("X"),
"Input(X) of SequecceEnumerate operator should not be null."); "Input(X) of SequecceEnumerate operator should not be null.");
...@@ -62,6 +59,8 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -62,6 +59,8 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
}); });
AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.") AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.")
.SetDefault(0); .SetDefault(0);
AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
.SetDefault(true);
AddComment(R"DOC( AddComment(R"DOC(
Sequence Enumerate Operator. Sequence Enumerate Operator.
......
...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/device_tracer.h"
#include <deque> #include <deque>
#include <forward_list> #include <forward_list>
...@@ -30,6 +29,7 @@ limitations under the License. */ ...@@ -30,6 +29,7 @@ limitations under the License. */
#include "glog/logging.h" #include "glog/logging.h"
#include "google/protobuf/text_format.h" #include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
...@@ -222,19 +222,24 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, ...@@ -222,19 +222,24 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
} }
case CUPTI_ACTIVITY_KIND_DRIVER: { case CUPTI_ACTIVITY_KIND_DRIVER: {
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record); auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
if (api->start != 0 && api->end != 0) if (api->start != 0 && api->end != 0) {
// -1 device id represents CUDA api call // -1 device id represents ActiveKind api call
tracer->AddCPURecords( tracer->AddActiveKindRecords(
DriverKind(api->cbid), api->start, api->end, -1, DriverKind(api->cbid), api->start, api->end, -1,
GetThreadIdFromSystemThreadId(api->threadId)); GetThreadIdFromSystemThreadId(api->threadId),
api->correlationId);
}
break; break;
} }
case CUPTI_ACTIVITY_KIND_RUNTIME: { case CUPTI_ACTIVITY_KIND_RUNTIME: {
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record); auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
if (api->start != 0 && api->end != 0) if (api->start != 0 && api->end != 0) {
tracer->AddCPURecords( // -1 device id represents ActiveKind api call
tracer->AddActiveKindRecords(
RuntimeKind(api->cbid), api->start, api->end, -1, RuntimeKind(api->cbid), api->start, api->end, -1,
GetThreadIdFromSystemThreadId(api->threadId)); GetThreadIdFromSystemThreadId(api->threadId),
api->correlationId);
}
break; break;
} }
default: { break; } default: { break; }
...@@ -313,6 +318,43 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -313,6 +318,43 @@ class DeviceTracerImpl : public DeviceTracer {
stream_id, correlation_id, bytes}); stream_id, correlation_id, bytes});
} }
void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place &place, const std::string &alloc_in,
const std::string &free_in, int64_t thread_id) {
if (0 == start_ns || 0 == end_ns) {
VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced.";
return;
}
thread_local std::forward_list<MemInfoRecord> *local_mem_info_record =
nullptr;
if (local_mem_info_record == nullptr) {
std::lock_guard<std::mutex> l(trace_mu_);
mem_info_record_.emplace_front();
local_mem_info_record = &mem_info_record_.front();
}
local_mem_info_record->emplace_front(MemInfoRecord{
start_ns, end_ns, bytes, place, thread_id, alloc_in, free_in});
}
void AddActiveKindRecords(const std::string &anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id,
int64_t thread_id, uint32_t correlation_id) {
if (anno.empty()) {
VLOG(1) << "Empty timeline annotation.";
return;
}
thread_local std::forward_list<ActiveKindRecord>
*local_active_kind_records = nullptr;
if (local_active_kind_records == nullptr) {
std::lock_guard<std::mutex> l(trace_mu_);
active_kind_records_.emplace_front();
local_active_kind_records = &active_kind_records_.front();
}
// lock is not needed, only one thread call this function.
local_active_kind_records->push_front(ActiveKindRecord{
anno, start_ns, end_ns, device_id, thread_id, correlation_id});
}
void AddKernelRecords(std::string name, uint64_t start, uint64_t end, void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
int64_t device_id, int64_t stream_id, int64_t device_id, int64_t stream_id,
uint32_t correlation_id) { uint32_t correlation_id) {
...@@ -355,6 +397,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -355,6 +397,7 @@ class DeviceTracerImpl : public DeviceTracer {
} }
const std::vector<int> cbids { const std::vector<int> cbids {
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020, CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020, CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020, CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020, CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020,
...@@ -385,6 +428,8 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -385,6 +428,8 @@ class DeviceTracerImpl : public DeviceTracer {
correlations_.clear(); correlations_.clear();
for (auto &tmp : correlations_pairs) tmp.clear(); for (auto &tmp : correlations_pairs) tmp.clear();
for (auto &tmp : cpu_records_) tmp.clear(); for (auto &tmp : cpu_records_) tmp.clear();
for (auto &tmp : mem_info_record_) tmp.clear();
for (auto &tmp : active_kind_records_) tmp.clear();
} }
void GenEventKernelCudaElapsedTime() { void GenEventKernelCudaElapsedTime() {
...@@ -415,9 +460,12 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -415,9 +460,12 @@ class DeviceTracerImpl : public DeviceTracer {
proto::Profile profile_pb; proto::Profile profile_pb;
profile_pb.set_start_ns(start_ns_); profile_pb.set_start_ns(start_ns_);
profile_pb.set_end_ns(end_ns_); profile_pb.set_end_ns(end_ns_);
if (correlations_.empty()) if (correlations_.empty()) {
for (auto &tmp : correlations_pairs) for (auto &tmp : correlations_pairs) {
for (auto &pair : tmp) correlations_[pair.first] = pair.second; for (auto &pair : tmp) correlations_[pair.first] = pair.second;
}
}
for (const KernelRecord &r : kernel_records_) { for (const KernelRecord &r : kernel_records_) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
event->set_type(proto::Event::GPUKernel); event->set_type(proto::Event::GPUKernel);
...@@ -437,7 +485,8 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -437,7 +485,8 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
} }
VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find; VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
for (auto &tmp : cpu_records_)
for (auto &tmp : cpu_records_) {
for (const CPURecord &r : tmp) { for (const CPURecord &r : tmp) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
event->set_type(proto::Event::CPU); event->set_type(proto::Event::CPU);
...@@ -447,6 +496,25 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -447,6 +496,25 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_sub_device_id(r.thread_id); event->set_sub_device_id(r.thread_id);
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
} }
}
for (auto &tmp : active_kind_records_) {
for (const ActiveKindRecord &r : tmp) {
auto *event = profile_pb.add_events();
event->set_type(proto::Event::CPU);
auto c = correlations_.find(r.correlation_id);
if (c != correlations_.end() && c->second != nullptr) {
event->set_name(c->second->name());
event->set_detail_info(r.name);
} else {
event->set_name(r.name);
}
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_sub_device_id(r.thread_id);
event->set_device_id(r.device_id);
}
}
miss = find = 0; miss = find = 0;
for (const MemRecord &r : mem_records_) { for (const MemRecord &r : mem_records_) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
...@@ -467,6 +535,31 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -467,6 +535,31 @@ class DeviceTracerImpl : public DeviceTracer {
event->mutable_memcopy()->set_bytes(r.bytes); event->mutable_memcopy()->set_bytes(r.bytes);
} }
VLOG(1) << "MemRecord event miss: " << miss << " find: " << find; VLOG(1) << "MemRecord event miss: " << miss << " find: " << find;
for (auto &tmp : mem_info_record_) {
for (const auto &r : tmp) {
auto *event = profile_pb.add_mem_events();
event->set_device_id(0);
if (platform::is_cpu_place(r.place)) {
event->set_place(proto::MemEvent::CPUPlace);
} else if (platform::is_gpu_place(r.place)) {
event->set_place(proto::MemEvent::CUDAPlace);
event->set_device_id(
boost::get<platform::CUDAPlace>(r.place).GetDeviceId());
} else if (platform::is_cuda_pinned_place(r.place)) {
event->set_place(proto::MemEvent::CUDAPinnedPlace);
} else {
PADDLE_THROW("The current place is not supported.");
}
event->set_alloc_in(r.alloc_in);
event->set_free_in(r.free_in);
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_bytes(r.bytes);
event->set_thread_id(r.thread_id);
}
}
std::ofstream profile_f; std::ofstream profile_f;
profile_f.open(profile_path, profile_f.open(profile_path,
std::ios::out | std::ios::trunc | std::ios::binary); std::ios::out | std::ios::trunc | std::ios::binary);
...@@ -510,6 +603,8 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -510,6 +603,8 @@ class DeviceTracerImpl : public DeviceTracer {
std::forward_list<KernelRecord> kernel_records_; std::forward_list<KernelRecord> kernel_records_;
std::forward_list<MemRecord> mem_records_; std::forward_list<MemRecord> mem_records_;
std::forward_list<std::forward_list<CPURecord>> cpu_records_; std::forward_list<std::forward_list<CPURecord>> cpu_records_;
std::forward_list<std::forward_list<MemInfoRecord>> mem_info_record_;
std::forward_list<std::forward_list<ActiveKindRecord>> active_kind_records_;
std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>> std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
correlations_pairs; correlations_pairs;
std::unordered_map<uint32_t, Event *> correlations_; std::unordered_map<uint32_t, Event *> correlations_;
...@@ -531,7 +626,7 @@ Event *CurAnnotation() { ...@@ -531,7 +626,7 @@ Event *CurAnnotation() {
return annotation_stack.back(); return annotation_stack.back();
} }
std::string CurAnnotationName() { std::string CurAnnotationName() {
if (annotation_stack.empty()) return ""; if (annotation_stack.empty()) return "Unknown";
return annotation_stack.back()->name(); return annotation_stack.back()->name();
} }
...@@ -613,6 +708,7 @@ void initCuptiCbidStr() { ...@@ -613,6 +708,7 @@ void initCuptiCbidStr() {
REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020); REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020); REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020);
REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020); REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
REGISTER_RUNTIME_CBID_STR(cudaDeviceGetPCIBusId_v4010);
#if CUDA_VERSION >= 9000 #if CUDA_VERSION >= 9000
REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000); REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000);
REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000); REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000);
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/cupti.h" #include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.pb.h" #include "paddle/fluid/platform/profiler.pb.h"
...@@ -47,6 +48,7 @@ class DeviceTracer { ...@@ -47,6 +48,7 @@ class DeviceTracer {
int64_t stream_id; int64_t stream_id;
uint32_t correlation_id; uint32_t correlation_id;
}; };
struct CPURecord { struct CPURecord {
std::string name; std::string name;
uint64_t start_ns; uint64_t start_ns;
...@@ -54,6 +56,7 @@ class DeviceTracer { ...@@ -54,6 +56,7 @@ class DeviceTracer {
int64_t device_id; int64_t device_id;
int64_t thread_id; int64_t thread_id;
}; };
struct MemRecord { struct MemRecord {
std::string name; std::string name;
uint64_t start_ns; uint64_t start_ns;
...@@ -64,6 +67,25 @@ class DeviceTracer { ...@@ -64,6 +67,25 @@ class DeviceTracer {
uint64_t bytes; uint64_t bytes;
}; };
struct MemInfoRecord {
uint64_t start_ns;
uint64_t end_ns;
size_t bytes;
Place place;
int64_t thread_id;
std::string alloc_in;
std::string free_in;
};
struct ActiveKindRecord {
std::string name;
uint64_t start_ns;
uint64_t end_ns;
int64_t device_id;
int64_t thread_id;
uint32_t correlation_id;
};
virtual ~DeviceTracer() {} virtual ~DeviceTracer() {}
// Needs to be called once before use. // Needs to be called once before use.
virtual void Enable() = 0; virtual void Enable() = 0;
...@@ -85,6 +107,16 @@ class DeviceTracer { ...@@ -85,6 +107,16 @@ class DeviceTracer {
virtual void AddCPURecords(const std::string& anno, uint64_t start_ns, virtual void AddCPURecords(const std::string& anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, uint64_t end_ns, int64_t device_id,
int64_t thread_id) = 0; int64_t thread_id) = 0;
virtual void AddActiveKindRecords(const std::string& anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id,
int64_t thread_id,
uint32_t correlation_id) = 0;
virtual void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns,
size_t bytes, const Place& place,
const std::string& alloc_in,
const std::string& free_in,
int64_t thread_id) = 0;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability. // added before for human readability.
......
...@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and ...@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string> #include <string>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
#endif #endif
#include "paddle/fluid/platform/place.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -64,5 +66,36 @@ class Event { ...@@ -64,5 +66,36 @@ class Event {
#endif #endif
#endif #endif
}; };
class MemEvent {
public:
MemEvent(EventType type, uint64_t start_ns, uint64_t end_ns, size_t bytes,
Place place, int64_t thread_id, const std::string& annotation)
: type_(type),
start_ns_(start_ns),
end_ns_(end_ns),
bytes_(bytes),
place_(place),
thread_id_(thread_id),
annotation_(annotation) {}
const EventType& type() const { return type_; }
uint64_t start_ns() const { return start_ns_; }
uint64_t end_ns() const { return end_ns_; }
size_t bytes() const { return bytes_; }
Place place() const { return place_; }
int64_t thread_id() const { return thread_id_; }
const std::string& annotation() const { return annotation_; }
private:
EventType type_;
uint64_t start_ns_ = 0;
uint64_t end_ns_ = 0;
size_t bytes_;
Place place_;
int64_t thread_id_;
std::string annotation_;
};
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include <algorithm> #include <algorithm>
#include <iomanip> #include <iomanip>
#include <limits> #include <limits>
...@@ -21,6 +20,8 @@ limitations under the License. */ ...@@ -21,6 +20,8 @@ limitations under the License. */
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <random> #include <random>
#include <string> #include <string>
#include <vector>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
...@@ -36,8 +37,6 @@ DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); ...@@ -36,8 +37,6 @@ DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
namespace paddle { namespace paddle {
namespace platform { namespace platform {
struct EventList;
static int64_t profiler_lister_id = 0; static int64_t profiler_lister_id = 0;
static bool should_send_profile_state = false; static bool should_send_profile_state = false;
std::mutex profiler_mu; std::mutex profiler_mu;
...@@ -53,43 +52,15 @@ static uint32_t g_next_thread_id = 0; ...@@ -53,43 +52,15 @@ static uint32_t g_next_thread_id = 0;
// The global mutex // The global mutex
static std::mutex g_all_event_lists_mutex; static std::mutex g_all_event_lists_mutex;
// The total event lists of all threads // The total event lists of all threads
static std::list<std::shared_ptr<EventList>> g_all_event_lists; static std::list<std::shared_ptr<EventList<Event>>> g_all_event_lists;
// The thread local event list only can be accessed by the specific thread // The thread local event list only can be accessed by the specific thread
static thread_local std::shared_ptr<EventList> g_event_list; static thread_local std::shared_ptr<EventList<Event>> g_event_list;
struct EventList {
constexpr static size_t kMB = 1024 * 1024;
constexpr static size_t kEventBlockSize = 16 * kMB;
constexpr static size_t kEventSize = sizeof(Event);
constexpr static size_t kEventAlign = alignof(Event);
constexpr static size_t kNumBlock =
kEventBlockSize /
((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
template <typename... Args>
Event* Record(Args&&... args) {
if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
event_blocks.emplace_front();
event_blocks.front().reserve(kNumBlock);
}
event_blocks.front().emplace_back(std::forward<Args>(args)...);
return &event_blocks.front().back();
}
std::vector<Event> Reduce() {
std::vector<Event> result;
for (auto& block : event_blocks) {
result.insert(result.begin(), std::make_move_iterator(block.begin()),
std::make_move_iterator(block.end()));
}
event_blocks.clear();
return result;
}
void Clear() { event_blocks.clear(); }
std::forward_list<std::vector<Event>> event_blocks; static std::list<std::shared_ptr<EventList<MemEvent>>> g_all_mem_event_lists;
}; static thread_local std::shared_ptr<EventList<MemEvent>> g_mem_event_list;
static std::mutex g_all_mem_event_lists_mutex;
static thread_local int32_t g_mem_thread_id;
static uint32_t g_mem_next_thread_id = 0;
inline uint64_t GetTimeInNsec() { inline uint64_t GetTimeInNsec() {
using clock = std::conditional<std::chrono::high_resolution_clock::is_steady, using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
...@@ -105,13 +76,13 @@ Event::Event(EventType type, std::string name, uint32_t thread_id) ...@@ -105,13 +76,13 @@ Event::Event(EventType type, std::string name, uint32_t thread_id)
cpu_ns_ = GetTimeInNsec(); cpu_ns_ = GetTimeInNsec();
} }
const EventType& Event::type() const { return type_; } const EventType &Event::type() const { return type_; }
double Event::CpuElapsedMs(const Event& e) const { double Event::CpuElapsedMs(const Event &e) const {
return (e.cpu_ns_ - cpu_ns_) / (1000000.0); return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
} }
double Event::CudaElapsedMs(const Event& e) const { double Event::CudaElapsedMs(const Event &e) const {
#ifdef PADDLE_WITH_CUPTI #ifdef PADDLE_WITH_CUPTI
return gpu_ns_ / 1000000.0; return gpu_ns_ / 1000000.0;
#else #else
...@@ -120,10 +91,32 @@ double Event::CudaElapsedMs(const Event& e) const { ...@@ -120,10 +91,32 @@ double Event::CudaElapsedMs(const Event& e) const {
#endif #endif
} }
inline EventList& GetEventList() { inline EventList<MemEvent> &GetMemEventList() {
if (!g_mem_event_list) {
g_mem_event_list = std::make_shared<EventList<MemEvent>>();
std::lock_guard<std::mutex> guard(g_all_mem_event_lists_mutex);
g_mem_thread_id = g_mem_next_thread_id++;
g_all_mem_event_lists.emplace_front(g_mem_event_list);
}
return *g_mem_event_list;
}
void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place &place, const std::string &annotation) {
GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes,
place, g_mem_thread_id, annotation);
}
void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place &place, const std::string &annotation) {
GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place,
g_mem_thread_id, annotation);
}
inline EventList<Event> &GetEventList() {
if (!g_event_list) { if (!g_event_list) {
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex); std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
g_event_list = std::make_shared<EventList>(); g_event_list = std::make_shared<EventList<Event>>();
g_thread_id = g_next_thread_id++; g_thread_id = g_next_thread_id++;
g_all_event_lists.emplace_front(g_event_list); g_all_event_lists.emplace_front(g_event_list);
RecoreCurThreadId(g_thread_id); RecoreCurThreadId(g_thread_id);
...@@ -131,26 +124,26 @@ inline EventList& GetEventList() { ...@@ -131,26 +124,26 @@ inline EventList& GetEventList() {
return *g_event_list; return *g_event_list;
} }
void Mark(const std::string& name) { void Mark(const std::string &name) {
GetEventList().Record(EventType::kMark, name, g_thread_id); GetEventList().Record(EventType::kMark, name, g_thread_id);
} }
Event* PushEvent(const std::string& name) { Event *PushEvent(const std::string &name) {
return GetEventList().Record(EventType::kPushRange, name, g_thread_id); return GetEventList().Record(EventType::kPushRange, name, g_thread_id);
} }
void PopEvent(const std::string& name) { void PopEvent(const std::string &name) {
GetEventList().Record(EventType::kPopRange, name, g_thread_id); GetEventList().Record(EventType::kPopRange, name, g_thread_id);
} }
RecordEvent::RecordEvent(const std::string& name) RecordEvent::RecordEvent(const std::string &name)
: is_enabled_(false), start_ns_(PosixInNsec()) { : is_enabled_(false), start_ns_(PosixInNsec()) {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
// lock is not needed, the code below is thread-safe // lock is not needed, the code below is thread-safe
is_enabled_ = true; is_enabled_ = true;
name_ = name; name_ = name;
Event* e = PushEvent(name_); Event *e = PushEvent(name_);
// Maybe need the same push/pop behavior. // Maybe need the same push/pop behavior.
SetCurAnnotation(e); SetCurAnnotation(e);
} }
...@@ -158,7 +151,7 @@ RecordEvent::RecordEvent(const std::string& name) ...@@ -158,7 +151,7 @@ RecordEvent::RecordEvent(const std::string& name)
RecordEvent::~RecordEvent() { RecordEvent::~RecordEvent() {
if (g_state == ProfilerState::kDisabled || !is_enabled_) return; if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
// lock is not needed, the code below is thread-safe // lock is not needed, the code below is thread-safe
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer *tracer = GetDeviceTracer();
if (tracer) { if (tracer) {
tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(), tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(),
BlockDepth(), g_thread_id); BlockDepth(), g_thread_id);
...@@ -167,7 +160,56 @@ RecordEvent::~RecordEvent() { ...@@ -167,7 +160,56 @@ RecordEvent::~RecordEvent() {
PopEvent(name_); PopEvent(name_);
} }
RecordRPCEvent::RecordRPCEvent(const std::string& name) { MemEvenRecorder MemEvenRecorder::recorder;
void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
size_t size) {
if (g_state == ProfilerState::kDisabled) return;
std::lock_guard<std::mutex> guard(mtx_);
auto &events = address_memevent_[place];
PADDLE_ENFORCE(events.count(ptr) == 0, "");
events.emplace(ptr, std::unique_ptr<RecordMemEvent>(
new MemEvenRecorder::RecordMemEvent(place, size)));
}
void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
if (g_state == ProfilerState::kDisabled) return;
std::lock_guard<std::mutex> guard(mtx_);
auto &events = address_memevent_[place];
auto iter = events.find(ptr);
// The ptr maybe not in address_memevent
if (iter != events.end()) {
events.erase(iter);
}
}
void MemEvenRecorder::Flush() {
std::lock_guard<std::mutex> guard(mtx_);
address_memevent_.clear();
}
MemEvenRecorder::RecordMemEvent::RecordMemEvent(const Place &place,
size_t bytes)
: place_(place),
bytes_(bytes),
start_ns_(PosixInNsec()),
alloc_in_(CurAnnotationName()) {
PushMemEvent(start_ns_, end_ns_, bytes_, place_, alloc_in_);
}
MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {
DeviceTracer *tracer = GetDeviceTracer();
end_ns_ = PosixInNsec();
auto annotation_free = CurAnnotationName();
if (tracer) {
tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_,
annotation_free, g_mem_thread_id);
}
PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free);
}
RecordRPCEvent::RecordRPCEvent(const std::string &name) {
if (FLAGS_enable_rpc_profiler) { if (FLAGS_enable_rpc_profiler) {
event_.reset(new platform::RecordEvent(name)); event_.reset(new platform::RecordEvent(name));
} }
...@@ -185,7 +227,7 @@ RecordBlock::RecordBlock(int block_id) ...@@ -185,7 +227,7 @@ RecordBlock::RecordBlock(int block_id)
RecordBlock::~RecordBlock() { RecordBlock::~RecordBlock() {
// lock is not needed, the code below is thread-safe // lock is not needed, the code below is thread-safe
if (g_state == ProfilerState::kDisabled || !is_enabled_) return; if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer *tracer = GetDeviceTracer();
if (tracer) { if (tracer) {
// We try to put all blocks at the same nested depth in the // We try to put all blocks at the same nested depth in the
// same timeline lane. and distinguish the using thread_id. // same timeline lane. and distinguish the using thread_id.
...@@ -232,11 +274,16 @@ void EnableProfiler(ProfilerState state) { ...@@ -232,11 +274,16 @@ void EnableProfiler(ProfilerState state) {
void ResetProfiler() { void ResetProfiler() {
SynchronizeAllDevice(); SynchronizeAllDevice();
GetDeviceTracer()->Reset(); GetDeviceTracer()->Reset();
MemEvenRecorder::Instance().Flush();
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex); std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
++it) { ++it) {
(*it)->Clear(); (*it)->Clear();
} }
for (auto it = g_all_mem_event_lists.begin();
it != g_all_mem_event_lists.end(); ++it) {
(*it)->Clear();
}
} }
std::vector<std::vector<Event>> GetAllEvents() { std::vector<std::vector<Event>> GetAllEvents() {
...@@ -249,6 +296,15 @@ std::vector<std::vector<Event>> GetAllEvents() { ...@@ -249,6 +296,15 @@ std::vector<std::vector<Event>> GetAllEvents() {
return result; return result;
} }
std::vector<std::vector<MemEvent>> GetMemEvents() {
std::lock_guard<std::mutex> guard(g_all_mem_event_lists_mutex);
std::vector<std::vector<MemEvent>> result;
for (auto &it : g_all_mem_event_lists) {
result.emplace_back((*it).Reduce());
}
return result;
}
// The information of each event given in the profiling report // The information of each event given in the profiling report
struct EventItem { struct EventItem {
std::string name; std::string name;
...@@ -263,8 +319,8 @@ struct EventItem { ...@@ -263,8 +319,8 @@ struct EventItem {
}; };
// Print results // Print results
void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table, void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
const std::string& sorted_domain, const size_t name_width, const std::string &sorted_domain, const size_t name_width,
const size_t data_width, bool merge_thread) { const size_t data_width, bool merge_thread) {
// Output header information // Output header information
std::cout << "\n------------------------->" std::cout << "\n------------------------->"
...@@ -302,7 +358,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table, ...@@ -302,7 +358,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
<< std::setw(data_width) << "Ratio." << std::endl; << std::setw(data_width) << "Ratio." << std::endl;
for (size_t i = 0; i < events_table.size(); ++i) { for (size_t i = 0; i < events_table.size(); ++i) {
for (size_t j = 0; j < events_table[i].size(); ++j) { for (size_t j = 0; j < events_table[i].size(); ++j) {
const EventItem& event_item = events_table[i][j]; const EventItem &event_item = events_table[i][j];
std::cout << std::setw(name_width) << event_item.name std::cout << std::setw(name_width) << event_item.name
<< std::setw(data_width) << event_item.calls << std::setw(data_width) << event_item.calls
<< std::setw(data_width) << event_item.total_time; << std::setw(data_width) << event_item.total_time;
...@@ -326,54 +382,54 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table, ...@@ -326,54 +382,54 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
} }
// Parse the event list and output the profiling report // Parse the event list and output the profiling report
void ParseEvents(const std::vector<std::vector<Event>>& events, void ParseEvents(const std::vector<std::vector<Event>> &events,
bool merge_thread, bool merge_thread,
EventSortingKey sorted_by = EventSortingKey::kDefault) { EventSortingKey sorted_by = EventSortingKey::kDefault) {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
if (merge_thread && events.size() < 2) return; if (merge_thread && events.size() < 2) return;
std::string sorted_domain; std::string sorted_domain;
std::function<bool(const EventItem&, const EventItem&)> sorted_func; std::function<bool(const EventItem &, const EventItem &)> sorted_func;
switch (sorted_by) { switch (sorted_by) {
case EventSortingKey::kCalls: case EventSortingKey::kCalls:
sorted_domain = "number of calls"; sorted_domain = "number of calls";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.calls > b.calls; return a.calls > b.calls;
}; };
break; break;
case EventSortingKey::kTotal: case EventSortingKey::kTotal:
sorted_domain = "total time"; sorted_domain = "total time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.total_time > b.total_time; return a.total_time > b.total_time;
}; };
break; break;
case EventSortingKey::kMin: case EventSortingKey::kMin:
sorted_domain = "minimum time"; sorted_domain = "minimum time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.min_time > b.min_time; return a.min_time > b.min_time;
}; };
break; break;
case EventSortingKey::kMax: case EventSortingKey::kMax:
sorted_domain = "maximum time"; sorted_domain = "maximum time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.max_time > b.max_time; return a.max_time > b.max_time;
}; };
break; break;
case EventSortingKey::kAve: case EventSortingKey::kAve:
sorted_domain = "average time"; sorted_domain = "average time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.ave_time > b.ave_time; return a.ave_time > b.ave_time;
}; };
break; break;
case EventSortingKey::kGPUTime: case EventSortingKey::kGPUTime:
sorted_domain = "average time"; sorted_domain = "average time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.gpu_time > b.gpu_time; return a.gpu_time > b.gpu_time;
}; };
break; break;
case EventSortingKey::kCPUTime: case EventSortingKey::kCPUTime:
sorted_domain = "average time"; sorted_domain = "average time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.cpu_time > b.cpu_time; return a.cpu_time > b.cpu_time;
}; };
break; break;
...@@ -381,7 +437,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -381,7 +437,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
sorted_domain = "event first end time"; sorted_domain = "event first end time";
} }
const std::vector<std::vector<Event>>* analyze_events; const std::vector<std::vector<Event>> *analyze_events;
std::vector<std::vector<Event>> merged_events_list; std::vector<std::vector<Event>> merged_events_list;
if (merge_thread) { if (merge_thread) {
std::vector<Event> merged_events; std::vector<Event> merged_events;
...@@ -469,7 +525,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -469,7 +525,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
} }
} }
// average time // average time
for (auto& item : event_items) { for (auto &item : event_items) {
item.ave_time = item.total_time / item.calls; item.ave_time = item.total_time / item.calls;
item.ratio = item.total_time / total; item.ratio = item.total_time / total;
} }
...@@ -493,15 +549,77 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -493,15 +549,77 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
merge_thread); merge_thread);
} }
struct MemoryProfierReport {
size_t alloc_times{0};
size_t alloc_size{0};
size_t free_times{0};
size_t free_size{0};
};
// Print results
void PrintMemProfiler(
const std::map<Place, std::unordered_map<std::string, MemoryProfierReport>>
&annotation_report,
const size_t name_width, const size_t data_width) {
// Output header information
std::cout << "\n------------------------->"
<< " Memory Profiling Report "
<< "<-------------------------\n\n";
// Output events table
std::cout.setf(std::ios::left);
std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
<< "Alloc Calls" << std::setw(data_width) << "Size(MB)"
<< std::setw(data_width) << "Free Calls" << std::setw(data_width)
<< "Size(MB)" << std::endl;
for (auto &tmp : annotation_report) {
for (auto &e : tmp.second) {
auto event_name = string::Sprintf("%s:%s", tmp.first, e.first);
std::cout << std::setw(name_width) << event_name;
std::cout << std::setw(data_width) << e.second.alloc_times;
std::cout << std::setw(data_width)
<< e.second.alloc_size / (1024.0 * 1024.0);
std::cout << std::setw(data_width) << e.second.free_times;
std::cout << std::setw(data_width)
<< e.second.free_size / (1024.0 * 1024.0) << std::endl;
}
}
std::cout << std::endl;
}
// parse memory events
void ParseMemEvents(const std::vector<std::vector<MemEvent>> &events) {
if (g_state == ProfilerState::kDisabled) return;
// place, annotation, alloc times, alloc size
std::map<Place, std::unordered_map<std::string, MemoryProfierReport>>
annotation_report;
for (auto &tmp : events) {
for (auto &e : tmp) {
if (e.type() == EventType::kPushRange) {
annotation_report[e.place()][e.annotation()].alloc_times += 1;
annotation_report[e.place()][e.annotation()].alloc_size += e.bytes();
} else if (e.type() == EventType::kPopRange) {
annotation_report[e.place()][e.annotation()].free_times += 1;
annotation_report[e.place()][e.annotation()].free_size += e.bytes();
}
}
}
PrintMemProfiler(annotation_report, 55, 18);
}
void DisableProfiler(EventSortingKey sorted_key, void DisableProfiler(EventSortingKey sorted_key,
const std::string& profile_path) { const std::string &profile_path) {
SynchronizeAllDevice(); SynchronizeAllDevice();
MemEvenRecorder::Instance().Flush();
std::lock_guard<std::mutex> l(profiler_mu); std::lock_guard<std::mutex> l(profiler_mu);
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
// Mark the profiling stop. // Mark the profiling stop.
Mark("_stop_profiler_"); Mark("_stop_profiler_");
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer *tracer = GetDeviceTracer();
if (tracer->IsEnabled()) { if (tracer->IsEnabled()) {
tracer->Disable(); tracer->Disable();
tracer->GenProfile(profile_path); tracer->GenProfile(profile_path);
...@@ -511,6 +629,11 @@ void DisableProfiler(EventSortingKey sorted_key, ...@@ -511,6 +629,11 @@ void DisableProfiler(EventSortingKey sorted_key,
std::vector<std::vector<Event>> all_events = GetAllEvents(); std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, true, sorted_key); ParseEvents(all_events, true, sorted_key);
ParseEvents(all_events, false, sorted_key); ParseEvents(all_events, false, sorted_key);
if (VLOG_IS_ON(5)) {
std::vector<std::vector<MemEvent>> all_mem_events = GetMemEvents();
ParseMemEvents(all_mem_events);
}
ResetProfiler(); ResetProfiler();
g_state = ProfilerState::kDisabled; g_state = ProfilerState::kDisabled;
should_send_profile_state = true; should_send_profile_state = true;
......
...@@ -15,10 +15,17 @@ limitations under the License. */ ...@@ -15,10 +15,17 @@ limitations under the License. */
#pragma once #pragma once
#include <forward_list> #include <forward_list>
#include <list> #include <list>
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#endif #endif
...@@ -34,8 +41,41 @@ enum ProfilerState { ...@@ -34,8 +41,41 @@ enum ProfilerState {
void Mark(const std::string& name); void Mark(const std::string& name);
Event* PushEvent(const std::string& name); void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place& place);
void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place& place);
struct MemEvenRecorder {
public:
void PushMemRecord(const void* ptr, const Place& place, size_t size);
void PopMemRecord(const void* ptr, const Place& place);
void Flush();
static MemEvenRecorder& Instance() { return recorder; }
private:
struct RecordMemEvent {
RecordMemEvent(const Place& place, size_t bytes);
~RecordMemEvent();
Place place_;
size_t bytes_;
uint64_t start_ns_;
uint64_t end_ns_;
std::string alloc_in_;
std::string free_in_;
};
static MemEvenRecorder recorder;
std::map<Place,
std::unordered_map<const void*, std::unique_ptr<RecordMemEvent>>>
address_memevent_;
std::mutex mtx_;
MemEvenRecorder() {}
DISABLE_COPY_AND_ASSIGN(MemEvenRecorder);
};
Event* PushEvent(const std::string& name);
void PopEvent(const std::string& name); void PopEvent(const std::string& name);
struct RecordEvent { struct RecordEvent {
...@@ -87,6 +127,41 @@ enum EventSortingKey { ...@@ -87,6 +127,41 @@ enum EventSortingKey {
kGPUTime kGPUTime
}; };
template <typename T>
struct EventList {
constexpr static size_t kMB = 1024 * 1024;
constexpr static size_t kEventBlockSize = 16 * kMB;
constexpr static size_t kEventSize = sizeof(T);
constexpr static size_t kEventAlign = alignof(T);
constexpr static size_t kNumBlock =
kEventBlockSize /
((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
template <typename... Args>
T* Record(Args&&... args) {
if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
event_blocks.emplace_front();
event_blocks.front().reserve(kNumBlock);
}
event_blocks.front().emplace_back(std::forward<Args>(args)...);
return &event_blocks.front().back();
}
std::vector<T> Reduce() {
std::vector<T> result;
for (auto& block : event_blocks) {
result.insert(result.begin(), std::make_move_iterator(block.begin()),
std::make_move_iterator(block.end()));
}
event_blocks.clear();
return result;
}
void Clear() { event_blocks.clear(); }
std::forward_list<std::vector<T>> event_blocks;
};
// Enable the profiling function. // Enable the profiling function.
void EnableProfiler(ProfilerState state); void EnableProfiler(ProfilerState state);
......
...@@ -34,8 +34,25 @@ message Event { ...@@ -34,8 +34,25 @@ message Event {
optional string detail_info = 9; optional string detail_info = 9;
} }
message MemEvent {
enum Place {
CUDAPlace = 0;
CPUPlace = 1;
CUDAPinnedPlace = 2;
}
optional uint64 start_ns = 1;
optional uint64 end_ns = 2;
optional uint64 bytes = 3;
optional Place place = 4;
optional uint64 thread_id = 5;
optional uint32 device_id = 6;
optional string alloc_in = 7;
optional string free_in = 8;
}
message Profile { message Profile {
repeated Event events = 1; repeated Event events = 1;
optional uint64 start_ns = 2; optional uint64 start_ns = 2;
optional uint64 end_ns = 3; optional uint64 end_ns = 3;
repeated MemEvent mem_events = 4;
} }
\ No newline at end of file
...@@ -13,10 +13,18 @@ See the License for the specific language governing permissions and ...@@ -13,10 +13,18 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/imperative.h"
#include <pybind11/chrono.h>
#include <pybind11/complex.h>
#include <pybind11/functional.h>
#include <pybind11/stl.h>
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/imperative/type_defs.h"
#include "paddle/fluid/pybind/pybind_boost_headers.h"
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
...@@ -31,20 +39,20 @@ void BindTracer(pybind11::module* m) { ...@@ -31,20 +39,20 @@ void BindTracer(pybind11::module* m) {
[](imperative::Tracer& self, imperative::OpBase* op, [](imperative::Tracer& self, imperative::OpBase* op,
const imperative::VarBasePtrMap& inputs, const imperative::VarBasePtrMap& inputs,
const imperative::VarBasePtrMap& outputs, const imperative::VarBasePtrMap& outputs,
framework::BlockDesc* block, framework::AttributeMap attrs_map,
const platform::CPUPlace expected_place, const platform::CPUPlace expected_place,
const bool stop_gradient = false) { const bool stop_gradient = false) {
return self.Trace(op, inputs, outputs, block, expected_place, return self.Trace(op, inputs, outputs, attrs_map, expected_place,
stop_gradient); stop_gradient);
}) })
.def("trace", .def("trace",
[](imperative::Tracer& self, imperative::OpBase* op, [](imperative::Tracer& self, imperative::OpBase* op,
const imperative::VarBasePtrMap& inputs, const imperative::VarBasePtrMap& inputs,
const imperative::VarBasePtrMap& outputs, const imperative::VarBasePtrMap& outputs,
framework::BlockDesc* block, framework::AttributeMap attrs_map,
const platform::CUDAPlace expected_place, const platform::CUDAPlace expected_place,
const bool stop_gradient = false) { const bool stop_gradient = false) {
return self.Trace(op, inputs, outputs, block, expected_place, return self.Trace(op, inputs, outputs, attrs_map, expected_place,
stop_gradient); stop_gradient);
}) })
.def("py_trace", &imperative::Tracer::PyTrace, .def("py_trace", &imperative::Tracer::PyTrace,
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <Python.h> #include <Python.h>
#include <string>
#include <vector> #include <vector>
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
...@@ -36,6 +37,8 @@ class Layer : public imperative::Layer { ...@@ -36,6 +37,8 @@ class Layer : public imperative::Layer {
class PYBIND11_HIDDEN PyOpBase : public imperative::OpBase { class PYBIND11_HIDDEN PyOpBase : public imperative::OpBase {
public: public:
using imperative::OpBase::OpBase; // Inherit constructors using imperative::OpBase::OpBase; // Inherit constructors
PyOpBase(const std::string& name) : OpBase(name) {}
}; };
class PyVarBase : public imperative::VarBase { class PyVarBase : public imperative::VarBase {
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <utility>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
...@@ -54,12 +55,14 @@ void BindGraph(py::module *m) { ...@@ -54,12 +55,14 @@ void BindGraph(py::module *m) {
"The graph is a Directed Acyclic Single Static Assignment Graph, see " "The graph is a Directed Acyclic Single Static Assignment Graph, see "
"`paddle::ir::Graph` for details.") "`paddle::ir::Graph` for details.")
.def(py::init<const ProgramDesc &>()) .def(py::init<const ProgramDesc &>())
.def("clone", &Graph::Clone)
.def("has", &Graph::Has) .def("has", &Graph::Has)
.def("get_int", &Graph::Get<int>) .def("get_int", &Graph::Get<int>)
.def("get_float", &Graph::Get<float>) .def("get_float", &Graph::Get<float>)
.def("get_double", &Graph::Get<double>) .def("get_double", &Graph::Get<double>)
.def("get_string", &Graph::Get<std::string>) .def("get_string", &Graph::Get<std::string>)
.def("get_marked_nodes", &Graph::Get<std::unordered_set<const Node *>>) .def("get_marked_nodes", &Graph::Get<std::unordered_set<const Node *>>,
return_value_policy::reference)
.def("set", [](Graph &self, const std::string &attr_name, .def("set", [](Graph &self, const std::string &attr_name,
int attr) { return self.Set(attr_name, new int(attr)); }) int attr) { return self.Set(attr_name, new int(attr)); })
.def("set", .def("set",
...@@ -103,7 +106,8 @@ void BindGraph(py::module *m) { ...@@ -103,7 +106,8 @@ void BindGraph(py::module *m) {
.def("retrieve_node", &Graph::RetrieveNode, .def("retrieve_node", &Graph::RetrieveNode,
return_value_policy::reference) return_value_policy::reference)
.def("resolve_hazard", &Graph::ResolveHazard) .def("resolve_hazard", &Graph::ResolveHazard)
.def("origin_program_desc", &Graph::OriginProgram); .def("origin_program_desc", &Graph::OriginProgram,
return_value_policy::reference);
} }
void BindNode(py::module *m) { void BindNode(py::module *m) {
......
...@@ -23,97 +23,7 @@ limitations under the License. */ ...@@ -23,97 +23,7 @@ limitations under the License. */
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/var_desc.h"
// Cast boost::variant for PyBind. #include "paddle/fluid/pybind/pybind_boost_headers.h"
// Copy from
// https://github.com/pybind/pybind11/issues/576#issuecomment-269563199
namespace pybind11 {
namespace detail {
#if !defined(PYBIND11_HIDDEN)
#ifdef _WIN32
#define PYBIND11_HIDDEN __declspec(dllexport)
#else
#define PYBIND11_HIDDEN __attribute__((visibility("hidden")))
#endif
#endif
// Can be replaced by a generic lambda in C++14
struct PYBIND11_HIDDEN paddle_variant_caster_visitor
: public boost::static_visitor<handle> {
return_value_policy policy;
handle parent;
paddle_variant_caster_visitor(return_value_policy policy, handle parent)
: policy(policy), parent(parent) {}
template <class T>
handle operator()(T const &src) const {
return make_caster<T>::cast(src, policy, parent);
}
};
template <class Variant>
struct paddle_variant_caster;
template <template <class...> class V, class... Ts>
struct paddle_variant_caster<V<Ts...>> {
using Type = V<Ts...>;
template <typename T>
typename std::enable_if<
!std::is_same<T, boost::detail::variant::void_>::value, bool>::type
try_load(handle src, bool convert) {
auto caster = make_caster<T>();
if (!load_success_ && caster.load(src, convert)) {
load_success_ = true;
if (std::is_same<T, std::vector<float>>::value) {
auto caster_ints = make_caster<std::vector<int64_t>>();
if (caster_ints.load(src, convert)) {
VLOG(4) << "This value are floats and int64_ts satisfy "
"simultaneously, will set it's type to "
"std::vector<int64_t>";
value = cast_op<std::vector<int64_t>>(caster_ints);
return true;
}
}
value = cast_op<T>(caster);
return true;
}
return false;
}
template <typename T>
typename std::enable_if<std::is_same<T, boost::detail::variant::void_>::value,
bool>::type
try_load(handle src, bool convert) {
return false;
}
bool load(handle src, bool convert) {
auto unused = {false, try_load<Ts>(src, convert)...};
(void)(unused);
return load_success_;
}
static handle cast(Type const &src, return_value_policy policy,
handle parent) {
paddle_variant_caster_visitor visitor(policy, parent);
return boost::apply_visitor(visitor, src);
}
PYBIND11_TYPE_CASTER(Type, _("Variant"));
bool load_success_{false};
};
// Add specialization for concrete variant type
template <class... Args>
struct type_caster<boost::variant<Args...>>
: paddle_variant_caster<boost::variant<Args...>> {};
} // namespace detail
} // namespace pybind11
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
......
...@@ -95,6 +95,14 @@ bool IsCompiledWithMKLDNN() { ...@@ -95,6 +95,14 @@ bool IsCompiledWithMKLDNN() {
#endif #endif
} }
bool IsCompiledWithNGRAPH() {
#ifndef PADDLE_WITH_NGRAPH
return false;
#else
return true;
#endif
}
bool IsCompiledWithBrpc() { bool IsCompiledWithBrpc() {
#ifndef PADDLE_WITH_DISTRIBUTE #ifndef PADDLE_WITH_DISTRIBUTE
return false; return false;
...@@ -155,8 +163,14 @@ PYBIND11_MODULE(core, m) { ...@@ -155,8 +163,14 @@ PYBIND11_MODULE(core, m) {
[]() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); }); []() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); });
py::class_<imperative::VarBase>(m, "VarBase", R"DOC()DOC") py::class_<imperative::VarBase>(m, "VarBase", R"DOC()DOC")
// .def(py::init<>()) .def(
.def(py::init<bool>(), py::arg("stop_gradient") = false) py::init<const std::string &, paddle::framework::proto::VarType::Type,
const std::vector<int64_t>, const paddle::platform::CPUPlace,
bool, bool>())
.def(
py::init<const std::string &, paddle::framework::proto::VarType::Type,
const std::vector<int64_t>,
const paddle::platform::CUDAPlace, bool, bool>())
.def("_run_backward", .def("_run_backward",
[](imperative::VarBase &self) { self.RunBackward(); }) [](imperative::VarBase &self) { self.RunBackward(); })
.def("_grad_name", &imperative::VarBase::GradName) .def("_grad_name", &imperative::VarBase::GradName)
...@@ -183,51 +197,21 @@ PYBIND11_MODULE(core, m) { ...@@ -183,51 +197,21 @@ PYBIND11_MODULE(core, m) {
py::return_value_policy::take_ownership) py::return_value_policy::take_ownership)
.def("value", [](const imperative::VarBase &self) { return self.var_; }, .def("value", [](const imperative::VarBase &self) { return self.var_; },
py::return_value_policy::reference) py::return_value_policy::reference)
.def_property("name", .def_property("name", &imperative::VarBase::Name,
[](const imperative::VarBase &self) { return self.name_; }, &imperative::VarBase::SetName)
[](imperative::VarBase &self, const std::string &name) { .def_property_readonly("shape", &imperative::VarBase::Shape)
self.name_ = name; .def_property_readonly("dtype", &imperative::VarBase::DType)
}) .def_property("persistable", &imperative::VarBase::IsPersistable,
.def_property("block", &imperative::VarBase::SetPersistable)
[](const imperative::VarBase &self) { return self.block_; }, .def_property("stop_gradient", &imperative::VarBase::IsStopGradient,
[](imperative::VarBase &self, framework::BlockDesc *block) { &imperative::VarBase::SetStopGradient);
self.block_ = block;
},
py::return_value_policy::reference)
.def_property(
"persistable",
[](const imperative::VarBase &self) { return self.persistable_; },
[](imperative::VarBase &self, const bool persistable) {
self.persistable_ = persistable;
})
.def_property(
"desc",
[](const imperative::VarBase &self) { return self.var_desc_; },
[](imperative::VarBase &self, framework::VarDesc *var_desc) {
self.var_desc_ = var_desc;
},
py::return_value_policy::reference)
.def_property(
"stop_gradient",
[](const imperative::VarBase &self) { return self.IsStopGradient(); },
[](imperative::VarBase &self, bool stop_gradient) {
self.SetStopGradient(stop_gradient);
});
py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC") py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
.def(py::init<>()) .def(py::init<const std::string &>())
.def("register_backward_hooks", .def("register_backward_hooks",
[](imperative::OpBase &self, const py::object &callable) { [](imperative::OpBase &self, const py::object &callable) {
self.RegisterBackwardHooks(callable); self.RegisterBackwardHooks(callable);
}) })
.def_property(
"desc", [](const imperative::OpBase &self) { return self.op_desc_; },
[](imperative::OpBase &self, framework::OpDesc *op_desc) {
if (op_desc) {
self.op_desc_ = op_desc;
}
},
py::return_value_policy::reference)
.def_property("_trace_id", .def_property("_trace_id",
[](const imperative::OpBase &self) { [](const imperative::OpBase &self) {
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
...@@ -266,7 +250,17 @@ PYBIND11_MODULE(core, m) { ...@@ -266,7 +250,17 @@ PYBIND11_MODULE(core, m) {
"apply", "apply",
[](int func_id, const std::vector<imperative::VarBase *> &inputs) [](int func_id, const std::vector<imperative::VarBase *> &inputs)
-> std::vector<imperative::VarBase *> { -> std::vector<imperative::VarBase *> {
return imperative::PyLayer::Apply(func_id, inputs); auto ret_vars = imperative::PyLayer::Apply(func_id, inputs);
std::vector<imperative::VarBase *> outputs;
outputs.reserve(ret_vars.size());
for (size_t i = 0U; i != ret_vars.size(); ++i) {
framework::Variable *v = ret_vars[i];
// TODO(minqiyang): use unique_name generator to set a name
outputs.emplace_back(
new imperative::VarBase("", v, nullptr, true));
}
return outputs;
}, },
py::return_value_policy::take_ownership) py::return_value_policy::take_ownership)
.def_static("register_func", .def_static("register_func",
...@@ -894,9 +888,11 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -894,9 +888,11 @@ All parameter, weight, gradient are variables in Paddle.
.def(py::init<const platform::Place &>()) .def(py::init<const platform::Place &>())
.def("close", &Executor::Close) .def("close", &Executor::Close)
.def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope, .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
int block_id, bool create_local_scope, bool create_vars) { int block_id, bool create_local_scope, bool create_vars,
const std::vector<std::string> &fetch_vars) {
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
self.Run(prog, scope, block_id, create_local_scope, create_vars); self.Run(prog, scope, block_id, create_local_scope, create_vars,
fetch_vars);
}); });
m.def("init_gflags", framework::InitGflags); m.def("init_gflags", framework::InitGflags);
...@@ -904,6 +900,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -904,6 +900,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("init_devices", m.def("init_devices",
[](bool init_p2p) { framework::InitDevices(init_p2p); }); [](bool init_p2p) { framework::InitDevices(init_p2p); });
m.def("is_compiled_with_ngraph", IsCompiledWithNGRAPH);
m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN); m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
m.def("is_compiled_with_brpc", IsCompiledWithBrpc); m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
...@@ -1272,7 +1269,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1272,7 +1269,7 @@ All parameter, weight, gradient are variables in Paddle.
cannot be updated after being finalized.)DOC"); cannot be updated after being finalized.)DOC");
pe.def(py::init<const std::vector<platform::Place> &, pe.def(py::init<const std::vector<platform::Place> &,
const std::unordered_set<std::string> &, const std::string &, const std::vector<std::string> &, const std::string &,
Scope *, std::vector<Scope *> &, const ExecutionStrategy &, Scope *, std::vector<Scope *> &, const ExecutionStrategy &,
const BuildStrategy &, ir::Graph *>()) const BuildStrategy &, ir::Graph *>())
// NOTE: even we return a vec<Scope*>* to Python use reference policy. // NOTE: even we return a vec<Scope*>* to Python use reference policy.
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <Python.h>
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/platform/variant.h"
#include "pybind11/numpy.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
// Cast boost::variant for PyBind.
// Copy from
// https://github.com/pybind/pybind11/issues/576#issuecomment-269563199
namespace pybind11 {
namespace detail {
#if !defined(PYBIND11_HIDDEN)
#ifdef _WIN32
#define PYBIND11_HIDDEN __declspec(dllexport)
#else
#define PYBIND11_HIDDEN __attribute__((visibility("hidden")))
#endif
#endif
// Can be replaced by a generic lambda in C++14
struct PYBIND11_HIDDEN paddle_variant_caster_visitor
: public boost::static_visitor<handle> {
return_value_policy policy;
handle parent;
paddle_variant_caster_visitor(return_value_policy policy, handle parent)
: policy(policy), parent(parent) {}
template <class T>
handle operator()(T const &src) const {
return make_caster<T>::cast(src, policy, parent);
}
};
template <class Variant>
struct paddle_variant_caster;
template <template <class...> class V, class... Ts>
struct paddle_variant_caster<V<Ts...>> {
using Type = V<Ts...>;
template <typename T>
typename std::enable_if<
!std::is_same<T, boost::detail::variant::void_>::value, bool>::type
try_load(handle src, bool convert) {
auto caster = make_caster<T>();
if (!load_success_ && caster.load(src, convert)) {
load_success_ = true;
if (std::is_same<T, std::vector<float>>::value) {
auto caster_ints = make_caster<std::vector<int64_t>>();
if (caster_ints.load(src, convert)) {
VLOG(4) << "This value are floats and int64_ts satisfy "
"simultaneously, will set it's type to "
"std::vector<int64_t>";
value = cast_op<std::vector<int64_t>>(caster_ints);
return true;
}
}
value = cast_op<T>(caster);
return true;
}
return false;
}
template <typename T>
typename std::enable_if<std::is_same<T, boost::detail::variant::void_>::value,
bool>::type
try_load(handle src, bool convert) {
return false;
}
bool load(handle src, bool convert) {
auto unused = {false, try_load<Ts>(src, convert)...};
(void)(unused);
return load_success_;
}
static handle cast(Type const &src, return_value_policy policy,
handle parent) {
paddle_variant_caster_visitor visitor(policy, parent);
return boost::apply_visitor(visitor, src);
}
PYBIND11_TYPE_CASTER(Type, _("Variant"));
bool load_success_{false};
};
// Add specialization for concrete variant type
template <class... Args>
struct type_caster<boost::variant<Args...>>
: paddle_variant_caster<boost::variant<Args...>> {};
} // namespace detail
} // namespace pybind11
...@@ -31,7 +31,7 @@ class RecordIOWriter { ...@@ -31,7 +31,7 @@ class RecordIOWriter {
RecordIOWriter(const std::string& filename, recordio::Compressor compressor, RecordIOWriter(const std::string& filename, recordio::Compressor compressor,
size_t max_num_record) size_t max_num_record)
: closed_(false), : closed_(false),
stream_(filename), stream_(filename, std::ios::binary),
writer_(&stream_, compressor, max_num_record) {} writer_(&stream_, compressor, max_num_record) {}
void AppendTensor(const framework::LoDTensor& tensor) { void AppendTensor(const framework::LoDTensor& tensor) {
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/recordio/scanner.h" #include "paddle/fluid/recordio/scanner.h"
#include <string> #include <string>
#include <utility>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -27,7 +28,8 @@ Scanner::Scanner(std::unique_ptr<std::istream> &&stream) ...@@ -27,7 +28,8 @@ Scanner::Scanner(std::unique_ptr<std::istream> &&stream)
} }
Scanner::Scanner(const std::string &filename) Scanner::Scanner(const std::string &filename)
: stream_(new std::ifstream(filename)), parser_(*stream_) { : stream_(new std::ifstream(filename, std::ios::in | std::ios::binary)),
parser_(*stream_) {
PADDLE_ENFORCE(static_cast<bool>(*stream_), "Cannot open file %s", filename); PADDLE_ENFORCE(static_cast<bool>(*stream_), "Cannot open file %s", filename);
Reset(); Reset();
} }
......
...@@ -455,7 +455,11 @@ function assert_api_spec_approvals() { ...@@ -455,7 +455,11 @@ function assert_api_spec_approvals() {
# NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable. # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
if [ "$API_FILE" == "paddle/fluid/API.spec" ];then if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308` python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308 46782768 30176695`
if [ "${APPROVALS}" == "TRUE" ];then
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308`
fi
else else
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803` python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803`
...@@ -463,7 +467,7 @@ function assert_api_spec_approvals() { ...@@ -463,7 +467,7 @@ function assert_api_spec_approvals() {
echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
if [ "${APPROVALS}" == "FALSE" ]; then if [ "${APPROVALS}" == "FALSE" ]; then
if [ "$API_FILE" == "paddle/fluid/API.spec" ];then if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
echo "You must have panyx0718 and shanyi15 approval for the api change! ${API_FILE}" echo "You must have one RD (panyx0718 or chengduoZH or XiaoguangHu01) and one PM (shanyi15) approval for the api change! ${API_FILE}"
else else
echo "You must have panyx0718 approval for the api change! ${API_FILE}" echo "You must have panyx0718 approval for the api change! ${API_FILE}"
fi fi
......
...@@ -125,14 +125,14 @@ def __bootstrap__(): ...@@ -125,14 +125,14 @@ def __bootstrap__():
os.environ['OMP_NUM_THREADS'] = str(num_threads) os.environ['OMP_NUM_THREADS'] = str(num_threads)
sysstr = platform.system() sysstr = platform.system()
read_env_flags = [ read_env_flags = [
'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_ngraph', 'check_nan_inf', 'benchmark', 'eager_delete_scope',
'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory',
'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb',
'fast_eager_deletion_mode', 'allocator_strategy', 'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion',
'reader_queue_speed_test_mode', 'print_sub_graph_dir', 'allocator_strategy', 'reader_queue_speed_test_mode',
'pe_profile_fname', 'warpctc_dir', 'inner_op_parallelism', 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
'enable_parallel_graph', 'multiple_of_cupti_buffer_size', 'inner_op_parallelism', 'enable_parallel_graph',
'enable_subgraph_optimize' 'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize'
] ]
if 'Darwin' not in sysstr: if 'Darwin' not in sysstr:
read_env_flags.append('use_pinned_memory') read_env_flags.append('use_pinned_memory')
...@@ -143,6 +143,9 @@ def __bootstrap__(): ...@@ -143,6 +143,9 @@ def __bootstrap__():
if core.is_compiled_with_mkldnn(): if core.is_compiled_with_mkldnn():
read_env_flags.append('use_mkldnn') read_env_flags.append('use_mkldnn')
if core.is_compiled_with_ngraph():
read_env_flags.append('use_ngraph')
if core.is_compiled_with_dist(): if core.is_compiled_with_dist():
read_env_flags.append('rpc_deadline') read_env_flags.append('rpc_deadline')
read_env_flags.append('rpc_server_profile_path') read_env_flags.append('rpc_server_profile_path')
......
...@@ -232,21 +232,23 @@ class CompiledProgram(object): ...@@ -232,21 +232,23 @@ class CompiledProgram(object):
self._build_strategy.trainers_endpoints = tps self._build_strategy.trainers_endpoints = tps
self._persistable_vars = [] self._persistable_vars = []
for block_id in range(self._program_desc.num_blocks()): for node in self._graph.nodes():
bdesc = self._program_desc.block(block_id) if node.is_var() and node.var() is not None and node.var().persistable() and \
self._persistable_vars.extend([ node.var().type() != core.VarDesc.VarType.RAW:
cpt.to_text(v.name()) for v in bdesc.all_vars() self._persistable_vars.append(cpt.to_text(node.name()))
if v.persistable() and v.type() != core.VarDesc.VarType.RAW
])
places = list(map(_place_obj, self._places)) places = list(map(_place_obj, self._places))
# ParallelExecutor would broadcast all the parameters during initializing.
return core.ParallelExecutor(places, # The parameters of each process should be in the same ordered for the data-parallelism
set(self._persistable_vars), # distributed training to keep the broadcast correct.
self._persistable_vars = list(set(self._persistable_vars))
self._persistable_vars.sort()
return core.ParallelExecutor(
places, self._persistable_vars,
cpt.to_text(self._loss_name) cpt.to_text(self._loss_name)
if self._loss_name else six.u(''), scope, if self._loss_name else six.u(''), self._scope, self._local_scopes,
self._local_scopes, self._exec_strategy, self._exec_strategy, self._build_strategy, self._graph)
self._build_strategy, self._graph)
def _compile_inference(self): def _compile_inference(self):
return core.create_paddle_predictor(self._infer_config) return core.create_paddle_predictor(self._infer_config)
......
...@@ -13,58 +13,92 @@ ...@@ -13,58 +13,92 @@
# limitations under the license. # limitations under the license.
from __future__ import print_function from __future__ import print_function
import os
import six
import unittest import unittest
import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import six
from paddle.fluid.framework import IrGraph from paddle.fluid.framework import IrGraph
from paddle.fluid import core from paddle.fluid import core
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["CPU_NUM"] = "1"
def residual_block(num):
def conv_bn_layer(input,
ch_out,
filter_size,
stride,
padding,
act='relu',
bias_attr=False):
tmp = fluid.layers.conv2d(
input=input,
filter_size=filter_size,
num_filters=ch_out,
stride=stride,
padding=padding,
act=None,
bias_attr=bias_attr)
return fluid.layers.batch_norm(input=tmp, act=act)
data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32') def conv_block():
img = fluid.layers.data(name='image', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64') label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = data conv_pool_1 = fluid.nets.simple_img_conv_pool(
for _ in six.moves.xrange(num): input=img,
conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True) filter_size=5,
short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None) num_filters=20,
hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu') pool_size=2,
fc = fluid.layers.fc(input=hidden, size=10) pool_stride=2,
loss = fluid.layers.cross_entropy(input=fc, label=label) act="relu")
loss = fluid.layers.mean(loss) conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
return loss conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
pool_size=2,
pool_stride=2,
act="relu")
prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
avg_loss = fluid.layers.mean(loss)
return [img, label], avg_loss
class TestGraph(unittest.TestCase): class TestGraph(unittest.TestCase):
def test_graph_functions(self): def graph_apis(self, use_cuda=False, for_ci=True):
main = fluid.Program() main = fluid.Program()
startup = fluid.Program() startup = fluid.Program()
with fluid.program_guard(main, startup): with fluid.program_guard(main, startup):
loss = residual_block(2) feeds, loss = conv_block()
opt = fluid.optimizer.Adam(learning_rate=0.001) opt = fluid.optimizer.Adam(learning_rate=0.001)
opt.minimize(loss) opt.minimize(loss)
graph = IrGraph(core.Graph(main.desc), for_test=False) graph = IrGraph(core.Graph(main.desc), for_test=False)
backup_graph = graph.clone()
self.assertEqual(len(graph.all_nodes()), len(backup_graph.all_nodes()))
build_strategy = fluid.BuildStrategy()
build_strategy.memory_optimize = False
build_strategy.enable_inplace = False
origin_binary = fluid.CompiledProgram(graph.graph).with_data_parallel(
loss_name=loss.name, build_strategy=build_strategy)
backup_binary = fluid.CompiledProgram(
backup_graph.graph).with_data_parallel(
loss_name=loss.name, build_strategy=build_strategy)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup)
iters = 5
batch_size = 8
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=batch_size)
feeder = fluid.DataFeeder(feed_list=feeds, place=place)
def train(binary):
for _ in range(iters):
data = next(train_reader())
loss_v = exe.run(binary,
feed=feeder.feed(data),
fetch_list=[loss.name])
print('{}: {}'.format('loss', loss_v))
train(origin_binary)
train(backup_binary)
marked_nodes = set() marked_nodes = set()
for op in graph.all_op_nodes(): for op in graph.all_op_nodes():
if op.name().find('conv2d') > -1: if op.name().find('conv2d') > -1:
marked_nodes.add(op) marked_nodes.add(op)
if not for_ci:
graph.draw('.', 'residual', marked_nodes) graph.draw('.', 'residual', marked_nodes)
backup_marked_nodes = set()
for op in backup_graph.all_op_nodes():
if op.name().find('conv2d') > -1:
backup_marked_nodes.add(op)
backup_graph.draw('.', 'backup', backup_marked_nodes)
self.assertFalse(graph.has_circle()) self.assertFalse(graph.has_circle())
self.assertEqual(graph.graph_num(), 1) self.assertEqual(graph.graph_num(), 1)
nodes = graph.topology_sort() nodes = graph.topology_sort()
...@@ -75,6 +109,13 @@ class TestGraph(unittest.TestCase): ...@@ -75,6 +109,13 @@ class TestGraph(unittest.TestCase):
graph.safe_remove_nodes(marked_nodes) graph.safe_remove_nodes(marked_nodes)
self.assertEqual(len(graph.all_nodes()), nodes_num - len(marked_nodes)) self.assertEqual(len(graph.all_nodes()), nodes_num - len(marked_nodes))
def test_graph_apis_cpu(self):
self.graph_apis(use_cuda=False, for_ci=True)
def test_graph_apis_cuda(self):
if fluid.core.is_compiled_with_cuda():
self.graph_apis(use_cuda=True, for_ci=True)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
# see the license for the specific language governing permissions and # see the license for the specific language governing permissions and
# limitations under the license. # limitations under the license.
import os
import unittest import unittest
import random import random
import numpy as np import numpy as np
...@@ -25,6 +26,9 @@ from paddle.fluid.contrib.slim.quantization import ConvertToInt8Pass ...@@ -25,6 +26,9 @@ from paddle.fluid.contrib.slim.quantization import ConvertToInt8Pass
from paddle.fluid.contrib.slim.quantization import TransformForMobilePass from paddle.fluid.contrib.slim.quantization import TransformForMobilePass
from paddle.fluid import core from paddle.fluid import core
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["CPU_NUM"] = "1"
def linear_fc(num): def linear_fc(num):
data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32') data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
...@@ -249,7 +253,11 @@ class TestQuantizationFreezePass(unittest.TestCase): ...@@ -249,7 +253,11 @@ class TestQuantizationFreezePass(unittest.TestCase):
marked_nodes.add(op) marked_nodes.add(op)
test_graph.draw('.', 'test' + dev_name + quant_type, marked_nodes) test_graph.draw('.', 'test' + dev_name + quant_type, marked_nodes)
quantized_main_program = main_graph.to_program() build_strategy = fluid.BuildStrategy()
build_strategy.memory_optimize = False
build_strategy.enable_inplace = False
binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
loss_name=loss.name, build_strategy=build_strategy)
quantized_test_program = test_graph.to_program() quantized_test_program = test_graph.to_program()
iters = 5 iters = 5
batch_size = 8 batch_size = 8
...@@ -264,7 +272,7 @@ class TestQuantizationFreezePass(unittest.TestCase): ...@@ -264,7 +272,7 @@ class TestQuantizationFreezePass(unittest.TestCase):
with fluid.scope_guard(scope): with fluid.scope_guard(scope):
for _ in range(iters): for _ in range(iters):
data = next(train_reader()) data = next(train_reader())
loss_v = exe.run(program=quantized_main_program, loss_v = exe.run(binary,
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[loss]) fetch_list=[loss])
if not for_ci: if not for_ci:
......
...@@ -594,7 +594,7 @@ class Executor(object): ...@@ -594,7 +594,7 @@ class Executor(object):
fetch_var_name=fetch_var_name) fetch_var_name=fetch_var_name)
self._feed_data(program, feed, feed_var_name, scope) self._feed_data(program, feed, feed_var_name, scope)
exe.run(program.desc, scope, 0, True, True) exe.run(program.desc, scope, 0, True, True, fetch_var_name)
outs = self._fetch_data(fetch_list, fetch_var_name, scope) outs = self._fetch_data(fetch_list, fetch_var_name, scope)
if return_numpy: if return_numpy:
outs = as_numpy(outs) outs = as_numpy(outs)
......
...@@ -389,10 +389,27 @@ class Variable(object): ...@@ -389,10 +389,27 @@ class Variable(object):
is_data=False, is_data=False,
**kwargs): **kwargs):
self.block = block self.block = block
self.error_clip = error_clip
if name is None: if name is None:
name = unique_name.generate('_generated_var') name = unique_name.generate('_generated_var')
if dtype is not None:
if not isinstance(dtype, core.VarDesc.VarType):
dtype = convert_np_dtype_to_dtype_(dtype)
if _in_imperative_mode():
# record vars in tracer rather than blocks
self._ivar = kwargs.get("ivar", None)
if not self._ivar:
self._ivar = core.VarBase(
name, dtype if dtype else core.VarDesc.VarType.FP32,
list(shape) if shape else [],
_current_expected_place(), True
if persistable else False, stop_gradient)
if persistable:
_imperative_tracer().trace_var(name, self)
else:
self.error_clip = error_clip
is_new_var = False is_new_var = False
name = cpt.to_text(name) name = cpt.to_text(name)
self.desc = self.block.desc.find_var(cpt.to_bytes(name)) self.desc = self.block.desc.find_var(cpt.to_bytes(name))
...@@ -404,10 +421,11 @@ class Variable(object): ...@@ -404,10 +421,11 @@ class Variable(object):
if is_new_var: if is_new_var:
self.desc.set_type(type) self.desc.set_type(type)
elif self.desc.type() != type: elif self.desc.type() != type:
raise ValueError("Variable {0} has been created before. The " raise ValueError(
"Variable {0} has been created before. The "
"previous type is {1}; the new type is {2}. They" "previous type is {1}; the new type is {2}. They"
" are not matched".format(self.name, " are not matched".format(self.name, self.desc.type(),
self.desc.type(), type)) type))
if shape is not None: if shape is not None:
if is_new_var: if is_new_var:
...@@ -421,25 +439,24 @@ class Variable(object): ...@@ -421,25 +439,24 @@ class Variable(object):
"shape is {1}; the new shape is {2}. They are not " "shape is {1}; the new shape is {2}. They are not "
"matched.".format(self.name, old_shape, shape)) "matched.".format(self.name, old_shape, shape))
if dtype is not None: if dtype is not None:
if not isinstance(dtype, core.VarDesc.VarType):
dtype = convert_np_dtype_to_dtype_(dtype)
if is_new_var: if is_new_var:
self.desc.set_dtype(dtype) self.desc.set_dtype(dtype)
else: else:
old_dtype = self.dtype old_dtype = self.dtype
if dtype != old_dtype: if dtype != old_dtype:
raise ValueError("Variable {0} has been created before. " raise ValueError(
"Variable {0} has been created before. "
"The previous data type is {1}; the new " "The previous data type is {1}; the new "
"data type is {2}. They are not " "data type is {2}. They are not "
"matched.".format(self.name, old_dtype, "matched.".format(self.name, old_dtype, dtype))
dtype))
if lod_level is not None: if lod_level is not None:
if is_new_var: if is_new_var:
self.desc.set_lod_level(lod_level) self.desc.set_lod_level(lod_level)
else: else:
if lod_level != self.lod_level: if lod_level != self.lod_level:
raise ValueError("Variable {0} has been created before. " raise ValueError(
"Variable {0} has been created before. "
"The previous lod_level is {1}; the new " "The previous lod_level is {1}; the new "
"lod_level is {2}. They are not " "lod_level is {2}. They are not "
"matched".format(self.name, self.lod_level, "matched".format(self.name, self.lod_level,
...@@ -463,18 +480,6 @@ class Variable(object): ...@@ -463,18 +480,6 @@ class Variable(object):
# get_capacity is implemented # get_capacity is implemented
pass pass
if _in_imperative_mode():
# record vars in tracer rather than blocks
self._ivar = kwargs.get("ivar", None)
if not self._ivar:
self._ivar = core.VarBase(stop_gradient)
self._ivar.desc = self.desc
self._ivar.block = block.desc
self._ivar.name = name
self._ivar.persistable = persistable
if persistable:
self.block.vars[name] = self
else:
self.block.vars[name] = self self.block.vars[name] = self
self.op = None self.op = None
self.stop_gradient = stop_gradient self.stop_gradient = stop_gradient
...@@ -547,39 +552,62 @@ class Variable(object): ...@@ -547,39 +552,62 @@ class Variable(object):
def _stop_gradient(self, s): def _stop_gradient(self, s):
if _in_imperative_mode(): if _in_imperative_mode():
self._ivar.stop_gradient = s self._ivar.stop_gradient = s
else:
self.stop_gradient = s self.stop_gradient = s
@property @property
def persistable(self): def persistable(self):
if _in_imperative_mode():
return self._ivar.persistable
else:
return self.desc.persistable() return self.desc.persistable()
@persistable.setter @persistable.setter
def persistable(self, p): def persistable(self, p):
if _in_imperative_mode():
return self._ivar.persistable
else:
self.desc.set_persistable(p) self.desc.set_persistable(p)
@property @property
def name(self): def name(self):
if _in_imperative_mode():
return self._ivar.name
else:
return cpt.to_text(self.desc.name()) return cpt.to_text(self.desc.name())
@name.setter @name.setter
def name(self, new_name): def name(self, new_name):
if _in_imperative_mode():
self._ivar.name = new_name
else:
self.desc.set_name(new_name) self.desc.set_name(new_name)
@property @property
def shape(self): def shape(self):
# convert to tuple, make it as same as numpy API. # convert to tuple, make it as same as numpy API.
if _in_imperative_mode():
return self._ivar.shape
else:
return tuple(self.desc.shape()) return tuple(self.desc.shape())
@property @property
def dtype(self): def dtype(self):
if _in_imperative_mode():
return self._ivar.dtype
else:
return self.desc.dtype() return self.desc.dtype()
@property @property
def lod_level(self): def lod_level(self):
# TODO(minqiyang): Support lod_level in imperative mode
return self.desc.lod_level() return self.desc.lod_level()
@property @property
def type(self): def type(self):
if _in_imperative_mode():
return self._ivar.dtype
else:
return self.desc.type() return self.desc.type()
def _set_error_clip(self, error_clip): def _set_error_clip(self, error_clip):
...@@ -709,6 +737,32 @@ class Operator(object): ...@@ -709,6 +737,32 @@ class Operator(object):
inputs=None, inputs=None,
outputs=None, outputs=None,
attrs=None): attrs=None):
if _in_imperative_mode():
if type is None:
raise ValueError(
"`type` to initilized an Operator can not be None.")
self.iop = core.OpBase(type)
# TODO(minqiyang): remove these lines after we take apart all
# backward grads and forward variables
self.inputs = defaultdict(list)
if inputs is not None:
for k, v in six.iteritems(inputs):
if isinstance(v, Variable):
self.inputs[k].append(v._ivar)
elif isinstance(v, list) or isinstance(v, tuple):
self.inputs[k].extend([var._ivar for var in v])
self.outputs = defaultdict(list)
if outputs is not None:
for k, v in six.iteritems(outputs):
if isinstance(v, Variable):
self.outputs[k].append(v._ivar)
elif isinstance(v, list) or isinstance(v, tuple):
self.outputs[k].extend([var._ivar for var in v])
self.attrs = attrs if attrs else {}
else:
self.block = block self.block = block
self.desc = desc self.desc = desc
# note: not add self.attrs here: # note: not add self.attrs here:
...@@ -721,7 +775,8 @@ class Operator(object): ...@@ -721,7 +775,8 @@ class Operator(object):
op_maker = core.op_proto_and_checker_maker op_maker = core.op_proto_and_checker_maker
if op_maker.kOpRoleAttrName() not in op_attrs: if op_maker.kOpRoleAttrName() not in op_attrs:
op_attrs[op_maker.kOpRoleAttrName()] = self.block.program.op_role op_attrs[op_maker.kOpRoleAttrName(
)] = self.block.program.op_role
role_var_name = op_maker.kOpRoleVarAttrName() role_var_name = op_maker.kOpRoleVarAttrName()
if len(self.block.program. if len(self.block.program.
...@@ -784,9 +839,9 @@ class Operator(object): ...@@ -784,9 +839,9 @@ class Operator(object):
if (m.name not in outputs) and m.dispensable: if (m.name not in outputs) and m.dispensable:
continue continue
if not ((m.name in outputs) or m.dispensable): if not ((m.name in outputs) or m.dispensable):
raise ValueError( raise ValueError(("Incorrect setting for output(s) of "
("Incorrect setting for output(s) of " "operator \"%s\", should set: [%s].")
"operator \"%s\", should set: [%s].") % (type, m.name)) % (type, m.name))
for out_proto in proto.outputs: for out_proto in proto.outputs:
if out_proto.name not in outputs: if out_proto.name not in outputs:
continue continue
...@@ -795,8 +850,8 @@ class Operator(object): ...@@ -795,8 +850,8 @@ class Operator(object):
out_args = [out_args] out_args = [out_args]
if not out_proto.duplicable and len(out_args) > 1: if not out_proto.duplicable and len(out_args) > 1:
raise ValueError( raise ValueError(
"Output %s expects only one output, but %d are given." % "Output %s expects only one output, but %d are given."
(out_proto.name, len(out_args))) % (out_proto.name, len(out_args)))
out_arg_names = [] out_arg_names = []
for arg in out_args: for arg in out_args:
out_arg_names.append(cpt.to_text(arg.name)) out_arg_names.append(cpt.to_text(arg.name))
...@@ -810,7 +865,8 @@ class Operator(object): ...@@ -810,7 +865,8 @@ class Operator(object):
raise TypeError("'attrs' should be a dict.") raise TypeError("'attrs' should be a dict.")
for attr in proto.attrs: for attr in proto.attrs:
attr_name = attr.name attr_name = attr.name
if (attr_name not in op_attrs) or (op_attrs[attr_name] is None): if (attr_name not in op_attrs) or (
op_attrs[attr_name] is None):
continue continue
attr_val = op_attrs[attr_name] attr_val = op_attrs[attr_name]
self._update_desc_attr(attr_name, attr_val) self._update_desc_attr(attr_name, attr_val)
...@@ -820,26 +876,6 @@ class Operator(object): ...@@ -820,26 +876,6 @@ class Operator(object):
self.desc.infer_var_type(self.block.desc) self.desc.infer_var_type(self.block.desc)
self.desc.infer_shape(self.block.desc) self.desc.infer_shape(self.block.desc)
if _in_imperative_mode():
self.iop = core.OpBase()
self.iop.desc = self.desc
self.inputs = defaultdict(list)
if inputs is not None:
for k, v in six.iteritems(inputs):
if isinstance(v, Variable):
self.inputs[k].append(v._ivar)
elif isinstance(v, list) or isinstance(v, tuple):
self.inputs[k].extend([var._ivar for var in v])
self.outputs = defaultdict(list)
if outputs is not None:
for k, v in six.iteritems(outputs):
if isinstance(v, Variable):
self.outputs[k].append(v._ivar)
elif isinstance(v, list) or isinstance(v, tuple):
self.outputs[k].extend([var._ivar for var in v])
def _has_kernel(self, op_type): def _has_kernel(self, op_type):
return op_type not in self.OP_WITHOUT_KERNEL_SET return op_type not in self.OP_WITHOUT_KERNEL_SET
...@@ -1403,16 +1439,15 @@ class Block(object): ...@@ -1403,16 +1439,15 @@ class Block(object):
Returns: Returns:
Operator: the append Operator. Operator: the append Operator.
""" """
op_desc = self.desc.append_op() if _in_imperative_mode():
op = Operator( op = Operator(
block=self, block=self,
desc=op_desc, desc=None,
type=kwargs.get("type", None), type=kwargs.get("type", None),
inputs=kwargs.get("inputs", None), inputs=kwargs.get("inputs", None),
outputs=kwargs.get("outputs", None), outputs=kwargs.get("outputs", None),
attrs=kwargs.get("attrs", None)) attrs=kwargs.get("attrs", None))
if _in_imperative_mode():
# record ops in tracer rather than blocks # record ops in tracer rather than blocks
# #
# TODO(minqiyang): add op stop_gradient support in static mode too. # TODO(minqiyang): add op stop_gradient support in static mode too.
...@@ -1420,6 +1455,15 @@ class Block(object): ...@@ -1420,6 +1455,15 @@ class Block(object):
_imperative_tracer().trace_op(op, _imperative_tracer().trace_op(op,
kwargs.get("stop_gradient", False)) kwargs.get("stop_gradient", False))
else: else:
op_desc = self.desc.append_op()
op = Operator(
block=self,
desc=op_desc,
type=kwargs.get("type", None),
inputs=kwargs.get("inputs", None),
outputs=kwargs.get("outputs", None),
attrs=kwargs.get("attrs", None))
self.ops.append(op) self.ops.append(op)
return op return op
...@@ -1468,19 +1512,27 @@ class Block(object): ...@@ -1468,19 +1512,27 @@ class Block(object):
return self.ops[start:end] return self.ops[start:end]
def _prepend_op(self, *args, **kwargs): def _prepend_op(self, *args, **kwargs):
op_desc = self.desc._prepend_op() if _in_imperative_mode():
op = Operator( op = Operator(
self, self,
op_desc, None,
type=kwargs.get("type", None), type=kwargs.get("type", None),
inputs=kwargs.get("inputs", None), inputs=kwargs.get("inputs", None),
outputs=kwargs.get("outputs", None), outputs=kwargs.get("outputs", None),
attrs=kwargs.get("attrs", None)) attrs=kwargs.get("attrs", None))
if _in_imperative_mode():
_imperative_tracer().trace_op(op, _imperative_tracer().trace_op(op,
kwargs.get("stop_gradient", False)) kwargs.get("stop_gradient", False))
else: else:
op_desc = self.desc._prepend_op()
op = Operator(
self,
op_desc,
type=kwargs.get("type", None),
inputs=kwargs.get("inputs", None),
outputs=kwargs.get("outputs", None),
attrs=kwargs.get("attrs", None))
self.ops.insert(0, op) self.ops.insert(0, op)
return op return op
def _sync_with_cpp(self): def _sync_with_cpp(self):
...@@ -2035,6 +2087,19 @@ class IrGraph(object): ...@@ -2035,6 +2087,19 @@ class IrGraph(object):
self.graph = graph self.graph = graph
self._for_test = for_test self._for_test = for_test
def clone(self):
"""
Create a new and duplicated IrGraph.
Warns:
The method only clones the graph structure, not its attributes.
Returns:
IrGraph: A new and duplicated graph.
"""
g = self.graph.clone()
return IrGraph(g, self._for_test)
def is_test(self): def is_test(self):
""" """
If the graph is used for testing, the function returns true. Otherwise, returns false. If the graph is used for testing, the function returns true. Otherwise, returns false.
...@@ -2265,10 +2330,10 @@ class IrGraph(object): ...@@ -2265,10 +2330,10 @@ class IrGraph(object):
Notes: the `graph` cannot contain a circle. Notes: the `graph` cannot contain a circle.
Returns: Returns:
set(IrNode): nodes in topology order. list(IrNode): nodes in topology order.
""" """
ordered_nodes = core.topology_sort(self.graph) ordered_nodes = core.topology_sort(self.graph)
return {IrNode(n) for n in ordered_nodes} return [IrNode(n) for n in ordered_nodes]
def build_adjacency_list(self): def build_adjacency_list(self):
""" """
...@@ -2336,7 +2401,7 @@ class IrGraph(object): ...@@ -2336,7 +2401,7 @@ class IrGraph(object):
""" """
Convert the graph into a Program. Convert the graph into a Program.
Notes: When the graph includes backward operator nodes, the WARN: When the graph includes backward operator nodes, the
conversion process may be failed. Usually, this function is conversion process may be failed. Usually, this function is
only used to convert a test graph. only used to convert a test graph.
......
...@@ -258,7 +258,7 @@ class PyLayer(core.PyLayer): ...@@ -258,7 +258,7 @@ class PyLayer(core.PyLayer):
cls.backward_id = core.PyLayer.num_funcs() + 1 cls.backward_id = core.PyLayer.num_funcs() + 1
PyLayer.register_func(cls.backward_id, cls._do_backward) PyLayer.register_func(cls.backward_id, cls._do_backward)
iop = core.OpBase() iop = core.OpBase(cls.__class__.__name__ + str(cls.forward_id))
iop.forward_id = cls.forward_id iop.forward_id = cls.forward_id
iop.backward_id = cls.backward_id iop.backward_id = cls.backward_id
block.ops.append(iop) block.ops.append(iop)
......
...@@ -22,7 +22,8 @@ from . import layers ...@@ -22,7 +22,8 @@ from . import layers
from ..framework import Variable, OpProtoHolder from ..framework import Variable, OpProtoHolder
from ..param_attr import ParamAttr from ..param_attr import ParamAttr
from ..initializer import Normal, Constant from ..initializer import Normal, Constant
__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding']
__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit']
class Conv2D(layers.Layer): class Conv2D(layers.Layer):
...@@ -112,7 +113,7 @@ class Conv2D(layers.Layer): ...@@ -112,7 +113,7 @@ class Conv2D(layers.Layer):
'strides': self._stride, 'strides': self._stride,
'paddings': self._padding, 'paddings': self._padding,
'dilations': self._dilation, 'dilations': self._dilation,
'groups': self._groups, 'groups': self._groups if self._groups else 1,
'use_cudnn': self._use_cudnn, 'use_cudnn': self._use_cudnn,
'use_mkldnn': False, 'use_mkldnn': False,
}) })
...@@ -468,3 +469,137 @@ class Embedding(layers.Layer): ...@@ -468,3 +469,137 @@ class Embedding(layers.Layer):
}) })
return out return out
class GRUUnit(layers.Layer):
"""
**GRU unit layer**
if origin_mode is True, then the equation of a gru step is from paper
`Learning Phrase Representations using RNN Encoder-Decoder for Statistical
Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_
.. math::
u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
if origin_mode is False, then the equation of a gru step is from paper
`Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_
.. math::
u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t)
The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
of the equation above, the :math:`z_t` is split into 3 parts -
:math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to
implement a full GRU unit operator for an input, a fully
connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`.
The terms :math:`u_t` and :math:`r_t` represent the update and reset gates
of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is
an intermediate candidate hidden output, which is denoted by :math:`m_t`.
This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})`
and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`.
Args:
input (Variable): The fc transformed input value of current step.
name_scope (str): See base class.
hidden (Variable): The hidden value of gru unit from previous step.
size (integer): The input dimension value.
param_attr(ParamAttr|None): The parameter attribute for the learnable
hidden-hidden weight matrix. Note:
- The shape of the weight matrix is :math:`(T \\times 3D)`, where
:math:`D` is the hidden size.
- All elements in the weight matrix can be divided into two parts.
The first part are weights of the update gate and reset gate with
shape :math:`(D \\times 2D)`, and the second part are weights for
candidate hidden state with shape :math:`(D \\times D)`.
If it is set to None or one attribute of ParamAttr, gru_unit will
create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
of GRU.Note that the bias with :math:`(1 \\times 3D)` concatenates
the bias in the update gate, reset gate and candidate calculations.
If it is set to False, no bias will be applied to the update gate,
reset gate and candidate calculations. If it is set to None or one
attribute of ParamAttr, gru_unit will create ParamAttr as
bias_attr. If the Initializer of the bias_attr is not set, the bias
is initialized zero. Default: None.
activation (string): The activation type for cell (actNode).
Default: 'tanh'
gate_activation (string): The activation type for gates (actGate).
Default: 'sigmoid'
Returns:
tuple: The hidden value, reset-hidden value and gate values.
"""
def __init__(self,
name_scope,
size,
param_attr=None,
bias_attr=None,
activation='tanh',
gate_activation='sigmoid',
origin_mode=False,
dtype='float32'):
super(GRUUnit, self).__init__(name_scope)
activation_dict = dict(
identity=0,
sigmoid=1,
tanh=2,
relu=3, )
activation = activation_dict[activation]
gate_activation = activation_dict[gate_activation]
self._dtype = dtype
size = size // 3
# create weight
self._weight = self.create_parameter(
attr=param_attr, shape=[size, 3 * size], dtype=dtype)
# create bias
bias_size = [1, 3 * size]
self._bias = self.create_parameter(
attr=bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
def forward(self, input, hidden):
inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': self._weight}
if self._bias:
inputs['Bias'] = self._bias
gate = self._helper.create_variable_for_type_inference(self._dtype)
reset_hidden_pre = self._helper.create_variable_for_type_inference(
self._dtype)
updated_hidden = self._helper.create_variable_for_type_inference(
self._dtype)
self._helper.append_op(
type='gru_unit',
inputs=inputs,
outputs={
'Gate': gate,
'ResetHiddenPrev': reset_hidden_pre,
'Hidden': updated_hidden,
},
attrs={
'activation': 2, # tanh
'gate_activation': 1, # sigmoid
})
return updated_hidden, reset_hidden_pre, gate
...@@ -36,14 +36,21 @@ class Tracer(core.Tracer): ...@@ -36,14 +36,21 @@ class Tracer(core.Tracer):
super(Tracer, self).__init__(block) super(Tracer, self).__init__(block)
self._ops = defaultdict() self._ops = defaultdict()
self._vars = defaultdict()
self._trace_id = 0 self._trace_id = 0
def trace_var(self, name, var):
self._vars[name] = var
def all_parameters(self):
return list((item for name, item in six.iteritems(self._vars)
if isinstance(item, framework.Parameter)))
def trace_op(self, op, stop_gradient=False): def trace_op(self, op, stop_gradient=False):
# record op's trace id # record op's trace id
op.iop._trace_id = self._trace_id op.iop._trace_id = self._trace_id
# trace op and save it backward_refs = self.trace(op.iop, op.inputs, op.outputs, op.attrs,
backward_refs = self.trace(op.iop, op.inputs, op.outputs, op.block.desc,
framework._current_expected_place(), framework._current_expected_place(),
stop_gradient) stop_gradient)
......
...@@ -10704,8 +10704,9 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002): ...@@ -10704,8 +10704,9 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002):
similarity_matrix = matmul( similarity_matrix = matmul(
anchor, positive, transpose_x=False, transpose_y=True) anchor, positive, transpose_x=False, transpose_y=True)
softmax_value = softmax(similarity_matrix) softmax_ce = softmax_with_cross_entropy(
cross_entropy = -1 * reduce_sum(labels * log(softmax_value), 0) logits=similarity_matrix, label=labels, soft_label=True)
cross_entropy = reduce_sum(labels * softmax_ce, 0)
celoss = reduce_mean(cross_entropy) celoss = reduce_mean(cross_entropy)
return l2loss + celoss return l2loss + celoss
...@@ -23,6 +23,7 @@ __activations_noattr__ = [ ...@@ -23,6 +23,7 @@ __activations_noattr__ = [
'logsigmoid', 'logsigmoid',
'exp', 'exp',
'tanh', 'tanh',
'atan',
'tanh_shrink', 'tanh_shrink',
'softshrink', 'softshrink',
'sqrt', 'sqrt',
...@@ -30,6 +31,8 @@ __activations_noattr__ = [ ...@@ -30,6 +31,8 @@ __activations_noattr__ = [
'ceil', 'ceil',
'floor', 'floor',
'cos', 'cos',
'acos',
'asin',
'sin', 'sin',
'round', 'round',
'reciprocal', 'reciprocal',
......
...@@ -377,17 +377,16 @@ class Optimizer(object): ...@@ -377,17 +377,16 @@ class Optimizer(object):
and list of (param, grad) Variables pair for optimization. and list of (param, grad) Variables pair for optimization.
""" """
self._dtype = loss.dtype self._dtype = loss.dtype
program = loss.block.program
optimize_ops = [] optimize_ops = []
if framework._in_imperative_mode(): if framework._in_imperative_mode():
if parameter_list is not None: if parameter_list is not None:
parameters = parameter_list parameters = parameter_list
else: else:
parameters = program.global_block().all_parameters() parameters = framework._imperative_tracer().all_parameters()
params_grads = [] params_grads = []
for param in parameters: for param in parameters:
if param.stop_gradient or not param.trainable: if not param.trainable:
continue continue
# create gradient variable # create gradient variable
grad_var = Variable( grad_var = Variable(
...@@ -396,9 +395,11 @@ class Optimizer(object): ...@@ -396,9 +395,11 @@ class Optimizer(object):
stop_gradient=True, stop_gradient=True,
ivar=param._ivar._grad_ivar()) ivar=param._ivar._grad_ivar())
params_grads.append((param, grad_var)) params_grads.append((param, grad_var))
with program_guard(program, startup_program): with program_guard(framework.default_main_program(),
framework.default_startup_program()):
optimize_ops = self._create_optimization_pass(params_grads) optimize_ops = self._create_optimization_pass(params_grads)
else: else:
program = loss.block.program
with program_guard(program, startup_program): with program_guard(program, startup_program):
params_grads = self.backward(loss, startup_program, params_grads = self.backward(loss, startup_program,
parameter_list, no_grad_set) parameter_list, no_grad_set)
......
...@@ -22,6 +22,7 @@ import six ...@@ -22,6 +22,7 @@ import six
import time import time
import itertools import itertools
import collections import collections
from collections import defaultdict
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
...@@ -257,8 +258,65 @@ class OpTest(unittest.TestCase): ...@@ -257,8 +258,65 @@ class OpTest(unittest.TestCase):
outs, _ = self._calc_output(place) outs, _ = self._calc_output(place)
return outs return outs
def _calc_output(self, place, parallel=False, no_check_set=None): def _create_var_from_numpy(self, value):
if isinstance(value, tuple):
data = value[0]
lod = value[1]
v = fluid.imperative.base.to_variable(value=data)
v._ivar.value().get_tensor().set_recursive_sequence_lengths(lod)
return v
else:
return fluid.imperative.base.to_variable(value)
def _calc_imperative_output(self, place, parallel=False, no_check_set=None):
with fluid.imperative.base.guard(place=place):
block = fluid.default_main_program().global_block()
# prepare input variable
inputs = defaultdict(list)
for name, np_value in six.iteritems(self.inputs):
if not isinstance(np_value, list):
np_value = [np_value]
for i in range(len(np_value)):
inputs[name].append(
self._create_var_from_numpy(np_value[i]))
# prepare output variable
outputs = defaultdict(list)
for name, np_value in six.iteritems(self.outputs):
if not isinstance(np_value, list):
np_value = [np_value]
for i in range(len(np_value)):
value = np_value[i]
if isinstance(value, tuple):
v = block.create_var(
name="%s_out%d" % (name, i),
dtype=value[0].dtype,
type=core.VarDesc.VarType.LOD_TENSOR,
persistable=False,
stop_gradient=False)
v._ivar.value().get_tensor(
).set_recursive_sequence_lengths(value[1])
else:
v = block.create_var(
name="%s_out%d" % (name, i),
dtype=value.dtype,
type=core.VarDesc.VarType.LOD_TENSOR,
persistable=False,
stop_gradient=False)
outputs[name].append(v)
block.append_op(
type=self.op_type,
inputs=inputs,
outputs=outputs,
attrs=self.attrs)
return outputs
def _calc_output(self, place, parallel=False, no_check_set=None):
program = Program() program = Program()
block = program.global_block() block = program.global_block()
self._append_ops(block) self._append_ops(block)
...@@ -305,8 +363,13 @@ class OpTest(unittest.TestCase): ...@@ -305,8 +363,13 @@ class OpTest(unittest.TestCase):
place, place,
atol, atol,
no_check_set=None, no_check_set=None,
equal_nan=False): equal_nan=False,
check_imperative=False):
if check_imperative:
imperative_outs = self._calc_imperative_output(
place, no_check_set=no_check_set)
outs, fetch_list = self._calc_output(place, no_check_set=no_check_set) outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
for out_name, out_dup in Operator.get_op_outputs(self.op_type): for out_name, out_dup in Operator.get_op_outputs(self.op_type):
if out_name not in self.outputs: if out_name not in self.outputs:
continue continue
...@@ -330,6 +393,10 @@ class OpTest(unittest.TestCase): ...@@ -330,6 +393,10 @@ class OpTest(unittest.TestCase):
type(sub_out)) type(sub_out))
for item in sub_out: for item in sub_out:
sub_out_name, expect = item[0], item[1] sub_out_name, expect = item[0], item[1]
if check_imperative:
imperative_actual = imperative_outs[sub_out_name][0]
imperative_actual_t = np.array(
imperative_actual._ivar.value().get_tensor())
idx = find_actual(sub_out_name, fetch_list) idx = find_actual(sub_out_name, fetch_list)
actual = outs[idx] actual = outs[idx]
actual_t = np.array(actual) actual_t = np.array(actual)
...@@ -340,12 +407,31 @@ class OpTest(unittest.TestCase): ...@@ -340,12 +407,31 @@ class OpTest(unittest.TestCase):
actual_t, expect_t, atol=atol, equal_nan=equal_nan), actual_t, expect_t, atol=atol, equal_nan=equal_nan),
"Output (" + sub_out_name + ") has diff at " + "Output (" + sub_out_name + ") has diff at " +
str(place)) str(place))
if check_imperative:
self.assertTrue(
np.allclose(
imperative_actual_t,
expect_t,
atol=atol,
equal_nan=equal_nan),
"Output (" + sub_out_name + ") has diff at " +
str(place) + " in imperative mode")
if isinstance(expect, tuple): if isinstance(expect, tuple):
self.assertListEqual( self.assertListEqual(
actual.recursive_sequence_lengths(), expect[1], actual.recursive_sequence_lengths(), expect[1],
"Output (" + sub_out_name + "Output (" + sub_out_name +
") has different lod at " + str(place)) ") has different lod at " + str(place))
if check_imperative:
self.assertListEqual(
imperative_actual._ivar.value().get_tensor()
.recursive_sequence_lengths(), expect[1],
"Output (" + out_name + ") has different lod at " +
str(place) + " in imperative mode")
else: else:
if check_imperative:
imperative_actual = imperative_outs[out_name][0]
imperative_actual_t = np.array(
imperative_actual._ivar.value().get_tensor())
idx = find_actual(out_name, fetch_list) idx = find_actual(out_name, fetch_list)
actual = outs[idx] actual = outs[idx]
actual_t = np.array(actual) actual_t = np.array(actual)
...@@ -357,10 +443,27 @@ class OpTest(unittest.TestCase): ...@@ -357,10 +443,27 @@ class OpTest(unittest.TestCase):
"Output (" + out_name + ") has diff at " + str(place) + "Output (" + out_name + ") has diff at " + str(place) +
"\nExpect " + str(expect_t) + "\n" + "But Got" + "\nExpect " + str(expect_t) + "\n" + "But Got" +
str(actual_t) + " in class " + self.__class__.__name__) str(actual_t) + " in class " + self.__class__.__name__)
if check_imperative:
self.assertTrue(
np.allclose(
imperative_actual_t,
expect_t,
atol=atol,
equal_nan=equal_nan),
"Output (" + out_name + ") has diff at " + str(place) +
"\nExpect " + str(expect_t) + "\n" + "But Got" +
str(imperative_actual_t) + " in class " +
self.__class__.__name__)
if isinstance(expect, tuple): if isinstance(expect, tuple):
self.assertListEqual(actual.recursive_sequence_lengths(), self.assertListEqual(actual.recursive_sequence_lengths(),
expect[1], "Output (" + out_name + expect[1], "Output (" + out_name +
") has different lod at " + str(place)) ") has different lod at " + str(place))
if check_imperative:
self.assertListEqual(
imperative_actual._ivar.value().get_tensor()
.recursive_sequence_lengths(), expect[1],
"Output (" + out_name + ") has different lod at " +
str(place) + " in imperative mode")
def _get_places(self): def _get_places(self):
if self.dtype == np.float16: if self.dtype == np.float16:
...@@ -383,10 +486,15 @@ class OpTest(unittest.TestCase): ...@@ -383,10 +486,15 @@ class OpTest(unittest.TestCase):
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
return places return places
def check_output(self, atol=1e-5, no_check_set=None, equal_nan=False): def check_output(self,
atol=1e-5,
no_check_set=None,
equal_nan=False,
check_imperative=False):
places = self._get_places() places = self._get_places()
for place in places: for place in places:
self.check_output_with_place(place, atol, no_check_set, equal_nan) self.check_output_with_place(place, atol, no_check_set, equal_nan,
check_imperative)
def check_output_customized(self, checker): def check_output_customized(self, checker):
places = self._get_places() places = self._get_places()
......
...@@ -26,8 +26,8 @@ class TestAccuracyOp(OpTest): ...@@ -26,8 +26,8 @@ class TestAccuracyOp(OpTest):
self.init_dtype() self.init_dtype()
n = 8192 n = 8192
infer = np.random.random((n, 1)).astype(self.dtype) infer = np.random.random((n, 1)).astype(self.dtype)
indices = np.random.randint(0, 2, (n, 1)) indices = np.random.randint(0, 2, (n, 1)).astype('int64')
label = np.random.randint(0, 2, (n, 1)) label = np.random.randint(0, 2, (n, 1)).astype('int64')
self.inputs = {'Out': infer, 'Indices': indices, "Label": label} self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
num_correct = 0 num_correct = 0
for rowid in range(n): for rowid in range(n):
......
...@@ -100,6 +100,23 @@ class TestTanh(TestActivation): ...@@ -100,6 +100,23 @@ class TestTanh(TestActivation):
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestAtan(TestActivation):
def setUp(self):
self.op_type = "atan"
self.init_dtype()
x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
out = np.arctan(x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestTanhShrink(TestActivation): class TestTanhShrink(TestActivation):
def setUp(self): def setUp(self):
self.op_type = "tanh_shrink" self.op_type = "tanh_shrink"
...@@ -248,6 +265,23 @@ class TestCos(TestActivation): ...@@ -248,6 +265,23 @@ class TestCos(TestActivation):
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestAcos(TestActivation):
def setUp(self):
self.op_type = "acos"
self.init_dtype()
x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
out = np.arccos(x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestSin(TestActivation): class TestSin(TestActivation):
def setUp(self): def setUp(self):
self.op_type = "sin" self.op_type = "sin"
...@@ -265,6 +299,23 @@ class TestSin(TestActivation): ...@@ -265,6 +299,23 @@ class TestSin(TestActivation):
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestAsin(TestActivation):
def setUp(self):
self.op_type = "asin"
self.init_dtype()
x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
out = np.arcsin(x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestRound(TestActivation): class TestRound(TestActivation):
def setUp(self): def setUp(self):
self.op_type = "round" self.op_type = "round"
...@@ -665,7 +716,10 @@ create_test_act_fp16_class(TestAbs) ...@@ -665,7 +716,10 @@ create_test_act_fp16_class(TestAbs)
create_test_act_fp16_class(TestCeil, grad_check=False) create_test_act_fp16_class(TestCeil, grad_check=False)
create_test_act_fp16_class(TestFloor, grad_check=False) create_test_act_fp16_class(TestFloor, grad_check=False)
create_test_act_fp16_class(TestCos, grad_atol=0.85) create_test_act_fp16_class(TestCos, grad_atol=0.85)
create_test_act_fp16_class(TestAcos, grad_atol=0.85)
create_test_act_fp16_class(TestSin) create_test_act_fp16_class(TestSin)
create_test_act_fp16_class(TestAsin)
create_test_act_fp16_class(TestAtan)
create_test_act_fp16_class(TestRound, grad_check=False) create_test_act_fp16_class(TestRound, grad_check=False)
create_test_act_fp16_class(TestRelu) create_test_act_fp16_class(TestRelu)
create_test_act_fp16_class(TestGelu) create_test_act_fp16_class(TestGelu)
......
...@@ -16,8 +16,7 @@ import os ...@@ -16,8 +16,7 @@ import os
import unittest import unittest
os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
os.environ[ os.environ['RECORDIO_FILENAME'] = './eager_deletion_transformer.wmt16.recordio'
'RECORDIO_FILENAME'] = '/tmp/eager_deletion_transformer.wmt16.recordio'
from test_parallel_executor_transformer import TestTransformer from test_parallel_executor_transformer import TestTransformer
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
os.environ['CPU_NUM'] = '2'
os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
os.environ['FLAGS_fast_eager_deletion_mode'] = '1'
import unittest
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid.executor import Executor
import paddle.fluid.core as core
from paddle.fluid.backward import append_backward
import paddle.fluid.compiler as compiler
import numpy
import multiprocessing
class TestEagerDeletionWhileOpBase(unittest.TestCase):
def test_main(self):
places = [core.CPUPlace(), ]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for p in places:
for with_data_parallel in [False, True]:
with fluid.program_guard(fluid.Program(), fluid.Program()):
with fluid.scope_guard(fluid.Scope()):
self.run_main(p, with_data_parallel)
def run_main(self, place, with_data_parallel):
self.place = place
self.with_data_parallel = with_data_parallel
if not core.is_compiled_with_cuda() and isinstance(self.place,
core.CUDAPlace):
return
if isinstance(self.place, core.CUDAPlace):
device_cnt = core.get_cuda_device_count(
) if self.with_data_parallel else 1
else:
device_cnt = int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count(
))) if self.with_data_parallel else 1
d0 = layers.data(
"d0", shape=[10], append_batch_size=False, dtype='float32')
d1 = layers.data(
"d1", shape=[10], append_batch_size=False, dtype='float32')
d2 = layers.data(
"d2", shape=[10], append_batch_size=False, dtype='float32')
i = layers.zeros(shape=[1], dtype='int64')
i.stop_gradient = True
init = layers.zeros(shape=[10], dtype='float32')
mem_array = layers.array_write(x=init, i=i)
data_array = layers.array_write(x=d0, i=i)
i = layers.increment(i)
layers.array_write(d1, i, array=data_array)
i = layers.increment(i)
layers.array_write(d2, i, array=data_array)
i = layers.zeros(shape=[1], dtype='int64')
i.stop_gradient = True
array_len = layers.fill_constant(shape=[1], dtype='int64', value=1)
array_len.stop_gradient = True
cond = layers.less_than(x=i, y=array_len)
j = layers.fill_constant(shape=[1], dtype='int64', value=1)
j.stop_gradient = True
array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3)
array_len2.stop_gradient = True
cond2 = layers.less_than(x=j, y=array_len2)
while_op = layers.While(cond=cond)
while_op2 = layers.While(cond=cond2)
with while_op.block():
d = layers.array_read(array=data_array, i=i)
prev = layers.array_read(array=mem_array, i=i)
d = layers.reshape(d, shape=[10])
prev = layers.reshape(prev, shape=[10])
result = layers.sums(input=[d, prev])
i = layers.increment(x=i, in_place=True)
layers.array_write(result, i=i, array=mem_array)
layers.less_than(x=i, y=array_len, cond=cond)
with while_op2.block():
d2 = layers.array_read(array=data_array, i=j)
prev2 = layers.array_read(array=mem_array, i=j)
d2 = layers.reshape(d2, shape=[10])
prev2 = layers.reshape(prev2, shape=[10])
result2 = layers.sums(input=[d2, prev2])
j = layers.increment(x=j, in_place=True)
layers.array_write(result2, i=j, array=mem_array)
layers.less_than(x=j, y=array_len2, cond=cond2)
sum_result = layers.array_read(array=mem_array, i=j)
sum_result.persistable = True
tmp = layers.unsqueeze(sum_result, axes=[0])
tmp = layers.expand(tmp, expand_times=[10, 1])
fc = layers.fc(tmp, size=256)
loss = layers.mean(sum_result)
optim = fluid.optimizer.Adam(learning_rate=1e-3)
optim.minimize(loss)
exe = Executor(self.place)
exe.run(fluid.default_startup_program())
prog = compiler.CompiledProgram(fluid.default_main_program())
if self.with_data_parallel:
prog = prog.with_data_parallel()
for _ in range(5):
d = []
for i in range(3):
tmp = numpy.random.random(size=[10]).astype('float32')
if not self.with_data_parallel:
d.append(tmp)
else:
d.append(numpy.array([tmp] * device_cnt))
outs = exe.run(program=prog,
feed={'d0': d[0],
'd1': d[1],
'd2': d[2]},
fetch_list=[sum_result])
self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)
if __name__ == '__main__':
unittest.main()
...@@ -31,6 +31,80 @@ def dequantize_max_abs(x, scale, max_range): ...@@ -31,6 +31,80 @@ def dequantize_max_abs(x, scale, max_range):
return y return y
def channel_wise_quantize_max_abs(x, quant_bit=8):
scales = []
for i in range(x.shape[0]):
scales.append(np.max(np.abs(x[i])).astype("float32"))
y = x.copy()
max_range = math.pow(2, quant_bit - 1) - 1
for i, scale in enumerate(scales):
y[i] = np.round(y[i] / scale * max_range)
return y, scales
def channel_wise_dequantize_max_abs(x,
scales,
quant_bits,
activation_scale=None):
y = x.copy()
for i in range(x.shape[0]):
y[i] = (scales[i] / (math.pow(2, quant_bits[0] - 1) - 1)) * y[i]
if activation_scale is not None:
y *= activation_scale / (math.pow(2, quant_bits[1] - 1) - 1)
return y
class TestFakeChannelWiseDequantizeMaxAbsOpTwoScales(OpTest):
def set_args(self):
self.quant_bits = [8, 8]
self.data_type = "float32"
self.activation_scale = 0.7861
def setUp(self):
self.set_args()
self.op_type = "fake_channel_wise_dequantize_max_abs"
x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0])
ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits,
self.activation_scale)
self.inputs = {
'X': yq,
'Scales': [("scales0", np.array(scales).astype(self.data_type)),
("scales1", np.array(
[self.activation_scale]).astype(self.data_type))]
}
self.attrs = {'quant_bits': self.quant_bits}
self.outputs = {'Out': ydq}
def test_check_output(self):
self.check_output()
class TestFakeChannelWiseDequantizeMaxAbsOpOneScale(OpTest):
def set_args(self):
self.quant_bits = [8]
self.data_type = "float32"
def setUp(self):
self.set_args()
self.op_type = "fake_channel_wise_dequantize_max_abs"
x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0])
ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits)
self.inputs = {
'X': yq,
'Scales': [("scales0", np.array(scales).astype(self.data_type))]
}
self.attrs = {'quant_bits': self.quant_bits}
self.outputs = {'Out': ydq}
def test_check_output(self):
self.check_output()
class TestFakeDequantizeMaxAbsOp(OpTest): class TestFakeDequantizeMaxAbsOp(OpTest):
def set_args(self): def set_args(self):
self.num_bits = 8 self.num_bits = 8
......
...@@ -35,6 +35,30 @@ class TestFakeQuantizeOp(OpTest): ...@@ -35,6 +35,30 @@ class TestFakeQuantizeOp(OpTest):
self.check_output() self.check_output()
class TestFakeChannelWiseQuantizeOp(OpTest):
def setUp(self):
self.op_type = "fake_channel_wise_quantize_abs_max"
self.attrs = {'bit_length': 8}
self.inputs = {
'X': np.random.random((4, 3, 64, 64)).astype("float32"),
}
scales = []
for i in range(self.inputs['X'].shape[0]):
scales.append(np.max(np.abs(self.inputs['X'][i])).astype("float32"))
outputs = self.inputs['X'].copy()
for i, scale in enumerate(scales):
outputs[i] = np.round(outputs[i] / scale * (
(1 << (self.attrs['bit_length'] - 1)) - 1))
self.outputs = {
'Out': outputs,
'OutScales': np.array(scales).astype("float32"),
}
def test_check_output(self):
self.check_output()
class TestFakeQuantizeRangeAbsMaxOp(OpTest): class TestFakeQuantizeRangeAbsMaxOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "fake_quantize_range_abs_max" self.op_type = "fake_quantize_range_abs_max"
......
...@@ -156,7 +156,7 @@ class TestGRUOp(OpTest): ...@@ -156,7 +156,7 @@ class TestGRUOp(OpTest):
} }
def test_check_output(self): def test_check_output(self):
self.check_output(atol=1e-8) self.check_output(atol=1e-8, check_imperative=True)
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden']) self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden'])
......
...@@ -152,7 +152,7 @@ class SimpleRNNCell(fluid.imperative.Layer): ...@@ -152,7 +152,7 @@ class SimpleRNNCell(fluid.imperative.Layer):
type='reduce_sum', type='reduce_sum',
inputs={'X': softmax_out}, inputs={'X': softmax_out},
outputs={'Out': reduce_out}, outputs={'Out': reduce_out},
attrs={'dim': None, attrs={'dim': [],
'keep_dim': False, 'keep_dim': False,
'reduce_all': True}) 'reduce_all': True})
......
...@@ -277,7 +277,7 @@ class TestImperativeResnet(unittest.TestCase): ...@@ -277,7 +277,7 @@ class TestImperativeResnet(unittest.TestCase):
dy_grad_value = {} dy_grad_value = {}
for param in resnet.parameters(): for param in resnet.parameters():
if not param.stop_gradient: if param.trainable:
np_array = np.array(param._ivar._grad_ivar().value() np_array = np.array(param._ivar._grad_ivar().value()
.get_tensor()) .get_tensor())
dy_grad_value[param.name + core.grad_var_suffix( dy_grad_value[param.name + core.grad_var_suffix(
...@@ -322,7 +322,7 @@ class TestImperativeResnet(unittest.TestCase): ...@@ -322,7 +322,7 @@ class TestImperativeResnet(unittest.TestCase):
for param in resnet.parameters(): for param in resnet.parameters():
static_param_name_list.append(param.name) static_param_name_list.append(param.name)
for param in resnet.parameters(): for param in resnet.parameters():
if not param.stop_gradient: if param.trainable:
static_grad_name_list.append(param.name + static_grad_name_list.append(param.name +
core.grad_var_suffix()) core.grad_var_suffix())
......
...@@ -15,13 +15,143 @@ ...@@ -15,13 +15,143 @@
from __future__ import print_function from __future__ import print_function
import unittest import unittest
import paddle.fluid.layers as layers import contextlib
import numpy as np
import decorators
import paddle
import paddle.fluid as fluid
from paddle.fluid.layers.device import get_places from paddle.fluid.layers.device import get_places
import paddle.fluid.nets as nets import paddle.fluid.nets as nets
from paddle.fluid.framework import Program, program_guard, default_main_program from paddle.fluid.framework import Program, program_guard, default_main_program
from paddle.fluid.param_attr import ParamAttr from paddle.fluid.param_attr import ParamAttr
import decorators from paddle.fluid import core
from paddle.fluid.initializer import Constant from paddle.fluid.initializer import Constant
import paddle.fluid.layers as layers
from test_imperative_base import new_program_scope
from paddle.fluid.imperative import nn
from paddle.fluid.imperative import base
class LayerTest(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.seed = 111
@classmethod
def tearDownClass(cls):
pass
def _get_place(self):
if core.is_compiled_with_cuda():
return core.CUDAPlace(0)
return core.CPUPlace()
@contextlib.contextmanager
def static_graph(self):
with new_program_scope():
fluid.default_startup_program().random_seed = self.seed
fluid.default_main_program().random_seed = self.seed
yield
def get_static_graph_result(self, feed, fetch_list):
exe = fluid.Executor(self._get_place())
exe.run(fluid.default_startup_program())
return exe.run(fluid.default_main_program(),
feed=feed,
fetch_list=fetch_list)
@contextlib.contextmanager
def dynamic_graph(self):
with fluid.imperative.guard(self._get_place()):
fluid.default_startup_program().random_seed = self.seed
fluid.default_main_program().random_seed = self.seed
yield
class TestLayer(LayerTest):
def test_relu(self):
with self.static_graph():
t = layers.data(name='t', shape=[3, 3], dtype='float32')
ret = layers.relu(t)
static_ret = self.get_static_graph_result(
feed={'t': np.ones(
[3, 3], dtype='float32')}, fetch_list=[ret])[0]
with self.dynamic_graph():
t = np.ones([3, 3], dtype='float32')
dy_ret = layers.relu(base.to_variable(t))
self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
def test_conv2d(self):
with self.static_graph():
images = layers.data(name='pixel', shape=[3, 5, 5], dtype='float32')
ret = layers.conv2d(input=images, num_filters=3, filter_size=[2, 2])
static_ret = self.get_static_graph_result(
feed={'pixel': np.ones(
[2, 3, 5, 5], dtype='float32')},
fetch_list=[ret])[0]
with self.static_graph():
images = layers.data(name='pixel', shape=[3, 5, 5], dtype='float32')
conv2d = nn.Conv2D(
'conv2d', num_channels=3, num_filters=3, filter_size=[2, 2])
ret = conv2d(images)
static_ret2 = self.get_static_graph_result(
feed={'pixel': np.ones(
[2, 3, 5, 5], dtype='float32')},
fetch_list=[ret])[0]
with self.dynamic_graph():
images = np.ones([2, 3, 5, 5], dtype='float32')
conv2d = nn.Conv2D(
'conv2d', num_channels=3, num_filters=3, filter_size=[2, 2])
dy_ret = conv2d(base.to_variable(images))
self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
self.assertTrue(np.allclose(static_ret, static_ret2))
def test_gru_unit(self):
lod = [[2, 4, 3]]
D = 5
T = sum(lod[0])
N = len(lod[0])
input = np.random.rand(T, 3 * D).astype('float32')
hidden_input = np.random.rand(T, D).astype('float32')
with self.static_graph():
x = layers.data(name='x', shape=[-1, D * 3], dtype='float32')
hidden = layers.data(name='hidden', shape=[-1, D], dtype='float32')
updated_hidden, reset_hidden_pre, gate = layers.gru_unit(
input=x, hidden=hidden, size=D * 3)
static_ret = self.get_static_graph_result(
feed={'x': input,
'hidden': hidden_input},
fetch_list=[updated_hidden, reset_hidden_pre, gate])
with self.static_graph():
x = layers.data(name='x', shape=[-1, D * 3], dtype='float32')
hidden = layers.data(name='hidden', shape=[-1, D], dtype='float32')
updated_hidden, reset_hidden_pre, gate = layers.gru_unit(
input=x, hidden=hidden, size=D * 3)
gru = nn.GRUUnit('gru', size=D * 3)
updated_hidden, reset_hidden_pre, gate = gru(x, hidden)
static_ret2 = self.get_static_graph_result(
feed={'x': input,
'hidden': hidden_input},
fetch_list=[updated_hidden, reset_hidden_pre, gate])
with self.dynamic_graph():
gru = nn.GRUUnit('gru', size=D * 3)
dy_ret = gru(
base.to_variable(input), base.to_variable(hidden_input))
for i in range(len(static_ret)):
self.assertTrue(np.allclose(static_ret[i], static_ret2[i]))
self.assertTrue(np.allclose(static_ret[i], dy_ret[i]._numpy()))
class TestBook(unittest.TestCase): class TestBook(unittest.TestCase):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import unittest
os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
os.environ['FLAGS_memory_fraction_of_eager_deletion'] = "0.55"
os.environ['RECORDIO_FILENAME'] = './p_gc_transformer.wmt16.recordio'
from test_parallel_executor_transformer import TestTransformer
if __name__ == '__main__':
unittest.main()
...@@ -31,7 +31,7 @@ class TestRandomCropOp(OpTest): ...@@ -31,7 +31,7 @@ class TestRandomCropOp(OpTest):
np.array([[6, 7, 8], [10, 11, 12]]).astype(np.int32) np.array([[6, 7, 8], [10, 11, 12]]).astype(np.int32)
] ]
self.op_type = "random_crop" self.op_type = "random_crop"
self.inputs = {'X': to_crop, 'Seed': np.array([10])} self.inputs = {'X': to_crop, 'Seed': np.array([10]).astype('int64')}
self.outputs = {'Out': np.array([]), 'SeedOut': np.array([])} self.outputs = {'Out': np.array([]), 'SeedOut': np.array([])}
self.attrs = {'shape': [2, 3]} self.attrs = {'shape': [2, 3]}
......
...@@ -30,6 +30,6 @@ if error: ...@@ -30,6 +30,6 @@ if error:
'''If you modify/add/delete the API files, including code and comment, please follow these steps in order to pass the CI: '''If you modify/add/delete the API files, including code and comment, please follow these steps in order to pass the CI:
1. cd ${paddle_path}, compile paddle; 1. cd ${paddle_path}, compile paddle;
2. pip install build/python/dist/(build whl package); 2. pip install build/python/dist/(build whl package);
3. run "python tools/print_signatures.py paddle.fluid, paddle.reader > paddle/fluid/API.spec"''' 3. run "python tools/print_signatures.py paddle.fluid,paddle.reader > paddle/fluid/API.spec"'''
) )
sys.exit(1) sys.exit(1)
...@@ -51,6 +51,8 @@ def visit_member(parent_name, member): ...@@ -51,6 +51,8 @@ def visit_member(parent_name, member):
all = (args, doc) all = (args, doc)
member_dict[cur_name] = all member_dict[cur_name] = all
except TypeError: # special for PyBind method except TypeError: # special for PyBind method
if cur_name in check_modules_list:
return
member_dict[cur_name] = " ".join([ member_dict[cur_name] = " ".join([
line.strip() for line in pydoc.render_doc(member).split('\n') line.strip() for line in pydoc.render_doc(member).split('\n')
if "->" in line if "->" in line
...@@ -78,6 +80,7 @@ def visit_all_module(mod): ...@@ -78,6 +80,7 @@ def visit_all_module(mod):
visit_member(mod.__name__, instance) visit_member(mod.__name__, instance)
check_modules_list = ["paddle.reader.ComposeNotAligned.__init__"]
modules = sys.argv[1].split(",") modules = sys.argv[1].split(",")
for m in modules: for m in modules:
visit_all_module(importlib.import_module(m)) visit_all_module(importlib.import_module(m))
......
...@@ -95,6 +95,22 @@ class _ChromeTraceFormatter(object): ...@@ -95,6 +95,22 @@ class _ChromeTraceFormatter(object):
event['args'] = args event['args'] = args
self._events.append(event) self._events.append(event)
def emit_counter(self, category, name, pid, timestamp, counter, value):
"""Emits a record for a single counter.
Args:
category: The event category as string
name: The event name as string
pid: Identifier of the process generating this event as integer
timestamp: The timestamps of this event as long integer
counter: Name of the counter as string
value: Value of the counter as integer
tid: Thread id of the allocation as integer
"""
event = self._create_event('C', category, name, pid, 0, timestamp)
event['args'] = {counter: value}
self._events.append(event)
def format_to_string(self, pretty=False): def format_to_string(self, pretty=False):
"""Formats the chrome trace to a string. """Formats the chrome trace to a string.
...@@ -117,6 +133,7 @@ class Timeline(object): ...@@ -117,6 +133,7 @@ class Timeline(object):
self._profile_dict = profile_dict self._profile_dict = profile_dict
self._pid = 0 self._pid = 0
self._devices = dict() self._devices = dict()
self._mem_devices = dict()
self._chrome_trace = _ChromeTraceFormatter() self._chrome_trace = _ChromeTraceFormatter()
def _allocate_pid(self): def _allocate_pid(self):
...@@ -131,7 +148,7 @@ class Timeline(object): ...@@ -131,7 +148,7 @@ class Timeline(object):
if (k, event.device_id, "CPU") not in self._devices: if (k, event.device_id, "CPU") not in self._devices:
pid = self._allocate_pid() pid = self._allocate_pid()
self._devices[(k, event.device_id, "CPU")] = pid self._devices[(k, event.device_id, "CPU")] = pid
# -1 device id represents CUDA api call # -1 device id represents CUDA API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy)
if event.device_id == -1: if event.device_id == -1:
self._chrome_trace.emit_pid("%s:cuda_api" % k, pid) self._chrome_trace.emit_pid("%s:cuda_api" % k, pid)
else: else:
...@@ -143,6 +160,45 @@ class Timeline(object): ...@@ -143,6 +160,45 @@ class Timeline(object):
self._devices[(k, event.device_id, "GPUKernel")] = pid self._devices[(k, event.device_id, "GPUKernel")] = pid
self._chrome_trace.emit_pid("%s:gpu:%d" % self._chrome_trace.emit_pid("%s:gpu:%d" %
(k, event.device_id), pid) (k, event.device_id), pid)
for mevent in profile_pb.mem_events:
if mevent.place == profiler_pb2.MemEvent.CUDAPlace:
if (k, mevent.device_id, "GPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, mevent.device_id, "GPU")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:gpu:%d" % (k, mevent.device_id),
pid)
elif mevent.place == profiler_pb2.MemEvent.CPUPlace:
if (k, mevent.device_id, "CPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, mevent.device_id, "CPU")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:cpu:%d" % (k, mevent.device_id),
pid)
elif mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace:
if (k, mevent.device_id, "CUDAPinnedPlace"
) not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, mevent.device_id,
"CUDAPinnedPlace")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:cudapinnedplace:%d" %
(k, mevent.device_id), pid)
if (k, 0, "CPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, 0, "CPU")] = pid
self._chrome_trace.emit_pid("memory usage on %s:cpu:%d" %
(k, 0), pid)
if (k, 0, "GPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, 0, "GPU")] = pid
self._chrome_trace.emit_pid("memory usage on %s:gpu:%d" %
(k, 0), pid)
if (k, 0, "CUDAPinnedPlace") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:cudapinnedplace:%d" % (k, 0), pid)
def _allocate_events(self): def _allocate_events(self):
for k, profile_pb in six.iteritems(self._profile_dict): for k, profile_pb in six.iteritems(self._profile_dict):
...@@ -163,9 +219,57 @@ class Timeline(object): ...@@ -163,9 +219,57 @@ class Timeline(object):
event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid, event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid,
event.sub_device_id, 'Op', event.name, args) event.sub_device_id, 'Op', event.name, args)
def _allocate_memory_event(self):
place_to_str = {
profiler_pb2.MemEvent.CPUPlace: "CPU",
profiler_pb2.MemEvent.CUDAPlace: "GPU",
profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace"
}
for k, profile_pb in six.iteritems(self._profile_dict):
mem_list = []
end_profiler = 0
for mevent in profile_pb.mem_events:
crt_info = dict()
crt_info['time'] = mevent.start_ns
crt_info['size'] = mevent.bytes
if mevent.place in place_to_str:
place = place_to_str[mevent.place]
else:
place = "UnDefine"
crt_info['place'] = place
pid = self._mem_devices[(k, mevent.device_id, place)]
crt_info['pid'] = pid
crt_info['thread_id'] = mevent.thread_id
crt_info['device_id'] = mevent.device_id
mem_list.append(crt_info)
crt_info = dict()
crt_info['place'] = place
crt_info['pid'] = pid
crt_info['thread_id'] = mevent.thread_id
crt_info['device_id'] = mevent.device_id
crt_info['time'] = mevent.end_ns
crt_info['size'] = -mevent.bytes
mem_list.append(crt_info)
end_profiler = max(end_profiler, crt_info['time'])
mem_list.sort(key=lambda tmp: (tmp.get('time', 0)))
i = 0
total_size = 0
while i < len(mem_list):
total_size += mem_list[i]['size']
while i < len(mem_list) - 1 and mem_list[i]['time'] == mem_list[
i + 1]['time']:
total_size += mem_list[i + 1]['size']
i += 1
self._chrome_trace.emit_counter(
"Memory", "Memory", mem_list[i]['pid'], mem_list[i]['time'],
0, total_size)
i += 1
def generate_chrome_trace(self): def generate_chrome_trace(self):
self._allocate_pids() self._allocate_pids()
self._allocate_events() self._allocate_events()
self._allocate_memory_event()
return self._chrome_trace.format_to_string() return self._chrome_trace.format_to_string()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册