提交 8ed02339 编写于 作者: M minqiyang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into accelerate_ddpg

test=develop
...@@ -60,7 +60,7 @@ class Float16Transpiler: ...@@ -60,7 +60,7 @@ class Float16Transpiler:
raise TypeError("place should be as CPUPlace/CUDAPlace type") raise TypeError("place should be as CPUPlace/CUDAPlace type")
if scope is None: if scope is None:
scope = global_scope() scope = global_scope()
if not isinstance(scope, core.Scope): if not isinstance(scope, core._Scope):
raise TypeError("scope should be as Scope type or None") raise TypeError("scope should be as Scope type or None")
self.scope = scope self.scope = scope
......
...@@ -351,6 +351,23 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b ...@@ -351,6 +351,23 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b
paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)) paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.CompressPass.apply ArgSpec(args=['self', 'graph'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.ImitationGraph.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.ImitationGraph.all_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.SensitivePruneStrategy.__init__ ArgSpec(args=['self', 'pruner', 'start_epoch', 'end_epoch', 'delta_rate', 'acc_loss_threshold', 'sensitivities'], varargs=None, keywords=None, defaults=(None, 0, 10, 0.2, 0.2, None))
paddle.fluid.contrib.SensitivePruneStrategy.on_batch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.SensitivePruneStrategy.on_batch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.SensitivePruneStrategy.on_compress_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.SensitivePruneStrategy.on_compress_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_begin ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_end ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.MagnitudePruner.__init__ ArgSpec(args=['self', 'threshold'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.MagnitudePruner.prune ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.RatioPruner.__init__ ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.RatioPruner.prune ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.load_persistables_for_increment ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.load_persistables_for_increment ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.load_persistables_for_inference ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.load_persistables_for_inference ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.convert_dist_to_sparse_program ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.convert_dist_to_sparse_program ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
...@@ -447,11 +464,7 @@ paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, ke ...@@ -447,11 +464,7 @@ paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, ke
paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
paddle.fluid.Scope.__init__ __init__(self: paddle.fluid.core.Scope) -> None paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None
paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope
paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None) paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None)
paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None) paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None)
paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None) paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None)
......
...@@ -48,10 +48,10 @@ if(WITH_GPU) ...@@ -48,10 +48,10 @@ if(WITH_GPU)
nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context) nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
add_dependencies(tensor tensor_util) add_dependencies(tensor tensor_util)
else() else()
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context) nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context )
endif(WIN32) endif(WIN32)
else() else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context) cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context )
endif() endif()
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
...@@ -84,6 +84,7 @@ cc_library(threadpool SRCS threadpool.cc DEPS enforce) ...@@ -84,6 +84,7 @@ cc_library(threadpool SRCS threadpool.cc DEPS enforce)
cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash) cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash)
cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
cc_test(scope_test SRCS scope_test.cc DEPS scope) cc_test(scope_test SRCS scope_test.cc DEPS scope)
cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor) cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
......
...@@ -165,7 +165,7 @@ template <typename T> ...@@ -165,7 +165,7 @@ template <typename T>
class GreaterThanChecker { class GreaterThanChecker {
public: public:
explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {} explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
void operator()(T& value) const { void operator()(const T& value) const {
PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails."); PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails.");
} }
...@@ -177,7 +177,7 @@ template <typename T> ...@@ -177,7 +177,7 @@ template <typename T>
class EqualGreaterThanChecker { class EqualGreaterThanChecker {
public: public:
explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {} explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
void operator()(T& value) const { void operator()(const T& value) const {
PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails."); PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails.");
} }
...@@ -193,7 +193,7 @@ class DefaultValueSetter { ...@@ -193,7 +193,7 @@ class DefaultValueSetter {
public: public:
explicit DefaultValueSetter(T default_value) explicit DefaultValueSetter(T default_value)
: default_value_(default_value) {} : default_value_(default_value) {}
void operator()(T& value) const { value = default_value_; } // NOLINT void operator()(T* value) const { *value = default_value_; }
private: private:
T default_value_; T default_value_;
...@@ -203,7 +203,7 @@ template <typename T> ...@@ -203,7 +203,7 @@ template <typename T>
class EnumInContainer { class EnumInContainer {
public: public:
explicit EnumInContainer(const std::unordered_set<T>& c) : container_(c) {} explicit EnumInContainer(const std::unordered_set<T>& c) : container_(c) {}
void operator()(T& val) const { void operator()(const T& val) const {
PADDLE_ENFORCE(container_.find(val) != container_.end(), PADDLE_ENFORCE(container_.find(val) != container_.end(),
"Value %s is not in enum container %s", val, "Value %s is not in enum container %s", val,
ContainerDebugString()); ContainerDebugString());
...@@ -232,7 +232,8 @@ class EnumInContainer { ...@@ -232,7 +232,8 @@ class EnumInContainer {
// an attribute can have more than one limits // an attribute can have more than one limits
template <typename T> template <typename T>
class TypedAttrChecker { class TypedAttrChecker {
typedef std::function<void(T&)> ValueChecker; typedef std::function<void(T*)> DefaultValueChecker;
typedef std::function<void(const T&)> ValueChecker;
public: public:
explicit TypedAttrChecker(const std::string& attr_name) explicit TypedAttrChecker(const std::string& attr_name)
...@@ -268,17 +269,17 @@ class TypedAttrChecker { ...@@ -268,17 +269,17 @@ class TypedAttrChecker {
return *this; return *this;
} }
void operator()(AttributeMap& attr_map) const { // NOLINT void operator()(AttributeMap* attr_map) const {
if (!attr_map.count(attr_name_)) { if (!attr_map->count(attr_name_)) {
// user do not set this attr // user do not set this attr
PADDLE_ENFORCE(!default_value_setter_.empty(), PADDLE_ENFORCE(!default_value_setter_.empty(),
"Attribute '%s' is required!", attr_name_); "Attribute '%s' is required!", attr_name_);
// default_value_setter_ has no more than one element // default_value_setter_ has no more than one element
T val; T val;
(default_value_setter_[0])(val); (default_value_setter_[0])(&val);
attr_map[attr_name_] = val; (*attr_map)[attr_name_] = val;
} }
Attribute& attr = attr_map.at(attr_name_); Attribute& attr = attr_map->at(attr_name_);
ExtractAttribute<T> extract_attr(attr_name_); ExtractAttribute<T> extract_attr(attr_name_);
T* attr_value = extract_attr(attr); T* attr_value = extract_attr(attr);
for (const auto& checker : value_checkers_) { for (const auto& checker : value_checkers_) {
...@@ -289,12 +290,12 @@ class TypedAttrChecker { ...@@ -289,12 +290,12 @@ class TypedAttrChecker {
private: private:
std::string attr_name_; std::string attr_name_;
std::vector<ValueChecker> value_checkers_; std::vector<ValueChecker> value_checkers_;
std::vector<ValueChecker> default_value_setter_; std::vector<DefaultValueChecker> default_value_setter_;
}; };
// check whether op's all attributes fit their own limits // check whether op's all attributes fit their own limits
class OpAttrChecker { class OpAttrChecker {
typedef std::function<void(AttributeMap&)> AttrChecker; typedef std::function<void(AttributeMap*)> AttrChecker;
public: public:
template <typename T> template <typename T>
...@@ -304,7 +305,7 @@ class OpAttrChecker { ...@@ -304,7 +305,7 @@ class OpAttrChecker {
return *(checker.target<TypedAttrChecker<T>>()); return *(checker.target<TypedAttrChecker<T>>());
} }
void Check(AttributeMap& attr_map) const { // NOLINT void Check(AttributeMap* attr_map) const {
for (const auto& checker : attr_checkers_) { for (const auto& checker : attr_checkers_) {
checker(attr_map); checker(attr_map);
} }
......
...@@ -355,7 +355,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( ...@@ -355,7 +355,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
BuildStrategy::GradientScaleStrategy::kCustomized) { BuildStrategy::GradientScaleStrategy::kCustomized) {
// TODO(paddle-dev): Why is there no input for this op_handle? // TODO(paddle-dev): Why is there no input for this op_handle?
auto loss_grad_name = node->Op()->OutputArgumentNames()[0]; auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0]); auto out_dtype = all_vars_.at(loss_grad_name)->GetDataType();
CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0],
out_dtype);
} }
// This assumes the backward generating code will ensure IsScaleLossOp // This assumes the backward generating code will ensure IsScaleLossOp
// is true only for the op that scale the final scalar loss. // is true only for the op that scale the final scalar loss.
...@@ -658,13 +660,13 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID( ...@@ -658,13 +660,13 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(
void MultiDevSSAGraphBuilder::CreateScaleLossGradOp( void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
ir::Graph *result, const std::string &loss_grad_name, ir::Graph *result, const std::string &loss_grad_name,
ir::Node *out_var_node) const { ir::Node *out_var_node, proto::VarType::Type dtype) const {
for (size_t i = 0; i < places_.size(); ++i) { for (size_t i = 0; i < places_.size(); ++i) {
// Insert ScaleCost OpHandle // Insert ScaleCost OpHandle
auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]); auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]);
auto *op_handle = new ScaleLossGradOpHandle( auto *op_handle = new ScaleLossGradOpHandle(
result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation), result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation),
local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx); local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx, dtype);
result->Get<GraphOps>(kGraphOps).emplace_back(op_handle); result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
// FIXME: Currently ScaleLossGradOp only use device_count as scale // FIXME: Currently ScaleLossGradOp only use device_count as scale
......
...@@ -68,7 +68,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ...@@ -68,7 +68,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
void CreateScaleLossGradOp(ir::Graph *result, void CreateScaleLossGradOp(ir::Graph *result,
const std::string &loss_grad_name, const std::string &loss_grad_name,
ir::Node *out_var_node) const; ir::Node *out_var_node,
proto::VarType::Type dtype) const;
VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og, VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
int dst_dev_id) const; int dst_dev_id) const;
......
...@@ -22,39 +22,66 @@ namespace details { ...@@ -22,39 +22,66 @@ namespace details {
ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
Scope *scope, Scope *scope,
platform::Place place, platform::Place place,
platform::DeviceContext *dev_ctx) platform::DeviceContext *dev_ctx,
proto::VarType::Type dtype)
: OpHandleBase(node), : OpHandleBase(node),
coeff_(static_cast<float>(1.0 / num_dev)), coeff_(static_cast<float>(1.0 / num_dev)),
scope_(scope), scope_(scope),
place_(place) { place_(place),
out_dtype_(dtype) {
this->SetDeviceContext(place_, dev_ctx); this->SetDeviceContext(place_, dev_ctx);
} }
ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {} ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
void ScaleLossGradOpHandle::RunImpl() { struct ScaleLossGradFunctor {
// Doesn't wait any event float coeff_;
std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_; Tensor *out_;
auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(); platform::Place place_;
OpHandleBase *op_handle_;
proto::VarType::Type out_dtype_;
platform::DeviceContext *ctx_;
float *tmp = local_scope.FindVar(var_name) ScaleLossGradFunctor(float coeff, Tensor *out, platform::Place place,
->GetMutable<LoDTensor>() OpHandleBase *op_handle, proto::VarType::Type dtype,
->mutable_data<float>(make_ddim({1}), place_); platform::DeviceContext *ctx)
: coeff_(coeff), out_(out), place_(place), out_dtype_(dtype), ctx_(ctx) {}
template <typename OutT>
void apply() const {
auto *out_data = out_->mutable_data<OutT>(place_);
if (platform::is_cpu_place(place_)) { if (platform::is_cpu_place(place_)) {
*tmp = coeff_; *out_data = static_cast<OutT>(coeff_);
} else { } else {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
this->RunAndRecordEvent([&] { OutT cast_coeff = static_cast<OutT>(coeff_);
auto stream = static_cast<platform::CUDADeviceContext *>( auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
this->dev_ctxes_.at(place_)) memory::Copy(boost::get<platform::CUDAPlace>(place_), out_data,
->stream(); platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_),
memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp, stream);
platform::CPUPlace(), &coeff_, sizeof(float), stream);
VLOG(10) << place_ << "RUN Scale loss grad op"; VLOG(10) << place_ << "RUN Scale loss grad op";
});
#endif #endif
} }
}
};
void ScaleLossGradOpHandle::RunImpl() {
// Doesn't wait any event
std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto *tensor = local_scope.FindVar(var_name)->GetMutable<LoDTensor>();
tensor->Resize(make_ddim({1}));
#ifdef PADDLE_WITH_CUDA
ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_,
this->dev_ctxes_.at(place_));
this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
#else
ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, nullptr);
framework::VisitDataType(out_dtype_, func);
#endif
} }
std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; } std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; }
......
...@@ -26,8 +26,8 @@ namespace details { ...@@ -26,8 +26,8 @@ namespace details {
struct ScaleLossGradOpHandle : public OpHandleBase { struct ScaleLossGradOpHandle : public OpHandleBase {
ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope, ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope,
platform::Place place, platform::Place place, platform::DeviceContext *context,
platform::DeviceContext *context); proto::VarType::Type dtype);
~ScaleLossGradOpHandle() final; ~ScaleLossGradOpHandle() final;
...@@ -40,6 +40,7 @@ struct ScaleLossGradOpHandle : public OpHandleBase { ...@@ -40,6 +40,7 @@ struct ScaleLossGradOpHandle : public OpHandleBase {
float coeff_; float coeff_;
Scope *scope_; Scope *scope_;
platform::Place place_; platform::Place place_;
proto::VarType::Type out_dtype_;
}; };
} // namespace details } // namespace details
......
...@@ -31,10 +31,12 @@ std::map<std::string, ...@@ -31,10 +31,12 @@ std::map<std::string,
std::shared_ptr<std::unordered_map< std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>> std::string, std::shared_ptr<ngraph::Node>>>)>>
NgraphBridge::NG_NODE_MAP = { NgraphBridge::NG_NODE_MAP = {
{"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode},
{"mul", paddle::operators::ngraphs::BuildMulNode}, {"mul", paddle::operators::ngraphs::BuildMulNode},
{"mul_grad", paddle::operators::ngraphs::BuildMulGradNode}, {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode},
{"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>}, {"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>},
{"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>}}; {"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>},
{"top_k", paddle::operators::ngraphs::BuildTopKNode}};
void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) { void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
auto& op_type = op->Type(); auto& op_type = op->Type();
......
...@@ -643,7 +643,7 @@ void OpDesc::CheckAttrs() { ...@@ -643,7 +643,7 @@ void OpDesc::CheckAttrs() {
// not by users. // not by users.
return; return;
} }
checker->Check(attrs_); checker->Check(&attrs_);
} }
void OpDesc::InferShape(const BlockDesc &block) const { void OpDesc::InferShape(const BlockDesc &block) const {
......
...@@ -82,6 +82,10 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, ...@@ -82,6 +82,10 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
AddAttr<std::string>(OpNamescopeAttrName(), "Operator name with namesope.") AddAttr<std::string>(OpNamescopeAttrName(), "Operator name with namesope.")
.SetDefault(""); .SetDefault("");
AddAttr<std::vector<std::string>>(OpCreationCallstackAttrName(),
"Callstack for Op Creatation.")
.SetDefault({});
Validate(); Validate();
} }
......
...@@ -47,6 +47,7 @@ class OpProtoAndCheckerMaker { ...@@ -47,6 +47,7 @@ class OpProtoAndCheckerMaker {
static const char *OpRoleAttrName() { return "op_role"; } static const char *OpRoleAttrName() { return "op_role"; }
static const char *OpRoleVarAttrName() { return "op_role_var"; } static const char *OpRoleVarAttrName() { return "op_role_var"; }
static const char *OpNamescopeAttrName() { return "op_namescope"; } static const char *OpNamescopeAttrName() { return "op_namescope"; }
static const char *OpCreationCallstackAttrName() { return "op_callstack"; }
void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker); void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
......
...@@ -24,7 +24,7 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp( ...@@ -24,7 +24,7 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
const VariableNameMap& outputs, AttributeMap attrs) { const VariableNameMap& outputs, AttributeMap attrs) {
auto& info = OpInfoMap::Instance().Get(type); auto& info = OpInfoMap::Instance().Get(type);
if (info.Checker() != nullptr) { if (info.Checker() != nullptr) {
info.Checker()->Check(attrs); info.Checker()->Check(&attrs);
} }
auto op = info.Creator()(type, inputs, outputs, attrs); auto op = info.Creator()(type, inputs, outputs, attrs);
return std::unique_ptr<OperatorBase>(op); return std::unique_ptr<OperatorBase>(op);
......
...@@ -16,10 +16,15 @@ limitations under the License. */ ...@@ -16,10 +16,15 @@ limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include <algorithm> #include <algorithm>
#include <sstream>
#include <string>
#include <vector>
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/transfer_scope_cache.h"
...@@ -157,7 +162,10 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames, ...@@ -157,7 +162,10 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames,
} }
void OperatorBase::Run(const Scope& scope, const platform::Place& place) { void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
try {
if (VLOG_IS_ON(4)) {
VLOG(4) << place << " " << DebugStringEx(&scope); VLOG(4) << place << " " << DebugStringEx(&scope);
}
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
PADDLE_THROW("Cannot run operator on place %s", place); PADDLE_THROW("Cannot run operator on place %s", place);
...@@ -167,17 +175,46 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -167,17 +175,46 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
#endif #endif
} }
// The profile has a process-wide mutex, results in serious performance issue // The profile has a process-wide mutex, results in serious performance
// issue
// in concurrency scenerio. Here use an `if` to fix this issue. // in concurrency scenerio. Here use an `if` to fix this issue.
// Please not remove the `if`, ask @Superjomn if there are any concern. // Please not remove the `if`, ask @Superjomn if there are any concern.
if (platform::IsProfileEnabled()) { if (platform::IsProfileEnabled()) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place)); platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place); RunImpl(scope, place);
} else { } else {
RunImpl(scope, place); RunImpl(scope, place);
} }
if (VLOG_IS_ON(3)) {
VLOG(3) << place << " " << DebugStringEx(&scope); VLOG(3) << place << " " << DebugStringEx(&scope);
}
} catch (platform::EnforceNotMet exception) {
if (Attrs().count("sub_block") != 0) {
throw exception;
}
auto& callstack = Attr<std::vector<std::string>>(
OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
if (callstack.empty()) {
throw exception;
}
std::ostringstream sout;
sout << "Invoke operator " << Type() << " error.\n";
sout << "Python Callstacks: \n";
for (auto& line : callstack) {
sout << line;
}
sout << "C++ Callstacks: \n";
sout << exception.err_str_;
exception.err_str_ = sout.str();
throw exception;
} catch (...) {
std::rethrow_exception(std::current_exception());
}
} }
bool OperatorBase::HasInputs(const std::string& name) const { bool OperatorBase::HasInputs(const std::string& name) const {
...@@ -1057,8 +1094,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( ...@@ -1057,8 +1094,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
t = &(var->Get<SelectedRows>().value()); t = &(var->Get<SelectedRows>().value());
} }
if (t != nullptr) { if (t != nullptr) {
PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized: %s", PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized",
ipt_name, DebugString()); ipt_name);
int tmp = static_cast<int>(t->type()); int tmp = static_cast<int>(t->type());
PADDLE_ENFORCE( PADDLE_ENFORCE(
tmp == data_type || data_type == -1, tmp == data_type || data_type == -1,
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/scope_pool.h"
#include "paddle/fluid/framework/threadpool.h"
namespace paddle {
namespace framework {
ScopePool &ScopePool::Instance() { // NOLINT
static ScopePool pool;
return pool;
}
void ScopePool::DeleteScope(Scope *scope) { delete scope; }
void ScopePool::Insert(std::unique_ptr<Scope> &&s) {
std::lock_guard<std::mutex> guard(mtx_);
scopes_.insert(s.release());
}
void ScopePool::Remove(Scope *s) {
size_t has_scope;
{
std::lock_guard<std::mutex> guard(mtx_);
has_scope = scopes_.erase(s);
}
PADDLE_ENFORCE(has_scope > 0, "Delete non-existing global scope");
DeleteScope(s);
}
ScopePool::~ScopePool() { Clear(); }
void ScopePool::Clear() {
std::lock_guard<std::mutex> guard(mtx_);
for (auto *s : scopes_) {
DeleteScope(s);
}
scopes_.clear();
}
} // namespace framework
} // namespace paddle
...@@ -13,30 +13,34 @@ ...@@ -13,30 +13,34 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/temporary_allocator.h" #include <mutex> // NOLINT
#include <unordered_set>
#include "paddle/fluid/framework/scope.h"
namespace paddle { namespace paddle {
namespace platform { namespace framework {
template <typename T> class ScopePool {
paddle::framework::Tensor GetTensor( public:
memory::allocation::AllocationPtr temp_allocation_ptr, static ScopePool &Instance(); // NOLINT
const framework::DDim &dim) {
auto &deleter = temp_allocation_ptr.get_deleter(); void Insert(std::unique_ptr<Scope> &&s);
auto *allocation_ptr = temp_allocation_ptr.release();
auto shared_allocation = void Remove(Scope *s);
std::shared_ptr<memory::allocation::Allocation>(allocation_ptr, deleter);
void Clear();
PADDLE_ENFORCE(dynamic_cast<TemporaryAllocation *>(allocation_ptr) != nullptr,
"The AllocationPtr must be TemporaryAllocation."); ~ScopePool();
PADDLE_ENFORCE_EQ(allocation_ptr->size(),
framework::product(dim) * sizeof(T)); private:
ScopePool() = default;
paddle::framework::Tensor temp_tensor(std::type_index(typeid(T)));
temp_tensor.Resize(dim); static void DeleteScope(Scope *scope);
temp_tensor.ResetHolder(std::move(shared_allocation));
return temp_tensor; std::unordered_set<Scope *> scopes_;
} std::mutex mtx_;
};
} // namespace platform
} // namespace framework
} // namespace paddle } // namespace paddle
...@@ -28,8 +28,7 @@ void Tensor::check_memory_size() const { ...@@ -28,8 +28,7 @@ void Tensor::check_memory_size() const {
"or maybe the required data-type mismatches the data already stored."); "or maybe the required data-type mismatches the data already stored.");
} }
Tensor::Tensor(std::type_index type) Tensor::Tensor(const proto::VarType::Type& dtype) : type_(dtype), offset_(0) {}
: type_(framework::ToDataType(type)), offset_(0) {}
size_t Tensor::memory_size() const { size_t Tensor::memory_size() const {
return holder_ == nullptr ? 0UL : holder_->size() - offset_; return holder_ == nullptr ? 0UL : holder_->size() - offset_;
......
...@@ -69,7 +69,7 @@ class Tensor { ...@@ -69,7 +69,7 @@ class Tensor {
public: public:
Tensor() : type_(proto::VarType::FP32), offset_(0) {} Tensor() : type_(proto::VarType::FP32), offset_(0) {}
explicit Tensor(std::type_index type); explicit Tensor(const proto::VarType::Type&);
/*! Return a pointer to mutable memory block. */ /*! Return a pointer to mutable memory block. */
template <typename T> template <typename T>
......
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/temporary_allocator.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -151,5 +152,26 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) { ...@@ -151,5 +152,26 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
src_ptr, size); src_ptr, size);
} }
template <typename T>
paddle::framework::Tensor GetTensor(
memory::allocation::AllocationPtr temp_allocation_ptr,
const framework::DDim& dim) {
auto& deleter = temp_allocation_ptr.get_deleter();
auto* allocation_ptr = temp_allocation_ptr.release();
auto shared_allocation =
std::shared_ptr<memory::allocation::Allocation>(allocation_ptr, deleter);
PADDLE_ENFORCE(
dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
"The AllocationPtr must be TemporaryAllocation.");
PADDLE_ENFORCE_EQ(allocation_ptr->size(),
framework::product(dim) * sizeof(T));
paddle::framework::Tensor temp_tensor(
framework::ToDataType(std::type_index(typeid(T))));
temp_tensor.Resize(dim);
temp_tensor.ResetHolder(std::move(shared_allocation));
return temp_tensor;
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -231,11 +231,14 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -231,11 +231,14 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
inputs[i].data.length()); inputs[i].data.length());
} else { } else {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto *dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_); auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr), memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), inputs[i].data.data(), platform::CPUPlace(), inputs[i].data.data(),
inputs[i].data.length(), inputs[i].data.length(), dev_ctx->stream());
0); // stream 0 for sync copy
#else #else
PADDLE_THROW("Not compile with CUDA, should not reach here."); PADDLE_THROW("Not compile with CUDA, should not reach here.");
#endif #endif
......
...@@ -208,11 +208,14 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -208,11 +208,14 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
inputs[i].data.length()); inputs[i].data.length());
} else { } else {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto *dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_); auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr), memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), inputs[i].data.data(), platform::CPUPlace(), inputs[i].data.data(),
inputs[i].data.length(), inputs[i].data.length(), dev_ctx->stream());
0); // stream 0 for sync copy
#else #else
PADDLE_THROW("Not compile with CUDA, should not reach here."); PADDLE_THROW("Not compile with CUDA, should not reach here.");
#endif #endif
......
...@@ -75,6 +75,11 @@ set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac") ...@@ -75,6 +75,11 @@ set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac")
download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz") download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_tester.cc) inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_tester.cc)
# MM DNN
set(MM_DNN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mm_dnn")
download_model_and_data(${MM_DNN_INSTALL_DIR} "MM_DNN_model.tar.gz" "MM_DNN_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_mm_dnn ${MM_DNN_INSTALL_DIR} analyzer_mm_dnn_tester.cc)
# text_classification # text_classification
set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification") set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz") download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
...@@ -103,6 +108,10 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ...@@ -103,6 +108,10 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose
inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
"${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz") "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")
# seq_pool1
inference_analysis_api_test_with_fake_data(test_analyzer_seq_pool1
"${INFERENCE_DEMO_INSTALL_DIR}/seq_pool1" analyzer_seq_pool1_tester.cc "seq_pool1.tar.gz")
# mobilenet with depthwise_conv op # mobilenet with depthwise_conv op
inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
"${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz") "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz")
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
using contrib::AnalysisConfig;
struct DataRecord {
std::vector<std::vector<int64_t>> query_data_all, title_data_all;
std::vector<size_t> lod1, lod2;
size_t batch_iter{0};
size_t batch_size{1};
size_t num_samples; // total number of samples
DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) {
Load(path);
}
DataRecord NextBatch() {
DataRecord data;
size_t batch_end = batch_iter + batch_size;
// NOTE skip the final batch, if no enough data is provided.
if (batch_end <= query_data_all.size()) {
data.query_data_all.assign(query_data_all.begin() + batch_iter,
query_data_all.begin() + batch_end);
data.title_data_all.assign(title_data_all.begin() + batch_iter,
title_data_all.begin() + batch_end);
// Prepare LoDs
data.lod1.push_back(0);
data.lod2.push_back(0);
CHECK(!data.query_data_all.empty());
CHECK(!data.title_data_all.empty());
CHECK_EQ(data.query_data_all.size(), data.title_data_all.size());
for (size_t j = 0; j < data.query_data_all.size(); j++) {
// calculate lod
data.lod1.push_back(data.lod1.back() + data.query_data_all[j].size());
data.lod2.push_back(data.lod2.back() + data.title_data_all[j].size());
}
}
batch_iter += batch_size;
return data;
}
void Load(const std::string &path) {
std::ifstream file(path);
std::string line;
int num_lines = 0;
while (std::getline(file, line)) {
num_lines++;
std::vector<std::string> data;
split(line, '\t', &data);
// load query data
std::vector<int64_t> query_data;
split_to_int64(data[0], ' ', &query_data);
// load title data
std::vector<int64_t> title_data;
split_to_int64(data[1], ' ', &title_data);
query_data_all.push_back(std::move(query_data));
title_data_all.push_back(std::move(title_data));
}
num_samples = num_lines;
}
};
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
int batch_size) {
PaddleTensor lod_query_tensor, lod_title_tensor;
lod_query_tensor.name = "left";
lod_title_tensor.name = "right";
auto one_batch = data->NextBatch();
int size1 = one_batch.lod1[one_batch.lod1.size() - 1]; // token batch size
int size2 = one_batch.lod2[one_batch.lod2.size() - 1]; // token batch size
lod_query_tensor.shape.assign({size1, 1});
lod_query_tensor.lod.assign({one_batch.lod1});
lod_title_tensor.shape.assign({size2, 1});
lod_title_tensor.lod.assign({one_batch.lod2});
// assign data
TensorAssignData<int64_t>(&lod_query_tensor, one_batch.query_data_all);
TensorAssignData<int64_t>(&lod_title_tensor, one_batch.title_data_all);
// Set inputs.
input_slots->assign({lod_query_tensor, lod_title_tensor});
for (auto &tensor : *input_slots) {
tensor.dtype = PaddleDType::INT64;
}
}
void SetConfig(contrib::AnalysisConfig *cfg) {
cfg->model_dir = FLAGS_infer_model;
cfg->use_gpu = false;
cfg->device = 0;
cfg->specify_input_name = true;
cfg->enable_ir_optim = true;
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
std::vector<PaddleTensor> input_slots;
int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
for (int bid = 0; bid < epoch; ++bid) {
PrepareInputs(&input_slots, &data, FLAGS_batch_size);
(*inputs).emplace_back(input_slots);
}
}
// Easy for profiling independently.
TEST(Analyzer_MM_DNN, profile) {
contrib::AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
PADDLE_ENFORCE_EQ(outputs.size(), 2UL);
for (auto &output : outputs) {
size_t size = GetSize(output);
PADDLE_ENFORCE_GT(size, 0);
float *result = static_cast<float *>(output.data.data());
// output is probability, which is in (-1, 1).
for (size_t i = 0; i < size; i++) {
EXPECT_GT(result[i], -1);
EXPECT_LT(result[i], 1);
}
}
}
}
// Check the fuse status
TEST(Analyzer_MM_DNN, fuse_statis) {
contrib::AnalysisConfig cfg;
SetConfig(&cfg);
int num_ops;
auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis(
static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
}
// Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_MM_DNN, compare) {
contrib::AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
}
// Compare Deterministic result
TEST(Analyzer_MM_DNN, compare_determine) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all);
}
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
namespace analysis {
void SetConfig(AnalysisConfig *cfg) {
cfg->param_file = FLAGS_infer_model + "/params";
cfg->prog_file = FLAGS_infer_model + "/model";
cfg->use_gpu = false;
cfg->device = 0;
cfg->enable_ir_optim = true;
cfg->specify_input_name = true;
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
std::vector<std::string> feed_names = {
"slot10000_embed", "slot10001_embed", "slot10004_embed",
"slot10005_embed", "slot10008_embed", "slot10009_embed",
"slot10012_embed", "slot10013_embed", "slot10108_embed",
"slot13324_embed", "slot13325_embed", "slot13326_embed",
"slot13327_embed", "slot13328_embed", "slot13329_embed",
"slot13330_embed", "slot13331_embed", "slot15501_embed",
"slot15502_embed", "slot15503_embed", "slot15504_embed",
"slot15505_embed", "slot15506_embed", "slot15507_embed",
"slot15508_embed", "slot15516_embed", "slot15519_embed",
"slot15523_embed", "slot15531_embed", "slot15533_embed",
"slot15548_embed", "slot15564_embed", "slot15565_embed",
"slot15566_embed", "slot15570_embed", "slot15571_embed",
"slot15572_embed", "slot15573_embed", "slot15574_embed",
"slot15575_embed", "slot15576_embed", "slot15577_embed",
"slot15579_embed", "slot15581_embed", "slot15582_embed",
"slot15583_embed", "slot15584_embed", "slot5016_embed",
"slot5021_embed", "slot6002_embed", "slot6003_embed",
"slot6004_embed", "slot6005_embed", "slot6006_embed",
"slot6007_embed", "slot6008_embed", "slot6009_embed",
"slot6011_embed", "slot6014_embed", "slot6015_embed",
"slot6023_embed", "slot6024_embed", "slot6025_embed",
"slot6027_embed", "slot6029_embed", "slot6031_embed",
"slot6034_embed", "slot6035_embed", "slot6036_embed",
"slot6037_embed", "slot6039_embed", "slot6048_embed",
"slot6050_embed", "slot6058_embed", "slot6059_embed",
"slot6060_embed", "slot6066_embed", "slot6067_embed",
"slot6068_embed", "slot6069_embed", "slot6070_embed",
"slot6071_embed", "slot6072_embed", "slot6073_embed",
"slot6182_embed", "slot6183_embed", "slot6184_embed",
"slot6185_embed", "slot6186_embed", "slot6188_embed",
"slot6189_embed", "slot6190_embed", "slot6201_embed",
"slot6202_embed", "slot6203_embed", "slot6247_embed",
"slot6248_embed", "slot6250_embed", "slot6251_embed",
"slot6807_embed", "slot6808_embed", "slot6809_embed",
"slot6810_embed", "slot6811_embed", "slot6812_embed",
"slot6813_embed", "slot6814_embed", "slot6815_embed",
"slot6816_embed", "slot6817_embed", "slot6818_embed",
"slot6819_embed", "slot6820_embed", "slot6822_embed",
"slot6823_embed", "slot6826_embed", "slot7002_embed",
"slot7003_embed", "slot7004_embed", "slot7005_embed",
"slot7006_embed", "slot7008_embed", "slot7009_embed",
"slot7010_embed", "slot7011_embed", "slot7013_embed",
"slot7014_embed", "slot7015_embed", "slot7016_embed",
"slot7017_embed", "slot7019_embed", "slot7100_embed",
"slot7506_embed", "slot7507_embed", "slot7514_embed",
"slot7515_embed", "slot7516_embed"};
SetFakeImageInput(inputs, FLAGS_infer_model, true, "model", "params",
&feed_names);
}
// Easy for profiling independently.
void profile(bool use_mkldnn = false) {
AnalysisConfig cfg;
SetConfig(&cfg);
if (use_mkldnn) {
cfg.EnableMKLDNN();
}
std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
}
TEST(Analyzer_seq_pool1, profile) { profile(); }
// Check the fuse status
TEST(Analyzer_seq_pool1, fuse_statis) {
AnalysisConfig cfg;
SetConfig(&cfg);
int num_ops;
auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis(
static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
LOG(INFO) << "num_ops: " << num_ops;
EXPECT_EQ(num_ops, 314);
}
} // namespace analysis
} // namespace inference
} // namespace paddle
...@@ -132,7 +132,8 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor, ...@@ -132,7 +132,8 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs, void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
const std::string &dirname, bool is_combined = true, const std::string &dirname, bool is_combined = true,
std::string model_filename = "model", std::string model_filename = "model",
std::string params_filename = "params") { std::string params_filename = "params",
const std::vector<std::string> *feed_names = nullptr) {
// Set fake_image_data // Set fake_image_data
PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data."); PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
std::vector<std::vector<int64_t>> feed_target_shapes = GetFeedTargetShapes( std::vector<std::vector<int64_t>> feed_target_shapes = GetFeedTargetShapes(
...@@ -146,26 +147,32 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs, ...@@ -146,26 +147,32 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
os << "}\n"; os << "}\n";
} }
LOG(INFO) << os.str(); LOG(INFO) << os.str();
if (feed_names) {
int dim1 = feed_target_shapes[0][1]; PADDLE_ENFORCE_EQ(feed_names->size(), feed_target_shapes.size());
int dim2 = feed_target_shapes[0][2]; }
int dim3 = feed_target_shapes[0][3]; std::vector<PaddleTensor> input_slots(feed_target_shapes.size());
for (size_t i = 0; i < feed_target_shapes.size(); ++i) {
PaddleTensor input; const auto &feed_shape = feed_target_shapes[i];
std::vector<int> shape({FLAGS_batch_size, dim1, dim2, dim3}); auto &input = input_slots[i];
std::vector<int> shape({FLAGS_batch_size});
for (size_t s = 1; s < feed_shape.size(); ++s) {
shape.push_back(static_cast<int>(feed_shape[s]));
}
if (feed_names) {
input.name = (*feed_names)[i];
}
input.shape = shape; input.shape = shape;
input.dtype = PaddleDType::FLOAT32; input.dtype = PaddleDType::FLOAT32;
size_t len = std::accumulate(shape.begin(), shape.end(), 1,
// fill input data, for profile easily, do not use random data here. [](int a, int b) { return a * b; });
size_t size = FLAGS_batch_size * dim1 * dim2 * dim3; input.data.Resize(len * sizeof(float));
input.data.Resize(size * sizeof(float)); input.lod.assign({{0, static_cast<size_t>(FLAGS_batch_size)}});
float *input_data = static_cast<float *>(input.data.data()); float *input_data = static_cast<float *>(input.data.data());
for (size_t i = 0; i < size; i++) { // fill input data, for profile easily, do not use random data here.
*(input_data + i) = static_cast<float>(i) / size; for (size_t j = 0; j < len; ++j) {
*(input_data + j) = static_cast<float>(j) / len;
}
} }
std::vector<PaddleTensor> input_slots;
input_slots.assign({input});
(*inputs).emplace_back(input_slots); (*inputs).emplace_back(input_slots);
} }
......
...@@ -18,11 +18,11 @@ limitations under the License. */ ...@@ -18,11 +18,11 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/vol2col.h" #include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/fluid/platform/create_tensor_with_allocationptr.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -161,10 +161,7 @@ class GemmConvKernel : public framework::OpKernel<T> { ...@@ -161,10 +161,7 @@ class GemmConvKernel : public framework::OpKernel<T> {
auto tmp_allocation_ptr = auto tmp_allocation_ptr =
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
framework::product(col_shape) * sizeof(T)); framework::product(col_shape) * sizeof(T));
Tensor tep_tensor = col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
platform::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
col.ShareDataWith(tep_tensor);
col_matrix.ShareDataWith(col); col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape); col_matrix.Resize(col_matrix_shape);
} }
...@@ -299,10 +296,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> { ...@@ -299,10 +296,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
auto tmp_allocation_ptr = auto tmp_allocation_ptr =
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
framework::product(col_shape) * sizeof(T)); framework::product(col_shape) * sizeof(T));
Tensor tep_tensor = col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
platform::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
col.ShareDataWith(tep_tensor);
col_matrix.ShareDataWith(col); col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape); col_matrix.Resize(col_matrix_shape);
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "mkldnn.hpp"
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/dequantize_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
namespace paddle {
namespace operators {
using mkldnn::memory;
using mkldnn::primitive;
using mkldnn::reorder;
using platform::to_void_cast;
using Tensor = framework::Tensor;
using framework::DataLayout;
using mkldnn::stream;
using platform::GetMKLDNNFormat;
template <typename T>
class DeQuantOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("Input");
auto scale_data = ctx.Attr<float>("Scale");
auto* output = ctx.Output<Tensor>("Output");
auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& engine = dev_ctx.GetEngine();
const T* input_data = input->data<T>();
float* output_data = output->mutable_data<float>(ctx.GetPlace());
std::vector<float> reorder_scale = {1.0f / scale_data};
std::vector<primitive> pipeline;
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
mkldnn::memory::data_type src_dt =
paddle::framework::ToMKLDNNDataType(input->type());
mkldnn::memory::format src_fmt = input->format();
mkldnn::primitive_attr attri;
int mask = 0;
attri.set_output_scales(mask, reorder_scale);
auto src_md = platform::MKLDNNMemDesc({src_tz}, src_dt, src_fmt);
auto src_pd = mkldnn::memory::primitive_desc(src_md, engine);
auto src_memory =
std::make_shared<mkldnn::memory>(src_pd, to_void_cast<T>(input_data));
std::shared_ptr<primitive::at> src_memory_p =
std::shared_ptr<primitive::at>(new primitive::at(*src_memory));
auto dst_md = platform::MKLDNNMemDesc({dst_tz}, memory::data_type::f32,
memory::format::nchw);
auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine);
auto dst_memory = mkldnn::memory(dst_pd, to_void_cast<float>(output_data));
auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
new reorder::primitive_desc(src_pd, dst_pd, attri));
auto reorder_p = std::shared_ptr<reorder>(
new reorder(*reorder_pd, *src_memory_p, dst_memory));
pipeline.push_back(*reorder_p);
stream(stream::kind::eager).submit(pipeline).wait();
output->set_format(GetMKLDNNFormat(dst_memory));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(dequantize, MKLDNN, ::paddle::platform::CPUPlace,
ops::DeQuantOpKernel<uint8_t>, ops::DeQuantOpKernel<int8_t>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/dequantize_op.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle {
namespace operators {
framework::OpKernelType DeQuantOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
framework::LibraryType library_ = framework::LibraryType::kMKLDNN;
framework::DataLayout layout_ = framework::DataLayout::kMKLDNN;
return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
ctx.GetPlace(), layout_, library_);
}
void DeQuantOpMaker::Make() {
AddInput("Input", "input data");
AddOutput("Output", "output data");
AddAttr<float>("Scale", "scale data").SetDefault({1.0f});
AddComment(R"DOC(This op will dequantize data from INT8 to FP32)DOC");
}
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(dequantize, ops::DeQuantOp, ops::DeQuantOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using framework::OpKernelType;
using framework::Tensor;
class DeQuantOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
ctx->SetOutputDim("Output", ctx->GetInputDim("Input"));
ctx->ShareLoD("Input", /*->*/ "Output");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override;
};
class DeQuantOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override;
};
class DeQuantGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {}
};
} // namespace operators
} // namespace paddle
...@@ -142,12 +142,13 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> { ...@@ -142,12 +142,13 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
vars->mutable_data<T>(ctx.GetPlace()); vars->mutable_data<T>(ctx.GetPlace());
framework::Tensor d_temp; framework::Tensor d_temp;
framework::TensorCopySync(h_temp, ctx.GetPlace(), &d_temp); framework::TensorCopy(h_temp, ctx.GetPlace(), &d_temp);
// At least use 32 threads, at most 512 threads. // At least use 32 threads, at most 512 threads.
// blockx is multiple of 32. // blockx is multiple of 32.
int blockx = std::min( int blockx = std::min(
static_cast<long>(((feature_width * num_priors + 31) >> 5) << 5), 512L); static_cast<int64_t>(((feature_width * num_priors + 31) >> 5) << 5),
512L);
int gridx = (feature_width * num_priors + blockx - 1) / blockx; int gridx = (feature_width * num_priors + blockx - 1) / blockx;
dim3 threads(blockx, 1); dim3 threads(blockx, 1);
dim3 grids(gridx, feature_height); dim3 grids(gridx, feature_height);
......
...@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_div_op.h" #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_div, elementwise_div,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_div_grad, elementwise_div_grad,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
......
...@@ -12,19 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,19 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_mul, elementwise_mul, ops::ElementwiseMulKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseMulKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseMulKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseMulKernel<plat::CUDADeviceContext, int64_t>,
ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_mul_grad, elementwise_mul_grad,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int64_t>,
int64_t>); ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::float16>);
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/fill_zeros_like_op.h" #include "paddle/fluid/operators/fill_zeros_like_op.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
...@@ -22,4 +23,6 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -22,4 +23,6 @@ REGISTER_OP_CUDA_KERNEL(
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>, ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>, ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>, ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>); ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
...@@ -131,8 +131,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> { ...@@ -131,8 +131,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
int in_col = input[0].numel() / in_row; int in_col = input[0].numel() / in_row;
int out_row = in_row, out_col = 0; int out_row = in_row, out_col = 0;
std::vector<T*> inputs_data(in_num); std::vector<const T*> inputs_data;
std::vector<int> inputs_col(in_num + 1); std::vector<int> inputs_col(in_num + 1);
inputs_data.reserve(in_num);
inputs_col[0] = 0; inputs_col[0] = 0;
bool sameShape = true; bool sameShape = true;
...@@ -143,7 +144,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> { ...@@ -143,7 +144,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
} }
out_col += t_cols; out_col += t_cols;
inputs_col[i + 1] = out_col; inputs_col[i + 1] = out_col;
inputs_data[i] = const_cast<T*>(input[i].data<T>()); inputs_data.emplace_back(input[i].data<T>());
} }
// computation // computation
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm>
#include <set> #include <set>
#include <unordered_map> #include <unordered_map>
...@@ -252,23 +253,26 @@ elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas, ...@@ -252,23 +253,26 @@ elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
template <typename T> template <typename T>
struct MergeAdd<platform::CPUDeviceContext, T> { struct MergeAdd<platform::CPUDeviceContext, T> {
framework::SelectedRows operator()(const platform::CPUDeviceContext& context, framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& input) { const framework::SelectedRows& input,
const bool sorted_result = false) {
framework::SelectedRows out; framework::SelectedRows out;
(*this)(context, input, &out); (*this)(context, input, &out, sorted_result);
return out; return out;
} }
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& input, const framework::SelectedRows& input,
framework::SelectedRows* output) { framework::SelectedRows* output,
const bool sorted_result = false) {
std::vector<const framework::SelectedRows*> inputs; std::vector<const framework::SelectedRows*> inputs;
inputs.push_back(&input); inputs.push_back(&input);
(*this)(context, inputs, output); (*this)(context, inputs, output, sorted_result);
} }
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const std::vector<const framework::SelectedRows*>& inputs, const std::vector<const framework::SelectedRows*>& inputs,
framework::SelectedRows* output) { framework::SelectedRows* output,
const bool sorted_result = false) {
if (inputs.size() == 0) { if (inputs.size() == 0) {
VLOG(3) << "no input! return"; VLOG(3) << "no input! return";
return; return;
...@@ -301,6 +305,9 @@ struct MergeAdd<platform::CPUDeviceContext, T> { ...@@ -301,6 +305,9 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
} }
std::vector<int64_t> merge_rows(merged_row_set.begin(), std::vector<int64_t> merge_rows(merged_row_set.begin(),
merged_row_set.end()); merged_row_set.end());
if (sorted_result) {
std::sort(merge_rows.begin(), merge_rows.end());
}
std::unordered_map<int64_t, size_t> rows_to_id; std::unordered_map<int64_t, size_t> rows_to_id;
for (size_t i = 0; i < merge_rows.size(); ++i) { for (size_t i = 0; i < merge_rows.size(); ++i) {
rows_to_id[merge_rows[i]] = i; rows_to_id[merge_rows[i]] = i;
......
...@@ -266,7 +266,8 @@ __global__ void MergeAddKernel(const T* input, const int64_t* input_rows, ...@@ -266,7 +266,8 @@ __global__ void MergeAddKernel(const T* input, const int64_t* input_rows,
template <typename T> template <typename T>
struct MergeAdd<platform::CUDADeviceContext, T> { struct MergeAdd<platform::CUDADeviceContext, T> {
framework::SelectedRows operator()(const platform::CUDADeviceContext& context, framework::SelectedRows operator()(const platform::CUDADeviceContext& context,
const framework::SelectedRows& input) { const framework::SelectedRows& input,
const bool sorted_result = false) {
framework::SelectedRows out; framework::SelectedRows out;
(*this)(context, input, &out); (*this)(context, input, &out);
return out; return out;
...@@ -274,7 +275,8 @@ struct MergeAdd<platform::CUDADeviceContext, T> { ...@@ -274,7 +275,8 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::SelectedRows& input, const framework::SelectedRows& input,
framework::SelectedRows* output) { framework::SelectedRows* output,
const bool sorted_result = false) {
framework::Vector<int64_t> input_rows(input.rows()); framework::Vector<int64_t> input_rows(input.rows());
if (input_rows.size() == 0) { if (input_rows.size() == 0) {
return; return;
...@@ -312,7 +314,8 @@ struct MergeAdd<platform::CUDADeviceContext, T> { ...@@ -312,7 +314,8 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const std::vector<const framework::SelectedRows*>& inputs, const std::vector<const framework::SelectedRows*>& inputs,
framework::SelectedRows* output) { framework::SelectedRows* output,
const bool sorted_result = false) {
if (inputs.size() == 0) { if (inputs.size() == 0) {
VLOG(3) << "no input! return"; VLOG(3) << "no input! return";
return; return;
......
...@@ -81,13 +81,16 @@ struct MergeAdd { ...@@ -81,13 +81,16 @@ struct MergeAdd {
// unary functor, merge by adding duplicated rows in // unary functor, merge by adding duplicated rows in
// the input SelectedRows object. // the input SelectedRows object.
framework::SelectedRows operator()(const DeviceContext& context, framework::SelectedRows operator()(const DeviceContext& context,
const framework::SelectedRows& input); const framework::SelectedRows& input,
const bool sorted_result = false);
void operator()(const DeviceContext& context, void operator()(const DeviceContext& context,
const framework::SelectedRows& input, const framework::SelectedRows& input,
framework::SelectedRows* output); framework::SelectedRows* output,
const bool sorted_result = false);
void operator()(const DeviceContext& context, void operator()(const DeviceContext& context,
const std::vector<const framework::SelectedRows*>& inputs, const std::vector<const framework::SelectedRows*>& inputs,
framework::SelectedRows* output); framework::SelectedRows* output,
const bool sorted_result = false);
}; };
enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY }; enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY };
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <thrust/reduce.h> #include <thrust/reduce.h>
#include "paddle/fluid/operators/metrics/accuracy_op.h" #include "paddle/fluid/operators/metrics/accuracy_op.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
namespace paddle { namespace paddle {
...@@ -94,6 +95,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> { ...@@ -94,6 +95,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
// FIXME(typhoonzero): types of T is for inference data. // FIXME(typhoonzero): types of T is for inference data.
// label data is always int64 // label data is always int64
REGISTER_OP_CUDA_KERNEL(accuracy, REGISTER_OP_CUDA_KERNEL(
paddle::operators::AccuracyOpCUDAKernel<float>, accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
paddle::operators::AccuracyOpCUDAKernel<double>); paddle::operators::AccuracyOpCUDAKernel<double>,
paddle::operators::AccuracyOpCUDAKernel<paddle::platform::float16>);
...@@ -22,4 +22,6 @@ limitations under the License. */ ...@@ -22,4 +22,6 @@ limitations under the License. */
#pragma once #pragma once
#include "ops/binary_unnary_op.h" #include "ops/binary_unnary_op.h"
#include "ops/fill_constant_op.h"
#include "ops/mul_op.h" #include "ops/mul_op.h"
#include "ops/top_k_op.h"
...@@ -45,7 +45,6 @@ static void BuildUnaryNode( ...@@ -45,7 +45,6 @@ static void BuildUnaryNode(
auto out = std::make_shared<T>(input); auto out = std::make_shared<T>(input);
paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map); paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
} }
} // namespace ngraphs } // namespace ngraphs
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
......
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#include <string>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace ngraphs {
void BuildFillConstantNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto op_attrs = paddle::framework::AttrReader(op->Attrs());
auto vsp = op_attrs.Get<std::vector<int64_t>>("shape");
ngraph::Shape shape;
for (auto& sp : vsp) {
shape.push_back(sp);
}
float value = op_attrs.Get<float>("value");
ngraph::element::Type ng_dtype;
auto data_type = static_cast<paddle::framework::proto::VarType::Type>(
op_attrs.Get<int>("dtype"));
if (data_type == paddle::framework::proto::VarType::FP32) {
ng_dtype = ngraph::element::f32;
} else if (data_type == paddle::framework::proto::VarType::FP64) {
ng_dtype = ngraph::element::f64;
} else if (data_type == paddle::framework::proto::VarType::INT64) {
ng_dtype = ngraph::element::i64;
} else if (data_type == paddle::framework::proto::VarType::INT32) {
ng_dtype = ngraph::element::i32;
} else if (data_type == paddle::framework::proto::VarType::BOOL) {
ng_dtype = ngraph::element::boolean;
} else {
PADDLE_THROW("unsupported data type: %s", data_type);
}
auto out = ngraph::op::Constant::create(ng_dtype, shape, {value});
paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
}
} // namespace ngraphs
} // namespace operators
} // namespace paddle
#endif
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace ngraphs {
void BuildTopKNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto op_attrs = paddle::framework::AttrReader(op->Attrs());
int k = op_attrs.Get<int>("k");
auto input = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto top_k = std::make_shared<ngraph::op::TopK>(
input, input->get_shape().size() - 1, ngraph::element::i64, k);
std::shared_ptr<ngraph::Node> indices =
std::make_shared<ngraph::op::GetOutputElement>(top_k, 0);
std::shared_ptr<ngraph::Node> out =
std::make_shared<ngraph::op::GetOutputElement>(top_k, 1);
auto dummy_out = paddle::platform::GetOutputNode(op, "Out", ngb_node_map);
if (dummy_out && dummy_out->get_element_type() != out->get_element_type()) {
out = std::make_shared<ngraph::op::Convert>(out,
dummy_out->get_element_type());
}
paddle::platform::SetOutputNode(op, "Indices", indices, ngb_node_map);
paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
}
} // namespace ngraphs
} // namespace operators
} // namespace paddle
#endif
...@@ -157,8 +157,11 @@ struct AdamFunctor<T, CPUAdam> { ...@@ -157,8 +157,11 @@ struct AdamFunctor<T, CPUAdam> {
} }
}; };
template <typename T, typename Flavour>
struct SparseAdamFunctor;
template <typename T> template <typename T>
struct SparseAdamFunctor { struct SparseAdamFunctor<T, GPUAdam> {
T beta1_; T beta1_;
T beta2_; T beta2_;
T epsilon_; T epsilon_;
...@@ -236,6 +239,106 @@ struct SparseAdamFunctor { ...@@ -236,6 +239,106 @@ struct SparseAdamFunctor {
} }
}; };
template <typename T>
struct SparseAdamFunctor<T, CPUAdam> {
T beta1_;
T beta2_;
T epsilon_;
const T* beta1_pow_;
const T* beta2_pow_;
const T* moment1_;
T* moment1_out_;
const T* moment2_;
T* moment2_out_;
const T* lr_;
const T* grad_;
const T* param_;
T* param_out_;
const int64_t* rows_;
int64_t row_numel_;
int64_t row_count_;
SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
const T* beta2_pow, const T* mom1, T* mom1_out,
const T* mom2, T* mom2_out, const T* lr, const T* grad,
const T* param, T* param_out, const int64_t* rows,
int64_t row_numel, int64_t row_count, bool lazy_mode)
: beta1_(beta1),
beta2_(beta2),
epsilon_(epsilon),
beta1_pow_(beta1_pow),
beta2_pow_(beta2_pow),
moment1_(mom1),
moment1_out_(mom1_out),
moment2_(mom2),
moment2_out_(mom2_out),
lr_(lr),
grad_(grad),
param_(param),
param_out_(param_out),
rows_(rows),
row_numel_(row_numel),
row_count_(row_count) {}
inline HOSTDEVICE void adam_update(size_t i, T g) const {
// The following code is the same as dense
T mom1 = moment1_[i];
T mom2 = moment2_[i];
T lr = *lr_;
T beta1_pow = *beta1_pow_;
T beta2_pow = *beta2_pow_;
T p = param_[i];
// Calculation
lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
mom1 = beta1_ * mom1 + (1 - beta1_) * g;
mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
// Write back to global memory
moment1_out_[i] = mom1;
moment2_out_[i] = mom2;
param_out_[i] = p;
}
inline void operator()(size_t numel) const {
// lr could be reuse
T lr = *lr_;
T beta1_pow = *beta1_pow_;
T beta2_pow = *beta2_pow_;
lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
size_t row_count = numel / row_numel_;
for (size_t i = 0U, j = 0U; i != row_count; ++i) {
if (i == *(rows_ + j)) {
for (size_t k = 0U; k != row_numel_; ++k) {
T g = grad_[j * row_numel_ + k];
adam_update(i * row_numel_ + k, g);
}
++j;
} else {
for (size_t k = 0U; k != row_numel_; ++k) {
T mom1 = moment1_[i * row_numel_ + k];
T mom2 = moment2_[i * row_numel_ + k];
T p = param_[i * row_numel_ + k];
mom1 = beta1_ * mom1;
mom2 = beta2_ * mom2;
p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
// Write back to global memory
moment1_out_[i * row_numel_ + k] = mom1;
moment2_out_[i * row_numel_ + k] = mom2;
param_out_[i * row_numel_ + k] = p;
}
}
}
}
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class AdamOpKernel : public framework::OpKernel<T> { class AdamOpKernel : public framework::OpKernel<T> {
public: public:
...@@ -331,7 +434,7 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -331,7 +434,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
.Var() .Var()
->GetMutable<framework::SelectedRows>(); ->GetMutable<framework::SelectedRows>();
merge_func(ctx.template device_context<DeviceContext>(), grad, merge_func(ctx.template device_context<DeviceContext>(), grad,
grad_merge_var); grad_merge_var, true);
grad_merge_ptr = grad_merge_var; grad_merge_ptr = grad_merge_var;
} }
...@@ -347,13 +450,13 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -347,13 +450,13 @@ class AdamOpKernel : public framework::OpKernel<T> {
} else { } else {
#endif #endif
rows = grad_merge.rows().data(); rows = grad_merge.rows().data();
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
} }
#endif #endif
auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
SparseAdamFunctor<T> functor( if (platform::is_cpu_place(ctx.GetPlace())) {
SparseAdamFunctor<T, CPUAdam> functor(
beta1, beta2, epsilon, beta1_pow.template data<T>(), beta1, beta2, epsilon, beta1_pow.template data<T>(),
beta2_pow.template data<T>(), mom1.template data<T>(), beta2_pow.template data<T>(), mom1.template data<T>(),
mom1_out.template mutable_data<T>(ctx.GetPlace()), mom1_out.template mutable_data<T>(ctx.GetPlace()),
...@@ -362,8 +465,8 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -362,8 +465,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
lr.template data<T>(), grad_data, param.template data<T>(), lr.template data<T>(), grad_data, param.template data<T>(),
param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel, param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel,
grad_merge.rows().size(), lazy_mode); grad_merge.rows().size(), lazy_mode);
VLOG(3) << "lazy_mode :" << lazy_mode;
if (lazy_mode && platform::is_cpu_place(ctx.GetPlace())) { if (lazy_mode) {
size_t row_count = grad_merge.rows().size(); size_t row_count = grad_merge.rows().size();
std::vector<int64_t> cpu_rows(grad_merge.rows()); std::vector<int64_t> cpu_rows(grad_merge.rows());
for (size_t row_index = 0; row_index < row_count; ++row_index) { for (size_t row_index = 0; row_index < row_count; ++row_index) {
...@@ -373,6 +476,20 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -373,6 +476,20 @@ class AdamOpKernel : public framework::OpKernel<T> {
} }
} }
} else { } else {
functor(param.numel());
}
} else if (platform::is_gpu_place(ctx.GetPlace())) {
SparseAdamFunctor<T, GPUAdam> functor(
beta1, beta2, epsilon, beta1_pow.template data<T>(),
beta2_pow.template data<T>(), mom1.template data<T>(),
mom1_out.template mutable_data<T>(ctx.GetPlace()),
mom2.template data<T>(),
mom2_out.template mutable_data<T>(ctx.GetPlace()),
lr.template data<T>(), grad_data, param.template data<T>(),
param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel,
grad_merge.rows().size(), lazy_mode);
// FIXME(minqiyang): remove BinarySearch in GPU later
platform::ForRange<DeviceContext> for_range( platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(ctx.device_context()), static_cast<const DeviceContext&>(ctx.device_context()),
param.numel()); param.numel());
......
...@@ -14,8 +14,11 @@ limitations under the License. */ ...@@ -14,8 +14,11 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/optimizers/momentum_op.h" #include "paddle/fluid/operators/optimizers/momentum_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
momentum, ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, float>, momentum, ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, double>); ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, double>,
ops::MomentumOpKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>);
...@@ -237,7 +237,8 @@ class SparseMomentumFunctor<T, UseNesterov> { ...@@ -237,7 +237,8 @@ class SparseMomentumFunctor<T, UseNesterov> {
inline HOSTDEVICE void operator()(size_t i) { inline HOSTDEVICE void operator()(size_t i) {
auto row_idx = auto row_idx =
math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_); math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0; T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_]
: static_cast<T>(0);
// put memory access in register // put memory access in register
const T p = p_[i]; const T p = p_[i];
const T lr = lr_[0]; const T lr = lr_[0];
...@@ -282,7 +283,8 @@ class SparseMomentumFunctor<T, NoNesterov> { ...@@ -282,7 +283,8 @@ class SparseMomentumFunctor<T, NoNesterov> {
inline HOSTDEVICE void operator()(size_t i) { inline HOSTDEVICE void operator()(size_t i) {
auto row_idx = auto row_idx =
math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_); math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0; T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_]
: static_cast<T>(0);
// put memory access in register // put memory access in register
const T p = p_[i]; const T p = p_[i];
const T lr = lr_[0]; const T lr = lr_[0];
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "mkldnn.hpp"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/quantize_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace paddle {
namespace operators {
using mkldnn::memory;
using mkldnn::primitive;
using mkldnn::reorder;
using platform::to_void_cast;
using Tensor = framework::Tensor;
using framework::DataLayout;
using mkldnn::stream;
using platform::GetMKLDNNFormat;
template <typename T>
class QuantOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("Input");
auto scale_data = ctx.Attr<float>("Scale");
auto* output = ctx.Output<Tensor>("Output");
auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& engine = dev_ctx.GetEngine();
std::vector<primitive> pipeline;
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
const T* input_data = input->data<T>();
mkldnn::primitive_attr attri;
int mask = 0;
attri.set_output_scales(mask, {scale_data});
auto src_md = platform::MKLDNNMemDesc({src_tz}, memory::data_type::f32,
input->format());
auto src_pd = mkldnn::memory::primitive_desc(src_md, engine);
auto src_memory =
std::make_shared<mkldnn::memory>(src_pd, to_void_cast<T>(input_data));
std::shared_ptr<primitive::at> src_memory_p =
std::shared_ptr<primitive::at>(new primitive::at(*src_memory));
bool is_negative = ctx.Attr<bool>("is_negative_input");
std::shared_ptr<mkldnn::memory::primitive_desc> dst_pd;
std::shared_ptr<mkldnn::memory> dst_memory;
if (is_negative) {
platform::ConvMKLDNNHandler::SetDstMemory<int8_t>(
ctx, output, dst_tz, engine, dst_pd, dst_memory);
} else {
platform::ConvMKLDNNHandler::SetDstMemory<uint8_t>(
ctx, output, dst_tz, engine, dst_pd, dst_memory);
}
auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
new reorder::primitive_desc(src_pd, *dst_pd, attri));
auto reorder_p = std::shared_ptr<reorder>(
new reorder(*reorder_pd, *src_memory_p, *dst_memory));
pipeline.push_back(*reorder_p);
stream(stream::kind::eager).submit(pipeline).wait();
output->set_layout(DataLayout::kMKLDNN);
output->set_format(GetMKLDNNFormat(*dst_memory));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
// TODO(Xiaoli) Support FP32->S8 quantization.
REGISTER_OP_KERNEL(quantize, MKLDNN, ::paddle::platform::CPUPlace,
ops::QuantOpKernel<float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/operators/quantize_op.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle {
namespace operators {
framework::OpKernelType QuantOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
framework::LibraryType library_ = framework::LibraryType::kMKLDNN;
framework::DataLayout layout_ = framework::DataLayout::kMKLDNN;
return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
ctx.GetPlace(), layout_, library_);
}
void QuantOpMaker::Make() {
AddInput("Input", "input data");
AddOutput("Output", "output data");
AddAttr<bool>("is_negative_input",
"(bool, default false) Only used in mkldnn INT8 kernel")
.SetDefault(false);
AddAttr<float>("Scale", "scale data").SetDefault({1.0f});
AddComment(R"DOC(This op will quantize data from FP32 to INT8)DOC");
}
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(quantize, ops::QuantOp, ops::QuantOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using framework::OpKernelType;
using framework::Tensor;
class QuantOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
ctx->SetOutputDim("Output", ctx->GetInputDim("Input"));
ctx->ShareLoD("Input", /*->*/ "Output");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override;
};
class QuantOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override;
};
} // namespace operators
} // namespace paddle
...@@ -52,7 +52,7 @@ class SequenceMaskOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -52,7 +52,7 @@ class SequenceMaskOpMaker : public framework::OpProtoAndCheckerMaker {
"The maximum length of the sequence. If maxlen < 0, maxlen " "The maximum length of the sequence. If maxlen < 0, maxlen "
"= max(Input(X)).") "= max(Input(X)).")
.SetDefault(-1) .SetDefault(-1)
.AddCustomChecker([](int &v) { .AddCustomChecker([](const int &v) {
PADDLE_ENFORCE(v < 0 || v >= 1, PADDLE_ENFORCE(v < 0 || v >= 1,
"Attr(maxlen) must be less than 0 or larger than 1"); "Attr(maxlen) must be less than 0 or larger than 1");
}); });
......
...@@ -63,7 +63,7 @@ class SplitLoDTensorOp : public framework::OperatorBase { ...@@ -63,7 +63,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
} }
auto *mask_data = cpu_mask->data<bool>(); auto *mask_data = cpu_mask->data<bool>();
std::vector<std::vector<CopyRange>> copy_ranges(mask_dim[0]); std::vector<std::vector<CopyRange>> copy_ranges(2);
// set out_true/out_false lod // set out_true/out_false lod
for (size_t t = 0; t < 2; t++) { for (size_t t = 0; t < 2; t++) {
......
...@@ -21,7 +21,7 @@ class TopkOp : public framework::OperatorWithKernel { ...@@ -21,7 +21,7 @@ class TopkOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of TopkOp should not be null."); "Input(X) of TopkOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
...@@ -44,12 +44,25 @@ class TopkOp : public framework::OperatorWithKernel { ...@@ -44,12 +44,25 @@ class TopkOp : public framework::OperatorWithKernel {
ctx->ShareLoD("X", "Out"); ctx->ShareLoD("X", "Out");
ctx->ShareLoD("X", "Indices"); ctx->ShareLoD("X", "Indices");
} }
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
framework::LibraryType library_{framework::LibraryType::kPlain};
framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
ctx.device_context(), layout_, library_);
}
}; };
class TopkOpMaker : public framework::OpProtoAndCheckerMaker { class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("X", "(Tensor) The input of Topk op"); AddInput("X", "(Tensor) The input of Topk op");
AddInput("K",
"(Tensor) Number of top elements to look for along "
"the last dimension (along each row for matrices).")
.AsDispensable();
AddOutput("Out", "(Tensor) The output tensor of Topk op"); AddOutput("Out", "(Tensor) The output tensor of Topk op");
AddOutput("Indices", "(Tensor) The indices of Topk elements of input"); AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
AddComment(R"DOC( AddComment(R"DOC(
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/operators/top_k_op.h" #include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -150,7 +151,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam, ...@@ -150,7 +151,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
if (k < MaxLength - (*beam)) { if (k < MaxLength - (*beam)) {
topk[k] = topk[k + *beam]; topk[k] = topk[k + *beam];
} else { } else {
topk[k].set(-INFINITY, -1); topk[k].set(-static_cast<T>(INFINITY), -1);
} }
} }
if (!(*is_empty)) { if (!(*is_empty)) {
...@@ -160,7 +161,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam, ...@@ -160,7 +161,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
} }
*max = topk[MaxLength - 1]; *max = topk[MaxLength - 1];
if ((*max).v == -1) *is_empty = true; if ((*max).v == -static_cast<T>(1)) *is_empty = true;
*beam = 0; *beam = 0;
} }
} }
...@@ -181,7 +182,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam, ...@@ -181,7 +182,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
if (k < MaxLength - *beam) { if (k < MaxLength - *beam) {
topk[k] = topk[k + *beam]; topk[k] = topk[k + *beam];
} else { } else {
topk[k].set(-INFINITY, -1); topk[k].set(-static_cast<T>(INFINITY), -1);
} }
} }
if (!(*is_empty)) { if (!(*is_empty)) {
...@@ -278,7 +279,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, ...@@ -278,7 +279,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
bool firststep = true; bool firststep = true;
for (int j = 0; j < MaxLength; j++) { for (int j = 0; j < MaxLength; j++) {
topk[j].set(-INFINITY, -1); topk[j].set(-static_cast<T>(INFINITY), -1);
} }
while (top_num) { while (top_num) {
ThreadGetTopK<T, MaxLength, BlockSize>( ThreadGetTopK<T, MaxLength, BlockSize>(
...@@ -326,6 +327,17 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> { ...@@ -326,6 +327,17 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
auto* indices = ctx.Output<Tensor>("Indices"); auto* indices = ctx.Output<Tensor>("Indices");
size_t k = static_cast<int>(ctx.Attr<int>("k")); size_t k = static_cast<int>(ctx.Attr<int>("k"));
auto* k_t = ctx.Input<Tensor>("K");
if (k_t) {
Tensor k_host;
framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host);
k = k_host.data<int>()[0];
framework::DDim output_dims = output->dims();
output_dims[output_dims.size() - 1] = k;
output->Resize(output_dims);
indices->Resize(output_dims);
}
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace()); T* output_data = output->mutable_data<T>(ctx.GetPlace());
// FIXME(typhoonzero): data is always converted to type T? // FIXME(typhoonzero): data is always converted to type T?
...@@ -362,5 +374,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> { ...@@ -362,5 +374,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>, REGISTER_OP_CUDA_KERNEL(
paddle::operators::TopkOpCUDAKernel<double>); top_k, paddle::operators::TopkOpCUDAKernel<float>,
paddle::operators::TopkOpCUDAKernel<double>,
paddle::operators::TopkOpCUDAKernel<paddle::platform::float16>);
...@@ -37,8 +37,16 @@ class TopkKernel : public framework::OpKernel<T> { ...@@ -37,8 +37,16 @@ class TopkKernel : public framework::OpKernel<T> {
auto* input = ctx.Input<Tensor>("X"); auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out"); auto* output = ctx.Output<Tensor>("Out");
auto* indices = ctx.Output<Tensor>("Indices"); auto* indices = ctx.Output<Tensor>("Indices");
// k is determined by Attr
const size_t k = static_cast<int>(ctx.Attr<int>("k")); size_t k = static_cast<int>(ctx.Attr<int>("k"));
auto* k_t = ctx.Input<Tensor>("K");
if (k_t) {
k = k_t->data<int>()[0];
framework::DDim output_dims = output->dims();
output_dims[output_dims.size() - 1] = k;
output->Resize(output_dims);
indices->Resize(output_dims);
}
T* output_data = output->mutable_data<T>(ctx.GetPlace()); T* output_data = output->mutable_data<T>(ctx.GetPlace());
int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace()); int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
......
...@@ -256,10 +256,11 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) ...@@ -256,10 +256,11 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: " << place_.device LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: " << place_.device
<< ", CUDA Capability: " << compute_capability_ << ", CUDA Capability: " << compute_capability_
<< ", Driver Version: " << driver_version_ / 1000 << ", Driver API Version: " << driver_version_ / 1000
<< "." << (driver_version_ % 100) / 10 << "." << (driver_version_ % 100) / 10
<< ", Runtime Version: " << runtime_version_ / 1000 << ", Runtime API Version: "
<< "." << (runtime_version_ % 100) / 10; << runtime_version_ / 1000 << "."
<< (runtime_version_ % 100) / 10;
size_t cudnn_dso_ver = dynload::cudnnGetVersion(); size_t cudnn_dso_ver = dynload::cudnnGetVersion();
LOG_FIRST_N(WARNING, 1) << "device: " << place_.device LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
<< ", cuDNN Version: " << cudnn_dso_ver / 1000 << "." << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
......
...@@ -41,7 +41,28 @@ limitations under the License. */ ...@@ -41,7 +41,28 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
/*! \brief device temporary allocator singleton */ /*! \brief device temporary allocator singleton.
*
* Some operator needs temporary memory during computation, for example,
* conv_gemm, which needs use col to store the result of im2col. If we
* create a stack memory which is used by CUDA Kernel, before the
* Computation(...) returns, we should add ctx->Wait(), because the
* execution of CUDA is async, if there doesn't have ctx->Wait(),
* the temporary memory will be released before the CUDA Kernel uses
* it.
*
* DeviceTemporaryAllocator is a singleton, which contains a
* `TemporaryAllocator` for each <Place, Stream>. And the TemporaryAllocator
* contains a temp_allocation_queue which is used to store the temporary
* allocations. The allocation, which is allocated by TemporaryAllocator,
* is a unique_ptr, and when it is not held by any variable, it will be
* pushed into the temp_allocation_queue. There are two opportunities to free
* the allocations of temp_allocation_queue:
* - when the Stream calls cudaStreamSynchronize;
* - when the allocation size of opportunities exceeds a certain threshold
* (defined by FLAGS_limit_of_temporary_allocation).
*
* */
class DeviceTemporaryAllocator { class DeviceTemporaryAllocator {
public: public:
static DeviceTemporaryAllocator& Instance() { static DeviceTemporaryAllocator& Instance() {
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
...@@ -181,6 +182,21 @@ class MKLDNNHandler { ...@@ -181,6 +182,21 @@ class MKLDNNHandler {
return dims2str(operand_dims) + suffix; return dims2str(operand_dims) + suffix;
} }
template <typename M>
static void SetDstMemory(
const framework::ExecutionContext& ctx, framework::Tensor* output,
std::vector<int> dst_tz, const mkldnn::engine& engine,
std::shared_ptr<mkldnn::memory::primitive_desc>& dst_pd, // NOLINT
std::shared_ptr<mkldnn::memory>& dst_memory) { // NOLINT
M* output_data = output->mutable_data<M>(ctx.GetPlace());
auto dst_md = platform::MKLDNNMemDesc(
{dst_tz}, paddle::framework::ToMKLDNNDataType(
framework::DataTypeTrait<M>::DataType),
mkldnn::memory::format::nhwc);
dst_pd.reset(new mkldnn::memory::primitive_desc(dst_md, engine));
dst_memory.reset(new mkldnn::memory(*dst_pd, to_void_cast<M>(output_data)));
}
protected: protected:
static std::string dims2str(const mkldnn::memory::dims& operand_dims) { static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
std::string dstr = ""; std::string dstr = "";
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/dynload/nccl.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#define NCCL_ID_VARNAME "NCCLID" #define NCCL_ID_VARNAME "NCCLID"
...@@ -38,6 +39,8 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) { ...@@ -38,6 +39,8 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
return ncclInt; return ncclInt;
} else if (type == framework::proto::VarType::INT64) { } else if (type == framework::proto::VarType::INT64) {
return ncclInt64; return ncclInt64;
} else if (type == framework::proto::VarType::FP16) {
return ncclFloat16;
} else { } else {
PADDLE_THROW("Not supported"); PADDLE_THROW("Not supported");
} }
......
...@@ -29,6 +29,19 @@ class TemporaryAllocation : public memory::allocation::Allocation { ...@@ -29,6 +29,19 @@ class TemporaryAllocation : public memory::allocation::Allocation {
memory::allocation::AllocationPtr underlying_allocation_; memory::allocation::AllocationPtr underlying_allocation_;
}; };
/*! \brief the TemporaryAllocator is used to alloc the temporary allocation
* which used by CUDA's async operation.
*
* The TemporaryAllocator contains a temp_allocation_queue which
* is used to store the temporary allocations. The allocation, which is
* allocated by TemporaryAllocator, is a unique_ptr, and when it is not held
* by any variable, it will be pushed into the temp_allocation_queue.
*
* There is one opportunity to free the allocations of temp_allocation_queue:
* - when the allocation size of opportunities exceeds a certain threshold
* (defined by FLAGS_limit_of_temporary_allocation).
*
* */
class TemporaryAllocator : public memory::allocation::Allocator { class TemporaryAllocator : public memory::allocation::Allocator {
public: public:
explicit TemporaryAllocator(platform::Place place); explicit TemporaryAllocator(platform::Place place);
......
...@@ -14,8 +14,7 @@ ...@@ -14,8 +14,7 @@
#include "paddle/fluid/platform/temporary_allocator.h" #include "paddle/fluid/platform/temporary_allocator.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/create_tensor_with_allocationptr.h"
DECLARE_double(limit_of_temporary_allocation); DECLARE_double(limit_of_temporary_allocation);
namespace paddle { namespace paddle {
...@@ -47,6 +46,7 @@ TEST(temporary_allocator, temporary_allocator) { ...@@ -47,6 +46,7 @@ TEST(temporary_allocator, temporary_allocator) {
TEST(temporary_allocator, add_callback) { TEST(temporary_allocator, add_callback) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
const double limit = FLAGS_limit_of_temporary_allocation;
FLAGS_limit_of_temporary_allocation = 10; FLAGS_limit_of_temporary_allocation = 10;
platform::CUDAPlace gpu_place(0); platform::CUDAPlace gpu_place(0);
TemporaryAllocator gpu_alloc(gpu_place); TemporaryAllocator gpu_alloc(gpu_place);
...@@ -63,7 +63,7 @@ TEST(temporary_allocator, add_callback) { ...@@ -63,7 +63,7 @@ TEST(temporary_allocator, add_callback) {
}); });
{ gpu_alloc.Allocate(100); } { gpu_alloc.Allocate(100); }
PADDLE_ENFORCE(deleted); PADDLE_ENFORCE(deleted);
FLAGS_limit_of_temporary_allocation = -1; FLAGS_limit_of_temporary_allocation = limit;
#endif #endif
} }
...@@ -75,8 +75,8 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) { ...@@ -75,8 +75,8 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
auto allocation = cpu_alloc.Allocate(memory_size); auto allocation = cpu_alloc.Allocate(memory_size);
void* address = allocation->ptr(); void* address = allocation->ptr();
int numel = memory_size / sizeof(float); int numel = memory_size / sizeof(float);
framework::Tensor tensor = framework::Tensor tensor = framework::GetTensor<float>(
GetTensor<float>(std::move(allocation), framework::make_ddim({numel})); std::move(allocation), framework::make_ddim({numel}));
PADDLE_ENFORCE_EQ(address, tensor.data<float>()); PADDLE_ENFORCE_EQ(address, tensor.data<float>());
PADDLE_ENFORCE_EQ(tensor.numel(), numel); PADDLE_ENFORCE_EQ(tensor.numel(), numel);
} }
...@@ -90,8 +90,8 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) { ...@@ -90,8 +90,8 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
auto allocation = gpu_alloc.Allocate(memory_size); auto allocation = gpu_alloc.Allocate(memory_size);
void* address = allocation->ptr(); void* address = allocation->ptr();
int numel = memory_size / sizeof(float); int numel = memory_size / sizeof(float);
framework::Tensor tensor = framework::Tensor tensor = framework::GetTensor<float>(
GetTensor<float>(std::move(allocation), framework::make_ddim({numel})); std::move(allocation), framework::make_ddim({numel}));
PADDLE_ENFORCE_EQ(address, tensor.data<float>()); PADDLE_ENFORCE_EQ(address, tensor.data<float>());
PADDLE_ENFORCE_EQ(tensor.numel(), numel); PADDLE_ENFORCE_EQ(tensor.numel(), numel);
} }
...@@ -116,7 +116,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) { ...@@ -116,7 +116,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
{ {
auto allocation = cpu_alloc.Allocate(memory_size); auto allocation = cpu_alloc.Allocate(memory_size);
address = allocation->ptr(); address = allocation->ptr();
framework::Tensor tensor = GetTensor<float>( framework::Tensor tensor = framework::GetTensor<float>(
std::move(allocation), framework::make_ddim({numel})); std::move(allocation), framework::make_ddim({numel}));
PADDLE_ENFORCE_EQ(address, tensor.data<float>()); PADDLE_ENFORCE_EQ(address, tensor.data<float>());
PADDLE_ENFORCE_EQ(tensor.numel(), numel); PADDLE_ENFORCE_EQ(tensor.numel(), numel);
...@@ -138,7 +138,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) { ...@@ -138,7 +138,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
{ {
auto allocation = gpu_alloc.Allocate(memory_size); auto allocation = gpu_alloc.Allocate(memory_size);
address = allocation->ptr(); address = allocation->ptr();
framework::Tensor tensor = GetTensor<float>( framework::Tensor tensor = framework::GetTensor<float>(
std::move(allocation), framework::make_ddim({numel})); std::move(allocation), framework::make_ddim({numel}));
PADDLE_ENFORCE_EQ(address, tensor.data<float>()); PADDLE_ENFORCE_EQ(address, tensor.data<float>());
PADDLE_ENFORCE_EQ(tensor.numel(), numel); PADDLE_ENFORCE_EQ(tensor.numel(), numel);
......
set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer) set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer scope_pool)
if(WITH_PYTHON) if(WITH_PYTHON)
list(APPEND PYBIND_DEPS py_func_op) list(APPEND PYBIND_DEPS py_func_op)
endif() endif()
......
...@@ -49,6 +49,9 @@ void BindConstValue(pybind11::module* m) { ...@@ -49,6 +49,9 @@ void BindConstValue(pybind11::module* m) {
op_proto_and_checker_maker.def( op_proto_and_checker_maker.def(
"kOpNameScopeAttrName", "kOpNameScopeAttrName",
framework::OpProtoAndCheckerMaker::OpNamescopeAttrName); framework::OpProtoAndCheckerMaker::OpNamescopeAttrName);
op_proto_and_checker_maker.def(
"kOpCreationCallstackAttrName",
framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName);
} }
} // namespace pybind } // namespace pybind
......
...@@ -32,6 +32,7 @@ limitations under the License. */ ...@@ -32,6 +32,7 @@ limitations under the License. */
#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/parallel_executor.h"
#include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/prune.h"
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/scope_pool.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/version.h" #include "paddle/fluid/framework/version.h"
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
...@@ -117,6 +118,9 @@ PYBIND11_MODULE(core, m) { ...@@ -117,6 +118,9 @@ PYBIND11_MODULE(core, m) {
return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj); return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj);
}); });
m.add_object("_cleanup",
py::capsule([]() { ScopePool::Instance().Clear(); }));
py::class_<imperative::VarBase, PyVarBase>(m, "VarBase", R"DOC()DOC") py::class_<imperative::VarBase, PyVarBase>(m, "VarBase", R"DOC()DOC")
.def(py::init<>()) .def(py::init<>())
.def("_run_backward", .def("_run_backward",
...@@ -454,7 +458,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -454,7 +458,7 @@ All parameter, weight, gradient are variables in Paddle.
}, },
py::return_value_policy::copy); py::return_value_policy::copy);
py::class_<Scope>(m, "Scope", R"DOC( py::class_<Scope>(m, "_Scope", R"DOC(
Scope is an association of a name to Variable. All variables belong to Scope. Scope is an association of a name to Variable. All variables belong to Scope.
Variables in a parent scope can be retrieved from local scope. Variables in a parent scope can be retrieved from local scope.
...@@ -474,17 +478,26 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -474,17 +478,26 @@ All parameter, weight, gradient are variables in Paddle.
param.set(param_array, place) param.set(param_array, place)
)DOC") )DOC")
.def("_remove_from_pool",
[](Scope &self) { ScopePool::Instance().Remove(&self); })
.def("var", .def("var",
[](Scope &self, const std::string &name) -> Variable * { [](Scope &self, const std::string &name) -> Variable * {
return self.Var(name); return self.Var(name);
}, },
py::return_value_policy::reference) py::return_value_policy::reference)
.def("find_var", &Scope::FindVar, py::return_value_policy::reference) .def("find_var", &Scope::FindVar, py::return_value_policy::reference)
.def(py::init<>())
.def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); }, .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
py::return_value_policy::reference) py::return_value_policy::reference)
.def("drop_kids", &Scope::DropKids); .def("drop_kids", &Scope::DropKids);
m.def("Scope",
[]() -> Scope * {
auto *s = new Scope();
ScopePool::Instance().Insert(std::unique_ptr<Scope>(s));
return s;
},
py::return_value_policy::reference);
//! @note: Be careful! PyBind will return std::string as an unicode, not //! @note: Be careful! PyBind will return std::string as an unicode, not
//! Python str. If you want a str object, you should cast them in Python. //! Python str. If you want a str object, you should cast them in Python.
m.def("get_all_op_protos", []() -> std::vector<py::bytes> { m.def("get_all_op_protos", []() -> std::vector<py::bytes> {
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import paddle as pd
print(pd.__version__)
...@@ -79,6 +79,7 @@ function cmake_gen() { ...@@ -79,6 +79,7 @@ function cmake_gen() {
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/bin/python2.7 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/bin/python2.7
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/2.7/include/python2.7 -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/2.7/include/python2.7
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/lib/libpython2.7.dylib" -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/lib/libpython2.7.dylib"
pip install --user -r ${PADDLE_ROOT}/python/requirements.txt
else else
exit 1 exit 1
fi fi
...@@ -91,6 +92,7 @@ function cmake_gen() { ...@@ -91,6 +92,7 @@ function cmake_gen() {
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/ -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib" -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib"
WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
pip3.5 install --user -r ${PADDLE_ROOT}/python/requirements.txt
else else
exit 1 exit 1
fi fi
...@@ -103,6 +105,7 @@ function cmake_gen() { ...@@ -103,6 +105,7 @@ function cmake_gen() {
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/ -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib" -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib"
WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
pip3.6 install --user -r ${PADDLE_ROOT}/python/requirements.txt
else else
exit 1 exit 1
fi fi
...@@ -115,6 +118,7 @@ function cmake_gen() { ...@@ -115,6 +118,7 @@ function cmake_gen() {
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/ -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib" -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib"
WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
pip3.7 install --user -r ${PADDLE_ROOT}/python/requirements.txt
else else
exit 1 exit 1
fi fi
...@@ -441,7 +445,9 @@ EOF ...@@ -441,7 +445,9 @@ EOF
# make install should also be test when unittest # make install should also be test when unittest
make install -j 8 make install -j 8
if [ "$1" == "cp27-cp27m" ]; then if [ "$1" == "cp27-cp27m" ]; then
set -e
pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
python ${PADDLE_ROOT}/paddle/scripts/installation_validate.py
elif [ "$1" == "cp35-cp35m" ]; then elif [ "$1" == "cp35-cp35m" ]; then
pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
elif [ "$1" == "cp36-cp36m" ]; then elif [ "$1" == "cp36-cp36m" ]; then
......
...@@ -46,7 +46,7 @@ from . import transpiler ...@@ -46,7 +46,7 @@ from . import transpiler
from . import distribute_lookup_table from . import distribute_lookup_table
from .param_attr import ParamAttr, WeightNormParamAttr from .param_attr import ParamAttr, WeightNormParamAttr
from .data_feeder import DataFeeder from .data_feeder import DataFeeder
from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope
from .transpiler import DistributeTranspiler, \ from .transpiler import DistributeTranspiler, \
memory_optimize, release_memory, DistributeTranspilerConfig memory_optimize, release_memory, DistributeTranspilerConfig
from .lod_tensor import create_lod_tensor, create_random_int_lodtensor from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
......
...@@ -22,6 +22,8 @@ from . import op_frequence ...@@ -22,6 +22,8 @@ from . import op_frequence
from .op_frequence import * from .op_frequence import *
from . import quantize from . import quantize
from .quantize import * from .quantize import *
from . import slim
from .slim import *
from . import utils from . import utils
from .utils import * from .utils import *
...@@ -30,4 +32,5 @@ __all__ += decoder.__all__ ...@@ -30,4 +32,5 @@ __all__ += decoder.__all__
__all__ += memory_usage_calc.__all__ __all__ += memory_usage_calc.__all__
__all__ += op_frequence.__all__ __all__ += op_frequence.__all__
__all__ += quantize.__all__ __all__ += quantize.__all__
__all__ += slim.__all__
__all__ += utils.__all__ __all__ += utils.__all__
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .core import *
from .graph import *
from .prune import *
__all__ = [
'build_compressor',
'CompressPass',
'ImitationGraph',
'SensitivePruneStrategy',
'MagnitudePruner',
'RatioPruner',
]
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import config
from .config import *
from . import compress_pass
from .compress_pass import *
from . import strategy
from .strategy import *
from . import pass_builder
from .pass_builder import *
__all__ = config.__all__ + compress_pass.__all__ + strategy.__all__ + pass_builder.__all__
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ....core import CPUPlace
from ..graph import get_executor
__all__ = ['Context', 'CompressPass']
class Context(object):
"""
The context in the process of compression.
Args:
exe: The executor used to execute graph.
graph: The graph to be compressed.
scope: The scope used to execute graph.
program_exe: The program_exe is used to execute the program
created for modifying the variables in scope.
"""
def __init__(self, exe, graph, scope, program_exe=None):
# The total number of epoches to be trained.
self.epoch = 0
# Current epoch
self.epoch_id = 0
# Current batch
self.batch_id = 0
self.exe = exe
self.graph = graph
self.scope = scope
self.program_exe = program_exe
class CompressPass(object):
"""
The pass used to compress model.
Args:
place: The device used in compression.
data_reader: The data_reader used to run graph.
data_feeder: The data_feeder used to run graph.
scope: The scope used to run graph.
metrics: The metrics for evaluating model.
epoch: The total epoches of trainning in compression.
program_exe: The program_exe is used to execute the program
created for modifying the variables in scope.
"""
def __init__(self,
place=None,
data_reader=None,
data_feeder=None,
scope=None,
metrics=None,
epoch=None,
program_exe=None):
self.strategies = []
self.place = CPUPlace() if place is None else place
self.data_reader = data_reader
self.data_feeder = data_feeder
self.scope = scope
self.metrics = metrics
self.epoch = epoch
self.program_exe = program_exe
def add_strategy(self, strategy):
"""
Add a strategy to current compress pass.
Args:
strategy: The strategy to be added into current compress pass.
"""
self.strategies.append(strategy)
self.epoch = max(strategy.end_epoch, self.epoch)
def apply(self, graph):
"""
Compress a model.
Args:
graph: The target graph to be compressed.
"""
self.executor = get_executor(graph, self.place)
context = Context(
self.executor, graph, self.scope, program_exe=self.program_exe)
for strategy in self.strategies:
strategy.on_compress_begin(context)
for epoch in range(self.epoch):
for strategy in self.strategies:
strategy.on_epoch_begin(context)
for data in self.data_reader():
for strategy in self.strategies:
strategy.on_batch_begin(context)
fetches = None
if self.metrics:
fetches = self.metrics.values()
feed = None
if self.data_feeder:
feed = self.data_feeder.feed(data)
results = self.executor.run(graph,
fetches=fetches,
scope=self.scope,
feed=feed)
if results:
print("results: {}".format(
zip(self.metrics.keys(), results)))
for strategy in self.strategies:
strategy.on_batch_end(context)
context.batch_id += 1
for strategy in self.strategies:
strategy.on_epoch_end(context)
context.epoch_id += 1
for strategy in self.strategies:
strategy.on_compress_end(context)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import funcsigs
import yaml
from collections import OrderedDict
from ..prune import *
from .compress_pass import *
from .strategy import *
__all__ = ['ConfigFactory']
"""This factory is used to create instances by loading and parsing configure file with yaml format.
"""
class ConfigFactory(object):
def __init__(self, config):
"""Init a factory from configure file."""
self.instances = {}
self.version = None
self._parse_config(config)
def get_compress_pass(self):
"""
Get compress pass from factory.
"""
return self.instance('compress_pass')
def instance(self, name):
"""
Get instance from factory.
"""
if name in self.instances:
return self.instances[name]
else:
return None
def _new_instance(self, name, attrs):
if name not in self.instances:
class_ = globals()[attrs['class']]
sig = funcsigs.signature(class_.__init__)
keys = [
param.name for param in sig.parameters.values()
if (param.kind == param.POSITIONAL_OR_KEYWORD)
][1:]
keys = set(attrs.keys()).intersection(set(keys))
args = {}
for key in keys:
value = attrs[key]
if isinstance(value, str) and value in self.instances:
value = self.instances[value]
args[key] = value
self.instances[name] = class_(**args)
return self.instances.get(name)
def _parse_config(self, config):
assert config
with open(config, 'r') as config_file:
key_values = self._ordered_load(config_file)
for key in key_values:
# parse version
if key == 'version' and self.version is None:
self.version = int(key_values['version'])
assert self.version == int(key_values['version'])
# parse pruners
if key == 'pruners' or key == 'strategies':
instances = key_values[key]
for name in instances:
self._new_instance(name, instances[name])
if key == 'compress_pass':
compress_pass = self._new_instance(key, key_values[key])
for name in key_values[key]['strategies']:
strategy = self.instance(name)
compress_pass.add_strategy(strategy)
if key == 'include':
for config_file in key_values[key]:
self._parse_config(config_file.strip())
def _ordered_load(self,
stream,
Loader=yaml.Loader,
object_pairs_hook=OrderedDict):
"""
See: https://stackoverflow.com/questions/5121931/in-python-how-can-you-load-yaml-mappings-as-ordereddicts
"""
class OrderedLoader(Loader):
pass
def construct_mapping(loader, node):
loader.flatten_mapping(node)
return object_pairs_hook(loader.construct_pairs(node))
OrderedLoader.add_constructor(
yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, construct_mapping)
return yaml.load(stream, OrderedLoader)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .compress_pass import CompressPass
from .config import ConfigFactory
__all__ = ['build_compressor']
def build_compressor(place=None,
data_reader=None,
data_feeder=None,
scope=None,
metrics=None,
epoch=None,
config=None):
if config is not None:
factory = ConfigFactory(config)
comp_pass = factory.get_compress_pass()
else:
comp_pass = CompressPass()
comp_pass.place = place
comp_pass.data_reader = data_reader
comp_pass.data_feeder = data_feeder
comp_pass.scope = scope
comp_pass.metrics = metrics
comp_pass.epoch = epoch
return comp_pass
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = ['Strategy']
class Strategy(object):
"""
Base class for all strategies.
"""
def __init__(self, start_epoch=0, end_epoch=10):
"""
Args:
start_epoch: The first epoch to apply the strategy.
end_epoch: The last epoch to apply the strategy.
"""
self.start_epoch = start_epoch
self.end_epoch = end_epoch
def on_compress_begin(self, context):
pass
def on_epoch_begin(self, context):
pass
def on_epoch_end(self, context):
pass
def on_batch_begin(self, context):
pass
def on_batch_end(self, context):
pass
def on_compress_end(self, context):
pass
version: 1.0
pruners:
pruner_1:
class: 'RatioPruner'
ratios:
'conv1_1.w': 0.3
'conv1_2.w': 0.4
'*': 0.9
group_dims:
'*': [1, 2, 3]
criterions:
'*': 'l1-norm'
strategies:
strategy_1:
class: 'SensitivePruneStrategy'
pruner: 'pruner_1'
start_epoch: 0
end_epoch: 10
delta_rate: 0.20
acc_loss_threshold: 0.2
sensitivities:
'conv1_1.w': 0.4
compress_pass:
class: 'CompressPass'
epoch: 100
strategies:
- strategy_1
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import paddle
import os
import sys
from paddle.fluid.contrib.slim import CompressPass
from paddle.fluid.contrib.slim import build_compressor
from paddle.fluid.contrib.slim import ImitationGraph
class LinearModel(object):
def __init__(slef):
pass
def train(self):
train_program = fluid.Program()
startup_program = fluid.Program()
startup_program.random_seed = 10
with fluid.program_guard(train_program, startup_program):
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
predict = fluid.layers.fc(input=x, size=1, act=None)
cost = fluid.layers.square_error_cost(input=predict, label=y)
avg_cost = fluid.layers.mean(cost)
eval_program = train_program.clone()
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_cost)
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=1)
eval_reader = paddle.batch(
paddle.dataset.uci_housing.test(), batch_size=1)
place = fluid.CPUPlace()
train_feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
eval_feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(startup_program)
train_metrics = {"loss": avg_cost.name}
eval_metrics = {"loss": avg_cost.name}
graph = ImitationGraph(train_program)
config = './config.yaml'
comp_pass = build_compressor(
place,
data_reader=train_reader,
data_feeder=train_feeder,
scope=fluid.global_scope(),
metrics=train_metrics,
epoch=1,
config=config)
comp_pass.apply(graph)
if __name__ == "__main__":
model = LinearModel()
model.train()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import executor
from .executor import *
from . import graph
from .graph import *
from . import graph_pass
from .graph_pass import *
__all__ = executor.__all__
__all__ += graph.__all__
__all__ += graph_pass.__all__
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import abc
from abc import abstractmethod
from .... import executor
from .graph import IRGraph, ImitationGraph
__all__ = ['get_executor']
class GraphExecutor(object):
__metaclass__ = abc.ABCMeta
def __init__(self, place):
self.place = place
@abstractmethod
def run(self, graph, feches=None, feed=None):
pass
class IRGraphExecutor(GraphExecutor):
def run(self, grah, fetches, feed=None):
pass
class ImitationGraphExecutor(GraphExecutor):
def __init__(self, place):
super(ImitationGraphExecutor, self).__init__(place)
self.exe = executor.Executor(place)
def run(self, graph, scope=None, fetches=None, feed=None):
assert isinstance(graph, ImitationGraph)
fetch_list = None
if fetches:
fetch_list = [
graph.program.global_block().var(name) for name in fetches
]
results = self.exe.run(graph.program,
scope=scope,
fetch_list=fetch_list,
feed=feed)
return results
def get_executor(graph, place):
if isinstance(graph, ImitationGraph):
return ImitationGraphExecutor(place)
if isinstance(graph, IRGraph):
return IRGraphExecutor(place)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ....framework import Program
__all__ = ['Graph', 'ImitationGraph', 'IRGraph']
class Graph(object):
"""
Base class for all graph.
"""
def __init__(self):
pass
def all_parameters(self):
"""
Return all the parameters in current graph.
"""
pass
class ImitationGraph(Graph):
def __init__(self, program=None):
super(ImitationGraph, self).__init__()
self.program = Program() if program is None else program
def all_parameters(self):
return self.program.global_block().all_parameters()
class IRGraph(Graph):
pass
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = ['GraphPass', 'PruneParameterPass']
class GraphPass(object):
"""
Base class for all graph pass.
"""
def __init__(self):
pass
def apply(self, graph):
pass
class PruneParameterPass(GraphPass):
"""
Generate a graph for pruning parameters from target graph.
"""
def __init__(self, pruned_params, thresholds):
super(PruneParameterPass, self).__init__()
self.pruned_params = pruned_params
self.thresholds = thresholds
self.default_threshold = thresholds['*']
def apply(self, graph):
pass
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import pruner
from .pruner import *
from . import prune_strategy
from .prune_strategy import *
__all__ = pruner.__all__
__all__ += prune_strategy.__all__
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ..core.strategy import Strategy
from ....framework import Program, program_guard
from .... import layers
import numpy as np
__all__ = ['SensitivePruneStrategy', 'PruneStrategy']
class SensitivePruneStrategy(Strategy):
def __init__(self,
pruner=None,
start_epoch=0,
end_epoch=10,
delta_rate=0.20,
acc_loss_threshold=0.2,
sensitivities=None):
super(SensitivePruneStrategy, self).__init__(start_epoch, end_epoch)
self.pruner = pruner
self.delta_rate = delta_rate
self.acc_loss_threshold = acc_loss_threshold
self.sensitivities = sensitivities
class PruneStrategy(Strategy):
"""
The strategy that pruning weights by threshold or ratio iteratively.
"""
def __init__(self,
pruner,
mini_batch_pruning_frequency=1,
start_epoch=0,
end_epoch=10):
super(PruneStrategy, self).__init__(start_epoch, end_epoch)
self.pruner = pruner
self.mini_batch_pruning_frequency = mini_batch_pruning_frequency
def _triger(self, context):
return (context.batch_id % self.mini_batch_pruning_frequency == 0 and
self.start_epoch <= context.epoch_id < self.end_epoch)
def on_batch_end(self, context):
if self._triger(context):
prune_program = Program()
with program_guard(prune_program):
for param in context.graph.all_parameters():
prune_program.global_block().clone_variable(param)
p = prune_program.global_block().var(param.name)
zeros_mask = self.pruner.prune(p)
pruned_param = p * zeros_mask
layers.assign(input=pruned_param, output=param)
context.program_exe.run(prune_program, scope=context.scope)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from .... import layers
__all__ = ['Pruner', 'MagnitudePruner', 'RatioPruner']
class Pruner(object):
"""
Base class of all pruners.
"""
def __init__(self):
pass
def prune(self, param):
pass
class MagnitudePruner(Pruner):
"""
Pruner used to pruning a parameter by threshold.
"""
def __init__(self, threshold):
self.threshold = threshold
def prune(self, param, threshold=None):
if threshold is None:
thres = layers.fill_constant(
shape=[1], dtype='float32', value=self.threshold)
else:
thres = threshold
zeros_mask = layers.less_than(x=param, y=thres)
return zeros_mask
class RatioPruner(Pruner):
"""
Pruner used to pruning a parameter by ratio.
"""
def __init__(self, ratios=None):
"""
Args:
ratios: dict with pair (paramer_name, pruned_ratio).
"""
self.ratios = ratios
def prune(self, param, ratio=None):
"""
Args:
ratio: `ratio=40%` means pruning (1 - 40%) weights to zero.
"""
if ratio is None:
rat = self.ratios[
param.name] if param.name in self.ratios else self.ratios['*']
else:
rat = ratio
if rat < 1.0:
k = max(int(rat * np.prod(param.shape)), 1)
param_vec = layers.reshape(x=param, shape=[1, -1])
param_topk, _ = layers.topk(param_vec, k=k)
threshold = layers.slice(
param_topk, axes=[1], starts=[-1], ends=[k])
threshold = layers.reshape(x=threshold, shape=[1])
zeros_mask = layers.less_than(x=param, y=threshold)
else:
zeros_mask = layers.ones(param.shape)
return zeros_mask
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
version: 1.0
include: ["./unitest/configs/pruners.yaml", "./unitest/configs/pruners_0.yaml"]
pruners:
pruner_1:
class: 'RatioPruner'
ratios:
'conv1_1.w': 0.3
'conv1_2.w': 0.4
'*': 0.9
group_dims:
'*': [1, 2, 3]
criterions:
'*': 'l1-norm'
strategies:
strategy_1:
class: 'SensitivePruneStrategy'
pruner: 'pruner_2'
start_epoch: 0
end_epoch: 10
delta_rate: 0.20
acc_loss_threshold: 0.2
sensitivities:
'conv1_1.w': 0.4
compress_pass:
class: 'CompressPass'
epoch: 100
strategies:
- strategy_1
version: 1.0
pruners:
pruner_2:
class: 'RatioPruner'
ratios:
'conv1_1.w': 0.5
'conv1_2.w': 0.2
'*': 0.7
group_dims:
'*': [1, 2, 3]
criterions:
'*': 'l1-norm'
version: 1.0
pruners:
pruner_3:
class: 'RatioPruner'
ratios:
'conv1_1.w': 0.5
'conv1_2.w': 0.2
'*': 0.7
group_dims:
'*': [1, 2, 3]
criterions:
'*': 'l1-norm'
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.fluid.contrib.slim import ConfigFactory
import unittest
class TestFactory(unittest.TestCase):
def test_parse(self):
factory = ConfigFactory('./unitest/configs/config.yaml')
pruner = factory.instance('pruner_1')
self.assertEquals(pruner.ratios['conv1_1.w'], 0.3)
pruner = factory.instance('pruner_2')
self.assertEquals(pruner.ratios['*'], 0.7)
strategy = factory.instance('strategy_1')
pruner = strategy.pruner
self.assertEquals(pruner.ratios['*'], 0.7)
compress_pass = factory.get_compress_pass()
self.assertEquals(compress_pass.epoch, 100)
strategy = compress_pass.strategies[0]
self.assertEquals(strategy.delta_rate, 0.2)
if __name__ == '__main__':
unittest.main()
...@@ -44,6 +44,8 @@ class DataToLoDTensorConverter(object): ...@@ -44,6 +44,8 @@ class DataToLoDTensorConverter(object):
self.dtype = 'int64' self.dtype = 'int64'
elif dtype == core.VarDesc.VarType.FP64: elif dtype == core.VarDesc.VarType.FP64:
self.dtype = 'float64' self.dtype = 'float64'
elif dtype == core.VarDesc.VarType.FP16:
self.dtype = 'float16'
elif dtype == core.VarDesc.VarType.INT32: elif dtype == core.VarDesc.VarType.INT32:
self.dtype = 'int32' self.dtype = 'int32'
elif dtype == core.VarDesc.VarType.UINT8: elif dtype == core.VarDesc.VarType.UINT8:
......
...@@ -191,7 +191,7 @@ def _fetch_var(name, scope=None, return_numpy=True): ...@@ -191,7 +191,7 @@ def _fetch_var(name, scope=None, return_numpy=True):
assert isinstance(name, str) assert isinstance(name, str)
if scope is None: if scope is None:
scope = global_scope() scope = global_scope()
assert isinstance(scope, core.Scope) assert isinstance(scope, core._Scope)
var = scope.find_var(name) var = scope.find_var(name)
assert var is not None, ( assert var is not None, (
......
...@@ -20,6 +20,7 @@ import os ...@@ -20,6 +20,7 @@ import os
import re import re
import six import six
import sys import sys
import traceback
import numpy as np import numpy as np
...@@ -604,6 +605,10 @@ class Operator(object): ...@@ -604,6 +605,10 @@ class Operator(object):
if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0: if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
del op_attrs[role_var_name] del op_attrs[role_var_name]
callstack_var_name = op_maker.kOpCreationCallstackAttrName()
op_attrs[callstack_var_name] = list(
reversed(traceback.format_stack()))[1:]
if len(self.desc.type()) != 0: if len(self.desc.type()) != 0:
return return
if type is None: if type is None:
......
...@@ -18,6 +18,7 @@ from . import framework ...@@ -18,6 +18,7 @@ from . import framework
import numpy as np import numpy as np
import contextlib import contextlib
from .core import VarDesc from .core import VarDesc
from . import unique_name
__all__ = [ __all__ = [
'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear', 'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear',
...@@ -207,16 +208,39 @@ class UniformInitializer(Initializer): ...@@ -207,16 +208,39 @@ class UniformInitializer(Initializer):
# Initialization Ops should be prepended and not appended # Initialization Ops should be prepended and not appended
if self._seed == 0: if self._seed == 0:
self._seed = block.program.random_seed self._seed = block.program.random_seed
# to be compatible of fp16 initalizers
if var.dtype == VarDesc.VarType.FP16:
out_dtype = VarDesc.VarType.FP32
out_var = block.create_var(
name=unique_name.generate(".".join(['gaussian_random', 'tmp'])),
shape=var.shape,
dtype=out_dtype,
type=VarDesc.VarType.LOD_TENSOR,
persistable=False)
else:
out_dtype = var.dtype
out_var = var
op = block._prepend_op( op = block._prepend_op(
type="uniform_random", type="uniform_random",
outputs={"Out": var}, outputs={"Out": out_var},
attrs={ attrs={
"shape": var.shape, "shape": var.shape,
"dtype": int(var.dtype), "dtype": out_dtype,
"min": self._low, "min": self._low,
"max": self._high, "max": self._high,
"seed": self._seed "seed": self._seed
}) })
if var.dtype == VarDesc.VarType.FP16:
block.append_op(
type="cast",
inputs={"X": out_var},
outputs={"Out": var},
attrs={"in_dtype": out_var.dtype,
"out_dtype": var.dtype})
var.op = op var.op = op
return op return op
...@@ -261,17 +285,39 @@ class NormalInitializer(Initializer): ...@@ -261,17 +285,39 @@ class NormalInitializer(Initializer):
# Initialization Ops should be prepended and not appended # Initialization Ops should be prepended and not appended
if self._seed == 0: if self._seed == 0:
self._seed = block.program.random_seed self._seed = block.program.random_seed
# to be compatible of fp16 initalizers
if var.dtype == VarDesc.VarType.FP16:
out_dtype = VarDesc.VarType.FP32
out_var = block.create_var(
name=unique_name.generate(".".join(['gaussian_random', 'tmp'])),
shape=var.shape,
dtype=out_dtype,
type=VarDesc.VarType.LOD_TENSOR,
persistable=False)
else:
out_dtype = var.dtype
out_var = var
op = block._prepend_op( op = block._prepend_op(
type="gaussian_random", type="gaussian_random",
outputs={"Out": var}, outputs={"Out": out_var},
attrs={ attrs={
"shape": var.shape, "shape": var.shape,
"dtype": int(var.dtype), "dtype": out_dtype,
"mean": self._mean, "mean": self._mean,
"std": self._std_dev, "std": self._std_dev,
"seed": self._seed, "seed": self._seed,
"use_mkldnn": False "use_mkldnn": False
}) })
if var.dtype == VarDesc.VarType.FP16:
block.append_op(
type="cast",
inputs={"X": out_var},
outputs={"Out": var},
attrs={"in_dtype": out_var.dtype,
"out_dtype": var.dtype})
var.op = op var.op = op
return op return op
......
...@@ -2801,6 +2801,10 @@ def batch_norm(input, ...@@ -2801,6 +2801,10 @@ def batch_norm(input,
helper = LayerHelper('batch_norm', **locals()) helper = LayerHelper('batch_norm', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
# use fp32 for bn parameter
if dtype == core.VarDesc.VarType.FP16:
dtype = core.VarDesc.VarType.FP32
input_shape = input.shape input_shape = input.shape
if data_layout == 'NCHW': if data_layout == 'NCHW':
channel_num = input_shape[1] channel_num = input_shape[1]
...@@ -2835,7 +2839,7 @@ def batch_norm(input, ...@@ -2835,7 +2839,7 @@ def batch_norm(input,
trainable=False, trainable=False,
do_model_average=do_model_average_for_mean_and_var), do_model_average=do_model_average_for_mean_and_var),
shape=param_shape, shape=param_shape,
dtype=input.dtype) dtype=dtype)
mean.stop_gradient = True mean.stop_gradient = True
variance = helper.create_parameter( variance = helper.create_parameter(
...@@ -2845,7 +2849,7 @@ def batch_norm(input, ...@@ -2845,7 +2849,7 @@ def batch_norm(input,
trainable=False, trainable=False,
do_model_average=do_model_average_for_mean_and_var), do_model_average=do_model_average_for_mean_and_var),
shape=param_shape, shape=param_shape,
dtype=input.dtype) dtype=dtype)
variance.stop_gradient = True variance.stop_gradient = True
# create output # create output
...@@ -4526,7 +4530,7 @@ def topk(input, k, name=None): ...@@ -4526,7 +4530,7 @@ def topk(input, k, name=None):
Args: Args:
input(Variable): The input variable which can be a vector or Tensor with input(Variable): The input variable which can be a vector or Tensor with
higher rank. higher rank.
k(int): The number of top elements to look for along the last dimension k(int | Variable): The number of top elements to look for along the last dimension
of input. of input.
name(str|None): A name for this layer(optional). If set None, the layer name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
...@@ -4549,12 +4553,18 @@ def topk(input, k, name=None): ...@@ -4549,12 +4553,18 @@ def topk(input, k, name=None):
helper = LayerHelper("top_k", **locals()) helper = LayerHelper("top_k", **locals())
values = helper.create_variable_for_type_inference(dtype=input.dtype) values = helper.create_variable_for_type_inference(dtype=input.dtype)
indices = helper.create_variable_for_type_inference(dtype="int64") indices = helper.create_variable_for_type_inference(dtype="int64")
inputs = {"X": [input]}
attrs = None
if isinstance(k, Variable):
inputs['K'] = k
else:
attrs = {'k': k}
helper.append_op( helper.append_op(
type="top_k", type="top_k",
inputs={"X": [input]}, inputs=inputs,
outputs={"Out": [values], outputs={"Out": [values],
"Indices": [indices]}, "Indices": [indices]},
attrs={"k": k}) attrs=attrs)
values.stop_gradient = True values.stop_gradient = True
indices.stop_gradient = True indices.stop_gradient = True
return values, indices return values, indices
...@@ -7943,7 +7953,7 @@ def unstack(x, axis=0, num=None): ...@@ -7943,7 +7953,7 @@ def unstack(x, axis=0, num=None):
num = x.shape[axis] num = x.shape[axis]
outs = [] outs = []
for _ in num: for _ in range(num):
outs.append(helper.create_variable_for_type_inference(x.dtype)) outs.append(helper.create_variable_for_type_inference(x.dtype))
helper.append_op( helper.append_op(
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from paddle.fluid.tests.unittests.test_fill_constant_op import TestFillConstantOp1, TestFillConstantOp2, TestFillConstantOpWithSelectedRows
class TestNGRAPHFillConstantOp1(TestFillConstantOp1):
def setUp(self):
super(TestNGRAPHFillConstantOp1, self).setUp()
class TestNGRAPHFillConstantOp2(TestFillConstantOp2):
def setUp(self):
super(TestNGRAPHFillConstantOp2, self).setUp()
class TestNGRAPHFillConstantOpWithSelectedRows(
TestFillConstantOpWithSelectedRows):
def setUp(self):
super(TestFillConstantOpWithSelectedRows, self).setUp()
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from paddle.fluid.tests.unittests.test_top_k_op import TestTopkOp, TestTopkOp3d, TestTopkOp2, TestTopkOp3, TestTopkOp4
class TestNGRAPHTopkOp(TestTopkOp):
def setUp(self):
super(TestNGRAPHTopkOp, self).setUp()
class TestNGRAPHTopkOp2(TestTopkOp2):
def setUp(self):
super(TestNGRAPHTopkOp2, self).setUp()
class TestNGRAPHTopkOp3(TestTopkOp3):
def setUp(self):
super(TestNGRAPHTopkOp3, self).setUp()
class TestNGRAPHTopkOp4(TestTopkOp4):
def setUp(self):
super(TestNGRAPHTopkOp4, self).setUp()
if __name__ == "__main__":
unittest.main()
...@@ -370,6 +370,8 @@ class OpTest(unittest.TestCase): ...@@ -370,6 +370,8 @@ class OpTest(unittest.TestCase):
return [place] return [place]
else: else:
return [] return []
else:
return []
places = [fluid.CPUPlace()] places = [fluid.CPUPlace()]
cpu_only = self._cpu_only if hasattr(self, '_cpu_only') else False cpu_only = self._cpu_only if hasattr(self, '_cpu_only') else False
if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type)\ if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type)\
......
...@@ -22,8 +22,10 @@ from op_test import OpTest ...@@ -22,8 +22,10 @@ from op_test import OpTest
class TestAccuracyOp(OpTest): class TestAccuracyOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "accuracy" self.op_type = "accuracy"
self.dtype = np.float32
self.init_dtype()
n = 8192 n = 8192
infer = np.random.random((n, 1)).astype("float32") infer = np.random.random((n, 1)).astype(self.dtype)
indices = np.random.randint(0, 2, (n, 1)) indices = np.random.randint(0, 2, (n, 1))
label = np.random.randint(0, 2, (n, 1)) label = np.random.randint(0, 2, (n, 1))
self.inputs = {'Out': infer, 'Indices': indices, "Label": label} self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
...@@ -34,14 +36,25 @@ class TestAccuracyOp(OpTest): ...@@ -34,14 +36,25 @@ class TestAccuracyOp(OpTest):
num_correct += 1 num_correct += 1
break break
self.outputs = { self.outputs = {
'Accuracy': np.array([num_correct / float(n)]).astype("float32"), 'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
'Correct': np.array([num_correct]).astype("int32"), 'Correct': np.array([num_correct]).astype("int32"),
'Total': np.array([n]).astype("int32") 'Total': np.array([n]).astype("int32")
} }
def init_dtype(self):
pass
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
class TestAccuracyOpFp16(TestAccuracyOp):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
self.check_output(atol=1e-3)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
class TestDeQuantizeOp(OpTest):
def setUp(self):
self.op_type = 'dequantize'
self.scale = 2.0
self.input_size = [1, 1, 5, 5] #Naive nChw16c
self.data_type = 'int8'
self.set_scale()
self.set_data_type()
if self.data_type == 'int8':
input = (np.random.randint(0, 100, self.input_size) - 50
).astype(self.data_type)
output = (input * (1 / self.scale)).astype('float')
else:
input = (np.random.randint(0, 100,
self.input_size)).astype(self.data_type)
output = (input * (1 / self.scale)).astype('float')
self.inputs = {'Input': OpTest.np_dtype_to_fluid_dtype(input)}
self.outputs = {'Output': output}
self.attrs = {'Scale': self.scale, }
def test_check_output(self):
self.check_output()
def set_scale(self):
pass
def set_data_type(OpTest):
pass
class TestDeQuantizeOp1(TestDeQuantizeOp):
def set_scale(self):
self.scale = 1.5
def set_data_type(self):
self.data_type = 'int8'
class TestDeQuantizeOp2(TestDeQuantizeOp):
def set_scale(self):
self.scale = 0.8
def set_data_type(self):
self.data_type = 'uint8'
if __name__ == '__main__':
unittest.main()
...@@ -21,14 +21,16 @@ from op_test import OpTest ...@@ -21,14 +21,16 @@ from op_test import OpTest
class ElementwiseDivOp(OpTest): class ElementwiseDivOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "elementwise_div" self.op_type = "elementwise_div"
self.dtype = np.float32
self.init_dtype()
""" Warning """ Warning
CPU gradient check error! CPU gradient check error!
'X': np.random.random((32,84)).astype("float32"), 'X': np.random.random((32,84)).astype("float32"),
'Y': np.random.random((32,84)).astype("float32") 'Y': np.random.random((32,84)).astype("float32")
""" """
self.inputs = { self.inputs = {
'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"), 'X': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype),
'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32") 'Y': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
} }
self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])} self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
...@@ -46,6 +48,9 @@ class ElementwiseDivOp(OpTest): ...@@ -46,6 +48,9 @@ class ElementwiseDivOp(OpTest):
self.check_grad( self.check_grad(
['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y')) ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y'))
def init_dtype(self):
pass
class TestElementwiseDivOp_scalar(ElementwiseDivOp): class TestElementwiseDivOp_scalar(ElementwiseDivOp):
def setUp(self): def setUp(self):
...@@ -126,5 +131,21 @@ class TestElementwiseDivOp_broadcast_3(ElementwiseDivOp): ...@@ -126,5 +131,21 @@ class TestElementwiseDivOp_broadcast_3(ElementwiseDivOp):
} }
class TestElementwiseDivOpFp16(ElementwiseDivOp):
def init_dtype(self):
self.dtype = np.float16
def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out', max_relative_error=1)
def test_check_grad_ingore_x(self):
self.check_grad(
['Y'], 'Out', max_relative_error=1, no_grad_set=set("X"))
def test_check_grad_ingore_y(self):
self.check_grad(
['X'], 'Out', max_relative_error=1, no_grad_set=set('Y'))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -135,5 +135,10 @@ class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp): ...@@ -135,5 +135,10 @@ class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
} }
class TestElementwiseMulOpFp16(ElementwiseMulOp):
def init_dtype(self):
self.dtype = np.float16
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -22,12 +22,22 @@ from op_test import OpTest ...@@ -22,12 +22,22 @@ from op_test import OpTest
class TestFillZerosLikeOp(OpTest): class TestFillZerosLikeOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "fill_zeros_like" self.op_type = "fill_zeros_like"
self.inputs = {'X': np.random.random((219, 232)).astype("float32")} self.dtype = np.float32
self.init_dtype()
self.inputs = {'X': np.random.random((219, 232)).astype(self.dtype)}
self.outputs = {'Out': np.zeros_like(self.inputs["X"])} self.outputs = {'Out': np.zeros_like(self.inputs["X"])}
def init_dtype(self):
pass
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
class TestFillZerosLikeOpFp16(TestFillZerosLikeOp):
def init_dtype(self):
self.dtype = np.float16
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -24,11 +24,13 @@ from op_test import OpTest ...@@ -24,11 +24,13 @@ from op_test import OpTest
class TestMomentumOp1(OpTest): class TestMomentumOp1(OpTest):
def setUp(self): def setUp(self):
self.op_type = "momentum" self.op_type = "momentum"
self.dtype = np.float32
self.init_dtype()
param = np.random.random((123, 321)).astype("float32") param = np.random.random((123, 321)).astype(self.dtype)
grad = np.random.random((123, 321)).astype("float32") grad = np.random.random((123, 321)).astype(self.dtype)
velocity = np.zeros((123, 321)).astype("float32") velocity = np.zeros((123, 321)).astype(self.dtype)
learning_rate = np.array([0.001]).astype("float32") learning_rate = np.array([0.001]).astype(self.dtype)
mu = 0.0001 mu = 0.0001
use_nesterov = False use_nesterov = False
...@@ -50,10 +52,21 @@ class TestMomentumOp1(OpTest): ...@@ -50,10 +52,21 @@ class TestMomentumOp1(OpTest):
self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out} self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
def init_dtype(self):
pass
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
class TestMomentumOpFp16(TestMomentumOp1):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
self.check_output(atol=1e-3)
class TestMomentumOp2(OpTest): class TestMomentumOp2(OpTest):
'''Test Momentum with default values for attributes '''Test Momentum with default values for attributes
''' '''
......
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册