提交 7a517dc9 编写于 作者: D dzhwinter

merge develop

......@@ -815,3 +815,8 @@ zeros
.. autofunction:: paddle.fluid.layers.zeros
:noindex:
topk
----
.. autofunction:: paddle.fluid.layers.topk
:noindex:
../../v2/dev/contribute_to_paddle_cn.md
\ No newline at end of file
../../v2/dev/contribute_to_paddle_en.md
\ No newline at end of file
......@@ -4,6 +4,8 @@
.. toctree::
:maxdepth: 1
contribute_to_paddle_cn.md
write_docs_cn.md
api_doc_std_cn.md
new_op_cn.md
new_op_kernel.md
......
......@@ -4,6 +4,8 @@ Development
.. toctree::
:maxdepth: 1
contribute_to_paddle_en.md
write_docs_en.md
api_doc_std_en.md
new_op_en.md
new_op_kernel.md
......
../../v2/dev/write_docs_cn.rst
\ No newline at end of file
../../v2/dev/write_docs_en.rst
\ No newline at end of file
......@@ -6,7 +6,43 @@ Data Reader Interface
DataTypes
=========
.. automodule:: paddle.v2.data_type
.. autofunction:: paddle.v2.data_type.dense_array
:noindex:
.. autofunction:: paddle.v2.data_type.integer_value
:noindex:
.. autofunction:: paddle.v2.data_type.integer_value_sequence
:noindex:
.. autofunction:: paddle.v2.data_type.integer_value_sub_sequence
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_binary_vector
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_binary_vector_sequence
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_binary_vector_sub_sequence
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_float_vector
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_float_vector_sequence
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_float_vector_sub_sequence
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_non_value_slot
:noindex:
.. autofunction:: paddle.v2.data_type.sparse_value_slot
:noindex:
.. autoclass:: paddle.v2.data_type.InputType
:members:
:noindex:
......
此差异已折叠。
......@@ -102,7 +102,7 @@ cc_test(init_test SRCS init_test.cc DEPS init)
cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
cc_test(channel_test SRCS channel_test.cc)
# cc_test(channel_test SRCS channel_test.cc)
cc_test(tuple_test SRCS tuple_test.cc )
cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
......
......@@ -51,23 +51,23 @@ void FetchOpHandle::RunImpl() {
auto *var = static_cast<VarHandle *>(input);
var->generated_op_->Wait(cpu_ctx);
}
tensors_.resize(inputs_.size());
auto *var = static_cast<VarHandle *>(inputs_[0]);
auto &var_name = var->name_;
auto *var_handle = static_cast<VarHandle *>(inputs_[0]);
auto &var_name = var_handle->name_;
platform::CPUPlace cpu;
auto &scopes = *local_scopes_;
for (size_t i = 0; i < scopes.size(); ++i) {
auto &scope = scopes[i];
auto &t = scope->FindVar(kLocalExecScopeName)
->Get<Scope *>()
->FindVar(var_name)
->Get<framework::LoDTensor>();
if (platform::is_gpu_place(var->place_)) {
auto *var =
scope->FindVar(kLocalExecScopeName)->Get<Scope *>()->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
var_name);
auto &t = var->Get<framework::LoDTensor>();
if (platform::is_gpu_place(t.place())) {
#ifdef PADDLE_WITH_CUDA
TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);
dev_ctxes_[t.place()]->Wait();
dev_ctxes_.at(t.place())->Wait();
#endif
} else {
tensors_[i].ShareDataWith(t);
......
......@@ -89,101 +89,25 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
bool is_forwarding = true;
for (auto *op : program.Block(0).AllOps()) {
bool change_forward = false;
if (!is_forwarding) {
// FIXME(yy): Do not hard code like this
if (op->OutputArgumentNames().size() == 1 &&
op->OutputArgumentNames()[0] == GradVarName(loss_var_name_)) {
continue; // Drop fill 1. for backward coeff;
}
}
// append send op if program is distributed trainer main program.
// always use the first device
if (!is_forwarding && op->Type() == "send") {
auto &p = places_[0];
auto *s = local_scopes_[0];
// FIXME(wuyi): send op always copy from GPU 0
result.ops_.emplace_back(new SendOpHandle(*op, s, p));
// Create inputs for output on original place and no ssa output
// is created for send op.
CreateOpHandleIOs(&result, *op, p, 0);
continue;
}
for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i];
auto *s = local_scopes_[i];
result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
auto *op_handle = result.ops_.back().get();
CreateOpHandleIOs(&result, *op, p, i);
auto var_names = op->OutputArgumentNames();
if (is_forwarding) {
if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
// Insert ScaleCost OpHandle
#ifdef PADDLE_WITH_CUDA
auto *communication_dev_ctx = nccl_ctxs_->DevCtx(p);
#else
auto *communication_dev_ctx =
platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
#endif
op_handle = new ScaleLossGradOpHandle(local_scopes_.size(), s, p,
communication_dev_ctx);
result.ops_.emplace_back(op_handle);
// FIXME: Currently ScaleLossGradOp only use device_count as scale
// factor. So it does not depend on any other operators.
// VarHandle *loss = GetVarHandle(loss_var_name, place);
// loss->pending_ops_.emplace_back(op_handle);
// op_handle->inputs_.emplace_back(loss);
CreateOpOutput(&result, op_handle, GradVarName(loss_var_name_), p, i);
change_forward = true;
}
}
}
if (change_forward) {
if (op->Type() == "send") {
// append send op if program is distributed trainer main program.
// always use the first device
CreateSendOp(&result, *op);
} else if (IsScaleLossOp(*op)) {
CreateScaleLossGradOp(&result);
is_forwarding = false;
}
if (!is_forwarding) {
auto var_names = op->OutputArgumentNames();
// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once. But there are no
// other cases, for example, we need to adjust the gradient according to
// the input when we get the gradient, which is not considered at present.
for (auto &og : var_names) {
if (grad_names_.count(og) != 0 &&
og_has_been_broadcast.count(og) == 0) { // is param grad
// Insert NCCL AllReduce Op
og_has_been_broadcast.insert(og);
#ifdef PADDLE_WITH_CUDA
result.ops_.emplace_back(
new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
auto *op_handle = result.ops_.back().get();
for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i];
auto &vars = result.vars_[i][og];
if (vars.empty()) { // This device has no data. continue.
continue;
}
auto &prev_grad = vars[vars.size() - 1];
op_handle->AddInput(prev_grad.get());
auto var = new VarHandle(vars.size() - 1, i, og, p);
vars.emplace_back(var);
op_handle->AddOutput(var);
} else {
CreateComputationalOps(&result, *op);
if (!is_forwarding) {
// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once. But there are no
// other cases, for example, we need to adjust the gradient according to
// the input when we get the gradient, which is not considered at
// present.
for (auto &og : op->OutputArgumentNames()) {
if (IsParameterGradientOnce(og, &og_has_been_broadcast)) {
InsertNCCLAllReduceOp(&result, og);
}
#else
PADDLE_ENFORCE("Not implemented");
#endif
}
}
}
......@@ -207,7 +131,95 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
}
return std::unique_ptr<SSAGraph>(graph);
} // namespace details
}
void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
SSAGraph *result, const std::string &og) const {
#ifdef PADDLE_WITH_CUDA
result->ops_.emplace_back(
new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
auto *op_handle = result->ops_.back().get();
for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i];
auto &vars = result->vars_[i][og];
PADDLE_ENFORCE(!vars.empty());
auto &prev_grad = vars.back();
op_handle->AddInput(prev_grad.get());
auto var = new VarHandle(vars.size() - 1, i, og, p);
vars.emplace_back(var);
op_handle->AddOutput(var);
}
#else
PADDLE_ENFORCE("Not implemented");
#endif
}
bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
const std::string &og,
std::unordered_set<std::string> *og_has_been_broadcast) const {
bool is_pg_once =
grad_names_.count(og) != 0 && og_has_been_broadcast->count(og) == 0;
if (is_pg_once) {
// Insert NCCL AllReduce Op
og_has_been_broadcast->insert(og);
}
return is_pg_once;
}
void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
for (size_t i = 0; i < places_.size(); ++i) {
// Insert ScaleCost OpHandle
#ifdef PADDLE_WITH_CUDA
auto *communication_dev_ctx = nccl_ctxs_->DevCtx(places_[i]);
#else
auto *communication_dev_ctx =
platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
#endif
auto *op_handle =
new ScaleLossGradOpHandle(local_scopes_.size(), local_scopes_[i],
places_[i], communication_dev_ctx);
result->ops_.emplace_back(op_handle);
// FIXME: Currently ScaleLossGradOp only use device_count as scale
// factor. So it does not depend on any other operators.
// VarHandle *loss = GetVarHandle(loss_var_name, place);
// loss->pending_ops_.emplace_back(op_handle);
// op_handle->inputs_.emplace_back(loss);
CreateOpOutput(result, op_handle, GradVarName(loss_var_name_), places_[i],
i);
}
}
void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result,
const OpDesc &op) const {
for (size_t scope_idx = 0; scope_idx < places_.size(); ++scope_idx) {
auto p = places_[scope_idx];
auto s = local_scopes_[scope_idx];
result->ops_.emplace_back(new ComputationOpHandle(op, s, p));
CreateOpHandleIOs(result, op, p, scope_idx);
}
}
void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result,
const OpDesc &op) const {
auto &p = places_[0];
auto *s = local_scopes_[0];
// FIXME(wuyi): send op always copy from GPU 0
result->ops_.emplace_back(new SendOpHandle(op, s, p));
// Create inputs for output on original place and no ssa output
// is created for send op.
CreateOpHandleIOs(result, op, p, 0);
}
bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
// FIXME(yy): Do not hard code like this
return op.OutputArgumentNames().size() == 1 &&
op.OutputArgumentNames()[0] == GradVarName(loss_var_name_);
}
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -57,6 +57,20 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
#ifdef PADDLE_WITH_CUDA
platform::NCCLContextMap *nccl_ctxs_;
#endif
bool IsScaleLossOp(const OpDesc &op) const;
void CreateSendOp(SSAGraph *result, const OpDesc &op) const;
void CreateComputationalOps(SSAGraph *result, const OpDesc &op) const;
void CreateScaleLossGradOp(SSAGraph *result) const;
bool IsParameterGradientOnce(
const std::string &og,
std::unordered_set<std::string> *og_has_been_broadcast) const;
void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const;
};
} // namespace details
} // namespace framework
......
......@@ -73,8 +73,9 @@ void NCCLAllReduceOpHandle::RunImpl() {
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto *s = local_scopes_[i];
auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto &lod_tensor = s->FindVar(var_name)->Get<LoDTensor>();
auto &lod_tensor = local_scope.FindVar(var_name)->Get<LoDTensor>();
lod_tensors.emplace_back(lod_tensor);
}
......@@ -110,17 +111,21 @@ void NCCLAllReduceOpHandle::RunImpl() {
}
});
} else { // Special handle CPU only Operator's gradient. Like CRF
auto &trg =
*this->local_scopes_[0]->Var()->GetMutable<framework::LoDTensor>();
auto &trg = *this->local_scopes_[0]
->FindVar(kLocalExecScopeName)
->Get<Scope *>()
->Var()
->GetMutable<framework::LoDTensor>();
// Reduce All Tensor to trg in CPU
ReduceLoDTensor func(lod_tensors, &trg);
VisitDataType(ToDataType(lod_tensors[0].type()), func);
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &scope = local_scopes_[i];
auto &scope =
*local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto &p = places_[i];
auto *var = scope->FindVar(var_name);
auto *var = scope.FindVar(var_name);
auto *dev_ctx = dev_ctxes_[p];
RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
......
......@@ -30,10 +30,11 @@ ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
void ScaleLossGradOpHandle::RunImpl() {
std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
float *tmp =
scope_->FindVar(var_name)->GetMutable<LoDTensor>()->mutable_data<float>(
make_ddim({1}), place_);
float *tmp = local_scope.FindVar(var_name)
->GetMutable<LoDTensor>()
->mutable_data<float>(make_ddim({1}), place_);
if (platform::is_cpu_place(place_)) {
*tmp = coeff_;
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <string>
#include <unordered_set>
#include <vector>
......@@ -69,8 +70,7 @@ class GradOpDescMakerBase {
" for input argument with a list of variables, "
" drop_empty_grad is not allowed because it makes"
" the correspondence bewteen a variable and its gradient"
" ambiguous. Use REGISTER_OP_EX to register the op"
" or call InputGrad(?,false) in GradOpDescMaker."
" ambiguous."
" Op type %s",
fwd_op_.Type());
......
......@@ -16,6 +16,8 @@ limitations under the License. */
#include <algorithm>
#include <atomic>
#include <string>
#include <tuple>
#include <type_traits>
#include <typeinfo>
#include <unordered_map>
......@@ -141,36 +143,6 @@ class OpKernelRegistrar : public Registrar {
return 0; \
}
/**
* Macro to register Operator. When the input is duplicable, you should
* use REGISTER_OP_EX with drop_empty_grad=false instead.
*/
#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
grad_op_class) \
REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type, \
grad_op_class, true)
// When an argument is duplicable, we need to use this version.
// Perhaps we can omit DropEmptyIG template parameter and
// only have one version of REGISTER_OP.
#define REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type, \
grad_op_class, drop_empty_grad) \
REGISTER_OPERATOR(grad_op_type, grad_op_class); \
class _GradOpDescMaker_##grad_op_type##_ \
: public ::paddle::framework::DefaultGradOpDescMaker<drop_empty_grad> { \
using ::paddle::framework::DefaultGradOpDescMaker< \
drop_empty_grad>::DefaultGradOpDescMaker; \
\
protected: \
virtual std::string GradOpType() const { return #grad_op_type; } \
}; \
REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_, \
op_maker_class);
#define REGISTER_OP_WITH_KERNEL(op_type, ...) \
REGISTER_OPERATOR(op_type, ::paddle::framework::OperatorWithKernel, \
##__VA_ARGS__)
#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
REGISTER_OPERATOR(op_type, op_class, op_maker_class)
......
......@@ -44,6 +44,7 @@ class ParallelExecutorPrivate {
#endif
std::vector<std::tuple<std::string, proto::VarType::Type, bool>> var_types_;
bool own_local_scope;
};
std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
......@@ -63,13 +64,16 @@ ParallelExecutor::ParallelExecutor(
// Step 1. Bcast the params to devs.
// Create local scopes
if (local_scopes.empty()) {
for (size_t i = 0; i < member_->places_.size(); ++i) {
member_->local_scopes_.push_back(&scope->NewScope());
member_->own_local_scope = true;
member_->local_scopes_.emplace_back(member_->global_scope_);
for (size_t i = 1; i < member_->places_.size(); ++i) {
member_->local_scopes_.emplace_back(&scope->NewScope());
}
} else {
member_->own_local_scope = false;
PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
for (size_t i = 0; i < member_->places_.size(); ++i) {
member_->local_scopes_.push_back(local_scopes[i]);
member_->local_scopes_.emplace_back(local_scopes[i]);
}
}
......@@ -159,7 +163,9 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
const std::string &fetched_var_name) {
platform::RecordBlock b(0);
// Create local scopes.
for (auto &scope : member_->local_scopes_) {
for (auto it = member_->local_scopes_.rbegin();
it != member_->local_scopes_.rend(); ++it) {
auto &scope = *it;
Scope &local_scope = scope->NewScope();
*scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
&local_scope;
......@@ -173,7 +179,7 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
std::get<1>(name_type_pair));
} else {
InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
InitializeVariable(local_scope.Var(std::get<0>(name_type_pair)),
std::get<1>(name_type_pair));
}
}
......@@ -228,5 +234,13 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
}
}
ParallelExecutor::~ParallelExecutor() {
if (member_->own_local_scope) {
for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
member_->global_scope_->DeleteScope(member_->local_scopes_[i]);
}
}
}
} // namespace framework
} // namespace paddle
......@@ -42,6 +42,8 @@ class ParallelExecutor {
const std::vector<Scope*>& local_scopes,
bool allow_op_delay);
~ParallelExecutor();
std::vector<Scope*>& GetLocalScopes();
/**
......
......@@ -110,12 +110,12 @@ function(op_library TARGET)
# Note that it's enough to just adding one operator to pybind in a *_op.cc file.
# And for detail pybind information, please see generated paddle/pybind/pybind.h.
file(READ ${TARGET}.cc TARGET_CONTENT)
string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}")
string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}")
string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}")
if (one_register STREQUAL "")
string(REPLACE "_op" "" TARGET "${TARGET}")
else ()
string(REPLACE "REGISTER_OP(" "" TARGET "${one_register}")
string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
string(REPLACE "," "" TARGET "${TARGET}")
endif()
......
......@@ -469,8 +469,6 @@ REGISTER_ACTIVATION_OP_GRAD_MAKER(HardSigmoid, hard_sigmoid);
namespace ops = paddle::operators;
void DummyFunctor() {}
#define FOR_EACH_INPLACE_OP_FUNCTOR(__macro) \
__macro(Sigmoid, sigmoid); \
__macro(Relu, relu); \
......
......@@ -648,7 +648,7 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"threshold", &threshold}};
}
bool Inplace() const { return IsInplace("softrelu"); }
bool Inplace() const { return IsInplace("soft_relu"); }
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
......
......@@ -153,9 +153,11 @@ class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp,
ops::BilinearTensorProductOpMaker, bilinear_tensor_product_grad,
ops::BilinearTensorProductOpGrad);
REGISTER_OPERATOR(bilinear_tensor_product, ops::BilinearTensorProductOp,
ops::BilinearTensorProductOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(bilinear_tensor_product_grad,
ops::BilinearTensorProductOpGrad)
REGISTER_OP_CPU_KERNEL(
bilinear_tensor_product,
ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -81,8 +81,9 @@ class ClipOpGrad : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
ops::ClipOpGrad);
REGISTER_OPERATOR(clip, ops::ClipOp, ops::ClipOpMaker<float>,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad)
REGISTER_OP_CPU_KERNEL(
clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
......
......@@ -103,8 +103,10 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
ops::ConcatOpGrad, false)
REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
paddle::framework::DefaultGradOpDescMaker<
false> /* set false to disable empty grad */)
REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad)
REGISTER_OP_CPU_KERNEL(
concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>)
REGISTER_OP_CPU_KERNEL(
......
......@@ -335,14 +335,17 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
ops::ConvOpGrad);
REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad)
// depthwise convolution op
REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
depthwise_conv2d_grad, ops::ConvOpGrad);
REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
ops::ConvOpGrad);
REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad)
REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad)
// depthwise conv kernel
// TODO(xingzhaolong): neon kernel for mobile
......
......@@ -193,8 +193,9 @@ class ConvShiftGradKernel<platform::CPUPlace, T>
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(conv_shift, ops::ConvShiftOp, ops::ConvShiftOpMaker,
conv_shift_grad, ops::ConvShiftGradOp);
REGISTER_OPERATOR(conv_shift, ops::ConvShiftOp, ops::ConvShiftOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(conv_shift_grad, ops::ConvShiftGradOp)
REGISTER_OP_CPU_KERNEL(conv_shift,
ops::ConvShiftKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
......
......@@ -298,8 +298,10 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
namespace ops = paddle::operators;
REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
conv2d_transpose_grad, ops::ConvTransposeOpGrad);
REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp,
ops::Conv2DTransposeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(conv2d_transpose_grad, ops::ConvTransposeOpGrad)
REGISTER_OP_CPU_KERNEL(
conv2d_transpose,
......@@ -311,8 +313,10 @@ REGISTER_OP_CPU_KERNEL(
ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
double>);
REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
conv3d_transpose_grad, ops::ConvTransposeOpGrad);
REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp,
ops::Conv3DTransposeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad)
REGISTER_OP_CPU_KERNEL(
conv3d_transpose,
......
......@@ -153,8 +153,9 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, cos_sim_grad,
ops::CosSimOpGrad);
REGISTER_OPERATOR(cos_sim, ops::CosSimOp, ops::CosSimOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(cos_sim_grad, ops::CosSimOpGrad)
REGISTER_OP_CPU_KERNEL(
cos_sim, ops::CosSimKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
......
......@@ -153,7 +153,9 @@ class CropOpGrad : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(crop, ops::CropOp, ops::CropOpMaker, crop_grad, ops::CropOpGrad);
REGISTER_OPERATOR(crop, ops::CropOp, ops::CropOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(crop_grad, ops::CropOpGrad);
REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
REGISTER_OP_CPU_KERNEL(
crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
......@@ -164,8 +164,9 @@ or not. But the output only shares the LoD information with input X.
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
cross_entropy_grad, ops::CrossEntropyGradientOp);
REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp)
REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<float>,
ops::CrossEntropyOpKernel<double>);
REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
......
......@@ -82,7 +82,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
platform::CPUPlace cpu;
auto& gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext&>(ctx);
auto copy_size = tensor.memory_size();
auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
payload = memory::Alloc(cpu, copy_size);
memory::Copy(cpu, payload,
......@@ -99,7 +99,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
} else {
payload = tensor.data<void>();
}
payload_size = tensor.memory_size();
payload_size = tensor.numel() * framework::SizeOfType(tensor.type());
e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
} break;
case framework::proto::VarType_Type_SELECTED_ROWS: {
......@@ -118,7 +118,8 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
platform::CPUPlace cpu;
auto& gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext&>(ctx);
auto copy_size = tensor->memory_size();
auto copy_size =
tensor->numel() * framework::SizeOfType(tensor->type());
payload = memory::Alloc(cpu, copy_size);
memory::Copy(cpu, payload,
boost::get<platform::CUDAPlace>(tensor->place()),
......@@ -133,7 +134,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
} else {
payload = slr->mutable_value()->data<void>();
}
payload_size = tensor->memory_size();
payload_size = tensor->numel() * framework::SizeOfType(tensor->type());
e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
} break;
default:
......
......@@ -101,8 +101,9 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker, dropout_grad,
ops::DropoutOpGrad);
REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad)
REGISTER_OP_CPU_KERNEL(
dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
......
......@@ -30,8 +30,10 @@ class ElementwiseDivOpMaker : public ElementwiseOpMaker {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker,
elementwise_div_grad, ops::ElementwiseOpGrad);
REGISTER_OPERATOR(elementwise_div, ops::ElementwiseOp,
ops::ElementwiseDivOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(elementwise_div_grad, ops::ElementwiseOpGrad)
REGISTER_OP_CPU_KERNEL(
elementwise_div,
ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -29,8 +29,10 @@ class ElementwiseMaxOpMaker : public ElementwiseOpMaker {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(elementwise_max, ops::ElementwiseOp, ops::ElementwiseMaxOpMaker,
elementwise_max_grad, ops::ElementwiseOpGrad);
REGISTER_OPERATOR(elementwise_max, ops::ElementwiseOp,
ops::ElementwiseMaxOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(elementwise_max_grad, ops::ElementwiseOpGrad)
REGISTER_OP_CPU_KERNEL(
elementwise_max,
ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -29,8 +29,10 @@ class ElementwiseMinOpMaker : public ElementwiseOpMaker {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(elementwise_min, ops::ElementwiseOp, ops::ElementwiseMinOpMaker,
elementwise_min_grad, ops::ElementwiseOpGrad);
REGISTER_OPERATOR(elementwise_min, ops::ElementwiseOp,
ops::ElementwiseMinOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(elementwise_min_grad, ops::ElementwiseOpGrad)
REGISTER_OP_CPU_KERNEL(
elementwise_min,
ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -31,8 +31,10 @@ class ElementwiseMulOpMaker : public ElementwiseOpMaker {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
elementwise_mul_grad, ops::ElementwiseOpGrad);
REGISTER_OPERATOR(elementwise_mul, ops::ElementwiseOp,
ops::ElementwiseMulOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(elementwise_mul_grad, ops::ElementwiseOpGrad)
REGISTER_OP_CPU_KERNEL(
elementwise_mul,
ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -29,8 +29,10 @@ class ElementwiseSubOpMaker : public ElementwiseOpMaker {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker,
elementwise_sub_grad, ops::ElementwiseOpGrad);
REGISTER_OPERATOR(elementwise_sub, ops::ElementwiseOp,
ops::ElementwiseSubOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(elementwise_sub_grad, ops::ElementwiseOpGrad)
REGISTER_OP_CPU_KERNEL(
elementwise_sub,
ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -14,6 +14,8 @@ limitations under the License. */
#include "paddle/fluid/operators/expand_op.h"
#include <vector>
namespace paddle {
namespace operators {
......@@ -128,8 +130,9 @@ class ExpandGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad,
ops::ExpandGradOp);
REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp)
REGISTER_OP_CPU_KERNEL(
expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
......
......@@ -98,5 +98,6 @@ FCOpMaker::FCOpMaker(OpProto* proto, OpAttrChecker* op_checker)
} // namespace operators
} // namespace paddle
REGISTER_OP(fc, paddle::operators::FCOp, paddle::operators::FCOpMaker, fc_grad,
paddle::operators::FCOpGrad);
REGISTER_OPERATOR(fc, paddle::operators::FCOp, paddle::operators::FCOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(fc_grad, paddle::operators::FCOpGrad)
......@@ -100,7 +100,8 @@ Out = [[3, 4],
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad,
ops::GatherGradOp);
REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(gather_grad, ops::GatherGradOp)
REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>);
REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>);
......@@ -216,7 +216,9 @@ class GRUGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp);
REGISTER_OPERATOR(gru, ops::GRUOp, ops::GRUOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(gru_grad, ops::GRUGradOp)
REGISTER_OP_CPU_KERNEL(
gru, ops::GRUKernel<paddle::platform::CPUDeviceContext, float>,
ops::GRUKernel<paddle::platform::CPUDeviceContext, double>);
......
......@@ -198,8 +198,9 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad,
ops::GRUUnitGradOp);
REGISTER_OPERATOR(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(gru_unit_grad, ops::GRUUnitGradOp)
REGISTER_OP_CPU_KERNEL(
gru_unit, ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, float>,
ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, double>);
......
......@@ -103,8 +103,9 @@ class HingeLossGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
hinge_loss_grad, ops::HingeLossGradOp);
REGISTER_OPERATOR(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(hinge_loss_grad, ops::HingeLossGradOp)
REGISTER_OP_CPU_KERNEL(
hinge_loss,
ops::HingeLossKernel<paddle::platform::CPUDeviceContext, float>);
......
......@@ -121,8 +121,9 @@ class HuberLossGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
huber_loss_grad, ops::HuberLossGradOp);
REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp)
REGISTER_OP_CPU_KERNEL(
huber_loss,
ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>);
......
......@@ -148,8 +148,9 @@ class Im2SequenceGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker,
im2sequence_grad, ops::Im2SequenceGradOp);
REGISTER_OPERATOR(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(im2sequence_grad, ops::Im2SequenceGradOp)
REGISTER_OP_CPU_KERNEL(
im2sequence,
ops::Im2SequenceKernel<paddle::platform::CPUDeviceContext, float>);
......
......@@ -67,8 +67,9 @@ $$Out = \sum{|X|}$$
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, l1_norm_grad,
ops::L1NormGradOp);
REGISTER_OPERATOR(l1_norm, ops::L1NormOp, ops::L1NormOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(l1_norm_grad, ops::L1NormGradOp)
REGISTER_OP_CPU_KERNEL(
l1_norm, ops::L1NormKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
......
......@@ -117,8 +117,9 @@ class LabelSmoothGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
label_smooth_grad, ops::LabelSmoothGradOp);
REGISTER_OPERATOR(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(label_smooth_grad, ops::LabelSmoothGradOp)
REGISTER_OP_CPU_KERNEL(
label_smooth,
ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -162,8 +162,9 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
layer_norm_grad, ops::LayerNormGradOp);
REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp)
REGISTER_OP_CPU_KERNEL(
layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
ops::LayerNormKernel<paddle::platform::CPUDeviceContext, double>);
......
......@@ -256,8 +256,10 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(linear_chain_crf, ops::LinearChainCRFOp, ops::LinearChainCRFOpMaker,
linear_chain_crf_grad, ops::LinearChainCRFGradOp);
REGISTER_OPERATOR(linear_chain_crf, ops::LinearChainCRFOp,
ops::LinearChainCRFOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(linear_chain_crf_grad, ops::LinearChainCRFGradOp)
REGISTER_OP_CPU_KERNEL(
linear_chain_crf,
ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -155,8 +155,9 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, lod_reset_grad,
ops::LoDResetGradOp);
REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp)
REGISTER_OP_CPU_KERNEL(
lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
ops::LoDResetKernel<paddle::platform::CPUPlace, double>,
......
......@@ -106,8 +106,9 @@ class LogLossGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, log_loss_grad,
ops::LogLossGradOp);
REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp)
REGISTER_OP_CPU_KERNEL(
log_loss, ops::LogLossKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
......
......@@ -276,7 +276,9 @@ class LRNOpGrad : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(lrn, ops::LRNOp, ops::LRNOpMaker<float>, lrn_grad, ops::LRNOpGrad);
REGISTER_OPERATOR(lrn, ops::LRNOp, ops::LRNOpMaker<float>,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(lrn_grad, ops::LRNOpGrad)
REGISTER_OP_CPU_KERNEL(
lrn, ops::LRNKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
......
......@@ -273,7 +273,9 @@ class LSTMGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(lstm, ops::LSTMOp, ops::LSTMOpMaker, lstm_grad, ops::LSTMGradOp);
REGISTER_OPERATOR(lstm, ops::LSTMOp, ops::LSTMOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(lstm_grad, ops::LSTMGradOp)
REGISTER_OP_CPU_KERNEL(
lstm, ops::LSTMKernel<paddle::platform::CPUDeviceContext, float>,
ops::LSTMKernel<paddle::platform::CPUDeviceContext, double>);
......
......@@ -97,8 +97,9 @@ class LstmUnitGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker, lstm_unit_grad,
ops::LstmUnitGradOp);
REGISTER_OPERATOR(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(lstm_unit_grad, ops::LstmUnitGradOp)
REGISTER_OP_CPU_KERNEL(lstm_unit,
ops::LstmUnitKernel<paddle::platform::CPUPlace, float>,
ops::LstmUnitKernel<paddle::platform::CPUPlace, double>);
......
......@@ -322,8 +322,9 @@ class LSTMPGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker, lstmp_grad,
ops::LSTMPGradOp);
REGISTER_OPERATOR(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(lstmp_grad, ops::LSTMPGradOp)
REGISTER_OP_CPU_KERNEL(
lstmp, ops::LSTMPKernel<paddle::platform::CPUDeviceContext, float>,
ops::LSTMPKernel<paddle::platform::CPUDeviceContext, double>);
......
......@@ -111,9 +111,10 @@ class MarginRankLossGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(margin_rank_loss, ops::MarginRankLossOp,
ops::MarginRankLossOpMaker<float>, margin_rank_loss_grad,
ops::MarginRankLossGradOp);
REGISTER_OPERATOR(margin_rank_loss, ops::MarginRankLossOp,
ops::MarginRankLossOpMaker<float>,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(margin_rank_loss_grad, ops::MarginRankLossGradOp)
REGISTER_OP_CPU_KERNEL(
margin_rank_loss,
ops::MarginRankLossKernel<paddle::platform::CPUDeviceContext, float>);
......
......@@ -237,8 +237,9 @@ class MatMulOpGrad : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(matmul, ops::MatMulOp, ops::MatMulOpMaker, matmul_grad,
ops::MatMulOpGrad);
REGISTER_OPERATOR(matmul, ops::MatMulOp, ops::MatMulOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(matmul_grad, ops::MatMulOpGrad)
REGISTER_OP_CPU_KERNEL(
matmul, ops::MatMulKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
......
......@@ -101,8 +101,9 @@ class MaxOutOpGrad : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
ops::MaxOutOpGrad);
REGISTER_OPERATOR(maxout, ops::MaxOutOp, ops::MaxOutOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(maxout_grad, ops::MaxOutOpGrad)
REGISTER_OP_CPU_KERNEL(
maxout, ops::MaxOutKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
......
......@@ -108,9 +108,10 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(modified_huber_loss, ops::ModifiedHuberLossOp,
ops::ModifiedHuberLossOpMaker, modified_huber_loss_grad,
ops::ModifiedHuberLossGradOp);
REGISTER_OPERATOR(modified_huber_loss, ops::ModifiedHuberLossOp,
ops::ModifiedHuberLossOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(modified_huber_loss_grad, ops::ModifiedHuberLossGradOp)
REGISTER_OP_CPU_KERNEL(
modified_huber_loss,
......
......@@ -160,7 +160,9 @@ class MulGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulGradOp);
REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(mul_grad, ops::MulGradOp)
REGISTER_OP_CPU_KERNEL(
mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
......
......@@ -14,6 +14,8 @@ limitations under the License. */
#include "paddle/fluid/operators/nce_op.h"
#include <vector>
namespace paddle {
namespace operators {
......@@ -179,7 +181,9 @@ class NCEOpGrad : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(nce, ops::NCEOp, ops::NCEOpMaker, nce_grad, ops::NCEOpGrad);
REGISTER_OPERATOR(nce, ops::NCEOp, ops::NCEOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(nce_grad, ops::NCEOpGrad)
REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel<paddle::platform::CPUPlace, float>,
ops::NCEKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(nce_grad,
......
......@@ -85,8 +85,9 @@ class NormOpGrad : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(norm, ops::NormOp, ops::NormOpMaker<float>, norm_grad,
ops::NormOpGrad);
REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker<float>,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(norm_grad, ops::NormOpGrad)
REGISTER_OP_CPU_KERNEL(
norm, ops::NormKernel<paddle::platform::CPUDeviceContext, float>,
ops::NormKernel<paddle::platform::CPUDeviceContext, double, float>);
......
......@@ -333,8 +333,9 @@ Example:
namespace ops = paddle::operators;
REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad,
ops::PoolOpGrad);
REGISTER_OPERATOR(pool2d, ops::PoolOp, ops::Pool2dOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(pool2d_grad, ops::PoolOpGrad)
REGISTER_OP_CPU_KERNEL(
pool2d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
......@@ -343,8 +344,9 @@ REGISTER_OP_CPU_KERNEL(
pool2d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>)
REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad,
ops::PoolOpGrad);
REGISTER_OPERATOR(pool3d, ops::PoolOp, ops::Pool3dOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad)
REGISTER_OP_CPU_KERNEL(
pool3d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -258,9 +258,10 @@ Example:
namespace ops = paddle::operators;
REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
ops::MaxPool2dWithIndexOpMaker, max_pool2d_with_index_grad,
ops::MaxPoolWithIndexOpGrad);
REGISTER_OPERATOR(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
ops::MaxPool2dWithIndexOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(max_pool2d_with_index_grad, ops::MaxPoolWithIndexOpGrad)
REGISTER_OP_CPU_KERNEL(
max_pool2d_with_index,
......@@ -274,9 +275,10 @@ REGISTER_OP_CPU_KERNEL(
ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
int>)
REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad,
ops::MaxPoolWithIndexOpGrad);
REGISTER_OPERATOR(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
ops::MaxPool3dWithIndexOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(max_pool3d_with_index_grad, ops::MaxPoolWithIndexOpGrad)
REGISTER_OP_CPU_KERNEL(
max_pool3d_with_index,
......
......@@ -83,8 +83,9 @@ class PReluGradOp : public framework::OperatorWithKernel {
namespace ops = paddle::operators;
REGISTER_OP(prelu, ops::PReluOp, ops::PReluOpMaker, prelu_grad,
ops::PReluGradOp);
REGISTER_OPERATOR(prelu, ops::PReluOp, ops::PReluOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(prelu_grad, ops::PReluGradOp)
REGISTER_OP_CPU_KERNEL(
prelu, ops::PReluKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
......
......@@ -121,8 +121,9 @@ class RankLossGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(rank_loss, ops::RankLossOp, ops::RankLossOpMaker, rank_loss_grad,
ops::RankLossGradOp);
REGISTER_OPERATOR(rank_loss, ops::RankLossOp, ops::RankLossOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(rank_loss_grad, ops::RankLossGradOp)
REGISTER_OP_CPU_KERNEL(
rank_loss, ops::RankLossKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
......
......@@ -14,6 +14,9 @@ limitations under the License. */
#include "paddle/fluid/operators/reduce_op.h"
#include <string>
#include <vector>
namespace paddle {
namespace operators {
......@@ -122,18 +125,18 @@ If reduce_all is true, just reduce along all dimensions and output a scalar.
protected:
std::string comment_;
void Replace(std::string &src, std::string from, std::string to) {
void Replace(std::string *src, std::string from, std::string to) {
std::size_t len_from = std::strlen(from.c_str());
std::size_t len_to = std::strlen(to.c_str());
for (std::size_t pos = src.find(from); pos != std::string::npos;
pos = src.find(from, pos + len_to)) {
src.replace(pos, len_from, to);
for (std::size_t pos = src->find(from); pos != std::string::npos;
pos = src->find(from, pos + len_to)) {
src->replace(pos, len_from, to);
}
}
void SetComment(std::string name, std::string op) {
Replace(comment_, "{ReduceOp}", name);
Replace(comment_, "{reduce}", op);
Replace(&comment_, "{ReduceOp}", name);
Replace(&comment_, "{reduce}", op);
}
};
......@@ -187,20 +190,25 @@ class ReduceProdOpMaker : public ReduceOpMaker {
namespace ops = paddle::operators;
REGISTER_OP(reduce_sum, ops::ReduceOp, ops::ReduceSumOpMaker, reduce_sum_grad,
ops::ReduceGradOp);
REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ops::ReduceSumOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp)
REGISTER_OP(reduce_mean, ops::ReduceOp, ops::ReduceMeanOpMaker,
reduce_mean_grad, ops::ReduceGradOp);
REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, ops::ReduceMeanOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp)
REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad,
ops::ReduceGradOp);
REGISTER_OPERATOR(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp)
REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad,
ops::ReduceGradOp);
REGISTER_OPERATOR(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(reduce_min_grad, ops::ReduceGradOp)
REGISTER_OP(reduce_prod, ops::ReduceOp, ops::ReduceProdOpMaker,
reduce_prod_grad, ops::ReduceGradOp);
REGISTER_OPERATOR(reduce_prod, ops::ReduceOp, ops::ReduceProdOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(reduce_prod_grad, ops::ReduceGradOp)
#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \
REGISTER_OP_CPU_KERNEL(reduce_type, \
......
......@@ -113,8 +113,9 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
namespace ops = paddle::operators;
using CPU = paddle::platform::CPUDeviceContext;
REGISTER_OP(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, reshape_grad,
ops::ReshapeGradOp);
REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp)
REGISTER_OP_CPU_KERNEL(reshape, ops::ReshapeKernel<CPU, float>,
ops::ReshapeKernel<CPU, double>,
ops::ReshapeKernel<CPU, int>,
......
......@@ -153,8 +153,9 @@ https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad,
ops::ROIPoolGradOp);
REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp)
REGISTER_OP_CPU_KERNEL(
roi_pool,
ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -250,8 +250,9 @@ class RowConvGradKernel<platform::CPUDeviceContext, T>
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(row_conv, ops::RowConvOp, ops::RowConvOpMaker, row_conv_grad,
ops::RowConvGradOp);
REGISTER_OPERATOR(row_conv, ops::RowConvOp, ops::RowConvOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(row_conv_grad, ops::RowConvGradOp)
REGISTER_OP_CPU_KERNEL(
row_conv, ops::RowConvKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
......
......@@ -102,7 +102,8 @@ $$
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad,
ops::ScatterGradOp);
REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp)
REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>);
REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>);
......@@ -124,9 +124,11 @@ class SequenceConcatGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_EX(sequence_concat, ops::SequenceConcatOp,
ops::SequenceConcatOpMaker, sequence_concat_grad,
ops::SequenceConcatGradOp, false);
REGISTER_OPERATOR(sequence_concat, ops::SequenceConcatOp,
ops::SequenceConcatOpMaker,
paddle::framework::DefaultGradOpDescMaker<
false> /* set false to disable empty grad */)
REGISTER_OPERATOR(sequence_concat_grad, ops::SequenceConcatGradOp);
REGISTER_OP_CPU_KERNEL(
sequence_concat,
ops::SequenceConcatOpKernel<paddle::platform::CPUDeviceContext, float>);
......
......@@ -14,6 +14,8 @@ limitations under the License. */
#include "paddle/fluid/operators/sequence_conv_op.h"
#include <algorithm>
namespace paddle {
namespace operators {
......@@ -174,8 +176,9 @@ context_length, context_stride and context_start.
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
sequence_conv_grad, ops::SequenceConvGradOp);
REGISTER_OPERATOR(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(sequence_conv_grad, ops::SequenceConvGradOp)
REGISTER_OP_CPU_KERNEL(
sequence_conv,
......
......@@ -200,8 +200,10 @@ class SequenceExpandOpGrad : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(sequence_expand, ops::SequenceExpandOp, ops::SequenceExpandOpMaker,
sequence_expand_grad, ops::SequenceExpandOpGrad);
REGISTER_OPERATOR(sequence_expand, ops::SequenceExpandOp,
ops::SequenceExpandOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(sequence_expand_grad, ops::SequenceExpandOpGrad)
REGISTER_OP_CPU_KERNEL(
sequence_expand,
ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -120,8 +120,10 @@ NOTE: The first dimension size of input, the size of offset and Length, should b
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(sequence_slice, ops::SequenceSliceOp, ops::SequenceSliceOpMaker,
sequence_slice_grad, ops::SequenceSliceGradOp);
REGISTER_OPERATOR(sequence_slice, ops::SequenceSliceOp,
ops::SequenceSliceOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(sequence_slice_grad, ops::SequenceSliceGradOp)
REGISTER_OP_CPU_KERNEL(
sequence_slice,
ops::SequenceSliceOpKernel<paddle::platform::CPUDeviceContext, float>);
......
......@@ -155,9 +155,10 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp,
ops::SequenceSoftmaxOpMaker, sequence_softmax_grad,
ops::SequenceSoftmaxGradOp);
REGISTER_OPERATOR(sequence_softmax, ops::SequenceSoftmaxOp,
ops::SequenceSoftmaxOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(sequence_softmax_grad, ops::SequenceSoftmaxGradOp)
REGISTER_OP_CPU_KERNEL(
sequence_softmax,
ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -135,11 +135,12 @@ However the output only shares the LoD with input `X`.
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(sigmoid_cross_entropy_with_logits,
ops::SigmoidCrossEntropyWithLogitsOp,
ops::SigmoidCrossEntropyWithLogitsOpMaker,
sigmoid_cross_entropy_with_logits_grad,
ops::SigmoidCrossEntropyWithLogitsGradOp);
REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits,
ops::SigmoidCrossEntropyWithLogitsOp,
ops::SigmoidCrossEntropyWithLogitsOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad,
ops::SigmoidCrossEntropyWithLogitsGradOp)
REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits,
ops::SigmoidCrossEntropyWithLogitsKernel<
paddle::platform::CPUDeviceContext, float>);
......
......@@ -132,8 +132,9 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(smooth_l1_loss, ops::SmoothL1LossOp, ops::SmoothL1LossOpMaker,
smooth_l1_loss_grad, ops::SmoothL1LossGradOp);
REGISTER_OPERATOR(smooth_l1_loss, ops::SmoothL1LossOp, ops::SmoothL1LossOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(smooth_l1_loss_grad, ops::SmoothL1LossGradOp)
REGISTER_OP_CPU_KERNEL(
smooth_l1_loss,
ops::SmoothL1LossKernel<paddle::platform::CPUDeviceContext, float>);
......
......@@ -160,8 +160,9 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
namespace ops = paddle::operators;
REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, softmax_grad,
ops::SoftmaxOpGrad);
REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad)
REGISTER_OP_CPU_KERNEL(
softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/split_byref_op.h"
#include "paddle/fluid/operators/split_op.h"
namespace paddle {
namespace operators {
using framework::Tensor;
class SplitByrefOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of SplitOp should not be null.");
PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
"Outputs(Out) of SplitOp should not be empty.");
auto in_dims = ctx->GetInputDim("X");
auto outs_names = ctx->Outputs("Out");
size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
std::vector<int> sections = static_cast<std::vector<int>>(
ctx->Attrs().Get<std::vector<int>>("sections"));
const size_t outs_number = outs_names.size();
std::vector<framework::DDim> outs_dims;
outs_dims.reserve(outs_number);
if (num > 0) {
int64_t in_axis_dim = in_dims[0];
PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
"tensor split does not result"
" in an equal division");
size_t out_axis_dim = in_axis_dim / num;
for (size_t i = 0; i < outs_number; ++i) {
auto dim = in_dims;
dim[0] = out_axis_dim;
outs_dims.push_back(dim);
}
} else if (sections.size() > 0) {
PADDLE_ENFORCE_EQ(sections.size(), outs_number,
"tensor split sections size"
"should be equal to output size.");
for (size_t i = 0; i < outs_number; ++i) {
auto dim = in_dims;
dim[0] = sections[i];
outs_dims.push_back(dim);
}
}
ctx->SetOutputsDim("Out", outs_dims);
}
};
class SplitByrefOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SplitByrefOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(Tensor) Input tensor of the split operator.");
AddOutput("Out", "(Tensor) Output tensors of the split operator.")
.AsDuplicable();
AddComment(R"DOC(
SplitByref operator
Split source tensor to sevaral tensors by axis 0. No copy in this operator
is performed, output tensor shares the same blocks of memory.
)DOC");
AddAttr<std::vector<int>>("sections",
"(vector<int>) "
"the length of each output along the "
"specified axis.")
.SetDefault(std::vector<int>{});
AddAttr<int>("num",
"(int, default 0)"
"Number of sub-tensors. This must evenly divide "
"Input.dims()[axis]")
.SetDefault(0);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
// NOTE: concat op default axis must be 0!
USE_CPU_ONLY_OP(concat);
REGISTER_OPERATOR(split_byref, ops::SplitByrefOp, ops::SplitByrefOpMaker,
ops::SplitGradMaker);
REGISTER_OP_CPU_KERNEL(
split_byref, ops::SplitByrefOpKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/split_byref_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
split_byref,
ops::SplitByrefOpKernel<paddle::platform::CUDADeviceContext, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class SplitByrefOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
auto place = ctx.GetPlace();
size_t row_offset = 0;
for (size_t i = 0; i < outs.size(); ++i) {
// NOTE: no need to call mutable_data here to allocate memory.
auto* out = outs[i];
VLOG(3) << "spliting by ref: " << row_offset << " " << out->dims()[0];
*out = std::move(in->Slice(row_offset, row_offset + out->dims()[0]));
row_offset += out->dims()[0];
}
}
};
} // namespace operators
} // namespace paddle
......@@ -108,21 +108,6 @@ Example:
}
};
class SplitGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
auto op = new framework::OpDesc();
op->SetType("concat");
op->SetInput("X", OutputGrad("Out"));
op->SetOutput("Out", InputGrad("X"));
op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(op);
}
};
} // namespace operators
} // namespace paddle
......
......@@ -44,5 +44,20 @@ class SplitOpKernel : public framework::OpKernel<T> {
}
};
class SplitGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
auto op = new framework::OpDesc();
op->SetType("concat");
op->SetInput("X", OutputGrad("Out"));
op->SetOutput("Out", InputGrad("X"));
op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(op);
}
};
} // namespace operators
} // namespace paddle
......@@ -92,7 +92,9 @@ class SppOpGrad : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(spp, ops::SppOp, ops::SppOpMaker, spp_grad, ops::SppOpGrad);
REGISTER_OPERATOR(spp, ops::SppOp, ops::SppOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(spp_grad, ops::SppOpGrad)
REGISTER_OP_CPU_KERNEL(
spp, ops::SppKernel<paddle::platform::CPUDeviceContext, float>,
ops::SppKernel<paddle::platform::CPUDeviceContext, double>);
......
......@@ -109,9 +109,10 @@ class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(squared_l2_distance, ops::SquaredL2DistanceOp,
ops::SquaredL2DistanceOpMaker, squared_l2_distance_grad,
ops::SquaredL2DistanceGradOp);
REGISTER_OPERATOR(squared_l2_distance, ops::SquaredL2DistanceOp,
ops::SquaredL2DistanceOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(squared_l2_distance_grad, ops::SquaredL2DistanceGradOp)
REGISTER_OP_CPU_KERNEL(
squared_l2_distance,
ops::SquaredL2DistanceKernel<paddle::platform::CPUDeviceContext, float>);
......
......@@ -67,8 +67,10 @@ $$Out = \sum_{i} X_{i}^2$$
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(squared_l2_norm, ops::SquaredL2NormOp, ops::SquaredL2NormOpMaker,
squared_l2_norm_grad, ops::SquaredL2NormGradOp);
REGISTER_OPERATOR(squared_l2_norm, ops::SquaredL2NormOp,
ops::SquaredL2NormOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(squared_l2_norm_grad, ops::SquaredL2NormGradOp)
REGISTER_OP_CPU_KERNEL(
squared_l2_norm,
ops::SquaredL2NormKernel<paddle::platform::CPUDeviceContext, float>);
......
......@@ -24,7 +24,6 @@ namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
......@@ -36,9 +35,9 @@ class TopkKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override {
// Get the top k elements of each row of input tensor
// FIXME: only deal with matrix(2d tensor).
auto* input = ctx.Input<LoDTensor>("X");
auto* output = ctx.Output<LoDTensor>("Out");
auto* indices = ctx.Output<LoDTensor>("Indices");
auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out");
auto* indices = ctx.Output<Tensor>("Indices");
// k is determined by Attr
const size_t k = static_cast<int>(ctx.Attr<int>("k"));
......
......@@ -118,8 +118,9 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(transpose, ops::TransposeOp, ops::TransposeOpMaker, transpose_grad,
ops::TransposeOpGrad);
REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad)
REGISTER_OP_CPU_KERNEL(
transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
......
......@@ -132,8 +132,9 @@ class UnpoolOpGrad : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
ops::UnpoolOpGrad);
REGISTER_OPERATOR(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(unpool_grad, ops::UnpoolOpGrad)
REGISTER_OP_CPU_KERNEL(
unpool, ops::UnpoolKernel<paddle::platform::CPUDeviceContext, float>,
ops::UnpoolKernel<paddle::platform::CPUDeviceContext, double>);
......
......@@ -132,8 +132,9 @@ class WarpCTCGradOp : public framework::OperatorWithKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker, warpctc_grad,
ops::WarpCTCGradOp);
REGISTER_OPERATOR(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>)
REGISTER_OPERATOR(warpctc_grad, ops::WarpCTCGradOp)
REGISTER_OP_CPU_KERNEL(
warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
......
......@@ -39,20 +39,19 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
class NCCLGroupGuard {
public:
static std::mutex &NCCLMutex() {
static std::mutex mtx;
return mtx;
}
inline NCCLGroupGuard() {
mutex().lock();
NCCLMutex().lock();
PADDLE_ENFORCE(dynload::ncclGroupStart());
}
inline ~NCCLGroupGuard() {
PADDLE_ENFORCE(dynload::ncclGroupEnd());
mutex().unlock();
}
private:
static std::mutex &mutex() {
static std::mutex mtx;
return mtx;
NCCLMutex().unlock();
}
};
......@@ -68,26 +67,6 @@ struct NCCLContext {
int device_id() const {
return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
}
static void InitNCCLContext(std::unordered_map<int, NCCLContext> *contexts,
const std::vector<platform::Place> &places) {
std::vector<ncclComm_t> comms;
std::vector<int> devs;
comms.resize(contexts->size());
devs.reserve(contexts->size());
for (auto &p : places) {
devs.push_back(boost::get<platform::CUDAPlace>(p).device);
}
PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
&comms[0], static_cast<int>(contexts->size()), &devs[0]));
int i = 0;
for (auto &dev_id : devs) {
contexts->at(dev_id).comm_ = comms[i++];
}
}
};
struct NCCLContextMap {
......@@ -107,12 +86,12 @@ struct NCCLContextMap {
"NCCL Context Map does not support contain two or more same device");
if (places.size() > 1) {
std::vector<ncclComm_t> comms;
comms.resize(order_.size());
PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
&comms[0], static_cast<int>(order_.size()), &order_[0]));
std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
{
std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
comms.get(), static_cast<int>(order_.size()), order_.data()));
}
int i = 0;
for (auto &dev_id : order_) {
contexts_.at(dev_id).comm_ = comms[i++];
......@@ -120,6 +99,9 @@ struct NCCLContextMap {
}
}
NCCLContextMap(const NCCLContextMap &other) = delete;
NCCLContextMap &operator=(const NCCLContextMap &other) = delete;
CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
CUDADeviceContext *DevCtx(platform::Place p) const {
......
......@@ -420,13 +420,14 @@ class DistributeTranspiler:
# append op to the current block
per_opt_block = append_block
for _, opt_op in enumerate(opt_op_on_pserver):
for idx, opt_op in enumerate(opt_op_on_pserver):
for _, op in enumerate(self.optimize_ops):
# optimizer is connected to itself
if ufind.is_connected(op, opt_op) and \
op not in global_ops:
__append_optimize_op__(op, per_opt_block)
per_opt_block = pserver_program.create_block(append_block.idx)
if idx == len(opt_op_on_pserver) - 1 and global_ops:
per_opt_block = pserver_program.create_block(append_block.idx)
# append global ops
for glb_op in global_ops:
......@@ -824,7 +825,7 @@ class DistributeTranspiler:
for v in splited_vars:
sections.append(v.shape[0])
program.global_block().append_op(
type="split",
type="split_byref",
inputs={"X": orig_var},
outputs={"Out": splited_vars},
attrs={"sections": sections} # assume split evenly
......
......@@ -32,7 +32,6 @@ __all__ = [
'Switch',
'lod_rank_table',
'max_sequence_len',
'topk',
'lod_tensor_to_array',
'array_to_lod_tensor',
'increment',
......@@ -751,43 +750,6 @@ def max_sequence_len(rank_table):
return res
def topk(input, k):
"""
**topk**
This function performs the operation that selects the k entries in the input
vector and outputs their values and indices as vectors. Thus topk_out[j] is
the j-th largest entry in input, and its index is topk_indices[j]
Args:
input (Variable|list): The input tensor that has all the data.
k (int): The number of top elements that the function will pick.
Returns:
Variable: The variable of type array that contains the k largest entries
from input.
Variable: The variable of type array that contains the indices of k
largest entries from input.
Examples:
.. code-block:: python
x = fluid.layers.data(name='x', shape=[10])
k = 5
array = fluid.layers.topk(x, k)
"""
helper = LayerHelper('topk', **locals())
topk_out = helper.create_tmp_variable(dtype=input.dtype)
topk_indices = helper.create_tmp_variable(dtype='int64')
helper.append_op(
type='top_k',
inputs={'X': [input]},
outputs={'Out': [topk_out],
'Indices': [topk_indices]},
attrs={'k': k})
return topk_out, topk_indices
def lod_tensor_to_array(x, table):
""" Convert a LOD_TENSOR to an LOD_TENSOR_ARRAY.
......
......@@ -20,7 +20,7 @@ from ..initializer import init_on_cpu
__all__ = [
'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
'polynomial_decay', 'piecewise_decay'
'polynomial_decay', 'piecewise_decay', 'noam_decay'
]
"""
When training a model, it's often useful to decay the
......@@ -32,14 +32,41 @@ strategy according to this module.
"""
def _decay_step_counter():
def _decay_step_counter(begin=0):
# the first global step is zero in learning rate decay
global_step = nn.autoincreased_step_counter(
counter_name='@LR_DECAY_COUNTER@', begin=0, step=1)
counter_name='@LR_DECAY_COUNTER@', begin=begin, step=1)
global_step = tensor.cast(global_step, 'float32')
return global_step
def noam_decay(d_model, warmup_steps):
"""Apply decay to learning rate.
```python
lr_value = np.power(d_model, -0.5) * np.min([
np.power(current_steps, -0.5),
np.power(warmup_steps, -1.5) * current_steps
])
```
Args:
d_model(Variable): The dimensionality of input and output of model.
Reference: attention is all you need
https://arxiv.org/pdf/1706.03762.pdf
warmup_steps(Variable): A super parameter.
Returns:
The decayed learning rate.
"""
global_step = _decay_step_counter(1)
with init_on_cpu():
a = global_step**-0.5
b = (warmup_steps**-1.5) * global_step
lr_value = (d_model**-0.5) * ops.elementwise_min(a, b)
return lr_value
def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
"""Applies exponential decay to the learning rate.
......
......@@ -20,6 +20,7 @@ from ..layer_helper import LayerHelper
from ..initializer import Normal, Constant
from ..framework import Variable
from ..param_attr import ParamAttr
import nn
__all__ = ['accuracy', 'auc']
......@@ -27,17 +28,10 @@ __all__ = ['accuracy', 'auc']
def accuracy(input, label, k=1, correct=None, total=None):
"""
This function computes the accuracy using the input and label.
The output is the top_k inputs and their indices.
The output is the top k inputs and their indices.
"""
helper = LayerHelper("accuracy", **locals())
topk_out = helper.create_tmp_variable(dtype=input.dtype)
topk_indices = helper.create_tmp_variable(dtype="int64")
helper.append_op(
type="top_k",
inputs={"X": [input]},
outputs={"Out": [topk_out],
"Indices": [topk_indices]},
attrs={"k": k})
topk_out, topk_indices = nn.topk(input, k=k)
acc_out = helper.create_tmp_variable(dtype="float32")
if correct is None:
correct = helper.create_tmp_variable(dtype="int64")
......@@ -68,12 +62,7 @@ def auc(input, label, curve='ROC', num_thresholds=200):
helper = LayerHelper("auc", **locals())
topk_out = helper.create_tmp_variable(dtype=input.dtype)
topk_indices = helper.create_tmp_variable(dtype="int64")
helper.append_op(
type="top_k",
inputs={"X": [input]},
outputs={"Out": [topk_out],
"Indices": [topk_indices]},
attrs={"k": k})
topk_out, topk_indices = nn.topk(input, k=k)
auc_out = helper.create_tmp_variable(dtype="float32")
if correct is None:
correct = helper.create_tmp_variable(dtype="int64")
......
......@@ -60,6 +60,7 @@ __all__ = [
'edit_distance',
'l2_normalize',
'matmul',
'topk',
'warpctc',
'sequence_reshape',
'transpose',
......@@ -2576,6 +2577,53 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
return out
def topk(input, k):
"""
This operator is used to find values and indices of the k largest entries
for the last dimension.
If the input is a vector (rank=1), finds the k largest entries in the vector
and outputs their values and indices as vectors. Thus values[j] is the j-th
largest entry in input, and its index is indices[j].
If the input is a Tensor with higher rank, this operator computes the top k
entries along the last dimension.
Args:
input(Variable): The input variable which can be a vector or Tensor with
higher rank.
k(int): An integer value to specify the top k largest elements.
Returns:
values(Variable): The k largest elements along each last dimensional
slice.
indices(Variable): The indices of values within the last dimension of
input.
Examples:
.. code-block:: python
top5_values, top5_indices = layers.topk(input, k=5)
"""
shape = input.shape
if k < 1 and k >= shape[-1]:
raise ValueError("k must be greater than 0 and less than %d." %
(shape[-1]))
helper = LayerHelper("top_k", **locals())
values = helper.create_tmp_variable(dtype=input.dtype)
indices = helper.create_tmp_variable(dtype="int64")
helper.append_op(
type="top_k",
inputs={"X": [input]},
outputs={"Out": [values],
"Indices": [indices]},
attrs={"k": k})
values.stop_gradient = True
indices.stop_gradient = True
return values, indices
def edit_distance(input, label, normalized=True, ignored_tokens=None,
name=None):
"""
......@@ -2717,15 +2765,7 @@ def ctc_greedy_decoder(input, blank, name=None):
cost = fluid.layers.ctc_greedy_decoder(input=x, blank=0)
"""
helper = LayerHelper("ctc_greedy_decoder", **locals())
# top 1 op
topk_out = helper.create_tmp_variable(dtype=input.dtype)
topk_indices = helper.create_tmp_variable(dtype="int64")
helper.append_op(
type="top_k",
inputs={"X": [input]},
outputs={"Out": [topk_out],
"Indices": [topk_indices]},
attrs={"k": 1})
_, topk_indices = topk(input, k=1)
# ctc align op
ctc_out = helper.create_tmp_variable(dtype="int64")
......
......@@ -16,6 +16,7 @@ import core
import multiprocessing
import framework
import executor
import warnings
import sys
__all__ = ['ParallelExecutor']
......@@ -62,8 +63,8 @@ class ParallelExecutor(object):
main_program=test_program,
share_vars_from=train_exe)
train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
train_loss, = train_exe.run([loss.name], feed=feed_dict)
test_loss, = test_exe.run([loss.name], feed=feed_dict)
"""
self._places = []
......@@ -103,8 +104,8 @@ class ParallelExecutor(object):
self.persistable_vars = [
v.name
for v in filter(lambda var: \
var.persistable and var.type != core.VarDesc.VarType.RAW,
for v in filter(
lambda var: var.persistable and var.type != core.VarDesc.VarType.RAW,
main.list_vars())
]
......@@ -163,7 +164,7 @@ class ParallelExecutor(object):
Returns: fetched result list.
"""
if feed is None:
if feed is None and feed_dict is not None:
feed = feed_dict
print >> sys.stderr, "`feed_dict` is deprecated. Please use `feed=`"
......
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册