提交 e9233d1c 编写于 作者: J Jiabin Yang 提交者: XiaoguangHu

Refactor dygraph (#19107)

* refactor dygraph,test=develop

* fix failed unittest,test=develop

* polish code,test=develop

* check windows ci error,test=develop
try to fix windows ci error by np.allclose,test=develop

* polish vlog and profiler, test=develop

* try to fix preceding ops order,test=develop

* test transformer in windows ci, test=develop

* use python c-api to speed up tracer.trace,test=develop

* test=develop, fix docker with paddle nccl problem

* test=develop, add ut for debug string and gradient_accumulator

* test=develop, add tests for layer/gradient_accumulator/prepared_op

* test=develop, fix complie error for test_prepared_op

* test=develop, add more ut for dygraph

* test=develop, create API.spec for dygraph api change

* test=develop, refoctor name to make it easier to understand

* test=develop, refoctor name to make it easier to understand

* test=develop, fix multi-gpu failed problem , add Tracer tests, change PADDLEENFORCE to PADDLEENFORCE_EQ

* test=develop, fix ut failed on parallel se-resnext

* test=develop, change one more PADDLE_ENFORCE
上级 dca9b6c5
...@@ -820,11 +820,11 @@ paddle.fluid.dygraph.TreeConv.state_dict (ArgSpec(args=['self', 'destination', ' ...@@ -820,11 +820,11 @@ paddle.fluid.dygraph.TreeConv.state_dict (ArgSpec(args=['self', 'destination', '
paddle.fluid.dygraph.TreeConv.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) paddle.fluid.dygraph.TreeConv.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
paddle.fluid.dygraph.TreeConv.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.TreeConv.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.dygraph.Tracer ('paddle.fluid.dygraph.tracer.Tracer', ('document', '28d72409112111274c33e1f07229d5da')) paddle.fluid.dygraph.Tracer ('paddle.fluid.dygraph.tracer.Tracer', ('document', '28d72409112111274c33e1f07229d5da'))
paddle.fluid.dygraph.Tracer.__init__ (ArgSpec(args=['self', 'block'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.Tracer.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.dygraph.Tracer.all_parameters (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.Tracer.all_parameters (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.dygraph.Tracer.eval_mode (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.Tracer.eval_mode (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.dygraph.Tracer.trace 1. trace(self: paddle.fluid.core_avx.Tracer, arg0: paddle.fluid.core_avx.OpBase, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CPUPlace, arg5: bool) -> None 2. trace(self: paddle.fluid.core_avx.Tracer, arg0: paddle.fluid.core_avx.OpBase, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CUDAPlace, arg5: bool) -> None paddle.fluid.dygraph.Tracer.trace 1. trace(self: paddle.fluid.core_avx.Tracer, arg0: unicode, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CUDAPlace, arg5: bool) -> None 2. trace(self: paddle.fluid.core_avx.Tracer, arg0: unicode, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CPUPlace, arg5: bool) -> None
paddle.fluid.dygraph.Tracer.trace_op (ArgSpec(args=['self', 'op', 'inputs', 'outputs', 'stop_gradient'], varargs=None, keywords=None, defaults=(False,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.Tracer.trace_op (ArgSpec(args=['self', 'type', 'inputs', 'outputs', 'attrs', 'stop_gradient'], varargs=None, keywords=None, defaults=(False,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.dygraph.Tracer.trace_var (ArgSpec(args=['self', 'name', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.Tracer.trace_var (ArgSpec(args=['self', 'name', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.dygraph.Tracer.train_mode (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.Tracer.train_mode (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.dygraph.prepare_context (ArgSpec(args=['strategy'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.prepare_context (ArgSpec(args=['strategy'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
......
cc_library(imperative_flag SRCS flags.cc DEPS gflags) cc_library(imperative_flag SRCS flags.cc DEPS gflags)
if(WITH_PYTHON) cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits)
cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind profiler imperative_flag) cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind profiler) cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows var_type_traits layer)
cc_library(engine SRCS engine.cc) cc_library(tracer SRCS tracer.cc DEPS layer engine)
cc_library(engine SRCS engine.cc DEPS layer gradient_accumulator)
cc_library(imperative_profiler SRCS profiler.cc) cc_library(imperative_profiler SRCS profiler.cc)
cc_library(nccl_context SRCS nccl_context.cc DEPS device_context) cc_library(nccl_context SRCS nccl_context.cc DEPS device_context)
cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context)
endif() add_subdirectory(tests)
...@@ -16,17 +16,12 @@ ...@@ -16,17 +16,12 @@
// Created by Jiabin on 2019-04-25. // Created by Jiabin on 2019-04-25.
// //
#pragma once #pragma once
#ifndef PADDLE_BACKWARDSTRATEGY_H
#define PADDLE_BACKWARDSTRATEGY_H
#endif // PADDLE_BACKWARDSTRATEGY_H
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
namespace detail { namespace detail {
class BackwardStrategy { struct BackwardStrategy {
public:
/* DyGraph now support two kinds of backward strategy, one is sorted sum /* DyGraph now support two kinds of backward strategy, one is sorted sum
* gradient, another is sum gradient once they are created */ * gradient, another is sum gradient once they are created */
// TODO(jiabin): add more Strategy when we support // TODO(jiabin): add more Strategy when we support
......
...@@ -14,40 +14,219 @@ ...@@ -14,40 +14,219 @@
#include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/imperative/engine.h"
#include <mutex> // NOLINT #include <algorithm>
#include <memory>
#include <queue>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/imperative/gradient_accumulator.h"
#include "glog/logging.h" #include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
static std::once_flag init_engine; void Engine::RunOp(paddle::imperative::OpBase* op,
static Engine* engine; const paddle::imperative::NameVarBaseMap& ins,
const paddle::imperative::NameVarBaseMap& outs,
const paddle::platform::Place& place) {
platform::RecordEvent event(op->Type());
op->Run(ins, outs);
}
void BasicEngine::Init(VarBase* var, const detail::BackwardStrategy& strategy) {
backward_strategy_ = strategy;
const std::vector<OpBase*> ops = var->GradVarBase()->GradOps();
var->ClearGradOps();
class DummyEngine : public Engine { if (ops.empty()) {
public: VLOG(3) << "Skip auto grad since there is no grad op for var: "
void Enqueue(Runnable* runnable) override { << var->Name();
queued_runnables_.push_back(runnable); return;
} else {
bool valid = false;
for (const auto& op : ops) {
if (op) {
valid = true;
}
}
if (!valid) {
VLOG(3) << "Skip auto grad since all grad op of start VarBase is nullptr";
return;
}
} }
init_ops_ = ops;
platform::RecordEvent record_event("Imperative Backward");
VLOG(3) << "start backward";
size_t Size() const override { return queued_runnables_.size(); } PADDLE_ENFORCE_EQ(var->HasGradVar(), true,
"Grad variable not exist for variable %s", var->Name());
void Sync() override { auto& fwd_var = var->Var().Get<framework::LoDTensor>();
for (Runnable* l : queued_runnables_) { auto* grad_var =
LOG(INFO) << "running " << reinterpret_cast<void*>(l); var->GradVarBase()->MutableVar()->GetMutable<framework::LoDTensor>();
auto* dev_ctx = platform::DeviceContextPool::Instance().Get(fwd_var.place());
grad_var->Resize(fwd_var.dims());
grad_var->mutable_data(fwd_var.place(), fwd_var.type());
operators::math::set_constant(*dev_ctx, grad_var, 1.0);
}
bool BasicEngine::CheckBackwardInputs(OpBase* op) {
for (auto& pair : op->GetInsMap()) {
for (auto& var : pair.second) {
if (var && !var->StopGradient()) {
return true;
}
} }
queued_runnables_.clear();
} }
return false;
}
private: void BasicEngine::PrepareGradAccumulators(OpBase* op) {
std::vector<Runnable*> queued_runnables_; for (const auto& pair : op->GetOutsMap()) {
}; for (const auto& var : pair.second) {
if (!var) continue;
Engine* GetEngine() { auto& accumulator = accumulators_[var.get()];
std::call_once(init_engine, []() { engine = new DummyEngine(); }); if (!accumulator) {
return engine; if (backward_strategy_.sorted_sum_gradient_) {
accumulator.reset(new SortedGradientAccumulator(var.get()));
} else {
accumulator.reset(new EagerGradientAccumulator(var.get()));
}
}
accumulator->IncreaseRefCnt();
VLOG(3) << "Prepare to acccumulate variable grad " << var->Name()
<< "with reference count " << accumulator->RefCnt();
}
}
} }
void BasicEngine::PrepareDeps() {
PADDLE_ENFORCE_EQ(op_deps_.empty(), true, "Op deps must be initialized here");
PADDLE_ENFORCE_EQ(accumulators_.empty(), true,
"Accumulators must be initialized here");
std::queue<OpBase*> q;
std::unordered_set<OpBase*> visited;
for (const auto& init_op : init_ops_) {
q.push(init_op);
visited.insert(init_op);
}
while (!q.empty()) {
auto* cur_op = q.front();
q.pop();
VLOG(3) << "Checking grads of op " << cur_op->Type();
if (!CheckBackwardInputs(cur_op)) {
// TODO(zjl): clear ops that do not need grad before running autograd
VLOG(3) << "Stop checking preceding ops of " << cur_op->Type()
<< " because all of its backward inputs is stop_gradient=True";
continue;
}
PrepareGradAccumulators(cur_op);
auto& preceding_ops = cur_op->GradPendingOps();
for (auto* preceding_op : preceding_ops) {
PADDLE_ENFORCE_NOT_NULL(preceding_op);
++op_deps_[preceding_op];
if (visited.count(preceding_op) == 0) {
visited.insert(preceding_op);
q.push(preceding_op);
}
}
}
}
void BasicEngine::SumGradient(OpBase* op, std::shared_ptr<VarBase> src,
VarBase* dst) {
auto iter = accumulators_.find(dst);
PADDLE_ENFORCE_EQ(iter != accumulators_.end(), true,
"Cannot find gradient of variable %s", dst->Name());
iter->second->Add(std::move(src), op->id());
}
void BasicEngine::Execute() {
PrepareDeps();
// Start execute Computation graph
std::queue<OpBase*> q;
for (const auto& init_op : init_ops_) {
q.push(init_op);
}
while (!q.empty()) {
OpBase* cur_op = q.front();
q.pop();
// Step 1: Run Backward
auto& bwd_ins = cur_op->GetInsMap();
auto& bwd_outs = cur_op->GetOutsMap();
NameVarBaseMap tmp_outs;
// A var may be coresponding to several grad var in one op
std::unordered_map<VarBase*, std::vector<std::shared_ptr<VarBase>>> var_map;
size_t counter = 0;
for (auto& bwd_out : bwd_outs) {
auto& tmp_var_list = tmp_outs[bwd_out.first];
tmp_var_list.reserve(bwd_out.second.size());
for (auto& var : bwd_out.second) {
auto tmp_var = std::make_shared<VarBase>(
false, "Gtmp@" + std::to_string(counter++)); // Do not need grad
tmp_var_list.emplace_back(tmp_var);
if (var) {
var_map[var.get()].emplace_back(std::move(tmp_var));
var->ClearGradOps();
}
}
}
VLOG(3) << "Start to execute grad op " << cur_op->Type();
RunOp(cur_op, bwd_ins, tmp_outs, cur_op->place());
// Step 2: Sum Gradient
{
platform::RecordEvent record_event("merge_grads");
for (auto& var_pair : var_map) {
auto* dst_var = var_pair.first;
if (dst_var == nullptr) continue;
for (auto& src_var : var_pair.second) {
VLOG(3) << "Sum gradient of variable " << dst_var->Name()
<< " after op " << cur_op->Type();
SumGradient(cur_op, std::move(src_var), dst_var);
}
}
}
// Step 3: Collect ready ops
for (auto* preceding_op : cur_op->GradPendingOps()) {
PADDLE_ENFORCE_NOT_NULL(preceding_op);
auto iter = op_deps_.find(preceding_op);
if (iter == op_deps_.end()) {
continue;
}
VLOG(3) << "Found preceding op of " << cur_op->Type();
// An Op is ready to go while its deps comes to zero
if (--(iter->second) == 0) {
q.push(preceding_op);
VLOG(3) << "Push preceding op " << preceding_op->Type()
<< " into queue";
}
}
// Step 4: Delete op to collect unused variables
VLOG(3) << "Remove op after op " << cur_op->Type() << " runs";
RemoveOp(cur_op);
}
VLOG(3) << "Clean properties of BasicEngine";
CleanEngine();
}
} // namespace imperative } // namespace imperative
} // namespace paddle } // namespace paddle
...@@ -16,24 +16,80 @@ ...@@ -16,24 +16,80 @@
#include <cstddef> #include <cstddef>
#include <cstdint> #include <cstdint>
#include <memory>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/imperative/backward_strategy.h"
#include "paddle/fluid/imperative/gradient_accumulator.h"
#include "paddle/fluid/imperative/layer.h"
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
struct Runnable {}; // It seems there is no need for Engine to be an
// singleton, we can have multi-engine to run
// mutil-graoh. For future use we may expose a interface
// to Python to support
class Engine { class Engine {
public: public:
virtual ~Engine() {} virtual ~Engine() = default;
virtual void Execute() = 0;
virtual void Init(VarBase* var, const detail::BackwardStrategy& strategy) = 0;
virtual void RunOp(imperative::OpBase* op, const NameVarBaseMap& ins,
const NameVarBaseMap& outs, const platform::Place& place);
virtual void Enqueue(Runnable* runnable) = 0; virtual void RemoveOp(OpBase* op) {
PADDLE_ENFORCE_NOT_NULL(op, "Cannot remove null op");
auto iter = grad_ops_.find(op);
PADDLE_ENFORCE_EQ(iter != grad_ops_.end(), true, "Op is not inside tracer");
grad_ops_.erase(iter);
}
virtual size_t Size() const = 0; void InsertOp(OpBase* op, std::shared_ptr<OpBase> op_shared) {
grad_ops_[op] = std::move(op_shared);
}
void Clear() { grad_ops_.clear(); }
virtual void Sync() = 0; private:
std::unordered_map<OpBase*, std::shared_ptr<OpBase>>
grad_ops_; // opBase for remove - grad_op
}; };
Engine* GetEngine(); class BasicEngine : public Engine {
public:
BasicEngine() = default;
void Init(VarBase* var, const detail::BackwardStrategy& strategy) override;
~BasicEngine() override = default;
void Execute() override;
private:
void PrepareDeps();
bool CheckBackwardInputs(OpBase* op);
void PrepareGradAccumulators(OpBase* op);
void SumGradient(OpBase* op, std::shared_ptr<VarBase> src, VarBase* dst);
// TODO(jiabin): maybe we can optimize the performance of engine by cache the
// result
void CleanEngine() {
init_ops_.clear();
op_deps_.clear();
accumulators_.clear();
Clear();
}
std::vector<OpBase*> init_ops_;
detail::BackwardStrategy backward_strategy_;
std::unordered_map<OpBase*, size_t> op_deps_;
std::unordered_map<VarBase*, std::unique_ptr<GradientAccumulator>>
accumulators_;
};
} // namespace imperative } // namespace imperative
} // namespace paddle } // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/gradient_accumulator.h"
#include <algorithm>
#include <memory>
#include <utility>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace imperative {
template <typename T>
class TensorAddFunctor : public boost::static_visitor<> {
public:
TensorAddFunctor(int64_t numel, const T* x, T* y)
: numel_(numel), x_(x), y_(y) {}
void operator()(const platform::CPUPlace& place) {
platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto blas = operators::math::GetBlas<platform::CPUDeviceContext, T>(*ctx);
blas.AXPY(numel_, 1., x_, y_);
}
#ifdef PADDLE_WITH_CUDA
void operator()(const platform::CUDAPlace& place) {
platform::CUDADeviceContext* ctx =
dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto blas = operators::math::GetBlas<platform::CUDADeviceContext, T>(*ctx);
blas.AXPY(numel_, 1., x_, y_);
}
#else
void operator()(const platform::CUDAPlace& place) {
PADDLE_THROW("Do NOT support gradient merge in place %s", place);
}
#endif
// there is NO blas in CUDAPinnedPlace
void operator()(const platform::CUDAPinnedPlace& place) {
PADDLE_THROW("Do NOT support gradient merge in place %s", place);
}
private:
int64_t numel_;
const T* x_;
T* y_;
};
void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
auto* dst_tensor = dst->GetMutable<framework::LoDTensor>();
auto& src_tensor = src.Get<framework::LoDTensor>();
auto numel = src_tensor.numel();
// FIXME(minqiyang): loss_grad op will pass a zero grad of label
// ugly fix for it
if (numel == 0) {
return;
}
PADDLE_ENFORCE_EQ(dst_tensor->numel() == numel, true,
"dst_numel %d vs. src_numel %d", dst_tensor->numel(),
numel);
auto data_type = src_tensor.type();
auto place = src_tensor.place();
#define PADDLE_TENSOR_ADD_MACRO(cpp_type) \
if (data_type == framework::DataTypeTrait<cpp_type>::DataType()) { \
TensorAddFunctor<cpp_type> func( \
numel, src_tensor.data<cpp_type>(), \
dst_tensor->mutable_data<cpp_type>(place)); \
boost::apply_visitor(func, place); \
return; \
}
PADDLE_TENSOR_ADD_MACRO(float);
PADDLE_TENSOR_ADD_MACRO(double);
#undef PADDLE_TENSOR_ADD_MACRO
PADDLE_THROW("Not supported data type %s for AddTo",
framework::DataTypeToString(data_type));
}
void EagerGradientAccumulator::Add(std::shared_ptr<VarBase> var,
size_t trace_id) {
auto* dst_var = var_->MutableVar();
if (cur_cnt_ == 0) {
*dst_var = std::move(*(var->MutableVar()));
} else {
TensorAdd(var->Var(), dst_var);
}
++cur_cnt_;
}
void SortedGradientAccumulator::Add(std::shared_ptr<VarBase> var,
size_t trace_id) {
auto* dst_var = var_->MutableVar();
if (ref_cnt_ == 1) {
*dst_var = std::move(*(var->MutableVar()));
} else {
if (tmp_grad_vars_.empty()) {
tmp_grad_vars_.reserve(ref_cnt_);
}
tmp_grad_vars_.emplace_back(std::move(var), trace_id);
if (tmp_grad_vars_.size() != ref_cnt_) {
return;
}
std::sort(tmp_grad_vars_.begin(), tmp_grad_vars_.end(),
[](const std::pair<std::shared_ptr<VarBase>, size_t>& p1,
const std::pair<std::shared_ptr<VarBase>, size_t>& p2) {
return p1.second > p2.second;
});
*dst_var = std::move(*(tmp_grad_vars_[0].first->MutableVar()));
for (size_t i = 1; i < tmp_grad_vars_.size(); ++i) {
TensorAdd(tmp_grad_vars_[i].first->Var(), dst_var);
}
tmp_grad_vars_.clear();
}
}
} // namespace imperative
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <utility>
#include <vector>
#include "paddle/fluid/imperative/layer.h"
namespace paddle {
namespace imperative {
class GradientAccumulator {
public:
explicit GradientAccumulator(VarBase* var) : var_(var) {}
virtual void Add(std::shared_ptr<VarBase> var, size_t trace_id) = 0;
virtual ~GradientAccumulator() = default;
inline void IncreaseRefCnt() { ++ref_cnt_; }
inline size_t RefCnt() const { return ref_cnt_; }
protected:
VarBase* var_;
size_t ref_cnt_{0};
};
class EagerGradientAccumulator : public GradientAccumulator {
public:
using GradientAccumulator::GradientAccumulator;
void Add(std::shared_ptr<VarBase> var, size_t trace_id) override;
private:
size_t cur_cnt_{0};
};
class SortedGradientAccumulator : public GradientAccumulator {
public:
using GradientAccumulator::GradientAccumulator;
void Add(std::shared_ptr<VarBase> var, size_t trace_id) override;
private:
std::vector<std::pair<std::shared_ptr<VarBase>, size_t>> tmp_grad_vars_;
};
} // namespace imperative
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -13,27 +13,21 @@ ...@@ -13,27 +13,21 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
#include <algorithm> #include <algorithm>
#include <deque> #include <queue>
#include <limits>
#include <map>
#include <random>
#include <unordered_set>
#include <utility> #include <utility>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/imperative/prepared_operator.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h"
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
using framework::Variable;
void ThreadSafeNameSet::Insert(const std::string& name) { void ThreadSafeNameSet::Insert(const std::string& name) {
std::lock_guard<std::mutex> guard(mtx_); std::lock_guard<std::mutex> guard(mtx_);
set_.insert(name); set_.insert(name);
...@@ -42,7 +36,7 @@ void ThreadSafeNameSet::Insert(const std::string& name) { ...@@ -42,7 +36,7 @@ void ThreadSafeNameSet::Insert(const std::string& name) {
void ThreadSafeNameSet::Remove(const std::string& name) { void ThreadSafeNameSet::Remove(const std::string& name) {
std::lock_guard<std::mutex> guard(mtx_); std::lock_guard<std::mutex> guard(mtx_);
auto iter = set_.find(name); auto iter = set_.find(name);
PADDLE_ENFORCE(iter != set_.end(), "%s does not exist", name); PADDLE_ENFORCE_EQ(iter != set_.end(), true, "%s does not exist", name);
set_.erase(iter); set_.erase(iter);
} }
...@@ -55,222 +49,161 @@ ThreadSafeNameSet VarBase::name_set_; ...@@ -55,222 +49,161 @@ ThreadSafeNameSet VarBase::name_set_;
std::vector<std::string> VarBase::AliveVarNames() { return name_set_.Names(); } std::vector<std::string> VarBase::AliveVarNames() { return name_set_.Names(); }
using framework::Variable; static framework::VariableNameMap CreateVarNameMap(
const framework::OpInfo& op_info, const std::string& op_type,
namespace detail { const NameVarBaseMap& varbase_map, bool is_input) {
if (op_info.proto_ == nullptr) {
return {};
}
template <typename T> framework::VariableNameMap result;
class TensorAddToFunctor : public boost::static_visitor<> {
public:
TensorAddToFunctor(int64_t numel, const T* x, T* y)
: numel_(numel), x_(x), y_(y) {}
void operator()(const platform::CPUPlace& place) { for (auto& var :
platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>( is_input ? op_info.Proto().inputs() : op_info.Proto().outputs()) {
platform::DeviceContextPool::Instance().Get(place)); auto it = varbase_map.find(var.name());
auto blas = operators::math::GetBlas<platform::CPUDeviceContext, T>(*ctx); if (it == varbase_map.end()) {
blas.AXPY(numel_, 1., x_, y_); PADDLE_ENFORCE_EQ(
var.dispensable(), true,
"Var: %s not dispensable and there are no such var in inputs",
var.name());
result[var.name()] = {};
} else {
auto& var_vector = it->second;
std::vector<std::string> args;
args.reserve(var_vector.size());
for (auto& var_base : var_vector) {
args.emplace_back(var_base->Name());
}
result[var.name()] = std::move(args);
} }
}
return result;
}
#ifdef PADDLE_WITH_CUDA static framework::RuntimeContext PrepareRuntimeContext(
void operator()(const platform::CUDAPlace& place) { const NameVarBaseMap& ins, const NameVarBaseMap& outs) {
platform::CUDADeviceContext* ctx = framework::VariableValueMap inputs, outputs;
dynamic_cast<platform::CUDADeviceContext*>( for (auto& in_pair : ins) {
platform::DeviceContextPool::Instance().Get(place)); auto& in_ctx = inputs[in_pair.first];
auto blas = operators::math::GetBlas<platform::CUDADeviceContext, T>(*ctx); in_ctx.reserve(in_pair.second.size());
blas.AXPY(numel_, 1., x_, y_); for (auto& in_var : in_pair.second) {
in_ctx.emplace_back(in_var->MutableVar());
} }
#else
void operator()(const platform::CUDAPlace& place) {
PADDLE_THROW("Do NOT support gradient merge in place %s", place);
} }
#endif
// there is NO blas in CUDAPinnedPlace for (auto& out_pair : outs) {
void operator()(const platform::CUDAPinnedPlace& place) { auto& out_ctx = outputs[out_pair.first];
PADDLE_THROW("Do NOT support gradient merge in place %s", place); out_ctx.reserve(out_pair.second.size());
for (auto& out_var : out_pair.second) {
out_ctx.emplace_back(out_var->MutableVar());
} }
}
return framework::RuntimeContext(std::move(inputs), std::move(outputs));
}
private: static std::string DebugString(
int64_t numel_; const std::string& name,
const T* x_; const std::vector<std::shared_ptr<VarBase>>& vars) {
T* y_; std::stringstream ss;
}; ss << name << "{";
} // namespace detail for (size_t i = 0; i < vars.size(); ++i) {
if (i > 0) ss << ", ";
void AddTo(std::shared_ptr<VarBase> src, std::shared_ptr<VarBase> dst,
platform::Place place, GradientRef* grad_ref) { if (vars[i] == nullptr) {
PADDLE_ENFORCE(grad_ref->find(dst.get()) != grad_ref->end(), ss << "NULL";
"gradient %s are not found in grad_ref", dst->Name()); continue;
if ((*grad_ref)[dst.get()].second) { }
PADDLE_ENFORCE(src->IsInitialize(), "Using uninitialized VarBase"); ss << vars[i]->Name() << "[";
dst->var_ = std::move(src->var_); auto& var = vars[i]->Var();
(*grad_ref)[dst.get()].second = false; if (!var.IsInitialized()) {
if (!dst->IsInitialize()) { ss << "NOT_INITED_VAR";
dst->SetInitialize(true); } else if (var.IsType<framework::LoDTensor>()) {
auto& tensor = var.Get<framework::LoDTensor>();
ss << "LoDTensor<";
if (tensor.IsInitialized()) {
ss << framework::DataTypeToString(tensor.type()) << ", ";
ss << tensor.place() << ", ";
ss << "(" << tensor.dims() << ")";
} else {
ss << "NOT_INITED";
} }
return; ss << ">";
} else { } else {
framework::Tensor* dst_tensor = ss << "UNRESOLVED_TYPE";
dst->var_->GetMutable<framework::LoDTensor>();
framework::Tensor* src_tensor =
src->var_->GetMutable<framework::LoDTensor>();
// FIXME(minqiyang): loss_grad op will pass a zero grad of label
// ugly fix for it
if (src_tensor->numel() == 0) {
return;
} }
ss << "]";
PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(),
"dst_numel %lld vs. src_numel %lld", dst_tensor->numel(),
src_tensor->numel());
detail::TensorAddToFunctor<float> func(
src_tensor->numel(), src_tensor->data<float>(),
dst_tensor->mutable_data<float>(place));
boost::apply_visitor(func, place);
} }
}
void ZeroGrads(const std::shared_ptr<imperative::VarBase> vb, ss << "}";
const platform::Place& place) { return ss.str();
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place);
auto grad_t = vb->var_->GetMutable<framework::LoDTensor>();
operators::math::set_constant(*dev_ctx, grad_t, 0.0);
} }
void AddGradBySort(BackwardSumMap* bck_map, std::string LayerDebugString(const std::string& op_type,
std::shared_ptr<imperative::VarBase> target, const NameVarBaseMap& ins,
GradientRef* grad_ref) { const NameVarBaseMap& outs) {
PADDLE_ENFORCE(bck_map->find(target.get()) != bck_map->end(), std::stringstream ss;
"Can't find %s in backward grad map", target->Name()); ss << "Op(" << op_type << "): ";
std::pair<platform::Place,
std::vector<std::pair<int, std::shared_ptr<imperative::VarBase>>>>&
current = bck_map->at(target.get());
std::sort(current.second.begin(), current.second.end(),
[](const std::pair<int, std::shared_ptr<imperative::VarBase>>& a,
const std::pair<int, std::shared_ptr<imperative::VarBase>>& b) {
return a.first > b.first;
});
for (auto& var_pair : current.second) {
VLOG(10) << "add origin_grad: " << target->Name();
VLOG(10) << "added grad: " << var_pair.second->Name()
<< " trace id is: " << var_pair.first;
AddTo(var_pair.second, target, current.first, grad_ref);
var_pair.second.reset();
}
}
class Autograd { ss << "Inputs: ";
public:
Autograd() {}
void RunBackward(VarBase* var, const detail::BackwardStrategy& bck_stratedy) { size_t i = 0;
if (var->IsStopGradient()) { for (auto& pair : ins) {
return; if (i > 0) ss << ", ";
} ss << DebugString(pair.first, pair.second);
VLOG(2) << "start autograd"; ++i;
BackwardSumMap bck_map;
std::deque<OpBase*> ready;
ready.push_back(var->PreOp());
std::map<OpBase*, int> dep_counts =
ComputeDepCounts(var->PreOp(), bck_stratedy, &grad_ref);
while (!ready.empty()) {
OpBase* ready_op = ready.front();
ready.pop_front();
std::vector<VarBasePtrMap> grads_outputs =
ready_op->ApplyGrad(&bck_map, &grad_ref, bck_stratedy);
for (const auto& map : grads_outputs) {
for (auto it = map.rbegin(); it != map.rend(); ++it) {
const std::vector<std::shared_ptr<VarBase>>& grad_outs = it->second;
for (size_t i = 0; i < grad_outs.size(); ++i) {
if (!grad_outs[i] || grad_outs[i]->IsStopGradient()) continue;
OpBase* pre_op = grad_outs[i]->PreOp();
if (!pre_op) continue;
dep_counts[pre_op] -= 1;
PADDLE_ENFORCE(dep_counts[pre_op] >= 0);
bool pre_op_ready = dep_counts[pre_op] == 0;
if (pre_op_ready) {
ready.push_back(pre_op);
}
}
}
} }
ready_op->InvokeBackwardHooks(); ss << ", Outputs: ";
} i = 0;
for (auto& pair : outs) {
if (i > 0) ss << ", ";
ss << DebugString(pair.first, pair.second);
++i;
} }
return ss.str();
}
private: void VarBase::AddGradOps(const std::weak_ptr<OpBase>& op) {
std::map<OpBase*, int> ComputeDepCounts( if (op.lock() == nullptr) {
OpBase* op, const detail::BackwardStrategy& bck_stratedy, return;
GradientRef* grad_ref) {
if (bck_stratedy.sorted_sum_gradient_) {
PADDLE_ENFORCE_NOT_NULL(grad_ref,
"grad_ref should not be null when "
"using sorted grad backward strategy");
}
std::map<OpBase*, int> ret;
std::deque<OpBase*> queue;
queue.push_back(op);
std::unordered_set<OpBase*> visited;
visited.insert(op);
while (!queue.empty()) {
OpBase* candidate = queue.front();
queue.pop_front();
for (const auto& map : candidate->grad_output_vars_) {
for (const auto& it : map) {
for (const auto& vb : it.second) {
if (bck_stratedy.sorted_sum_gradient_) {
++(*grad_ref)[vb.get()].first;
}
// init the state of the grad_
(*grad_ref)[vb.get()].second = true;
}
}
}
for (auto it : candidate->pre_ops_) {
for (OpBase* pre_op : it.second) {
if (!pre_op) continue;
VLOG(2) << "op dep " << candidate->Type() << " trace id "
<< candidate->trace_id_ << " <---- " << it.first << " <---- "
<< pre_op->Type() << " trace id " << pre_op->trace_id_;
if (visited.find(pre_op) == visited.end()) {
visited.insert(pre_op);
queue.push_back(pre_op);
} }
ret[pre_op] += 1; for (const auto& cur_op : grad_ops_) {
if (cur_op.lock() == op.lock()) {
return;
} }
} }
grad_ops_.emplace_back(op);
}
void VarBase::ClearGradient() {
if (grad_var_) {
auto* grad_t = grad_var_->var_.GetMutable<framework::LoDTensor>();
if (grad_t->IsInitialized()) {
auto* dev_ctx =
platform::DeviceContextPool::Instance().Get(grad_t->place());
operators::math::set_constant(*dev_ctx, grad_t, 0.0);
} }
return ret;
} }
}
GradientRef grad_ref; std::shared_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
};
std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
const bool blocking) const { const bool blocking) const {
PADDLE_ENFORCE(var_->IsInitialized(), PADDLE_ENFORCE_EQ(var_.IsInitialized() && var_.IsType<framework::LoDTensor>(),
"Variable must be initialized when getting numpy tensor"); true,
"Variable must be initialized and type of LoDTensor when "
// TODO(minqiyang): change this after move unique_name generator to CXX "getting numpy tensor");
const framework::LoDTensor& self_tensor = var_->Get<framework::LoDTensor>();
std::unique_ptr<VarBase> new_var(new VarBase( auto& src_tensor = var_.Get<framework::LoDTensor>();
"Itmp", self_tensor.type(), self_tensor.dims(), dst_place, true, false));
framework::LoDTensor* tensor = // TODO(Jiabin): change this after move unique_name generator to CXX
new_var->var_->GetMutable<framework::LoDTensor>(); auto new_var = std::make_shared<VarBase>(
tensor->set_lod(var_->Get<framework::LoDTensor>().lod()); false, "Itmp" + std::to_string(copied_counter_++));
const auto& src_tensor = var_->Get<framework::LoDTensor>(); auto* dst_tensor = new_var->var_.GetMutable<framework::LoDTensor>();
framework::TensorCopy(src_tensor, dst_place, tensor); dst_tensor->set_lod(src_tensor.lod());
framework::TensorCopy(src_tensor, dst_place, dst_tensor);
if (blocking) { if (blocking) {
platform::DeviceContextPool::Instance().Get(dst_place)->Wait(); platform::DeviceContextPool::Instance().Get(dst_place)->Wait();
auto src_place = src_tensor.place(); auto src_place = src_tensor.place();
...@@ -285,184 +218,66 @@ std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place, ...@@ -285,184 +218,66 @@ std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
return new_var; return new_var;
} }
// create OpBase from optype
framework::LoDTensor& VarBase::GradValue() { OpBase::OpBase(size_t id, const std::string& type, const NameVarBaseMap& ins,
VLOG(3) << "get var grad " << Name(); const NameVarBaseMap& outs, framework::AttributeMap attrs,
PADDLE_ENFORCE_NOT_NULL(grads_, const platform::Place& place)
"Could not get grad value from no grad variable"); : id_(id), place_(place) {
return *(grads_->var_->GetMutable<framework::LoDTensor>()); const auto& info = framework::OpInfoMap::Instance().Get(type);
// Step 1: Run forward
if (info.Checker() != nullptr) {
info.Checker()->Check(&attrs);
}
auto input_name_map = CreateVarNameMap(info, type, ins, true);
auto output_name_map = CreateVarNameMap(info, type, outs, false);
op_ = framework::OpRegistry::CreateOp(type, std::move(input_name_map),
std::move(output_name_map),
std::move(attrs));
VLOG(3) << "Construct Op: " << type << std::endl;
} }
std::vector<VarBasePtrMap> OpBase::ApplyGrad( // create OpBase from opdesc
BackwardSumMap* bck_map, GradientRef* grad_ref, OpBase::OpBase(size_t id, const framework::OpDesc& op_desc,
const detail::BackwardStrategy& bck_stratedy) { const platform::Place& place)
PADDLE_ENFORCE(!grad_op_descs_.empty(), "%s has no backward implementation", : id_(id), op_(framework::OpRegistry::CreateOp(op_desc)), place_(place) {
Type()); VLOG(3) << "Construct Op: " << op_desc.Type() << std::endl;
VLOG(3) << "apply op grad: " << Type(); }
std::vector<VarBasePtrMap> tmp_grad_outputs;
const size_t grad_op_count = grad_op_descs_.size();
tmp_grad_outputs.resize(grad_op_count);
for (size_t k = 0; k < grad_op_count; ++k) {
framework::OpDesc* grad_op_desc = grad_op_descs_[k];
platform::RecordEvent record_event(grad_op_desc->Type());
auto& grad_output_variable_map = grad_output_vars_[k];
VLOG(3) << "apply grad op " << grad_op_desc->Type();
// Allocate tmp grad output variable
for (const auto& it : grad_output_variable_map) {
auto& outputs = tmp_grad_outputs[k][it.first];
outputs.reserve(it.second.size());
for (const std::shared_ptr<imperative::VarBase>& origin_grad_var_base :
it.second) {
// Allocate a new variable
std::shared_ptr<imperative::VarBase> tmp_grad_var_base(new VarBase(
string::Sprintf("%s@IGrad", origin_grad_var_base->Name()),
origin_grad_var_base->DataType(), origin_grad_var_base->Dims(),
place_, true, false));
outputs.emplace_back(std::move(tmp_grad_var_base));
}
}
// No need to do compile time infer shape here.
// grad_op_desc_->InferShape(*block_);
// grad_op_desc->InferVarType(block_);
std::unique_ptr<framework::OperatorBase> opbase =
framework::OpRegistry::CreateOp(*grad_op_desc);
auto& info = framework::OpInfoMap::Instance().Get(grad_op_desc->Type()); void OpBase::Run(const NameVarBaseMap& ins, const NameVarBaseMap& outs) {
auto* op_kernel = dynamic_cast<framework::OperatorWithKernel*>(op_.get());
PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
auto& info = op_->Info();
if (info.infer_var_type_) { if (info.infer_var_type_) {
RuntimeInferVarTypeContext infer_var_type_ctx( RuntimeInferVarTypeContext infer_var_type_ctx(ins, &outs, op_->Attrs());
&grad_input_vars_[k], &tmp_grad_outputs[k], &(opbase->Attrs()));
info.infer_var_type_(&infer_var_type_ctx); info.infer_var_type_(&infer_var_type_ctx);
} }
framework::OperatorWithKernel* op_kernel = // Initialize output var type
dynamic_cast<framework::OperatorWithKernel*>(opbase.get()); for (auto& var_pair : outs) {
PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); for (auto& var : var_pair.second) {
InitializeVariable(var->MutableVar(), var->Type());
// Run grad op
framework::VariableValueMap grad_invars_map;
framework::VariableValueMap grad_outvars_map;
for (const auto& it : grad_input_vars_[k]) {
auto& grad_invars = grad_invars_map[it.first];
grad_invars.reserve(it.second.size());
for (const std::shared_ptr<imperative::VarBase>& grad_inp : it.second) {
PADDLE_ENFORCE_NOT_NULL(grad_inp->var_, "op %s input %s nullptr",
grad_op_desc->Type(), grad_inp->Name());
if (!grad_inp->IsInitialize()) {
grad_inp->InitBuffer();
ZeroGrads(grad_inp, place_);
} }
const std::shared_ptr<imperative::VarBase>& const_grad_inp = grad_inp;
grad_invars.emplace_back(const_grad_inp->var_.get());
} }
}
for (const auto& it : tmp_grad_outputs[k]) {
auto& grad_outvars = grad_outvars_map[it.first];
grad_outvars.reserve(it.second.size());
for (const std::shared_ptr<imperative::VarBase>& grad_out : it.second) {
PADDLE_ENFORCE_NOT_NULL(grad_out->var_, "op %s output %s nullptr",
grad_op_desc->Type(), grad_out->Name());
grad_outvars.emplace_back(grad_out->var_.get()); VLOG(3) << "Running Op " << Type();
} VLOG(5) << LayerDebugString(Type(), ins, outs);
} auto runtime_ctx = PrepareRuntimeContext(ins, outs);
auto runtime_place = PreparedOp::GetExpectedPlace(place(), ins);
framework::RuntimeContext ctx(grad_invars_map, grad_outvars_map); auto prepared_op =
framework::Scope scope; PreparedOp::Prepare(runtime_ctx, *op_kernel, runtime_place);
PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
p.op.RuntimeInferShape(scope, place_, ctx);
p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx,
p.kernel_configs));
}
platform::RecordEvent record_event("merge_grads");
// Add tmp grad outputs to original grad vars
for (size_t k = 0; k < grad_output_vars_.size(); ++k) {
for (const auto& it : grad_output_vars_[k]) {
auto& outputs = tmp_grad_outputs[k][it.first];
const auto& origin_outputs = it.second;
PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
for (size_t i = 0; i < outputs.size(); ++i) {
// track outputs used by sum
if (bck_stratedy.sorted_sum_gradient_) {
if (bck_map->find(origin_outputs[i].get()) != bck_map->end()) {
VLOG(10) << "add sub grad to " << origin_outputs[i]->Name();
bck_map->at(origin_outputs[i].get())
.second.emplace_back(
std::pair<int, std::shared_ptr<imperative::VarBase>>(
this->trace_id_, std::move(outputs[i])));
} else {
VLOG(10) << "insert new map for " << origin_outputs[i]->Name();
std::pair<platform::Place,
std::vector<
std::pair<int, std::shared_ptr<imperative::VarBase>>>>
tmp(place_,
{std::make_pair(this->trace_id_, std::move(outputs[i]))});
bck_map->insert(std::make_pair(origin_outputs[i].get(), tmp));
}
PADDLE_ENFORCE(
grad_ref->find(origin_outputs[i].get()) != grad_ref->end(),
"Can't find %s in grad_reference count map",
origin_outputs[i]->Name());
PADDLE_ENFORCE(grad_ref->at(origin_outputs[i].get()).first >= 1,
"Backward error when calculate grad reference");
if (grad_ref->at(origin_outputs[i].get()).first > 1) {
VLOG(10) << "remove ref for " << origin_outputs[i]->Name();
grad_ref->at(origin_outputs[i].get()).first--;
} else {
VLOG(10) << "Add grad for: " << origin_outputs[i]->Name();
AddGradBySort(bck_map, origin_outputs[i], grad_ref);
grad_ref->at(origin_outputs[i].get()).first--;
}
} else {
VLOG(10) << "AddTo Called with orig_grad is: "
<< origin_outputs[i]->name_ << " Grad to be added is "
<< outputs[i]->name_;
AddTo(outputs[i], origin_outputs[i], place_, grad_ref);
outputs[i].reset();
}
}
}
}
return grad_output_vars_;
}
void OpBase::InvokeBackwardHooks() {
VLOG(3) << "call backward hooks, hooks num: " << backward_hooks_.size();
// call backward hooks
for (py::object& callable : backward_hooks_) {
callable(this);
}
}
void OpBase::RegisterBackwardHooks(const py::object& callable) { prepared_op.Run();
VLOG(3) << "Register backward hooks " << trace_id_;
// TODO(minqiyang): check the callable format VLOG(4) << LayerDebugString(Type(), ins, outs);
backward_hooks_.push_back(callable);
} }
void VarBase::RunBackward(const detail::BackwardStrategy& bck_stratedy) { void OpBase::ClearBackwardTrace() {
if (!pre_op_) return; grad_pending_ops_.clear();
platform::RecordEvent record_event("Imperative Backward"); ins_.clear();
VLOG(3) << "start backward"; outs_.clear();
grads_->InitBuffer();
auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>();
operators::math::set_constant(
*(platform::DeviceContextPool::Instance().Get(
var_->GetMutable<framework::LoDTensor>()->place())),
grads_t, 1.0);
Autograd().RunBackward(this, bck_stratedy);
} }
} // namespace imperative } // namespace imperative
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -13,8 +13,10 @@ ...@@ -13,8 +13,10 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <algorithm>
#include <atomic>
#include <cstdint> #include <cstdint>
#include <list>
#include <map> // NOLINT #include <map> // NOLINT
#include <memory> // NOLINT #include <memory> // NOLINT
#include <mutex> // NOLINT #include <mutex> // NOLINT
...@@ -22,95 +24,19 @@ ...@@ -22,95 +24,19 @@
#include <string> // NOLINT #include <string> // NOLINT
#include <unordered_map> // NOLINT #include <unordered_map> // NOLINT
#include <utility> #include <utility>
#include <vector> // NOLINT #include <vector>
// clang-format off
#include "paddle/fluid/framework/python_headers.h"
// clang-format on
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/framework/var_type_inference.h" #include "paddle/fluid/framework/var_type_inference.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/imperative/backward_strategy.h"
#include "paddle/fluid/imperative/type_defs.h"
#include "paddle/fluid/imperative/flags.h" #include "paddle/fluid/imperative/flags.h"
#include "paddle/fluid/imperative/type_defs.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
class VarBase;
namespace py = ::pybind11;
class PreparedOp {
public:
PreparedOp(const framework::OperatorBase& op,
const framework::RuntimeContext& ctx,
framework::OperatorWithKernel::OpKernelFunc func,
platform::DeviceContext* dev_ctx,
std::vector<framework::KernelConfig>* kernel_configs)
: op(op),
ctx(ctx),
func(func),
dev_ctx(dev_ctx),
kernel_configs(kernel_configs) {}
static PreparedOp Prepare(const framework::RuntimeContext& ctx,
const framework::OperatorWithKernel& op,
const platform::Place& place) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place);
// check if op[type] has kernel registered.
auto& all_op_kernels = op.AllOpKernels();
auto kernels_iter = all_op_kernels.find(op.Type());
if (kernels_iter == all_op_kernels.end()) {
PADDLE_THROW(
"There are no kernels which are registered in the %s operator.",
op.Type());
}
framework::OperatorWithKernel::OpKernelMap& kernels = kernels_iter->second;
auto expected_kernel_key =
op.GetExpectedKernelType(framework::ExecutionContext(
op, framework::Scope(), *dev_ctx, ctx, nullptr));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_MKLDNN
// workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
if (kernel_iter == kernels.end() &&
expected_kernel_key.library_type_ == framework::LibraryType::kMKLDNN) {
VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
expected_kernel_key.library_type_ = framework::LibraryType::kPlain;
expected_kernel_key.data_layout_ = framework::DataLayout::kAnyLayout;
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
if (kernel_iter == kernels.end()) {
PADDLE_THROW("op %s does not have kernel for %s", op.Type(),
KernelTypeToString(expected_kernel_key));
}
std::vector<framework::KernelConfig>* kernel_configs =
op.GetKernelConfig(expected_kernel_key);
return PreparedOp(op, ctx, kernel_iter->second, dev_ctx, kernel_configs);
}
inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx; }
const framework::OperatorBase& op;
const framework::RuntimeContext& ctx;
framework::OperatorWithKernel::OpKernelFunc func;
platform::DeviceContext* dev_ctx;
std::vector<framework::KernelConfig>* kernel_configs;
};
class OpBase; class OpBase;
class ThreadSafeNameSet { class ThreadSafeNameSet {
...@@ -126,290 +52,117 @@ class ThreadSafeNameSet { ...@@ -126,290 +52,117 @@ class ThreadSafeNameSet {
mutable std::mutex mtx_; mutable std::mutex mtx_;
}; };
/* The wrapper for Variable which holds a Variable and a VarBase of its
* gradient. This object should be managed totally by Python intepreter.
*
* Nearly all interface should be implemented in C++.
*/
class VarBase { class VarBase {
DISABLE_COPY_AND_ASSIGN(VarBase);
public: public:
static std::vector<std::string> AliveVarNames(); static std::vector<std::string> AliveVarNames();
explicit VarBase(bool has_grad, const std::string& name)
// Internal interface, create VarBase from exist variable
VarBase(const std::string& name, std::unique_ptr<framework::Variable> var,
VarBase* grad, bool stop_gradient)
: VarBase(name, var->Get<framework::LoDTensor>().type(),
var->Get<framework::LoDTensor>().dims(),
var->Get<framework::LoDTensor>().place(), nullptr, grad,
stop_gradient, false, true) {
var_ = std::move(var);
}
// Python interface
VarBase(const std::string& name, const framework::proto::VarType::Type dtype,
const std::vector<int64_t>& shape, const platform::Place& place,
bool stop_gradient, bool persistable)
: VarBase(name, dtype, framework::make_ddim(shape), place, stop_gradient,
persistable) {}
// Internal interface, create VarBase from with ddim
VarBase(const std::string& name, const framework::proto::VarType::Type dtype,
const framework::DDim& shape, const platform::Place& place,
bool stop_gradient, bool persistable)
: VarBase(name, dtype, shape, place, nullptr, nullptr, stop_gradient,
persistable, true) {}
// Grad used constructor
VarBase(const std::string& name, const framework::proto::VarType::Type dtype,
const std::vector<int64_t>& shape, const platform::Place& place,
bool stop_gradient, bool persistable, bool need_initialize)
: VarBase(name, dtype, framework::make_ddim(shape), place, nullptr,
nullptr, stop_gradient, persistable, need_initialize) {}
private:
// TODO(minqiyang): need support SelectedRows
VarBase(const std::string& name, framework::proto::VarType::Type dtype,
const framework::DDim& shape, const platform::Place& place,
std::unique_ptr<framework::Variable> var, VarBase* grad,
bool stop_gradient, bool persistable, bool need_initialize)
: name_(name), : name_(name),
type_(framework::proto::VarType::LOD_TENSOR), grad_var_(has_grad ? new VarBase(false, GradVarName()) : nullptr) {
place_(place),
var_(std::move(var)),
grads_(grad),
dtype_(dtype),
stop_gradient_(stop_gradient),
persistable_(persistable),
pre_op_(nullptr),
pre_op_out_name_(),
pre_op_out_idx_(-1) {
if (!var_) {
var_.reset(new framework::Variable());
}
auto tensor = var_->GetMutable<framework::LoDTensor>();
tensor->Resize(shape);
if (need_initialize) {
tensor->mutable_data(place, dtype);
is_initialized_ = true;
VLOG(8) << "initialized varbase: " << name_ << " type: " << dtype
<< " place: " << place;
} else {
is_initialized_ = false;
VLOG(8) << "not initialized varbase: " << name_;
}
VLOG(8) << "create varbase: " << name_ << " type: " << dtype
<< " place: " << place << "Stop gradient: " << stop_gradient_;
if (IsDebugEnabled()) { if (IsDebugEnabled()) {
VLOG(10) << "Construct VarBase: " << name;
name_set_.Insert(name_); name_set_.Insert(name_);
} }
} }
public: explicit VarBase(const std::string& name) : VarBase(true, name) {}
virtual ~VarBase() {
pre_op_ = nullptr; ~VarBase() {
pre_op_out_idx_ = -1; VLOG(10) << "Destruct VarBase: " << name_;
VLOG(8) << "destruct varbase: " << name_;
if (IsDebugEnabled()) { if (IsDebugEnabled()) {
name_set_.Remove(name_); name_set_.Remove(name_);
} }
} }
inline void SetName(const std::string& name) { name_ = name; } const framework::Variable& Var() const { return var_; }
inline std::string Name() const { return name_; }
inline bool IsInitialize() const { return is_initialized_; }
inline void SetInitialize(bool inited) { is_initialized_ = inited; }
inline std::vector<int64_t> Shape() const {
if (var_->IsInitialized()) {
return framework::vectorize(var_->Get<framework::LoDTensor>().dims());
} else {
return {};
}
}
inline framework::DDim Dims() const { framework::Variable* MutableVar() { return &var_; }
return var_->Get<framework::LoDTensor>().dims();
}
// data type. e.g.. FP32 bool HasGradVar() const { return grad_var_ != nullptr; }
inline void SetDataType(framework::proto::VarType::Type type) {
auto tensor = var_->GetMutable<framework::LoDTensor>();
tensor->mutable_data(tensor->place(), type);
}
inline framework::proto::VarType::Type DataType() const { return dtype_; }
// tensor type. e.g.. LoDTensor const std::shared_ptr<VarBase>& GradVarBase() const { return grad_var_; }
inline void SetType(framework::proto::VarType::Type type) { type_ = type; }
inline framework::proto::VarType::Type Type() const { return type_; }
inline void SetStopGradient(bool stop_gradient) { const framework::Variable& GradVar() const {
stop_gradient_ = stop_gradient; PADDLE_ENFORCE_NOT_NULL(grad_var_, "Gradient of %s does not exist", name_);
if (grads_) { return grad_var_->var_;
grads_->stop_gradient_ = stop_gradient;
}
} }
inline bool IsStopGradient() const { return stop_gradient_; }
inline void SetPersistable(bool persistable) { persistable_ = persistable; }
inline bool IsPersistable() const { return persistable_; }
inline void SetPreOp(OpBase* op) { pre_op_ = op; }
inline platform::Place GetPlace() { return place_; }
inline OpBase* PreOp() const { return pre_op_; }
inline int PreOpOutIdx() const { return pre_op_out_idx_; }
void RunBackward(const detail::BackwardStrategy& bck_stratedy); framework::Variable* MutableGradVar() {
PADDLE_ENFORCE_NOT_NULL(grad_var_, "Gradient of %s does not exist", name_);
inline void ResetPreOp(OpBase* op) { return &(grad_var_->var_);
if (op == pre_op_) {
// clear pre_op info when op equals to var's pre_op
pre_op_ = nullptr;
pre_op_out_idx_ = -1;
}
} }
void InitBuffer() { void SetStopGradient(bool stop_gradient) {
if (!is_initialized_) { stop_gradient_ = stop_gradient;
var_->GetMutable<framework::LoDTensor>()->mutable_data(place_, dtype_); if (grad_var_) {
is_initialized_ = true; grad_var_->stop_gradient_ = stop_gradient;
VLOG(8) << "initialized varbase: " << name_ << " type: " << dtype_
<< " place: " << place_;
} else {
VLOG(8) << "var: " << name_ << " has already been initialized ";
} }
} }
void TrackPreOp(OpBase* pre_op, const std::string& pre_op_out_name, bool StopGradient() const { return stop_gradient_; }
int pre_op_out_idx, bool pre_op_stop_gradient) {
pre_op_ = pre_op;
pre_op_out_name_ = pre_op_out_name;
pre_op_out_idx_ = pre_op_out_idx;
if (pre_op_stop_gradient) {
stop_gradient_ = pre_op_stop_gradient;
}
}
void ClearGradient() { void SetPersistable(bool persistable) { persistable_ = persistable; }
VLOG(1) << "clear gradient of " << Name();
if (grads_ && grads_->var_ && grads_->var_->IsInitialized()) {
auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>();
operators::math::set_constant(
*(platform::DeviceContextPool::Instance().Get(
grads_->var_->Get<framework::LoDTensor>().place())),
grads_t, 0.0);
}
}
framework::LoDTensor& GradValue(); bool Persistable() const { return persistable_; }
std::unique_ptr<VarBase> NewVarBase(const platform::Place& dst_place, void AddGradOps(const std::weak_ptr<OpBase>& op);
const bool blocking) const;
inline std::string GradName() const { std::vector<OpBase*> GradOps() {
return string::Sprintf("%s@IGrad", Name()); std::vector<OpBase*> rlt;
// TODO(jiabin): use better data structure to remove nullptr when we find it
for (const auto& wk_ptr : grad_ops_) {
OpBase* tmp_op = wk_ptr.lock().get();
if (tmp_op) rlt.emplace_back(tmp_op);
} }
return rlt;
}
void ClearGradOps() { grad_ops_.clear(); }
std::string name_; const std::string& Name() const { return name_; }
framework::proto::VarType::Type type_;
platform::Place place_;
std::unique_ptr<framework::Variable> var_;
std::shared_ptr<VarBase> grads_;
private:
framework::proto::VarType::Type dtype_;
bool stop_gradient_;
bool persistable_;
bool is_initialized_;
OpBase* pre_op_;
std::string pre_op_out_name_;
int pre_op_out_idx_;
// A private flag to check memory leak
static ThreadSafeNameSet name_set_;
};
/* The wrapper for OpDesc which holds a OpDesc and a OpDesc of its void SetName(const std::string& name) {
* gradient. This object should be managed totally by Python intepreter. name_ = name;
*/ if (grad_var_) {
class PYBIND11_HIDDEN OpBase { grad_var_->SetName(GradVarName());
public:
OpBase(const std::string& type)
: type_(type),
trace_id_(-1),
place_(platform::CPUPlace()),
backward_hooks_() {}
virtual ~OpBase() {
for (const auto& it : outputs_ref) {
auto vb = it.lock();
if (vb) {
VLOG(3) << "Op reset by" << vb->name_;
vb->ResetPreOp(this);
}
}
// TODO(minqiyang): remove op_desc from block_desc in tracer
// release resource
for (framework::OpDesc* desc : grad_op_descs_) {
delete desc;
} }
} }
std::vector<VarBasePtrMap> ApplyGrad( std::string GradVarName() { return framework::GradVarName(name_); }
BackwardSumMap* bck_map, GradientRef* grad_ref,
const detail::BackwardStrategy& bck_stratedy);
inline std::string Type() const { return type_; } void SetType(framework::proto::VarType::Type type) { type_ = type; }
inline std::string GradOpType(size_t index) const {
PADDLE_ENFORCE_NOT_NULL(grad_op_descs_[index]);
return grad_op_descs_[index]->Type();
}
void RegisterBackwardHooks(const py::object& callable);
void InvokeBackwardHooks(); framework::proto::VarType::Type Type() const { return type_; }
void TrackPreOp( void SetDataType(framework::proto::VarType::Type data_type) {
const std::string& inp_name, data_type_ = data_type;
const std::vector<std::shared_ptr<imperative::VarBase>>& inputs) { if (grad_var_) {
auto& pre_ops_list = pre_ops_[inp_name]; grad_var_->SetDataType(data_type_);
pre_ops_list.reserve(inputs.size());
auto& pre_ops_out_idx_list = pre_ops_out_idx_[inp_name];
for (std::shared_ptr<imperative::VarBase> inp_var : inputs) {
if (inp_var->PreOp() && !inp_var->IsStopGradient()) {
VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot "
<< inp_name;
pre_ops_list.emplace_back(inp_var->PreOp());
pre_ops_out_idx_list.push_back(inp_var->PreOpOutIdx());
} else {
VLOG(3) << "no pre op in slot " << inp_name
<< " input var stop_gradient: " << inp_var->IsStopGradient();
pre_ops_list.emplace_back(nullptr);
// pre_ops_out_idx_list.push_back(-1);
}
} }
} }
std::string type_; framework::proto::VarType::Type DataType() const { return data_type_; }
int trace_id_;
// Note: each fwd op corresponds to a vector of bwd ops. void ClearGradient();
std::vector<framework::OpDesc*> grad_op_descs_;
platform::Place place_; std::shared_ptr<VarBase> NewVarBase(const platform::Place& dst_place,
const bool blocking) const;
OpBasePtrMap pre_ops_; private:
std::map<std::string, std::vector<int>> pre_ops_out_idx_; framework::Variable var_;
std::string name_;
std::shared_ptr<VarBase> grad_var_;
mutable size_t copied_counter_ = 0;
VarBaseWeakPtrList outputs_ref; // grad_op indicates which grad_op will this var be used as input
// Inputs to a vector of bwd ops. std::vector<std::weak_ptr<OpBase>> grad_ops_;
std::vector<VarBasePtrMap> grad_input_vars_;
// Outputs to a vector of bwd ops.
std::vector<VarBasePtrMap> grad_output_vars_;
std::vector<py::object> backward_hooks_; bool stop_gradient_{false};
bool persistable_{false};
framework::AttributeMap attrs_; framework::proto::VarType::Type type_{framework::proto::VarType::LOD_TENSOR};
framework::proto::VarType::Type data_type_{framework::proto::VarType::FP32};
static ThreadSafeNameSet name_set_;
}; };
class Layer { class Layer {
...@@ -418,18 +171,16 @@ class Layer { ...@@ -418,18 +171,16 @@ class Layer {
virtual std::vector<std::shared_ptr<VarBase>> Forward( virtual std::vector<std::shared_ptr<VarBase>> Forward(
const std::vector<std::shared_ptr<VarBase>>& inputs) { const std::vector<std::shared_ptr<VarBase>>& inputs) {
std::vector<std::shared_ptr<VarBase>> vars; return {};
return vars;
} }
}; };
// infer var type context for imperative mode // infer var type context for imperative mode
class PYBIND11_HIDDEN RuntimeInferVarTypeContext class RuntimeInferVarTypeContext : public framework::InferVarTypeContext {
: public framework::InferVarTypeContext {
public: public:
RuntimeInferVarTypeContext(const imperative::VarBasePtrMap* inputs, RuntimeInferVarTypeContext(const NameVarBaseMap& inputs,
imperative::VarBasePtrMap* outputs, const NameVarBaseMap* outputs,
const framework::AttributeMap* attrs_map) const framework::AttributeMap& attrs_map)
: InferVarTypeContext(nullptr, nullptr), : InferVarTypeContext(nullptr, nullptr),
inputs_(inputs), inputs_(inputs),
outputs_(outputs), outputs_(outputs),
...@@ -437,19 +188,19 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext ...@@ -437,19 +188,19 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext
input_names_(), input_names_(),
output_names_(), output_names_(),
var_set_() { var_set_() {
input_names_.reserve(inputs_->size()); input_names_.reserve(inputs_.size());
for (auto& it : *inputs_) { for (auto& it : inputs_) {
for (std::shared_ptr<imperative::VarBase> var : it.second) { for (auto& var : it.second) {
input_names_[it.first].emplace_back(var->Name()); input_names_[it.first].emplace_back(var->Name());
var_set_[var->Name()] = var; var_set_[var->Name()] = var.get();
} }
} }
output_names_.reserve(outputs_->size()); output_names_.reserve(outputs_->size());
for (auto& it : *outputs_) { for (auto& it : *outputs_) {
for (std::shared_ptr<imperative::VarBase> var : it.second) { for (auto& var : it.second) {
output_names_[it.first].emplace_back(var->Name()); output_names_[it.first].emplace_back(var->Name());
var_set_[var->Name()] = var; var_set_[var->Name()] = var.get();
} }
} }
} }
...@@ -457,8 +208,10 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext ...@@ -457,8 +208,10 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext
virtual ~RuntimeInferVarTypeContext() {} virtual ~RuntimeInferVarTypeContext() {}
framework::Attribute GetAttr(const std::string& name) const override { framework::Attribute GetAttr(const std::string& name) const override {
PADDLE_ENFORCE_NOT_NULL(attrs_); auto iter = attrs_.find(name);
return attrs_->at(name); PADDLE_ENFORCE_EQ(iter != attrs_.end(), true, "Cannot find attribute %s",
name);
return iter->second;
} }
bool HasVar(const std::string& name) const override { bool HasVar(const std::string& name) const override {
...@@ -466,8 +219,7 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext ...@@ -466,8 +219,7 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext
} }
bool HasInput(const std::string& name) const override { bool HasInput(const std::string& name) const override {
PADDLE_ENFORCE_NOT_NULL(inputs_); return inputs_.count(name) > 0;
return inputs_->count(name) > 0;
} }
bool HasOutput(const std::string& name) const override { bool HasOutput(const std::string& name) const override {
...@@ -477,17 +229,26 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext ...@@ -477,17 +229,26 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext
const std::vector<std::string>& Input( const std::vector<std::string>& Input(
const std::string& name) const override { const std::string& name) const override {
return input_names_.at(name); auto iter = input_names_.find(name);
PADDLE_ENFORCE_EQ(iter != input_names_.end(), true, "Cannot find input %s",
name);
return iter->second;
} }
const std::vector<std::string>& Output( const std::vector<std::string>& Output(
const std::string& name) const override { const std::string& name) const override {
return output_names_.at(name); auto iter = output_names_.find(name);
PADDLE_ENFORCE_EQ(iter != output_names_.end(), true,
"Cannot find output %s", name);
return iter->second;
} }
framework::proto::VarType::Type GetType( framework::proto::VarType::Type GetType(
const std::string& name) const override { const std::string& name) const override {
return var_set_.at(name)->Type(); auto iter = var_set_.find(name);
PADDLE_ENFORCE_EQ(iter != var_set_.end(), true,
"Cannot find var %s in GetType", name);
return iter->second->Type();
} }
void SetType(const std::string& name, void SetType(const std::string& name,
...@@ -501,7 +262,10 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext ...@@ -501,7 +262,10 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext
framework::proto::VarType::Type GetDataType( framework::proto::VarType::Type GetDataType(
const std::string& name) const override { const std::string& name) const override {
return var_set_.at(name)->DataType(); auto iter = var_set_.find(name);
PADDLE_ENFORCE_EQ(iter != var_set_.end(), true,
"Cannot find var %s in GetDataType", name);
return iter->second->DataType();
} }
void SetDataType(const std::string& name, void SetDataType(const std::string& name,
...@@ -538,13 +302,97 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext ...@@ -538,13 +302,97 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext
} }
private: private:
const imperative::VarBasePtrMap* inputs_; const NameVarBaseMap& inputs_;
imperative::VarBasePtrMap* outputs_; const NameVarBaseMap* outputs_;
const framework::AttributeMap* attrs_; const framework::AttributeMap& attrs_;
std::unordered_map<std::string, std::vector<std::string>> input_names_; std::unordered_map<std::string, std::vector<std::string>> input_names_;
std::unordered_map<std::string, std::vector<std::string>> output_names_; std::unordered_map<std::string, std::vector<std::string>> output_names_;
std::unordered_map<std::string, std::shared_ptr<imperative::VarBase>> std::unordered_map<std::string, VarBase*> var_set_;
var_set_; };
// TODO(zjl): to support py_func layer
class OpBase : public std::enable_shared_from_this<OpBase> {
DISABLE_COPY_AND_ASSIGN(OpBase);
public:
~OpBase() { VLOG(3) << "Destruct Op: " << Type() << std::endl; }
// Developer should not rely on this method to create OpBase.
// OpBase should be created in Tracer and managed by Tracer totally.
template <typename... Args>
static std::shared_ptr<OpBase> Create(Args&&... args) {
return std::shared_ptr<OpBase>(new OpBase(std::forward<Args>(args)...));
}
size_t id() const { return id_; }
const std::string& Type() const { return op_->Type(); }
void Run(const NameVarBaseMap& ins, const NameVarBaseMap& outs);
const framework::VariableNameMap& InputNameMap() const {
return op_->Inputs();
}
const framework::VariableNameMap& OutputNameMap() const {
return op_->Outputs();
}
const framework::AttributeMap& Attrs() const { return op_->Attrs(); }
const framework::OpInfo& Info() const { return op_->Info(); }
void ClearBackwardTrace();
const std::vector<OpBase*>& GradPendingOps() const {
return grad_pending_ops_;
}
void InsertGradPendingOps(OpBase* op) { grad_pending_ops_.emplace_back(op); }
void SortGradPendingOps() {
std::sort(grad_pending_ops_.begin(), grad_pending_ops_.end(),
[](OpBase* op1, OpBase* op2) { return op1->id() > op2->id(); });
}
NameVarBaseMap* GetMutableOutsMap() { return &outs_; }
NameVarBaseMap* GetMutableInsMap() { return &ins_; }
const NameVarBaseMap& GetInsMap() { return ins_; }
const NameVarBaseMap& GetOutsMap() { return outs_; }
const platform::Place& place() const { return place_; }
// TODO(jiabin) prepare for backward hook
void RegisterBackwardHooks(const std::function<void()>& func) {
backward_hooks_.emplace_back(func);
}
void InvokeBackwardHooks() {
for (const auto& func : backward_hooks_) {
func();
VLOG(5) << "Invoke Backward Hook for: " << Type() << std::endl;
}
}
private:
OpBase(size_t id, const std::string& type, const NameVarBaseMap& ins,
const NameVarBaseMap& outs, framework::AttributeMap attrs,
const platform::Place& place);
OpBase(size_t id, const framework::OpDesc& op_desc,
const platform::Place& place);
size_t id_;
std::unique_ptr<framework::OperatorBase> op_;
std::vector<std::function<void()>> backward_hooks_;
platform::Place place_;
// Not need to be std::weak_ptr, because op is binded to a certain Tracer,
// and would not be used by a Tracer that does not create itself.
std::vector<OpBase*> grad_pending_ops_;
// This part is only used for backward
NameVarBaseMap ins_;
NameVarBaseMap outs_;
}; };
} // namespace imperative } // namespace imperative
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/prepared_operator.h"
#include <sstream>
namespace paddle {
namespace imperative {
const framework::Tensor* GetTensorFromVar(const framework::Variable& var) {
if (var.IsType<framework::LoDTensor>()) {
return &(var.Get<framework::LoDTensor>());
} else if (var.IsType<framework::SelectedRows>()) {
return &(var.Get<framework::SelectedRows>().value());
} else {
return nullptr;
}
}
platform::Place PreparedOp::GetExpectedPlace(const platform::Place& place,
const NameVarBaseMap& ins) {
bool found = false;
for (auto& name_pair : ins) {
for (auto& var_base : name_pair.second) {
const auto* tensor = GetTensorFromVar(var_base->Var());
if (tensor && tensor->IsInitialized()) {
auto tmp_place = tensor->place();
PADDLE_ENFORCE_EQ(!found || tmp_place == place, true,
"Input variable should keep in the same place: %s, "
"but get place: %s of input %s instead",
place, tmp_place, name_pair.first);
}
}
}
return place;
}
PreparedOp::PreparedOp(const framework::OperatorBase& op,
const framework::RuntimeContext& ctx,
framework::OperatorWithKernel::OpKernelFunc func,
platform::DeviceContext* dev_ctx,
std::vector<framework::KernelConfig>* kernel_configs)
: op_(op),
ctx_(ctx),
func_(std::move(func)),
dev_ctx_(dev_ctx),
kernel_configs_(kernel_configs) {}
PreparedOp PreparedOp::Prepare(const framework::RuntimeContext& ctx,
const framework::OperatorWithKernel& op,
const platform::Place& place) {
auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
// check if op[type] has kernel registered.
auto& all_op_kernels = op.AllOpKernels();
auto kernels_iter = all_op_kernels.find(op.Type());
if (kernels_iter == all_op_kernels.end()) {
PADDLE_THROW(
"There are no kernels which are registered in the %s operator.",
op.Type());
}
auto& kernels = kernels_iter->second;
auto expected_kernel_key =
op.GetExpectedKernelType(framework::ExecutionContext(
op, framework::Scope(), *dev_ctx, ctx, nullptr));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key);
// TODO(jiabin): Add operator.cc's line 1000 part back when we need that case
if (kernel_iter == kernels.end()) {
PADDLE_THROW("op %s does not have kernel for %s", op.Type(),
KernelTypeToString(expected_kernel_key));
}
std::vector<framework::KernelConfig>* kernel_configs =
op.GetKernelConfig(expected_kernel_key);
return PreparedOp(op, ctx, kernel_iter->second, dev_ctx, kernel_configs);
}
void PreparedOp::Run() {
// TODO(zjl): remove scope in dygraph
framework::Scope scope;
op_.RuntimeInferShape(scope, dev_ctx_->GetPlace(), ctx_);
func_(framework::ExecutionContext(op_, scope, *dev_ctx_, ctx_,
kernel_configs_));
}
} // namespace imperative
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/type_defs.h"
namespace paddle {
namespace imperative {
const framework::Tensor* GetTensorFromVar(const framework::Variable& var);
class PreparedOp {
public:
static PreparedOp Prepare(const framework::RuntimeContext& ctx,
const framework::OperatorWithKernel& op,
const platform::Place& place);
inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx_; }
void Run();
static platform::Place GetExpectedPlace(const platform::Place& place,
const NameVarBaseMap& ins);
private:
PreparedOp(const framework::OperatorBase& op,
const framework::RuntimeContext& ctx,
framework::OperatorWithKernel::OpKernelFunc func,
platform::DeviceContext* dev_ctx,
std::vector<framework::KernelConfig>* kernel_configs);
private:
const framework::OperatorBase& op_;
const framework::RuntimeContext& ctx_;
framework::OperatorWithKernel::OpKernelFunc func_;
platform::DeviceContext* dev_ctx_;
std::vector<framework::KernelConfig>* kernel_configs_;
};
} // namespace imperative
} // namespace paddle
cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context)
cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS gradient_accumulator memcpy)
cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op)
cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split)
cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/imperative/gradient_accumulator.h"
#include "paddle/fluid/memory/memcpy.h"
namespace imperative = paddle::imperative;
namespace platform = paddle::platform;
namespace framework = paddle::framework;
namespace paddle {
namespace imperative {
void TensorAdd(const framework::Variable& src, framework::Variable* dst);
#if defined(PADDLE_WITH_CUDA)
template <typename T>
int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) {
framework::Variable var1;
framework::Variable var2;
std::vector<T> src_data(10, t1);
std::vector<T> dst_data(10, t2);
std::vector<T> result;
platform::CPUPlace src_place;
for (unsigned int i = 0; i < 10; i++) {
result.emplace_back(src_data[i] + dst_data[i]);
}
std::vector<int64_t> dims = {2, 5};
auto* src = var1.GetMutable<framework::LoDTensor>();
auto* dst = var2.GetMutable<framework::LoDTensor>();
src->Resize(framework::make_ddim(dims));
dst->Resize(framework::make_ddim(dims));
auto* src_mutable = src->mutable_data<T>(place);
auto* dst_mutable = dst->mutable_data<T>(place);
paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
sizeof(T) * src_data.size(), 0);
paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
sizeof(T) * dst_data.size(), 0);
imperative::TensorAdd(var1, &var2);
framework::LoDTensor rlt;
platform::CPUPlace rlt_place;
framework::TensorCopySync(*dst, rlt_place, &rlt);
for (unsigned int i = 0; i < rlt.numel(); i++) {
if (rlt.data<T>()[i] != result[i]) return 1;
}
return 0;
}
#endif
template <typename T>
int TensorCPUAddTest(platform::CPUPlace place, T t1, T t2) {
framework::Variable var1;
framework::Variable var2;
std::vector<T> src_data(10, t1);
std::vector<T> dst_data(10, t2);
std::vector<T> result;
platform::CPUPlace src_place;
for (unsigned int i = 0; i < 10; i++) {
result.emplace_back(src_data[i] + dst_data[i]);
}
std::vector<int64_t> dims = {2, 5};
auto* src = var1.GetMutable<framework::LoDTensor>();
auto* dst = var2.GetMutable<framework::LoDTensor>();
src->Resize(framework::make_ddim(dims));
dst->Resize(framework::make_ddim(dims));
auto* src_mutable = src->mutable_data<T>(place);
auto* dst_mutable = dst->mutable_data<T>(place);
paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
sizeof(T) * src_data.size());
paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
sizeof(T) * dst_data.size());
imperative::TensorAdd(var1, &var2);
framework::LoDTensor rlt;
platform::CPUPlace rlt_place;
framework::TensorCopySync(*dst, rlt_place, &rlt);
for (unsigned int i = 0; i < rlt.numel(); i++) {
if (rlt.data<T>()[i] != result[i]) return 1;
}
return 0;
}
TEST(test_add_functor, add_functor) {
#if defined(PADDLE_WITH_CUDA)
platform::CUDAPlace gpu_place(0);
#endif
platform::CPUPlace cpu_place;
int cpu_res = 1;
cpu_res = TensorCPUAddTest(cpu_place, 1.0, 0.0);
EXPECT_EQ(cpu_res, 0);
cpu_res = TensorCPUAddTest(cpu_place, static_cast<double>(1.0),
static_cast<double>(2.0));
EXPECT_EQ(cpu_res, 0);
#if defined(PADDLE_WITH_CUDA)
int gpu_res = 1;
gpu_res = TensorGPUAddTest(gpu_place, 1.0, 0.0);
EXPECT_EQ(gpu_res, 0);
gpu_res = TensorGPUAddTest(gpu_place, static_cast<double>(1.0),
static_cast<double>(2.0));
EXPECT_EQ(gpu_res, 0);
#endif
}
} // namespace imperative
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Created by Jiabin on 2019-08-16.
//
#include <paddle/fluid/framework/op_registry.h>
#include <memory>
#include <string>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/imperative/layer.h"
namespace imperative = paddle::imperative;
namespace platform = paddle::platform;
namespace framework = paddle::framework;
namespace paddle {
namespace imperative {
using vb_vector = std::vector<std::shared_ptr<imperative::VarBase>>;
using var_pair = std::pair<std::string, vb_vector>;
TEST(test_layer, test_runtime_context) {
std::shared_ptr<imperative::VarBase> vin(
new imperative::VarBase(false, "vin"));
std::shared_ptr<imperative::VarBase> vout(
new imperative::VarBase(false, "vout"));
var_pair in_pair = var_pair("X", vb_vector(1, vin));
var_pair out_pair = var_pair("Out", vb_vector(1, vout));
imperative::NameVarBaseMap ins = {in_pair};
imperative::NameVarBaseMap outs = {out_pair};
framework::AttributeMap attrs;
auto* ctx = new imperative::RuntimeInferVarTypeContext(ins, &outs, attrs);
ASSERT_TRUE(ctx->HasVar("vin"));
ASSERT_TRUE(ctx->HasInput("X"));
ASSERT_TRUE(ctx->HasOutput("Out"));
ASSERT_ANY_THROW(ctx->GetDataTypes("vin"));
std::vector<framework::proto::VarType::Type> NullType;
ASSERT_ANY_THROW(ctx->SetDataTypes("vin", NullType));
ASSERT_ANY_THROW(ctx->GetShape("vin"));
ASSERT_ANY_THROW(ctx->GetLoDLevel("vin"));
ASSERT_ANY_THROW(ctx->SetLoDLevel("vin", 2));
}
std::string LayerDebugString(const std::string& op_type,
const NameVarBaseMap& ins,
const NameVarBaseMap& outs);
TEST(test_layer, test_debug_string_test_debug_Test) {
std::shared_ptr<imperative::VarBase> vin(
new imperative::VarBase(false, "vin"));
std::shared_ptr<imperative::VarBase> vin_error(
new imperative::VarBase(false, "vin_error"));
std::shared_ptr<imperative::VarBase> vout(
new imperative::VarBase(false, "vout"));
std::shared_ptr<imperative::VarBase> vout_error(
new imperative::VarBase(false, "vout_error"));
vin_error->MutableVar()->GetMutable<framework::LoDTensor>();
vout->MutableVar()->GetMutable<framework::LoDTensor>();
vout_error->MutableVar()->GetMutable<framework::SelectedRows>();
var_pair in_pair = var_pair("X", vb_vector(1, vin));
vb_vector vb_in_error = {vin_error, nullptr};
var_pair vin_error_pair = var_pair("X", vb_in_error);
var_pair out_pair = var_pair("Out", vb_vector(1, vout));
var_pair vout_error_pair = var_pair("Out2", vb_vector(1, vout_error));
imperative::NameVarBaseMap ins = {in_pair};
imperative::NameVarBaseMap ins_error = {vin_error_pair};
imperative::NameVarBaseMap outs = {out_pair};
imperative::NameVarBaseMap outs_error = {vout_error_pair};
ASSERT_NO_FATAL_FAILURE(LayerDebugString("test_op", ins, outs));
std::string res = LayerDebugString("test_op", ins, outs_error);
ASSERT_TRUE(res.find("UNRESOLVED_TYPE") != std::string::npos);
std::string res2 = LayerDebugString("test_op", ins_error, outs_error);
VLOG(3) << res2;
ASSERT_TRUE(res2.find("NOT_INITED") != std::string::npos);
ASSERT_TRUE(res2.find("NULL") != std::string::npos);
}
TEST(test_layer, test_clear_backward_info) {
std::shared_ptr<imperative::VarBase> vin(
new imperative::VarBase(false, "vin"));
std::shared_ptr<imperative::VarBase> vout(
new imperative::VarBase(false, "vout"));
framework::OpDesc desc;
platform::CPUPlace place;
var_pair x_pair = var_pair("X", vb_vector(1, vin));
var_pair y_pair = var_pair("Y", vb_vector(1, vin));
var_pair out_pair = var_pair("Out", vb_vector(1, vout));
imperative::NameVarBaseMap ins = {x_pair, y_pair};
imperative::NameVarBaseMap outs = {out_pair};
framework::AttributeMap concat_att_map;
concat_att_map["axis"] = 1;
std::shared_ptr<imperative::OpBase> op(
OpBase::Create(0, "mul", ins, outs, concat_att_map, place));
std::shared_ptr<imperative::OpBase> preceding_op(
OpBase::Create(0, "mul", ins, outs, concat_att_map, place));
op->InsertGradPendingOps(preceding_op.get());
*(op->GetMutableInsMap()) = ins;
*(op->GetMutableOutsMap()) = outs;
ASSERT_GT(op->GetInsMap().size(), 0);
ASSERT_GT(op->GetOutsMap().size(), 0);
ASSERT_GT(op->GradPendingOps().size(), 0);
op->ClearBackwardTrace();
ASSERT_EQ(op->GetInsMap().size(), 0);
ASSERT_EQ(op->GetOutsMap().size(), 0);
ASSERT_EQ(op->GradPendingOps().size(), 0);
}
TEST(test_layer, test_varbase_basic) {
platform::CPUPlace place;
std::shared_ptr<imperative::VarBase> vin(
new imperative::VarBase(false, "vin"));
vin->MutableVar()->GetMutable<framework::LoDTensor>()->mutable_data<float>(
place);
std::shared_ptr<imperative::VarBase> vout(vin->NewVarBase(place, false));
ASSERT_EQ(vout->Name(), "Itmp0");
std::shared_ptr<imperative::VarBase> vin_with_grad(
new imperative::VarBase(true, "vin"));
ASSERT_ANY_THROW(vin->MutableGradVar());
ASSERT_NO_THROW(ASSERT_TRUE(dynamic_cast<framework::Variable*>(
vin_with_grad->MutableGradVar()) != 0));
ASSERT_TRUE(
dynamic_cast<framework::Variable*>(vin_with_grad->MutableGradVar()) != 0);
vin_with_grad->SetStopGradient(true);
ASSERT_TRUE(vin_with_grad->StopGradient());
ASSERT_NO_FATAL_FAILURE(vin_with_grad->SetPersistable(true));
ASSERT_TRUE(vin_with_grad->StopGradient());
ASSERT_NO_FATAL_FAILURE(vin_with_grad->SetName("new_name"));
ASSERT_EQ(vin_with_grad->Name(), "new_name");
}
// TODO(jiabin): Add more ut here for layer
} // namespace imperative
} // namespace paddle
USE_OP(mul);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Created by Jiabin on 2019-08-19.
//
#include <paddle/fluid/framework/op_registry.h>
#include <memory>
#include <string>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/imperative/prepared_operator.h"
#include "paddle/fluid/imperative/type_defs.h"
namespace imperative = paddle::imperative;
namespace platform = paddle::platform;
namespace framework = paddle::framework;
namespace paddle {
namespace imperative {
static framework::RuntimeContext PrepareRuntimeContext(
const NameVarBaseMap& ins, const NameVarBaseMap& outs) {
framework::VariableValueMap inputs, outputs;
for (auto& in_pair : ins) {
auto& in_ctx = inputs[in_pair.first];
in_ctx.reserve(in_pair.second.size());
for (auto& in_var : in_pair.second) {
in_ctx.emplace_back(in_var->MutableVar());
}
}
for (auto& out_pair : outs) {
auto& out_ctx = outputs[out_pair.first];
out_ctx.reserve(out_pair.second.size());
for (auto& out_var : out_pair.second) {
out_ctx.emplace_back(out_var->MutableVar());
}
}
return framework::RuntimeContext(std::move(inputs), std::move(outputs));
}
static framework::VariableNameMap CreateVarNameMap(
const framework::OpInfo& op_info, const std::string& op_type,
const NameVarBaseMap& varbase_map, bool is_input) {
if (op_info.proto_ == nullptr) {
return {};
}
framework::VariableNameMap result;
for (auto& var :
is_input ? op_info.Proto().inputs() : op_info.Proto().outputs()) {
auto it = varbase_map.find(var.name());
if (it == varbase_map.end()) {
PADDLE_ENFORCE_EQ(
var.dispensable(), true,
"Var: %s not dispensable and there are no such var in inputs",
var.name());
result[var.name()] = {};
} else {
auto& var_vector = it->second;
std::vector<std::string> args;
args.reserve(var_vector.size());
for (auto& var_base : var_vector) {
args.emplace_back(var_base->Name());
}
result[var.name()] = std::move(args);
}
}
return result;
}
using vb_vector = std::vector<std::shared_ptr<imperative::VarBase>>;
using var_pair = std::pair<std::string, vb_vector>;
TEST(test_prepare_op, test_prepare_op) {
std::shared_ptr<imperative::VarBase> vin(
new imperative::VarBase(false, "vin"));
std::shared_ptr<imperative::VarBase> vout(
new imperative::VarBase(false, "vout"));
framework::OpDesc desc;
platform::CPUPlace place;
vin->MutableVar()->GetMutable<framework::LoDTensor>()->mutable_data<float>(
place);
var_pair x_pair = var_pair("X", vb_vector(1, vin));
var_pair out_pair = var_pair("Out", vb_vector(1, vout));
imperative::NameVarBaseMap ins = {x_pair};
imperative::NameVarBaseMap outs = {out_pair};
framework::AttributeMap split_attr_map;
const auto& info = framework::OpInfoMap::Instance().Get("split");
framework::VariableNameMap var_in_map =
CreateVarNameMap(info, "split", ins, true);
framework::VariableNameMap var_out_map =
CreateVarNameMap(info, "split", outs, false);
framework::OperatorWithKernel op("split", var_in_map, var_out_map,
split_attr_map);
framework::RuntimeContext ctx = PrepareRuntimeContext(ins, outs);
ASSERT_NO_FATAL_FAILURE(PreparedOp preparedOp =
PreparedOp::Prepare(ctx, op, place));
}
const framework::Tensor* GetTensorFromVar(const framework::Variable& var);
TEST(test_prepare_op, test_get_tensor_from_var) {
std::shared_ptr<imperative::VarBase> vout_error(
new imperative::VarBase(false, "vout_error"));
vout_error->MutableVar()->GetMutable<framework::SelectedRows>();
auto* ts = GetTensorFromVar(*vout_error->MutableVar());
ASSERT_TRUE(ts != nullptr);
}
} // namespace imperative
} // namespace paddle
USE_OP(split);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Created by Jiabin on 2019-08-16.
//
#include <paddle/fluid/framework/op_registry.h>
#include <memory>
#include <string>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/imperative/tracer.h"
namespace imperative = paddle::imperative;
namespace platform = paddle::platform;
namespace framework = paddle::framework;
namespace paddle {
namespace imperative {
using vb_vector = std::vector<std::shared_ptr<imperative::VarBase>>;
using var_pair = std::pair<std::string, vb_vector>;
TEST(test_tracer, test_trace_op) {
// Doing an mul
imperative::Tracer tracer;
std::shared_ptr<imperative::VarBase> x_in(
new imperative::VarBase(true, "x_in"));
std::shared_ptr<imperative::VarBase> y_in(
new imperative::VarBase(true, "y_in"));
std::shared_ptr<imperative::VarBase> vout(
new imperative::VarBase(true, "vout"));
platform::CPUPlace place;
std::vector<float> src_data(10, 2.0);
std::vector<int64_t> dims1 = {2, 5};
std::vector<int64_t> dims2 = {5, 2};
auto* x_in_tensor = x_in->MutableVar()->GetMutable<framework::LoDTensor>();
auto* y_in_tensor = y_in->MutableVar()->GetMutable<framework::LoDTensor>();
x_in_tensor->Resize(framework::make_ddim(dims1));
auto* mutable_x = x_in_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_x, place, src_data.data(),
sizeof(float) * src_data.size());
y_in_tensor->Resize(framework::make_ddim(dims2));
auto* mutable_y = y_in_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_y, place, src_data.data(),
sizeof(float) * src_data.size());
var_pair x_pair = var_pair("X", vb_vector(1, x_in));
var_pair y_pair = var_pair("Y", vb_vector(1, y_in));
var_pair out_pair = var_pair("Out", vb_vector(1, vout));
imperative::NameVarBaseMap ins = {x_pair, y_pair};
imperative::NameVarBaseMap outs = {out_pair};
framework::AttributeMap mul_attr_map;
mul_attr_map["use_mkldnn"] = false;
tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true);
const auto& out_tensor = vout->Var().Get<framework::LoDTensor>();
for (size_t i = 0; i < vout->Var().Get<framework::LoDTensor>().numel(); i++) {
ASSERT_EQ(out_tensor.data<float>()[i], 20.0);
}
}
TEST(test_tracer, test_track_backward_output) {
// Doing an mul
imperative::Tracer tracer;
std::shared_ptr<imperative::VarBase> x_in(
new imperative::VarBase(true, "x_in"));
std::shared_ptr<imperative::VarBase> y_in(
new imperative::VarBase(false, "y_in"));
std::shared_ptr<imperative::VarBase> vout(
new imperative::VarBase(true, "vout"));
platform::CPUPlace place;
std::vector<float> src_data(10, 2.0);
std::vector<int64_t> dims1 = {2, 5};
std::vector<int64_t> dims2 = {5, 2};
auto* x_in_tensor = x_in->MutableVar()->GetMutable<framework::LoDTensor>();
auto* y_in_tensor = y_in->MutableVar()->GetMutable<framework::LoDTensor>();
x_in_tensor->Resize(framework::make_ddim(dims1));
auto* mutable_x = x_in_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_x, place, src_data.data(),
sizeof(float) * src_data.size());
y_in_tensor->Resize(framework::make_ddim(dims2));
auto* mutable_y = y_in_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_y, place, src_data.data(),
sizeof(float) * src_data.size());
var_pair x_pair = var_pair("X", vb_vector(1, x_in));
var_pair y_pair = var_pair("Y", vb_vector(1, y_in));
var_pair out_pair = var_pair("Out", vb_vector(1, vout));
imperative::NameVarBaseMap ins = {x_pair, y_pair};
imperative::NameVarBaseMap outs = {out_pair};
framework::AttributeMap mul_attr_map;
mul_attr_map["use_mkldnn"] = false;
ASSERT_ANY_THROW(tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true));
}
TEST(test_tracer, test_track_backward_input) {
// Doing an mul
imperative::Tracer tracer;
std::shared_ptr<imperative::VarBase> x_in(
new imperative::VarBase(true, "x_in"));
std::shared_ptr<imperative::VarBase> y_in(
new imperative::VarBase(true, "y_in"));
std::shared_ptr<imperative::VarBase> vout(
new imperative::VarBase(false, "vout"));
platform::CPUPlace place;
std::vector<float> src_data(10, 2.0);
std::vector<int64_t> dims1 = {2, 5};
std::vector<int64_t> dims2 = {5, 2};
auto* x_in_tensor = x_in->MutableVar()->GetMutable<framework::LoDTensor>();
auto* y_in_tensor = y_in->MutableVar()->GetMutable<framework::LoDTensor>();
x_in_tensor->Resize(framework::make_ddim(dims1));
auto* mutable_x = x_in_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_x, place, src_data.data(),
sizeof(float) * src_data.size());
y_in_tensor->Resize(framework::make_ddim(dims2));
auto* mutable_y = y_in_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_y, place, src_data.data(),
sizeof(float) * src_data.size());
var_pair x_pair = var_pair("X", vb_vector(1, x_in));
var_pair y_pair = var_pair("Y", vb_vector(1, y_in));
var_pair out_pair = var_pair("Out", vb_vector(1, vout));
imperative::NameVarBaseMap ins = {x_pair, y_pair};
imperative::NameVarBaseMap outs = {out_pair};
framework::AttributeMap mul_attr_map;
mul_attr_map["use_mkldnn"] = false;
ASSERT_ANY_THROW(tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true));
}
} // namespace imperative
} // namespace paddle
USE_OP(mul);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -11,282 +11,207 @@ ...@@ -11,282 +11,207 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/tracer.h"
#include <memory>
#include <set>
#include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include "paddle/fluid/framework/var_type_inference.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
void CreateGradOp(const framework::OpDesc& op_desc, static std::vector<std::unique_ptr<framework::OpDesc>> CreateGradOpDescs(
const framework::OpInfo& op_info, const framework::OpDesc& op_desc,
const std::unordered_set<std::string>& no_grad_set, const std::unordered_set<std::string>& no_grad_set,
const std::vector<framework::BlockDesc*>& grad_sub_block, const std::vector<framework::BlockDesc*>& grad_sub_block,
std::vector<framework::OpDesc*>* grad_op_descs,
std::unordered_map<std::string, std::string>* grad_to_var) { std::unordered_map<std::string, std::string>* grad_to_var) {
PADDLE_ENFORCE(grad_op_descs->empty()); if (op_info.grad_op_maker_) {
const framework::OpInfo& op_info = return op_info.grad_op_maker_(op_desc, no_grad_set, grad_to_var,
framework::OpInfoMap::Instance().Get(op_desc.Type()); grad_sub_block);
if (!op_info.grad_op_maker_) return; } else {
return {};
std::vector<std::unique_ptr<framework::OpDesc>> descs =
op_info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
for (auto& desc : descs) {
grad_op_descs->emplace_back(desc.release());
} }
} }
void CreateNoBuffuerGrad(std::shared_ptr<imperative::VarBase> var, void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
platform::DeviceContext* dev_ctx) { const NameVarBaseMap& outs, framework::AttributeMap attrs,
PADDLE_ENFORCE_NOT_NULL(var, "Could not get valid var base"); const platform::Place& place, bool trace_backward) {
PADDLE_ENFORCE_NOT_NULL(dev_ctx, platform::RecordEvent event(type);
"Could not get valid device from forward op"); VLOG(1) << "Trace Op: " << type;
size_t op_id = GenerateUniqueId();
auto op = OpBase::Create(op_id, type, ins, outs, std::move(attrs), place);
op->Run(ins, outs);
if (var->grads_ == nullptr) { if (ComputeRequiredGrad(ins, outs, trace_backward)) {
auto& var_t = var->var_->Get<framework::LoDTensor>(); TraceBackward(op, framework::OpDesc(op->Type(), op->InputNameMap(),
var->grads_ = std::shared_ptr<imperative::VarBase>( op->OutputNameMap(), op->Attrs()),
new VarBase(var->GradName(), framework::proto::VarType::FP32, ins, outs);
framework::vectorize(var_t.dims()), dev_ctx->GetPlace(),
var->IsStopGradient(), false, false));
} }
} }
platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) { bool Tracer::ComputeRequiredGrad(const NameVarBaseMap& ins,
platform::Place result = place; const NameVarBaseMap outs,
for (const auto& it : inputs) { bool trace_backward) {
for (const std::shared_ptr<imperative::VarBase>& var : it.second) { // TODO(jiabin): Implement auto prune here
platform::Place tmp_place = return trace_backward;
var->var_->Get<framework::LoDTensor>().place();
if (!platform::is_same_place(tmp_place, result)) {
PADDLE_THROW(
"Input variable should keep in the same place: %s, but get place: "
"%s of input %s instead",
result, tmp_place, it.first);
}
}
}
return result;
} }
framework::VariableNameMap CreateInputVarNameMap( void Tracer::TraceBackward(const std::shared_ptr<OpBase>& fwd_op,
const OpBase* op, const VarBasePtrMap& varbase_map) { const framework::OpDesc& fwd_op_desc,
framework::VariableNameMap result; const NameVarBaseMap& ins,
const NameVarBaseMap& outs) {
auto& info_map = framework::OpInfoMap::Instance(); // grad_to_var is a map of framework::GradVarName(in_var_name/out_var_name) ->
auto* op_info = info_map.GetNullable(op->Type()); // in_var_name/out_var_name
if (op_info == nullptr || op_info->proto_ == nullptr) { std::unordered_map<std::string, std::string> grad_to_var;
return result;
} // Get grad_op_desc using fwd_op_desc
std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs_ =
for (auto& in : op_info->Proto().inputs()) { CreateGradOpDescs(fwd_op->Info(), fwd_op_desc, {}, {}, &grad_to_var);
auto it = varbase_map.find(in.name());
if (it == varbase_map.end()) { // Create grad_ops using grad_op_descs
PADDLE_ENFORCE(in.dispensable());
result[in.name()] = {}; size_t grad_op_num = grad_op_descs_.size();
VLOG(3) << "Create " << grad_op_num << " grad op desc(s) to op "
<< fwd_op->Type();
if (grad_op_num == 0) {
return;
}
// Build a map to record var_name -> std::shared_ptr<VarBase>*,
// so that we can find suitable var in grad op descs
std::unordered_map<std::string, const std::shared_ptr<VarBase>*> name_to_var;
for (auto& pair : ins) {
for (auto& var : pair.second) {
auto& var_ptr = name_to_var[var->Name()];
PADDLE_ENFORCE_EQ(var_ptr == nullptr || var_ptr->get() == var.get(), true,
"There are different variables with same name %s",
var->Name());
var_ptr = &var;
}
}
for (auto& pair : outs) {
for (auto& var : pair.second) {
auto& var_ptr = name_to_var[var->Name()];
PADDLE_ENFORCE_EQ(var_ptr == nullptr || var_ptr->get() == var.get(), true,
"There are different variables with same name %s",
var->Name());
var_ptr = &var;
}
}
// Build backward ins and outs
for (size_t i = 0; i < grad_op_num; i++) {
// Step1: build grad op and add them to engine
// Use trace id to decide the order of gradient sum in sorted sum mode
size_t trace_id = fwd_op->id();
std::shared_ptr<OpBase> grad_op =
OpBase::Create(trace_id, (*(grad_op_descs_[i].get())), fwd_op->place());
// this OpBase* is just used to manage op's life time
engine_->InsertOp(grad_op.get(), grad_op);
std::unordered_set<OpBase*> visited_preceding_ops;
// Step2 : prepare grad_in vars and bind them with grad_op,
// set inputs' grad_op as current grad_op
for (const auto& grad_ins : grad_op_descs_[i]->Inputs()) {
if (grad_ins.second.empty()) continue;
auto& bwd_in = (*grad_op->GetMutableInsMap())[grad_ins.first];
bwd_in.reserve(grad_ins.second.size());
for (auto& grad_in_var_name : grad_ins.second) {
auto iter = grad_to_var.find(grad_in_var_name);
if (iter != grad_to_var.end()) {
// If it is a grad var, find its coresponding forward var
auto& fwd_var_name = iter->second;
auto fwd_var_iter = name_to_var.find(fwd_var_name);
PADDLE_ENFORCE_EQ(fwd_var_iter != name_to_var.end(), true,
"Cannot find forward variable named %s",
fwd_var_name);
PADDLE_ENFORCE_NOT_NULL(
(*(fwd_var_iter->second))->GradVarBase(),
"Grad of %s should "
"not be NULL when we Track_Backward Input of %s",
(*(fwd_var_iter->second))->Name(), grad_op->Type());
(*(fwd_var_iter->second))->GradVarBase()->AddGradOps(grad_op);
VLOG(3) << "Add Grad Op " << grad_op->Type() << " for :"
<< (*(fwd_var_iter->second))->GradVarBase()->Name();
bwd_in.emplace_back((*(fwd_var_iter->second))->GradVarBase());
} else { } else {
auto var_vector = it->second; // If it is a forward var, just add it
std::vector<std::string> args; auto fwd_var_iter = name_to_var.find(grad_in_var_name);
args.reserve(var_vector.size()); PADDLE_ENFORCE_EQ(fwd_var_iter != name_to_var.end(), true,
for (std::shared_ptr<imperative::VarBase> var_base : var_vector) { "Cannot find forward variable named %s",
args.emplace_back(var_base->Name()); grad_in_var_name);
bwd_in.emplace_back(*(fwd_var_iter->second));
}
VLOG(3) << "Set backward input " << grad_ins.first << " of "
<< grad_op->Type() << " to be "
<< (bwd_in.back() ? bwd_in.back()->Name() : "nullptr");
}
}
// Step3: prepare grad_out vars and using their grad_ops to set current
// grad_op's preceding op
for (auto& grad_outs : grad_op_descs_[i]->Outputs()) {
if (grad_outs.second.empty()) continue;
auto& bwd_out = (*grad_op->GetMutableOutsMap())[grad_outs.first];
bwd_out.reserve(grad_outs.second.size());
for (auto& grad_out_var_name : grad_outs.second) {
auto iter = grad_to_var.find(grad_out_var_name);
PADDLE_ENFORCE_EQ(iter != grad_to_var.end(), true,
"Cannot find output of input grad %s in op %s",
grad_out_var_name, fwd_op->Type());
auto fwd_var_iter = name_to_var.find(iter->second);
PADDLE_ENFORCE_EQ(fwd_var_iter != name_to_var.end(), true,
"Cannot find forward variable named %s",
iter->second);
PADDLE_ENFORCE_NOT_NULL(
(*(fwd_var_iter->second))->GradVarBase(),
"Grad of %s should "
"not be NULL when we Track_Backward Output of %s",
(*(fwd_var_iter->second))->Name(), grad_op->Type());
bwd_out.emplace_back((*(fwd_var_iter->second))->GradVarBase());
VLOG(3) << "Set backward output " << grad_outs.first << " of "
<< grad_op->Type() << " to be "
<< (bwd_out.back() ? bwd_out.back()->Name() : "nullptr");
auto preceding_ops =
(*(fwd_var_iter->second))->GradVarBase()->GradOps();
if (VLOG_IS_ON(3) && !preceding_ops.empty()) {
VLOG(3) << "Add preceding Op of :"
<< (*(fwd_var_iter->second))->GradVarBase()->Name()
<< " It's preceding Op are: ";
for (const auto& op : preceding_ops) {
VLOG(3) << op->Type();
}
}
if (!preceding_ops.empty()) {
for (const auto& op : preceding_ops) {
PADDLE_ENFORCE_NOT_NULL(op, "No nullptr should be preceding_op");
if (visited_preceding_ops.count(op) == 0) {
visited_preceding_ops.insert(op);
grad_op->InsertGradPendingOps(op);
} }
result[in.name()] = args;
} }
}
return result;
}
framework::VariableNameMap CreateOutputVarNameMap(
const OpBase* op, const VarBasePtrMap& varbase_map) {
framework::VariableNameMap result;
auto& info_map = framework::OpInfoMap::Instance();
auto* op_info = info_map.GetNullable(op->Type());
if (op_info == nullptr || op_info->proto_ == nullptr) {
return result;
}
for (auto& out : op_info->Proto().outputs()) {
auto it = varbase_map.find(out.name());
if (it == varbase_map.end()) {
PADDLE_ENFORCE(out.dispensable());
result[out.name()] = {};
} else { } else {
auto var_vector = it->second; VLOG(5) << "Hit leaf VarBase";
std::vector<std::string> args; VLOG(5) << "Hit leaf VarBase"
args.reserve(var_vector.size()); << (*(fwd_var_iter->second))->GradVarBase()->Name();
for (const std::shared_ptr<imperative::VarBase>& var_base : var_vector) {
args.emplace_back(var_base->Name());
}
result[out.name()] = args;
}
}
return result;
}
Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {}
void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
VarBasePtrMap* outputs, framework::AttributeMap attrs_map,
const platform::Place expected_place,
const bool stop_gradient) {
platform::RecordEvent record_event(op->type_);
framework::VariableValueMap invars_map;
framework::VariableValueMap outvars_map;
// Construct input_vars_map and output_vars_map
std::map<std::string, std::shared_ptr<imperative::VarBase>> current_vars_map;
for (auto it : inputs) {
auto& invars = invars_map[it.first];
invars.reserve(it.second.size());
for (std::shared_ptr<imperative::VarBase> inp : it.second) {
PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", op->Type(),
inp->Name());
invars.emplace_back(inp->var_.get());
if (!stop_gradient) {
current_vars_map[inp->Name()] = inp;
}
VLOG(3) << "input var name: " << inp->Name()
<< " inited: " << inp->var_->IsInitialized()
<< " stop_grad: " << inp->IsStopGradient();
}
op->TrackPreOp(it.first, it.second);
}
for (const auto& it : *outputs) {
auto& outvars = outvars_map[it.first];
const std::vector<std::shared_ptr<imperative::VarBase>>& outputs_tmp =
it.second;
outvars.reserve(outputs_tmp.size());
for (size_t i = 0U; i < outputs_tmp.size(); ++i) {
// Add weak_ptr to track outputs
op->outputs_ref.emplace_back(outputs_tmp[i]);
std::shared_ptr<imperative::VarBase> out = outputs_tmp[i];
outvars.emplace_back(out->var_.get());
out->TrackPreOp(op, it.first, i, stop_gradient);
if (!stop_gradient) {
current_vars_map[out->Name()] = out;
}
VLOG(3) << "output var name: " << out->Name()
<< " inited: " << out->var_->IsInitialized()
<< " stop_grad: " << out->IsStopGradient();
}
}
// Check attrs and create op
framework::VariableNameMap invars_name_map =
CreateInputVarNameMap(op, inputs);
framework::VariableNameMap outvars_name_map =
CreateOutputVarNameMap(op, *outputs);
auto& info = framework::OpInfoMap::Instance().Get(op->Type());
if (info.Checker() != nullptr) {
info.Checker()->Check(&attrs_map);
}
std::unique_ptr<framework::OperatorBase> op_base =
framework::OpRegistry::CreateOp(op->Type(), invars_name_map,
outvars_name_map, attrs_map);
if (info.infer_var_type_) {
RuntimeInferVarTypeContext infer_var_type_ctx(&inputs, outputs, &attrs_map);
info.infer_var_type_(&infer_var_type_ctx);
}
// TODO(minqiyang): Support infer var type in imperative mode
// Run forward op
VLOG(3) << "tracer running " << op->Type();
framework::RuntimeContext ctx(invars_map, outvars_map);
// TODO(panyx0718): Cache p.
framework::OperatorWithKernel* op_kernel =
dynamic_cast<framework::OperatorWithKernel*>(op_base.get());
PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
framework::Scope scope;
op->place_ = GetExpectedPlace(expected_place, inputs);
PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_);
prepared_op.op.RuntimeInferShape(scope, op->place_, ctx);
prepared_op.func(
framework::ExecutionContext(prepared_op.op, scope, *prepared_op.dev_ctx,
prepared_op.ctx, prepared_op.kernel_configs));
if (!stop_gradient) {
VLOG(5) << "start construct backward op";
// construct grad op descs
op->attrs_ = attrs_map;
std::unique_ptr<framework::OpDesc> fwd_op_desc(new framework::OpDesc(
op->Type(), invars_name_map, outvars_name_map, attrs_map));
std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
new std::unordered_map<std::string, std::string>());
// NOTE(minqiyang): We don't support control flow op in imperative now
// Add grad_block_ when we want to support it
CreateGradOp(*fwd_op_desc, {}, {}, &op->grad_op_descs_, grad_to_var.get());
VLOG(5) << "create grad op desc: " << op->grad_op_descs_[0]->Type();
const size_t grad_op_count = op->grad_op_descs_.size();
op->grad_input_vars_.resize(grad_op_count);
op->grad_output_vars_.resize(grad_op_count);
for (size_t i = 0; i < grad_op_count; ++i) {
framework::OpDesc* grad_op_desc = op->grad_op_descs_[i];
for (auto it : grad_op_desc->Inputs()) {
auto& grad_in_vars = op->grad_input_vars_[i][it.first];
grad_in_vars.reserve(it.second.size());
for (const std::string& grad_invar : it.second) {
auto var_it = grad_to_var->find(grad_invar);
if (var_it == grad_to_var->end()) {
auto fwd_var_it = current_vars_map.find(grad_invar);
PADDLE_ENFORCE(fwd_var_it != current_vars_map.end());
// Forward inputs or outputs.
grad_in_vars.emplace_back(fwd_var_it->second);
} else {
std::shared_ptr<imperative::VarBase> var =
current_vars_map[var_it->second];
CreateNoBuffuerGrad(var, prepared_op.GetDeviceContext());
// Douts.
var->grads_->SetPreOp(var->PreOp());
grad_in_vars.emplace_back(var->grads_);
}
}
}
for (auto it : grad_op_desc->Outputs()) {
auto& grad_out_vars = op->grad_output_vars_[i][it.first];
for (const std::string& grad_outvar : it.second) {
auto var_it = grad_to_var->find(grad_outvar);
PADDLE_ENFORCE(var_it != grad_to_var->end(),
"Could not found the grad op output var, should this "
"operator %s's stop gradient be True",
op->Type());
std::shared_ptr<imperative::VarBase> var =
current_vars_map[var_it->second];
CreateNoBuffuerGrad(var, prepared_op.GetDeviceContext());
var->grads_->SetPreOp(var->PreOp());
grad_out_vars.push_back(var->grads_);
VLOG(3) << "grads output var name: " << var->name_;
} }
} }
} }
// To ensure numeric stability as static graph
grad_op->SortGradPendingOps();
} }
} }
} // namespace imperative } // namespace imperative
} // namespace paddle } // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -14,46 +14,48 @@ ...@@ -14,46 +14,48 @@
#pragma once #pragma once
#include <map> #include <atomic>
#include <set> #include <future> // NOLINT
#include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set>
#include <vector> #include <vector>
#include "ThreadPool.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/imperative/engine.h"
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/macros.h"
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
void CreateGradOp(const framework::OpDesc& op_desc,
const std::unordered_set<std::string>& no_grad_set,
const std::vector<framework::BlockDesc*>& grad_sub_block,
framework::OpDesc** grad_op_desc,
std::unordered_map<std::string, std::string>* grad_to_var);
platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs);
class Tracer { class Tracer {
DISABLE_COPY_AND_ASSIGN(Tracer);
public: public:
explicit Tracer(framework::BlockDesc* root_block); Tracer() : engine_(new BasicEngine()) {}
virtual ~Tracer() {} ~Tracer() = default;
void Trace(OpBase* op, const VarBasePtrMap& inputs, void TraceOp(const std::string& type, const NameVarBaseMap& ins,
VarBasePtrMap* outputs, // NOLINT const NameVarBaseMap& outs, framework::AttributeMap attrs,
framework::AttributeMap attrs_map, const platform::Place& place, bool trace_bacward);
const platform::Place expected_place,
const bool stop_gradient = false); bool ComputeRequiredGrad(const NameVarBaseMap& ins, const NameVarBaseMap outs,
bool trace_backward);
void TraceBackward(const std::shared_ptr<OpBase>& fwd_op,
const framework::OpDesc& fwd_op_desc,
const NameVarBaseMap& ins, const NameVarBaseMap& outs);
Engine* GetDefaultEngine() const { return engine_.get(); }
private: private:
platform::Place GetPlace(const VarBasePtrMap& inputs); static size_t GenerateUniqueId() {
static std::atomic<size_t> id{0};
return id.fetch_add(1);
}
framework::BlockDesc* root_block_; private:
std::unique_ptr<Engine> engine_;
}; };
} // namespace imperative } // namespace imperative
......
...@@ -17,8 +17,6 @@ limitations under the License. */ ...@@ -17,8 +17,6 @@ limitations under the License. */
#include <map> #include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <utility>
#include <vector> #include <vector>
namespace paddle { namespace paddle {
...@@ -26,18 +24,10 @@ namespace imperative { ...@@ -26,18 +24,10 @@ namespace imperative {
class VarBase; class VarBase;
class OpBase; class OpBase;
class Tracer;
typedef std::map<std::string, std::vector<std::shared_ptr<VarBase>>> using NameVarBaseMap =
VarBasePtrMap; std::map<std::string, std::vector<std::shared_ptr<VarBase>>>;
typedef std::vector<std::weak_ptr<VarBase>> VarBaseWeakPtrList;
typedef std::map<std::string, std::vector<OpBase*>> OpBasePtrMap;
typedef std::unordered_map<
const VarBase*,
std::pair<platform::Place,
std::vector<std::pair<int, std::shared_ptr<VarBase>>>>>
BackwardSumMap; // var_grad -> {place, {id -> var_grad@rename}}
typedef std::unordered_map<const VarBase*, std::pair<int, bool>> GradientRef;
// var_grad -> {ref_times, is_first_to_be_accumulate}
} // namespace imperative } // namespace imperative
} // namespace paddle } // namespace paddle
set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper nccl_wrapper prune set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper nccl_wrapper prune
feed_fetch_method pass_builder parallel_executor profiler layer scope_pool feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool
tracer analysis_predictor imperative_profiler nccl_context) analysis_predictor imperative_profiler nccl_context imperative_flag)
if(WITH_PYTHON) if(WITH_PYTHON)
list(APPEND PYBIND_DEPS py_func_op) list(APPEND PYBIND_DEPS py_func_op)
......
...@@ -20,11 +20,13 @@ limitations under the License. */ ...@@ -20,11 +20,13 @@ limitations under the License. */
#include <pybind11/functional.h> #include <pybind11/functional.h>
#include <pybind11/stl.h> #include <pybind11/stl.h>
#include <memory> #include <memory>
#include <string>
#include <unordered_map> #include <unordered_map>
#include <utility> #include <utility>
#include <vector>
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/imperative/backward_strategy.h"
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/nccl_context.h"
#include "paddle/fluid/imperative/profiler.h" #include "paddle/fluid/imperative/profiler.h"
#include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/imperative/type_defs.h"
...@@ -44,16 +46,27 @@ class Layer : public imperative::Layer { ...@@ -44,16 +46,27 @@ class Layer : public imperative::Layer {
const std::vector<std::shared_ptr<imperative::VarBase>> &inputs) const std::vector<std::shared_ptr<imperative::VarBase>> &inputs)
override { override {
PYBIND11_OVERLOAD(std::vector<std::shared_ptr<imperative::VarBase>>, Layer, PYBIND11_OVERLOAD(std::vector<std::shared_ptr<imperative::VarBase>>, Layer,
Forward, Forward, inputs); // NOLINT
inputs); // NOLINT
} }
}; };
class PYBIND11_HIDDEN PyOpBase : public imperative::OpBase { // warper for pyobject to avoid imperative module depend on python
// TODO(jiabin) Add OpBase's pybind interface back to enable backward hook
class PYBIND11_HIDDEN PyCallableObject {
public: public:
using imperative::OpBase::OpBase; // Inherit constructors PyCallableObject(std::shared_ptr<py::object> py_obj_ptr)
: py_obj_ptr_(std::move(py_obj_ptr)) {}
~PyCallableObject() {
py::call_guard<py::gil_scoped_acquire>();
py_obj_ptr_.reset();
}
void operator()() {
py::call_guard<py::gil_scoped_acquire>();
py_obj_ptr_->operator()(this);
}
PyOpBase(const std::string &name) : OpBase(name) {} private:
std::shared_ptr<py::object> py_obj_ptr_;
}; };
// Function like obj.attr_name in Python. // Function like obj.attr_name in Python.
...@@ -125,33 +138,43 @@ GetVarBaseListFromPyHandle(const py::handle &handle) { ...@@ -125,33 +138,43 @@ GetVarBaseListFromPyHandle(const py::handle &handle) {
} }
} else { } else {
PADDLE_THROW( PADDLE_THROW(
"unsupported type %s, must be Variable, List[Variable] or " "unsupported type %s, must be Variable, list[Variable] or "
"tuple[Variable]", "tuple[Variable]",
py::str(handle)); py::str(handle));
} }
PADDLE_ENFORCE(PyErr_Occurred() == nullptr,
py::str(py::handle(PyErr_Occurred())));
return result; return result;
} }
using PyVarBaseMap = std::unordered_map<std::string, py::handle>; using PyNameVarBaseMap = std::unordered_map<std::string, py::handle>;
static imperative::VarBasePtrMap ConvertToVarBasePtrMap( static imperative::NameVarBaseMap ConvertToNameVarBaseMap(
const PyVarBaseMap &map) { const PyNameVarBaseMap &map) {
imperative::VarBasePtrMap result; imperative::NameVarBaseMap result;
for (auto &pair : map) { for (auto &pair : map) {
auto var_vec = GetVarBaseListFromPyHandle(pair.second); auto var_vec = GetVarBaseListFromPyHandle(pair.second);
if (!var_vec.empty()) { if (!var_vec.empty()) {
result.emplace(pair.first, std::move(var_vec)); result.emplace(pair.first, std::move(var_vec));
} }
} }
PADDLE_ENFORCE_EQ(PyErr_Occurred() == nullptr, true,
py::str(py::handle(PyErr_Occurred())));
return result; return result;
} }
static std::string GetTypeName(const imperative::VarBase &var) {
if (var.Type() == framework::proto::VarType::RAW) {
return "RAW";
} else if (!var.Var().IsInitialized()) {
return "nullptr";
} else {
return framework::ToTypeName(var.Var().Type());
}
}
// Bind Methods // Bind Methods
void BindImperative(pybind11::module *m_ptr) { void BindImperative(py::module *m_ptr) {
auto &m = *m_ptr; auto &m = *m_ptr;
py::class_<imperative::detail::BackwardStrategy> backward_strategy( py::class_<imperative::detail::BackwardStrategy> backward_strategy(
...@@ -200,69 +223,88 @@ void BindImperative(pybind11::module *m_ptr) { ...@@ -200,69 +223,88 @@ void BindImperative(pybind11::module *m_ptr) {
m.def("_dygraph_debug_level", []() { return imperative::GetDebugLevel(); }); m.def("_dygraph_debug_level", []() { return imperative::GetDebugLevel(); });
py::class_<imperative::VarBase, std::shared_ptr<imperative::VarBase>>( py::class_<imperative::VarBase, std::shared_ptr<imperative::VarBase>>(
m, "VarBase", R"DOC()DOC") m, "VarBase",
R"DOC()DOC")
.def_static("_alive_vars", &imperative::VarBase::AliveVarNames) .def_static("_alive_vars", &imperative::VarBase::AliveVarNames)
.def( .def("__init__",
py::init<const std::string &, paddle::framework::proto::VarType::Type, [](imperative::VarBase &self, const std::string &name,
const std::vector<int64_t>, const paddle::platform::CPUPlace, framework::proto::VarType::Type type,
bool, bool>()) framework::proto::VarType::Type dtype,
.def( const std::vector<int> &dims, bool stop_gradient,
py::init<const std::string &, paddle::framework::proto::VarType::Type, bool persistable) {
const std::vector<int64_t>, new (&self) imperative::VarBase(name);
const paddle::platform::CUDAPlace, bool, bool>()) self.SetPersistable(persistable);
self.SetType(type);
self.SetDataType(dtype);
self.SetStopGradient(stop_gradient);
if (type == framework::proto::VarType::LOD_TENSOR) {
auto *tensor =
self.MutableVar()->GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim(dims));
}
})
.def("_run_backward", .def("_run_backward",
[](imperative::VarBase &self, [](imperative::VarBase &self,
const imperative::detail::BackwardStrategy &bckst) { const imperative::detail::BackwardStrategy &bckst,
self.RunBackward(bckst); const imperative::Tracer &tracer) {
}) // TODO(jiabin): when we impl more backward execution we can select
.def("_grad_name", &imperative::VarBase::GradName) // them
.def("_grad_value", &imperative::VarBase::GradValue)
imperative::Engine *engine = tracer.GetDefaultEngine();
VLOG(3) << "Start backward";
engine->Init(&self, bckst);
engine->Execute();
VLOG(3) << "Finish backward";
},
py::call_guard<py::gil_scoped_release>())
.def("_grad_name", &imperative::VarBase::GradVarName)
.def("_grad_value",
[](imperative::VarBase &self) {
return self.MutableGradVar()->Get<framework::LoDTensor>();
},
py::return_value_policy::reference)
.def("_clear_gradient", &imperative::VarBase::ClearGradient) .def("_clear_gradient", &imperative::VarBase::ClearGradient)
.def("_grad_ivar", .def("_grad_ivar",
[](const imperative::VarBase &self) { return self.grads_; }, [](const imperative::VarBase &self) {
py::return_value_policy::reference) auto &grad_var = self.GradVarBase();
if (grad_var && grad_var->Var().IsInitialized()) {
return grad_var;
} else {
return std::shared_ptr<imperative::VarBase>(nullptr);
}
},
py::return_value_policy::copy)
.def("_copy_to", .def("_copy_to",
[](const imperative::VarBase &self, const platform::CPUPlace &place, [](const imperative::VarBase &self, const platform::CPUPlace &place,
bool blocking) { bool blocking) { return self.NewVarBase(place, blocking); },
return self.NewVarBase(place, blocking).release(); py::return_value_policy::copy)
},
py::return_value_policy::take_ownership)
.def("_copy_to", .def("_copy_to",
[](const imperative::VarBase &self, const platform::CUDAPlace &place, [](const imperative::VarBase &self, const platform::CUDAPlace &place,
bool blocking) { bool blocking) { return self.NewVarBase(place, blocking); },
return self.NewVarBase(place, blocking).release(); py::return_value_policy::copy)
}, .def("value", [](imperative::VarBase &self) { return self.MutableVar(); },
py::return_value_policy::take_ownership)
.def("value",
[](const imperative::VarBase &self) { return self.var_.get(); },
py::return_value_policy::reference) py::return_value_policy::reference)
.def_property("name", &imperative::VarBase::Name, .def_property("name", &imperative::VarBase::Name,
&imperative::VarBase::SetName) &imperative::VarBase::SetName)
.def_property_readonly("shape", &imperative::VarBase::Shape) .def_property_readonly(
"shape",
[](imperative::VarBase &self) {
if (self.Var().IsType<framework::LoDTensor>()) {
return framework::vectorize2int(
self.Var().Get<framework::LoDTensor>().dims());
} else {
VLOG(2) << "It is meaningless to get shape of variable type "
<< GetTypeName(self);
return std::vector<int>();
}
})
.def_property_readonly("type", &imperative::VarBase::Type)
.def_property_readonly("dtype", &imperative::VarBase::DataType) .def_property_readonly("dtype", &imperative::VarBase::DataType)
.def_property("persistable", &imperative::VarBase::IsPersistable, .def_property("persistable", &imperative::VarBase::Persistable,
&imperative::VarBase::SetPersistable) &imperative::VarBase::SetPersistable)
.def_property("stop_gradient", &imperative::VarBase::IsStopGradient, .def_property("stop_gradient", &imperative::VarBase::StopGradient,
&imperative::VarBase::SetStopGradient); &imperative::VarBase::SetStopGradient);
py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
.def(py::init<const std::string &>())
.def("register_backward_hooks",
[](imperative::OpBase &self, const py::object &callable) {
self.RegisterBackwardHooks(callable);
})
.def_property("_trace_id",
[](const imperative::OpBase &self) {
py::gil_scoped_release release;
return self.trace_id_;
},
[](imperative::OpBase &self, int trace_id) {
py::gil_scoped_release release;
self.trace_id_ = trace_id;
},
py::return_value_policy::reference)
.def_property_readonly("type", &imperative::OpBase::Type);
py::class_<imperative::Layer, Layer /* <--- trampoline*/> layer(m, "Layer"); py::class_<imperative::Layer, Layer /* <--- trampoline*/> layer(m, "Layer");
layer.def(py::init<>()) layer.def(py::init<>())
.def("forward", .def("forward",
...@@ -271,40 +313,33 @@ void BindImperative(pybind11::module *m_ptr) { ...@@ -271,40 +313,33 @@ void BindImperative(pybind11::module *m_ptr) {
return self.Forward(inputs); return self.Forward(inputs);
}); });
// NOTE(zjl): Tracer use PyVarBaseMap as its parameter but not VarBasePtrMap.
// We call Python C-API to convert PyVarBaseMap to VarBasePtrMap, instead
// making conversion in Python code. This speed up Tracer.trace() about 6%
// in ptb model and make time cost in Python to be nearly zero.
py::class_<imperative::Tracer>(m, "Tracer", "") py::class_<imperative::Tracer>(m, "Tracer", "")
.def("__init__", .def("__init__",
[](imperative::Tracer &self, framework::BlockDesc *root_block) { [](imperative::Tracer &self) { new (&self) imperative::Tracer(); })
new (&self) imperative::Tracer(root_block);
})
.def("trace", .def("trace",
[](imperative::Tracer &self, imperative::OpBase *op, [](imperative::Tracer &self, const std::string &type,
const PyVarBaseMap &inputs, const PyVarBaseMap &outputs, const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
framework::AttributeMap attrs_map, framework::AttributeMap attrs, const platform::CUDAPlace &place,
const platform::CPUPlace expected_place, bool trace_backward) {
const bool stop_gradient = false) { auto ins_map = ConvertToNameVarBaseMap(ins);
auto ins = ConvertToVarBasePtrMap(inputs); auto outs_map = ConvertToNameVarBaseMap(outs);
auto outs = ConvertToVarBasePtrMap(outputs);
{ {
py::gil_scoped_release release; py::gil_scoped_release release;
self.Trace(op, std::move(ins), &outs, attrs_map, expected_place, self.TraceOp(type, std::move(ins_map), std::move(outs_map),
stop_gradient); std::move(attrs), place, trace_backward);
} }
}) })
.def("trace", [](imperative::Tracer &self, imperative::OpBase *op, .def("trace",
const PyVarBaseMap &inputs, const PyVarBaseMap &outputs, [](imperative::Tracer &self, const std::string &type,
framework::AttributeMap attrs_map, const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
const platform::CUDAPlace expected_place, framework::AttributeMap attrs, const platform::CPUPlace &place,
const bool stop_gradient = false) { bool trace_backward) {
auto ins = ConvertToVarBasePtrMap(inputs); auto ins_map = ConvertToNameVarBaseMap(ins);
auto outs = ConvertToVarBasePtrMap(outputs); auto outs_map = ConvertToNameVarBaseMap(outs);
{ {
py::gil_scoped_release release; py::gil_scoped_release release;
self.Trace(op, std::move(ins), &outs, attrs_map, expected_place, self.TraceOp(type, std::move(ins_map), std::move(outs_map),
stop_gradient); std::move(attrs), place, trace_backward);
} }
}); });
......
...@@ -14,10 +14,6 @@ limitations under the License. */ ...@@ -14,10 +14,6 @@ limitations under the License. */
#pragma once #pragma once
#include <Python.h> #include <Python.h>
#include <string>
#include <vector>
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/nccl_context.h"
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
#include "pybind11/stl.h" #include "pybind11/stl.h"
......
...@@ -18,6 +18,7 @@ from paddle.fluid import core ...@@ -18,6 +18,7 @@ from paddle.fluid import core
from paddle.fluid import framework from paddle.fluid import framework
from .tracer import Tracer from .tracer import Tracer
import logging import logging
import objgraph
__all__ = [ __all__ = [
'no_grad', 'no_grad',
...@@ -123,7 +124,7 @@ def guard(place=None): ...@@ -123,7 +124,7 @@ def guard(place=None):
""" """
train = framework.Program() train = framework.Program()
startup = framework.Program() startup = framework.Program()
tracer = Tracer(train.current_block().desc) tracer = Tracer()
if place is None: if place is None:
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
...@@ -138,19 +139,22 @@ def guard(place=None): ...@@ -138,19 +139,22 @@ def guard(place=None):
yield yield
def _print_debug_msg(): def _print_debug_msg(limit=5, is_test=False):
if not core._is_dygraph_debug_enabled(): if not core._is_dygraph_debug_enabled():
logging.warn( logging.warn(
'Debug mode is not enabled. Please set FLAGS_dygraph_debug=1 to enable debug' 'Debug mode is not enabled. Please set FLAGS_dygraph_debug=1 to enable debug'
) )
return return
unique_name_size = len(framework.unique_name.generator.ids) unique_name_size = len(framework.unique_name.generator.ids)
tracer_var_size = len(framework._dygraph_tracer()._vars) tracer_var_size = len(framework._dygraph_tracer()._vars)
alive_cpp_var_size = len(core.VarBase._alive_vars()) alive_cpp_var_size = len(core.VarBase._alive_vars())
if not is_test:
logging.warn( logging.warn(
'unique_name num: {}, tracer vars num: {}, alive cpp vars num: {}' 'unique_name num: {}, tracer vars num: {}, alive cpp vars num: {}'
.format(unique_name_size, tracer_var_size, alive_cpp_var_size)) .format(unique_name_size, tracer_var_size, alive_cpp_var_size))
objgraph.show_growth(limit=limit)
else:
return unique_name_size, tracer_var_size, alive_cpp_var_size
def to_variable(value, block=None, name=None): def to_variable(value, block=None, name=None):
......
...@@ -20,7 +20,7 @@ from . import layers ...@@ -20,7 +20,7 @@ from . import layers
from . import parallel_helper from . import parallel_helper
from .. import framework from .. import framework
from ..layers import collective from ..layers import collective
from . import to_variable from . import to_variable, no_grad
__all__ = ["prepare_context"] __all__ = ["prepare_context"]
...@@ -197,6 +197,7 @@ class DataParallel(layers.Layer): ...@@ -197,6 +197,7 @@ class DataParallel(layers.Layer):
for g_var, g_shape in zip(origin_grad_vars, grad_shapes): for g_var, g_shape in zip(origin_grad_vars, grad_shapes):
nn.reshape(x=g_var, shape=g_shape, inplace=True) nn.reshape(x=g_var, shape=g_shape, inplace=True)
@no_grad
def apply_collective_grads(self): def apply_collective_grads(self):
""" """
AllReduce the Parameters' gradient. AllReduce the Parameters' gradient.
......
...@@ -23,21 +23,15 @@ from paddle.fluid import framework ...@@ -23,21 +23,15 @@ from paddle.fluid import framework
__all__ = ['Tracer'] __all__ = ['Tracer']
def release_op(op):
del framework._dygraph_tracer()._ops[op._trace_id]
class Tracer(core.Tracer): class Tracer(core.Tracer):
""" """
Python wrapper of dygraph tracer Python wrapper of dygraph tracer
""" """
def __init__(self, block): def __init__(self):
super(Tracer, self).__init__(block) super(Tracer, self).__init__()
self._ops = defaultdict()
self._vars = defaultdict() self._vars = defaultdict()
self._trace_id = 0
self._train_mode = True self._train_mode = True
def trace_var(self, name, var): def trace_var(self, name, var):
...@@ -47,23 +41,10 @@ class Tracer(core.Tracer): ...@@ -47,23 +41,10 @@ class Tracer(core.Tracer):
return list((item for name, item in six.iteritems(self._vars) return list((item for name, item in six.iteritems(self._vars)
if isinstance(item, framework.Parameter))) if isinstance(item, framework.Parameter)))
def _clear_ops(self): def trace_op(self, type, inputs, outputs, attrs, stop_gradient=False):
self._ops = defaultdict() self.trace(type, inputs, outputs, attrs,
self._trace_id = 0 framework._current_expected_place(), self._train_mode and
not stop_gradient)
def trace_op(self, op, inputs, outputs, stop_gradient=False):
# record op's trace id
op.iop._trace_id = self._trace_id
self.trace(op.iop, inputs, outputs, op.attrs,
framework._current_expected_place(), stop_gradient)
if not stop_gradient and self._train_mode:
self._trace_id += 1
self._ops[op.iop._trace_id] = op
# register backward hooks and variables if needed
op.iop.register_backward_hooks(release_op)
def train_mode(self): def train_mode(self):
self._train_mode = True self._train_mode = True
......
...@@ -458,9 +458,10 @@ class Variable(object): ...@@ -458,9 +458,10 @@ class Variable(object):
self._ivar = kwargs.get("ivar", None) self._ivar = kwargs.get("ivar", None)
if not self._ivar: if not self._ivar:
self._ivar = core.VarBase( self._ivar = core.VarBase(
name, dtype if dtype else core.VarDesc.VarType.FP32, name, type
list(shape) if shape else [], if type else core.VarDesc.VarType.LOD_TENSOR, dtype
_current_expected_place(), stop_gradient, True if dtype else core.VarDesc.VarType.FP32,
list(shape) if shape else [], stop_gradient, True
if persistable else False) if persistable else False)
if persistable: if persistable:
_dygraph_tracer().trace_var(name, self) _dygraph_tracer().trace_var(name, self)
...@@ -582,13 +583,16 @@ class Variable(object): ...@@ -582,13 +583,16 @@ class Variable(object):
return np.array(new_ivar.value().get_tensor()) return np.array(new_ivar.value().get_tensor())
def backward(self, backward_strategy=None): def backward(self, backward_strategy=None):
if in_dygraph_mode():
from .dygraph import BackwardStrategy from .dygraph import BackwardStrategy
if backward_strategy is None: if backward_strategy is None:
backward_strategy = BackwardStrategy() backward_strategy = BackwardStrategy()
backward_strategy.sort_sum_gradient = False backward_strategy.sort_sum_gradient = False
self._ivar._run_backward(backward_strategy) self._ivar._run_backward(backward_strategy, _dygraph_tracer())
_dygraph_tracer()._clear_ops() else:
raise ValueError(
"Variable.backward() is only avaliable in DyGraph mode")
def gradient(self): def gradient(self):
new_ivar = self._ivar._grad_ivar()._copy_to(core.CPUPlace(), True) new_ivar = self._ivar._grad_ivar()._copy_to(core.CPUPlace(), True)
...@@ -616,9 +620,13 @@ class Variable(object): ...@@ -616,9 +620,13 @@ class Variable(object):
""" """
if in_dygraph_mode(): if in_dygraph_mode():
# TODO(panyx0718): add more dygraph debug info. # TODO(panyx0718): add more dygraph debug info.
tensor = self._ivar.value().get_tensor()
if tensor._is_initialized():
return 'name %s, dtype: %s shape: %s %s' % ( return 'name %s, dtype: %s shape: %s %s' % (
self.name, self.dtype, self.shape, self.name, self.dtype, self.shape, str(tensor))
str(self._ivar.value().get_tensor())) else:
return 'name %s, shape: %s, not inited' % (self.name,
self.shape)
assert isinstance(throw_on_error, bool) and isinstance(with_details, assert isinstance(throw_on_error, bool) and isinstance(with_details,
bool) bool)
...@@ -713,7 +721,7 @@ class Variable(object): ...@@ -713,7 +721,7 @@ class Variable(object):
@property @property
def type(self): def type(self):
if in_dygraph_mode(): if in_dygraph_mode():
return self._ivar.dtype return self._ivar.type
else: else:
return self.desc.type() return self.desc.type()
...@@ -1085,9 +1093,7 @@ class Operator(object): ...@@ -1085,9 +1093,7 @@ class Operator(object):
if type is None: if type is None:
raise ValueError( raise ValueError(
"`type` to initialized an Operator can not be None.") "`type` to initialized an Operator can not be None.")
self.iop = core.OpBase(type) self._type = type
self.previous_ops = []
self.attrs = attrs if attrs else {} self.attrs = attrs if attrs else {}
else: else:
self.block = block self.block = block
...@@ -1233,7 +1239,7 @@ class Operator(object): ...@@ -1233,7 +1239,7 @@ class Operator(object):
@property @property
def type(self): def type(self):
if in_dygraph_mode(): if in_dygraph_mode():
return self.iop.type return self._type
else: else:
return self.desc.type() return self.desc.type()
...@@ -1787,10 +1793,12 @@ class Block(object): ...@@ -1787,10 +1793,12 @@ class Block(object):
else: else:
attrs['is_test'] = False attrs['is_test'] = False
type = kwargs.get("type", None)
op = Operator( op = Operator(
block=self, block=self,
desc=None, desc=None,
type=kwargs.get("type", None), type=type,
inputs=None, inputs=None,
outputs=None, outputs=None,
attrs=attrs) attrs=attrs)
...@@ -1799,9 +1807,11 @@ class Block(object): ...@@ -1799,9 +1807,11 @@ class Block(object):
# #
# TODO(minqiyang): add op stop_gradient support in static mode too. # TODO(minqiyang): add op stop_gradient support in static mode too.
# currently, we only support stop_gradient in dygraph mode. # currently, we only support stop_gradient in dygraph mode.
_dygraph_tracer().trace_op(op,
_dygraph_tracer().trace_op(type,
kwargs.get("inputs", {}), kwargs.get("inputs", {}),
kwargs.get("outputs", {}), kwargs.get("outputs", {}), attrs
if attrs else {},
kwargs.get("stop_gradient", False)) kwargs.get("stop_gradient", False))
else: else:
op_desc = self.desc.append_op() op_desc = self.desc.append_op()
...@@ -1862,17 +1872,15 @@ class Block(object): ...@@ -1862,17 +1872,15 @@ class Block(object):
def _prepend_op(self, *args, **kwargs): def _prepend_op(self, *args, **kwargs):
if in_dygraph_mode(): if in_dygraph_mode():
type = kwargs.get("type", None)
attrs = kwargs.get("attrs", {})
op = Operator( op = Operator(
self, self, None, type=type, inputs=None, outputs=None, attrs=attrs)
None,
type=kwargs.get("type", None),
inputs=None,
outputs=None,
attrs=kwargs.get("attrs", {}))
_dygraph_tracer().trace_op(op, _dygraph_tracer().trace_op(type,
kwargs.get("inputs", {}), kwargs.get("inputs", {}),
kwargs.get("outputs", {}), kwargs.get("outputs", {}), attrs
if attrs else {},
kwargs.get("stop_gradient", False)) kwargs.get("stop_gradient", False))
else: else:
op_desc = self.desc._prepend_op() op_desc = self.desc._prepend_op()
......
...@@ -615,9 +615,6 @@ class Optimizer(object): ...@@ -615,9 +615,6 @@ class Optimizer(object):
optimize_ops = self.apply_optimize( optimize_ops = self.apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads) loss, startup_program=startup_program, params_grads=params_grads)
if framework.in_dygraph_mode():
framework._dygraph_tracer()._clear_ops()
return optimize_ops, params_grads return optimize_ops, params_grads
......
...@@ -177,7 +177,7 @@ list(REMOVE_ITEM TEST_OPS test_basic_gru_api) ...@@ -177,7 +177,7 @@ list(REMOVE_ITEM TEST_OPS test_basic_gru_api)
list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op) list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op)
list(REMOVE_ITEM TEST_OPS test_basic_lstm_api) list(REMOVE_ITEM TEST_OPS test_basic_lstm_api)
list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op) list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op)
list(REMOVE_ITEM TEST_OPS test_imperative_debug_string)
# Some ops need to check results when gc is enabled # Some ops need to check results when gc is enabled
# Currently, only ops that register NoNeedBufferVarsInference need to do this test # Currently, only ops that register NoNeedBufferVarsInference need to do this test
set(TEST_OPS_WITH_GC set(TEST_OPS_WITH_GC
...@@ -240,6 +240,7 @@ py_test_modules(test_imperative_ocr_attention_model MODULES test_imperative_ocr_ ...@@ -240,6 +240,7 @@ py_test_modules(test_imperative_ocr_attention_model MODULES test_imperative_ocr_
py_test_modules(test_install_check MODULES test_install_check ENVS py_test_modules(test_install_check MODULES test_install_check ENVS
FLAGS_cudnn_deterministic=1 SERIAL) FLAGS_cudnn_deterministic=1 SERIAL)
set_tests_properties(test_install_check PROPERTIES LABELS "RUN_TYPE=DIST") set_tests_properties(test_install_check PROPERTIES LABELS "RUN_TYPE=DIST")
py_test_modules(test_imperative_debug_string MODULES test_imperative_debug_string ENVS FLAGS_dygraph_debug=1)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
py_test_modules(test_dist_train MODULES test_dist_train ENVS ${dist_ENVS}) py_test_modules(test_dist_train MODULES test_dist_train ENVS ${dist_ENVS})
py_test_modules(test_lookup_remote_table_op MODULES test_lookup_remote_table_op ENVS ${dist_ENVS}) py_test_modules(test_lookup_remote_table_op MODULES test_lookup_remote_table_op ENVS ${dist_ENVS})
......
...@@ -27,17 +27,40 @@ import paddle.fluid as fluid ...@@ -27,17 +27,40 @@ import paddle.fluid as fluid
import paddle.fluid.dygraph as dygraph import paddle.fluid.dygraph as dygraph
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC, BatchNorm from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC, LayerNorm
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.layer_helper import LayerHelper
import math
from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
momentum_rate = 0.9
l2_decay = 1.2e-4
def optimizer_setting(params):
ls = params["learning_strategy"]
if "total_images" not in params:
total_images = 6149
else:
total_images = params["total_images"]
batch_size = ls["batch_size"]
step = int(math.ceil(float(total_images) / batch_size))
bd = [step * e for e in ls["epochs"]]
lr = params["lr"]
num_epochs = params["num_epochs"]
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.cosine_decay(
learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
momentum=momentum_rate,
regularization=fluid.regularizer.L2Decay(l2_decay))
return optimizer
class ConvBNLayer(fluid.dygraph.Layer): class ConvBNLayer(fluid.dygraph.Layer):
def __init__(self, def __init__(self,
name_scope, name_scope,
num_channels,
num_filters, num_filters,
filter_size, filter_size,
stride=1, stride=1,
...@@ -46,26 +69,21 @@ class ConvBNLayer(fluid.dygraph.Layer): ...@@ -46,26 +69,21 @@ class ConvBNLayer(fluid.dygraph.Layer):
super(ConvBNLayer, self).__init__(name_scope) super(ConvBNLayer, self).__init__(name_scope)
self._conv = Conv2D( self._conv = Conv2D(
self.full_name(), "conv2d",
num_filters=num_filters, num_filters=num_filters,
filter_size=filter_size, filter_size=filter_size,
stride=stride, stride=stride,
padding=(filter_size - 1) // 2, padding=(filter_size - 1) // 2,
groups=groups, groups=groups,
act=None, act=None,
bias_attr=None) bias_attr=False,
param_attr=fluid.ParamAttr(name="weights"))
self._batch_norm = BatchNorm( self._layer_norm = LayerNorm(self.full_name(), begin_norm_axis=1)
self.full_name(), num_filters, act=act, momentum=0.1)
self._layer_norm = fluid.dygraph.nn.LayerNorm(
self.full_name(), begin_norm_axis=1)
def forward(self, inputs): def forward(self, inputs):
y = self._conv(inputs) y = self._conv(inputs)
# FIXME(zcd): when compare the result of multi-card and single-card,
# we should replace batch_norm with layer_norm.
y = self._layer_norm(y) y = self._layer_norm(y)
# y = self._batch_norm(y)
return y return y
...@@ -76,17 +94,19 @@ class SqueezeExcitation(fluid.dygraph.Layer): ...@@ -76,17 +94,19 @@ class SqueezeExcitation(fluid.dygraph.Layer):
super(SqueezeExcitation, self).__init__(name_scope) super(SqueezeExcitation, self).__init__(name_scope)
self._pool = Pool2D( self._pool = Pool2D(
self.full_name(), pool_size=0, pool_type='avg', global_pooling=True) self.full_name(), pool_size=0, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(num_channels * 1.0)
self._squeeze = FC( self._squeeze = FC(
self.full_name(), self.full_name(),
size=num_channels // reduction_ratio, size=num_channels // reduction_ratio,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.05)), initializer=fluid.initializer.Uniform(-stdv, stdv)),
act='relu') act='relu')
stdv = 1.0 / math.sqrt(num_channels / 16.0 * 1.0)
self._excitation = FC( self._excitation = FC(
self.full_name(), self.full_name(),
size=num_channels, size=num_channels,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.05)), initializer=fluid.initializer.Uniform(-stdv, stdv)),
act='sigmoid') act='sigmoid')
def forward(self, input): def forward(self, input):
...@@ -110,39 +130,37 @@ class BottleneckBlock(fluid.dygraph.Layer): ...@@ -110,39 +130,37 @@ class BottleneckBlock(fluid.dygraph.Layer):
self.conv0 = ConvBNLayer( self.conv0 = ConvBNLayer(
self.full_name(), self.full_name(),
num_channels=num_channels,
num_filters=num_filters, num_filters=num_filters,
filter_size=1) filter_size=1,
act="relu")
self.conv1 = ConvBNLayer( self.conv1 = ConvBNLayer(
self.full_name(), self.full_name(),
num_channels=num_filters,
num_filters=num_filters, num_filters=num_filters,
filter_size=3, filter_size=3,
stride=stride, stride=stride,
groups=cardinality) groups=cardinality,
act="relu")
self.conv2 = ConvBNLayer( self.conv2 = ConvBNLayer(
self.full_name(), self.full_name(),
num_channels=num_filters, num_filters=num_filters * 2,
num_filters=num_filters * 4,
filter_size=1, filter_size=1,
act='relu') act=None)
self.scale = SqueezeExcitation( self.scale = SqueezeExcitation(
self.full_name(), self.full_name(),
num_channels=num_filters * 4, num_channels=num_filters * 2,
reduction_ratio=reduction_ratio) reduction_ratio=reduction_ratio)
if not shortcut: if not shortcut:
self.short = ConvBNLayer( self.short = ConvBNLayer(
self.full_name(), self.full_name(),
num_channels=num_channels, num_filters=num_filters * 2,
num_filters=num_filters * 4,
filter_size=1, filter_size=1,
stride=stride) stride=stride)
self.shortcut = shortcut self.shortcut = shortcut
self._num_channels_out = num_filters * 4 self._num_channels_out = num_filters * 2
def forward(self, inputs): def forward(self, inputs):
y = self.conv0(inputs) y = self.conv0(inputs)
...@@ -155,10 +173,7 @@ class BottleneckBlock(fluid.dygraph.Layer): ...@@ -155,10 +173,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
else: else:
short = self.short(inputs) short = self.short(inputs)
y = fluid.layers.elementwise_add(x=short, y=scale) y = fluid.layers.elementwise_add(x=short, y=scale, act='relu')
layer_helper = LayerHelper(self.full_name(), act='relu')
y = layer_helper.append_activation(y)
return y return y
...@@ -178,7 +193,6 @@ class SeResNeXt(fluid.dygraph.Layer): ...@@ -178,7 +193,6 @@ class SeResNeXt(fluid.dygraph.Layer):
num_filters = [128, 256, 512, 1024] num_filters = [128, 256, 512, 1024]
self.conv0 = ConvBNLayer( self.conv0 = ConvBNLayer(
self.full_name(), self.full_name(),
num_channels=3,
num_filters=64, num_filters=64,
filter_size=7, filter_size=7,
stride=2, stride=2,
...@@ -196,8 +210,7 @@ class SeResNeXt(fluid.dygraph.Layer): ...@@ -196,8 +210,7 @@ class SeResNeXt(fluid.dygraph.Layer):
num_filters = [128, 256, 512, 1024] num_filters = [128, 256, 512, 1024]
self.conv0 = ConvBNLayer( self.conv0 = ConvBNLayer(
self.full_name(), self.full_name(),
num_channels=3, num_filters=64,
num_filters=3,
filter_size=7, filter_size=7,
stride=2, stride=2,
act='relu') act='relu')
...@@ -214,24 +227,21 @@ class SeResNeXt(fluid.dygraph.Layer): ...@@ -214,24 +227,21 @@ class SeResNeXt(fluid.dygraph.Layer):
num_filters = [128, 256, 512, 1024] num_filters = [128, 256, 512, 1024]
self.conv0 = ConvBNLayer( self.conv0 = ConvBNLayer(
self.full_name(), self.full_name(),
num_channels=3, num_filters=64,
num_filters=3, filter_size=3,
filter_size=7,
stride=2, stride=2,
act='relu') act='relu')
self.conv1 = ConvBNLayer( self.conv1 = ConvBNLayer(
self.full_name(), self.full_name(),
num_channels=64, num_filters=64,
num_filters=3, filter_size=3,
filter_size=7, stride=1,
stride=2,
act='relu') act='relu')
self.conv2 = ConvBNLayer( self.conv2 = ConvBNLayer(
self.full_name(), self.full_name(),
num_channels=64, num_filters=128,
num_filters=3, filter_size=3,
filter_size=7, stride=1,
stride=2,
act='relu') act='relu')
self.pool = Pool2D( self.pool = Pool2D(
self.full_name(), self.full_name(),
...@@ -261,16 +271,14 @@ class SeResNeXt(fluid.dygraph.Layer): ...@@ -261,16 +271,14 @@ class SeResNeXt(fluid.dygraph.Layer):
self.pool2d_avg = Pool2D( self.pool2d_avg = Pool2D(
self.full_name(), pool_size=7, pool_type='avg', global_pooling=True) self.full_name(), pool_size=7, pool_type='avg', global_pooling=True)
import math
stdv = 1.0 / math.sqrt(2048 * 1.0) stdv = 1.0 / math.sqrt(2048 * 1.0)
self.fc = FC(self.full_name(), self.out = FC(self.full_name(),
size=class_dim, size=class_dim,
act='softmax',
param_attr=fluid.param_attr.ParamAttr( param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv))) initializer=fluid.initializer.Uniform(-stdv, stdv)))
def forward(self, inputs, label): def forward(self, inputs):
if self.layers == 50 or self.layers == 101: if self.layers == 50 or self.layers == 101:
y = self.conv0(inputs) y = self.conv0(inputs)
y = self.pool(y) y = self.pool(y)
...@@ -283,13 +291,8 @@ class SeResNeXt(fluid.dygraph.Layer): ...@@ -283,13 +291,8 @@ class SeResNeXt(fluid.dygraph.Layer):
for bottleneck_block in self.bottleneck_block_list: for bottleneck_block in self.bottleneck_block_list:
y = bottleneck_block(y) y = bottleneck_block(y)
y = self.pool2d_avg(y) y = self.pool2d_avg(y)
# FIXME(zcd): the dropout should be removed when compare the y = self.out(y)
# result of multi-card and single-card. return y
# y = fluid.layers.dropout(y, dropout_prob=0.2, seed=1)
cost = self.fc(y)
loss = fluid.layers.cross_entropy(cost, label)
avg_loss = fluid.layers.mean(loss)
return avg_loss
class TestSeResNeXt(TestParallelDyGraphRunnerBase): class TestSeResNeXt(TestParallelDyGraphRunnerBase):
...@@ -312,8 +315,11 @@ class TestSeResNeXt(TestParallelDyGraphRunnerBase): ...@@ -312,8 +315,11 @@ class TestSeResNeXt(TestParallelDyGraphRunnerBase):
label = to_variable(y_data) label = to_variable(y_data)
label.stop_gradient = True label.stop_gradient = True
loss = model(img, label) out = model(img)
return loss softmax_out = fluid.layers.softmax(out, use_cudnn=False)
loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
avg_loss = fluid.layers.mean(x=loss)
return avg_loss
if __name__ == "__main__": if __name__ == "__main__":
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid as fluid
import numpy as np
class MLP(fluid.Layer):
def __init__(self, name_scope):
super(MLP, self).__init__(name_scope)
self._fc1 = fluid.dygraph.FC(
self.full_name(),
3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)))
self._fc2 = fluid.dygraph.FC(
self.full_name(),
4,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)))
def forward(self, inputs):
x = self._fc1(inputs)
x = self._fc2(x)
x = fluid.layers.reduce_sum(x)
return x
class TestDygraphDebugString(unittest.TestCase):
def test_dygraph_debug_string(self):
np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
unique_name = 0
trace_var = 0
alive_var = 0
with fluid.dygraph.guard():
mlp = MLP("mlp")
for i in range(10):
var_inp = fluid.dygraph.base.to_variable(np_inp)
out = mlp(var_inp)
out.backward()
mlp.clear_gradients()
unique_name_tmp, trace_var_tmp, alive_var_tmp = fluid.dygraph.base._print_debug_msg(
is_test=True)
if i > 0:
self.assertGreaterEqual(unique_name, unique_name_tmp)
self.assertGreaterEqual(trace_var, trace_var_tmp)
self.assertGreaterEqual(alive_var, alive_var_tmp)
else:
unique_name = unique_name_tmp
trace_var = trace_var_tmp
alive_var = alive_var_tmp
try:
fluid.dygraph.base._print_debug_msg()
except Exception as e:
raise RuntimeError(
"No Exception is accepted in _print_debug_msg, but we got: {}".
format(e))
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid as fluid
import numpy as np
from test_imperative_base import new_program_scope
class MLP(fluid.Layer):
def __init__(self, name_scope):
super(MLP, self).__init__(name_scope)
self._fc1 = fluid.dygraph.FC(
self.full_name(),
3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)))
self._fc2 = fluid.dygraph.FC(
self.full_name(),
4,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)))
def forward(self, inputs):
x = self._fc1(inputs)
x = self._fc2(x)
x = fluid.layers.reduce_sum(x)
return x
class TestDygraphFramework(unittest.TestCase):
def test_dygraph_backward(self):
with new_program_scope():
mlp = MLP("mlp")
var_inp = fluid.layers.data(
"input", shape=[2, 2], dtype="float32", append_batch_size=False)
out = mlp(var_inp)
try:
out.backward()
raise AssertionError(
"backward should not be usable in static graph mode")
except ValueError as e:
self.assertTrue((e is not None))
def test_dygraph_to_string(self):
np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
with fluid.dygraph.guard():
var_inp = fluid.dygraph.base.to_variable(np_inp)
var_inp.to_string(throw_on_error=True)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid as fluid
from paddle.fluid import Embedding, LayerNorm, FC, Layer
from paddle.fluid.dygraph import to_variable, guard
from test_imperative_base import new_program_scope
from paddle.fluid import core
import numpy as np
import six
np.set_printoptions(suppress=True)
# Copy from models
class TrainTaskConfig(object):
# support both CPU and GPU now.
use_gpu = True
# the epoch number to train.
pass_num = 30
# the number of sequences contained in a mini-batch.
# deprecated, set batch_size in args.
batch_size = 32
# the hyper parameters for Adam optimizer.
# This static learning_rate will be multiplied to the LearningRateScheduler
# derived learning rate the to get the final learning rate.
learning_rate = 2.0
beta1 = 0.9
beta2 = 0.997
eps = 1e-9
# the parameters for learning rate scheduling.
warmup_steps = 8000
# the weight used to mix up the ground-truth distribution and the fixed
# uniform distribution in label smoothing when training.
# Set this as zero if label smoothing is not wanted.
label_smooth_eps = 0.1
# the directory for saving trained models.
model_dir = "trained_models"
# the directory for saving checkpoints.
ckpt_dir = "trained_ckpts"
# the directory for loading checkpoint.
# If provided, continue training from the checkpoint.
ckpt_path = None
# the parameter to initialize the learning rate scheduler.
# It should be provided if use checkpoints, since the checkpoint doesn't
# include the training step counter currently.
start_step = 0
# the frequency to save trained models.
save_freq = 10000
class InferTaskConfig(object):
use_gpu = True
# the number of examples in one run for sequence generation.
batch_size = 10
# the parameters for beam search.
beam_size = 5
max_out_len = 256
# the number of decoded sentences to output.
n_best = 1
# the flags indicating whether to output the special tokens.
output_bos = False
output_eos = False
output_unk = True
# the directory for loading the trained model.
model_path = "trained_models/pass_1.infer.model"
class ModelHyperParams(object):
# These following five vocabularies related configurations will be set
# automatically according to the passed vocabulary path and special tokens.
# size of source word dictionary.
src_vocab_size = 10000
# size of target word dictionay
trg_vocab_size = 10000
# index for <bos> token
bos_idx = 0
# index for <eos> token
eos_idx = 1
# index for <unk> token
unk_idx = 2
# max length of sequences deciding the size of position encoding table.
max_length = 4
# the dimension for word embeddings, which is also the last dimension of
# the input and output of multi-head attention, position-wise feed-forward
# networks, encoder and decoder.
d_model = 512
# size of the hidden layer in position-wise feed-forward networks.
d_inner_hid = 2048
# the dimension that keys are projected to for dot-product attention.
d_key = 64
# the dimension that values are projected to for dot-product attention.
d_value = 64
# number of head used in multi-head attention.
n_head = 8
# number of sub-layers to be stacked in the encoder and decoder.
n_layer = 6
# dropout rates of different modules.
prepostprocess_dropout = 0.1
attention_dropout = 0.1
relu_dropout = 0.1
# to process before each sub-layer
preprocess_cmd = "n" # layer normalization
# to process after each sub-layer
postprocess_cmd = "da" # dropout + residual connection
# random seed used in dropout for CE.
dropout_seed = None
# the flag indicating whether to share embedding and softmax weights.
# vocabularies in source and target should be same for weight sharing.
weight_sharing = True
def merge_cfg_from_list(cfg_list, g_cfgs):
"""
Set the above global configurations using the cfg_list.
"""
assert len(cfg_list) % 2 == 0
for key, value in zip(cfg_list[0::2], cfg_list[1::2]):
for g_cfg in g_cfgs:
if hasattr(g_cfg, key):
try:
value = eval(value)
except Exception: # for file path
pass
setattr(g_cfg, key, value)
break
def position_encoding_init(n_position, d_pos_vec):
"""
Generate the initial values for the sinusoid position encoding table.
"""
channels = d_pos_vec
position = np.arange(n_position)
num_timescales = channels // 2
log_timescale_increment = (np.log(float(1e4) / float(1)) /
(num_timescales - 1))
inv_timescales = np.exp(np.arange(
num_timescales)) * -log_timescale_increment
scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales,
0)
signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant')
position_enc = signal
return position_enc.astype("float32")
def create_data(is_static=False):
if is_static:
return [
src_word_np, src_pos_np, src_slf_attn_bias_np, trg_word_np,
trg_pos_np, trg_slf_attn_bias_np, trg_src_attn_bias_np, lbl_word_np,
lbl_weight_np
]
else:
enc_inputs = [
to_variable(
src_word_np, name='src_word'), to_variable(
src_pos_np, name='src_pos'), to_variable(
src_slf_attn_bias_np, name='src_slf_attn_bias')
]
dec_inputs = [
to_variable(
trg_word_np, name='trg_word'), to_variable(
trg_pos_np, name='trg_pos'), to_variable(
trg_slf_attn_bias_np, name='trg_slf_attn_bias'),
to_variable(
trg_src_attn_bias_np, name='trg_src_attn_bias')
]
label = to_variable(lbl_word_np, name='lbl_word')
weight = to_variable(lbl_weight_np, name='lbl_weight')
return enc_inputs, dec_inputs, label, weight
def create_feed_dict_list(data, init=False):
if init:
data_input_names = encoder_data_input_fields + \
decoder_data_input_fields[:-1] + label_data_input_fields + pos_enc_param_names
else:
data_input_names = encoder_data_input_fields + \
decoder_data_input_fields[:-1] + label_data_input_fields
feed_dict_list = dict()
for i in range(len(data_input_names)):
feed_dict_list[data_input_names[i]] = data[i]
return feed_dict_list
def make_all_inputs(input_fields):
"""
Define the input data layers for the transformer model.
"""
inputs = []
for input_field in input_fields:
input_var = fluid.layers.data(
name=input_field,
shape=input_descs[input_field][0],
dtype=input_descs[input_field][1],
lod_level=input_descs[input_field][2]
if len(input_descs[input_field]) == 3 else 0,
append_batch_size=False)
inputs.append(input_var)
return inputs
# The placeholder for batch_size in compile time. Must be -1 currently to be
# consistent with some ops' infer-shape output in compile time, such as the
# sequence_expand op used in beamsearch decoder.
batch_size = -1
# The placeholder for squence length in compile time.
seq_len = ModelHyperParams.max_length
# Here list the data shapes and data types of all inputs.
# The shapes here act as placeholder and are set to pass the infer-shape in
# compile time.
input_descs = {
# The actual data shape of src_word is:
# [batch_size, max_src_len_in_batch, 1]
"src_word": [(batch_size, seq_len, 1), "int64", 2],
# The actual data shape of src_pos is:
# [batch_size, max_src_len_in_batch, 1]
"src_pos": [(batch_size, seq_len, 1), "int64"],
# This input is used to remove attention weights on paddings in the
# encoder.
# The actual data shape of src_slf_attn_bias is:
# [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
"src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
seq_len), "float32"],
# The actual data shape of trg_word is:
# [batch_size, max_trg_len_in_batch, 1]
"trg_word": [(batch_size, seq_len, 1), "int64",
2], # lod_level is only used in fast decoder.
# The actual data shape of trg_pos is:
# [batch_size, max_trg_len_in_batch, 1]
"trg_pos": [(batch_size, seq_len, 1), "int64"],
# This input is used to remove attention weights on paddings and
# subsequent words in the decoder.
# The actual data shape of trg_slf_attn_bias is:
# [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
"trg_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
seq_len), "float32"],
# This input is used to remove attention weights on paddings of the source
# input in the encoder-decoder attention.
# The actual data shape of trg_src_attn_bias is:
# [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
"trg_src_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
seq_len), "float32"],
# This input is used in independent decoder program for inference.
# The actual data shape of enc_output is:
# [batch_size, max_src_len_in_batch, d_model]
"enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"],
# The actual data shape of label_word is:
# [batch_size * max_trg_len_in_batch, 1]
"lbl_word": [(batch_size * seq_len, 1), "int64"],
# This input is used to mask out the loss of paddding tokens.
# The actual data shape of label_weight is:
# [batch_size * max_trg_len_in_batch, 1]
"lbl_weight": [(batch_size * seq_len, 1), "float32"],
# This input is used in beam-search decoder.
"init_score": [(batch_size, 1), "float32", 2],
# This input is used in beam-search decoder for the first gather
# (cell states updation)
"init_idx": [(batch_size, ), "int32"],
}
# Names of word embedding table which might be reused for weight sharing.
word_emb_param_names = (
"src_word_emb_table",
"trg_word_emb_table", )
# Names of position encoding table which will be initialized externally.
pos_enc_param_names = (
"src_pos_enc_table",
"trg_pos_enc_table", )
# separated inputs for different usages.
encoder_data_input_fields = (
"src_word",
"src_pos",
"src_slf_attn_bias", )
decoder_data_input_fields = (
"trg_word",
"trg_pos",
"trg_slf_attn_bias",
"trg_src_attn_bias",
"enc_output", )
label_data_input_fields = (
"lbl_word",
"lbl_weight", )
# In fast decoder, trg_pos (only containing the current time step) is generated
# by ops and trg_slf_attn_bias is not needed.
fast_decoder_data_input_fields = (
"trg_word",
"init_score",
"init_idx",
"trg_src_attn_bias", )
# if we use py_reader
use_py_reader = False
# if we run sync mode
sync = False
# how many batches we use
batch_num = 5
np.random.seed = 90
src_word_np = np.arange(1, TrainTaskConfig.batch_size * seq_len + 1).reshape(
[TrainTaskConfig.batch_size, seq_len, 1]).astype('int64')
src_pos_np = np.random.randint(
1, seq_len, size=(TrainTaskConfig.batch_size, seq_len, 1), dtype='int64')
src_slf_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size,
ModelHyperParams.n_head, seq_len,
seq_len).astype('float32')
trg_word_np = np.arange(1, TrainTaskConfig.batch_size * seq_len + 1).reshape(
[TrainTaskConfig.batch_size, seq_len, 1]).astype('int64')
trg_pos_np = np.random.randint(
1, seq_len, size=(TrainTaskConfig.batch_size, seq_len, 1), dtype='int64')
trg_slf_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size,
ModelHyperParams.n_head, seq_len,
seq_len).astype('float32')
trg_src_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size,
ModelHyperParams.n_head, seq_len,
seq_len).astype('float32')
lbl_word_np = np.random.randint(
1,
ModelHyperParams.src_vocab_size - 1,
size=(TrainTaskConfig.batch_size * seq_len, 1),
dtype='int64')
lbl_weight_np = np.random.randn(TrainTaskConfig.batch_size * seq_len,
1).astype('float32')
pos_inp1 = position_encoding_init(ModelHyperParams.max_length,
ModelHyperParams.d_model)
pos_inp2 = position_encoding_init(ModelHyperParams.max_length,
ModelHyperParams.d_model)
class PrePostProcessLayer(Layer):
def __init__(self, name_scope, process_cmd, shape_len=None):
super(PrePostProcessLayer, self).__init__(name_scope)
for cmd in process_cmd:
if cmd == "n":
self._layer_norm = LayerNorm(
name_scope=self.full_name(),
begin_norm_axis=shape_len - 1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(0.)))
def forward(self, prev_out, out, process_cmd, dropout_rate=0.):
for cmd in process_cmd:
if cmd == "a": # add residual connection
out = out + prev_out if prev_out else out
elif cmd == "n": # add layer normalization
out = self._layer_norm(out)
elif cmd == "d": # add dropout
if dropout_rate:
out = fluid.layers.dropout(
out,
dropout_prob=dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False)
return out
class PositionwiseFeedForwardLayer(Layer):
def __init__(self, name_scope, d_inner_hid, d_hid, dropout_rate):
super(PositionwiseFeedForwardLayer, self).__init__(name_scope)
self._i2h = FC(name_scope=self.full_name(),
size=d_inner_hid,
num_flatten_dims=2,
act="relu")
self._h2o = FC(name_scope=self.full_name(),
size=d_hid,
num_flatten_dims=2)
self._dropout_rate = dropout_rate
def forward(self, x):
hidden = self._i2h(x)
if self._dropout_rate:
hidden = fluid.layers.dropout(
hidden,
dropout_prob=self._dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False)
out = self._h2o(hidden)
return out
class MultiHeadAttentionLayer(Layer):
def __init__(self,
name_scope,
d_key,
d_value,
d_model,
n_head=1,
dropout_rate=0.,
cache=None,
gather_idx=None,
static_kv=False):
super(MultiHeadAttentionLayer, self).__init__(name_scope)
self._n_head = n_head
self._d_key = d_key
self._d_value = d_value
self._d_model = d_model
self._dropout_rate = dropout_rate
self._q_fc = FC(name_scope=self.full_name(),
size=d_key * n_head,
bias_attr=False,
num_flatten_dims=2)
self._k_fc = FC(name_scope=self.full_name(),
size=d_key * n_head,
bias_attr=False,
num_flatten_dims=2)
self._v_fc = FC(name_scope=self.full_name(),
size=d_value * n_head,
bias_attr=False,
num_flatten_dims=2)
self._proj_fc = FC(name_scope=self.full_name(),
size=self._d_model,
bias_attr=False,
num_flatten_dims=2)
def forward(self, queries, keys, values, attn_bias):
# compute q ,k ,v
keys = queries if keys is None else keys
values = keys if values is None else values
q = self._q_fc(queries)
k = self._k_fc(keys)
v = self._v_fc(values)
# split head
reshaped_q = fluid.layers.reshape(
x=q, shape=[0, 0, self._n_head, self._d_key], inplace=False)
transpose_q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3])
reshaped_k = fluid.layers.reshape(
x=k, shape=[0, 0, self._n_head, self._d_key], inplace=False)
transpose_k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3])
reshaped_v = fluid.layers.reshape(
x=v, shape=[0, 0, self._n_head, self._d_value], inplace=False)
transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
# scale dot product attention
product = fluid.layers.matmul(
x=transpose_q,
y=transpose_k,
transpose_y=True,
alpha=self._d_model**-0.5)
if attn_bias:
product += attn_bias
weights = fluid.layers.softmax(product)
if self._dropout_rate:
weights_droped = fluid.layers.dropout(
weights,
dropout_prob=self._dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False)
out = fluid.layers.matmul(weights_droped, transpose_v)
else:
out = fluid.layers.matmul(weights, transpose_v)
# combine heads
if len(out.shape) != 4:
raise ValueError("Input(x) should be a 4-D Tensor.")
trans_x = fluid.layers.transpose(out, perm=[0, 2, 1, 3])
final_out = fluid.layers.reshape(
x=trans_x,
shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
inplace=False)
# fc to output
proj_out = self._proj_fc(final_out)
return proj_out
class EncoderSubLayer(Layer):
def __init__(self,
name_scope,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd="n",
postprocess_cmd="da"):
super(EncoderSubLayer, self).__init__(name_scope)
self._preprocess_cmd = preprocess_cmd
self._postprocess_cmd = postprocess_cmd
self._prepostprocess_dropout = prepostprocess_dropout
self._preprocess_layer = PrePostProcessLayer(self.full_name(),
self._preprocess_cmd, 3)
self._multihead_attention_layer = MultiHeadAttentionLayer(
self.full_name(), d_key, d_value, d_model, n_head,
attention_dropout)
self._postprocess_layer = PrePostProcessLayer(
self.full_name(), self._postprocess_cmd, None)
self._preprocess_layer2 = PrePostProcessLayer(self.full_name(),
self._preprocess_cmd, 3)
self._positionwise_feed_forward = PositionwiseFeedForwardLayer(
self.full_name(), d_inner_hid, d_model, relu_dropout)
self._postprocess_layer2 = PrePostProcessLayer(
self.full_name(), self._postprocess_cmd, None)
def forward(self, enc_input, attn_bias):
pre_process_multihead = self._preprocess_layer(
None, enc_input, self._preprocess_cmd, self._prepostprocess_dropout)
attn_output = self._multihead_attention_layer(pre_process_multihead,
None, None, attn_bias)
attn_output = self._postprocess_layer(enc_input, attn_output,
self._postprocess_cmd,
self._prepostprocess_dropout)
pre_process2_output = self._preprocess_layer2(
None, attn_output, self._preprocess_cmd,
self._prepostprocess_dropout)
ffd_output = self._positionwise_feed_forward(pre_process2_output)
return self._postprocess_layer2(attn_output, ffd_output,
self._postprocess_cmd,
self._prepostprocess_dropout)
class EncoderLayer(Layer):
def __init__(self,
name_scope,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd="n",
postprocess_cmd="da"):
super(EncoderLayer, self).__init__(name_scope)
self._preprocess_cmd = preprocess_cmd
self._encoder_sublayers = list()
self._prepostprocess_dropout = prepostprocess_dropout
self._n_layer = n_layer
self._preprocess_layer = PrePostProcessLayer(self.full_name(),
self._preprocess_cmd, 3)
for i in range(n_layer):
self._encoder_sublayers.append(
self.add_sublayer(
'esl_%d' % i,
EncoderSubLayer(
self.full_name(), n_head, d_key, d_value, d_model,
d_inner_hid, prepostprocess_dropout, attention_dropout,
relu_dropout, preprocess_cmd, postprocess_cmd)))
def forward(self, enc_input, attn_bias):
for i in range(self._n_layer):
enc_output = self._encoder_sublayers[i](enc_input, attn_bias)
enc_input = enc_output
return self._preprocess_layer(None, enc_output, self._preprocess_cmd,
self._prepostprocess_dropout)
class PrepareEncoderDecoderLayer(Layer):
def __init__(self,
name_scope,
src_vocab_size,
src_emb_dim,
src_max_len,
dropout_rate,
word_emb_param_name=None,
pos_enc_param_name=None):
super(PrepareEncoderDecoderLayer, self).__init__(name_scope)
self._src_max_len = src_max_len
self._src_emb_dim = src_emb_dim
self._src_vocab_size = src_vocab_size
self._dropout_rate = dropout_rate
self._input_emb = Embedding(
name_scope=self.full_name(),
size=[src_vocab_size, src_emb_dim],
padding_idx=0,
param_attr=fluid.ParamAttr(
name=word_emb_param_name,
initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
if pos_enc_param_name is pos_enc_param_names[0]:
pos_inp = pos_inp1
else:
pos_inp = pos_inp2
self._pos_emb = Embedding(
name_scope=self.full_name(),
size=[self._src_max_len, src_emb_dim],
param_attr=fluid.ParamAttr(
name=pos_enc_param_name,
initializer=fluid.initializer.NumpyArrayInitializer(pos_inp),
trainable=False))
# use in dygraph_mode to fit different length batch
# self._pos_emb._w = to_variable(
# position_encoding_init(self._src_max_len, self._src_emb_dim))
def forward(self, src_word, src_pos):
src_word_emb = self._input_emb(src_word)
src_word_emb = fluid.layers.scale(
x=src_word_emb, scale=self._src_emb_dim**0.5)
# # TODO change this to fit dynamic length input
src_pos_emb = self._pos_emb(src_pos)
src_pos_emb.stop_gradient = True
enc_input = src_word_emb + src_pos_emb
return fluid.layers.dropout(
enc_input,
dropout_prob=self._dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False) if self._dropout_rate else enc_input
class WrapEncoderLayer(Layer):
def __init__(self, name_cope, src_vocab_size, max_length, n_layer, n_head,
d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
attention_dropout, relu_dropout, preprocess_cmd,
postprocess_cmd, weight_sharing):
"""
The wrapper assembles together all needed layers for the encoder.
"""
super(WrapEncoderLayer, self).__init__(name_cope)
self._prepare_encoder_layer = PrepareEncoderDecoderLayer(
self.full_name(),
src_vocab_size,
d_model,
max_length,
prepostprocess_dropout,
word_emb_param_name=word_emb_param_names[0],
pos_enc_param_name=pos_enc_param_names[0])
self._encoder = EncoderLayer(
self.full_name(), n_layer, n_head, d_key, d_value, d_model,
d_inner_hid, prepostprocess_dropout, attention_dropout,
relu_dropout, preprocess_cmd, postprocess_cmd)
def forward(self, enc_inputs):
src_word, src_pos, src_slf_attn_bias = enc_inputs
enc_input = self._prepare_encoder_layer(src_word, src_pos)
enc_output = self._encoder(enc_input, src_slf_attn_bias)
return enc_output
class DecoderSubLayer(Layer):
def __init__(self,
name_scope,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
cache=None,
gather_idx=None):
super(DecoderSubLayer, self).__init__(name_scope)
self._postprocess_cmd = postprocess_cmd
self._preprocess_cmd = preprocess_cmd
self._prepostprcess_dropout = prepostprocess_dropout
self._pre_process_layer = PrePostProcessLayer(self.full_name(),
preprocess_cmd, 3)
self._multihead_attention_layer = MultiHeadAttentionLayer(
self.full_name(),
d_key,
d_value,
d_model,
n_head,
attention_dropout,
cache=cache,
gather_idx=gather_idx)
self._post_process_layer = PrePostProcessLayer(self.full_name(),
postprocess_cmd, None)
self._pre_process_layer2 = PrePostProcessLayer(self.full_name(),
preprocess_cmd, 3)
self._multihead_attention_layer2 = MultiHeadAttentionLayer(
self.full_name(),
d_key,
d_value,
d_model,
n_head,
attention_dropout,
cache=cache,
gather_idx=gather_idx,
static_kv=True)
self._post_process_layer2 = PrePostProcessLayer(self.full_name(),
postprocess_cmd, None)
self._pre_process_layer3 = PrePostProcessLayer(self.full_name(),
preprocess_cmd, 3)
self._positionwise_feed_forward_layer = PositionwiseFeedForwardLayer(
self.full_name(), d_inner_hid, d_model, relu_dropout)
self._post_process_layer3 = PrePostProcessLayer(self.full_name(),
postprocess_cmd, None)
def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
pre_process_rlt = self._pre_process_layer(
None, dec_input, self._preprocess_cmd, self._prepostprcess_dropout)
slf_attn_output = self._multihead_attention_layer(pre_process_rlt, None,
None, slf_attn_bias)
slf_attn_output_pp = self._post_process_layer(
dec_input, slf_attn_output, self._postprocess_cmd,
self._prepostprcess_dropout)
pre_process_rlt2 = self._pre_process_layer2(None, slf_attn_output_pp,
self._preprocess_cmd,
self._prepostprcess_dropout)
enc_attn_output_pp = self._multihead_attention_layer2(
pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias)
enc_attn_output = self._post_process_layer2(
slf_attn_output_pp, enc_attn_output_pp, self._postprocess_cmd,
self._prepostprcess_dropout)
pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output,
self._preprocess_cmd,
self._prepostprcess_dropout)
ffd_output = self._positionwise_feed_forward_layer(pre_process_rlt3)
dec_output = self._post_process_layer3(enc_attn_output, ffd_output,
self._postprocess_cmd,
self._prepostprcess_dropout)
return dec_output
class DecoderLayer(Layer):
def __init__(self,
name_scope,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
caches=None,
gather_idx=None):
super(DecoderLayer, self).__init__(name_scope)
self._pre_process_layer = PrePostProcessLayer(self.full_name(),
preprocess_cmd, 3)
self._decoder_sub_layers = list()
self._n_layer = n_layer
self._preprocess_cmd = preprocess_cmd
self._prepostprocess_dropout = prepostprocess_dropout
for i in range(n_layer):
self._decoder_sub_layers.append(
self.add_sublayer(
'dsl_%d' % i,
DecoderSubLayer(
self.full_name(),
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
cache=None if caches is None else caches[i],
gather_idx=gather_idx)))
def forward(self, dec_input, enc_output, dec_slf_attn_bias,
dec_enc_attn_bias):
for i in range(self._n_layer):
tmp_dec_output = self._decoder_sub_layers[i](
dec_input, enc_output, dec_slf_attn_bias, dec_enc_attn_bias)
dec_input = tmp_dec_output
dec_output = self._pre_process_layer(None, tmp_dec_output,
self._preprocess_cmd,
self._prepostprocess_dropout)
return dec_output
class WrapDecoderLayer(Layer):
def __init__(self,
name_scope,
trg_vocab_size,
max_length,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
weight_sharing,
caches=None,
gather_idx=None):
"""
The wrapper assembles together all needed layers for the encoder.
"""
super(WrapDecoderLayer, self).__init__(name_scope)
self._prepare_decoder_layer = PrepareEncoderDecoderLayer(
self.full_name(),
trg_vocab_size,
d_model,
max_length,
prepostprocess_dropout,
word_emb_param_name=word_emb_param_names[1],
pos_enc_param_name=pos_enc_param_names[1])
self._decoder_layer = DecoderLayer(
self.full_name(),
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
caches=caches,
gather_idx=gather_idx)
self._weight_sharing = weight_sharing
if not weight_sharing:
self._fc = FC(self.full_name(),
size=trg_vocab_size,
bias_attr=False)
def forward(self, dec_inputs=None, enc_output=None):
trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs
dec_input = self._prepare_decoder_layer(trg_word, trg_pos)
dec_output = self._decoder_layer(dec_input, enc_output,
trg_slf_attn_bias, trg_src_attn_bias)
dec_output_reshape = fluid.layers.reshape(
dec_output, shape=[-1, dec_output.shape[-1]], inplace=False)
if self._weight_sharing:
predict = fluid.layers.matmul(
x=dec_output_reshape,
y=self._prepare_decoder_layer._input_emb._w,
transpose_y=True)
else:
predict = self._fc(dec_output_reshape)
if dec_inputs is None:
# Return probs for independent decoder program.
predict_out = fluid.layers.softmax(predict)
return predict_out
return predict
class TransFormer(Layer):
def __init__(self,
name_scope,
src_vocab_size,
trg_vocab_size,
max_length,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
weight_sharing,
label_smooth_eps,
use_py_reader=False,
is_test=False):
super(TransFormer, self).__init__(name_scope)
self._label_smooth_eps = label_smooth_eps
self._trg_vocab_size = trg_vocab_size
if weight_sharing:
assert src_vocab_size == trg_vocab_size, (
"Vocabularies in source and target should be same for weight sharing."
)
self._wrap_encoder_layer = WrapEncoderLayer(
self.full_name(), src_vocab_size, max_length, n_layer, n_head,
d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd,
weight_sharing)
self._wrap_decoder_layer = WrapDecoderLayer(
self.full_name(), trg_vocab_size, max_length, n_layer, n_head,
d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd,
weight_sharing)
if weight_sharing:
self._wrap_decoder_layer._prepare_decoder_layer._input_emb._w = self._wrap_encoder_layer._prepare_encoder_layer._input_emb._w
def forward(self, enc_inputs, dec_inputs, label, weights):
enc_output = self._wrap_encoder_layer(enc_inputs)
predict = self._wrap_decoder_layer(dec_inputs, enc_output)
if self._label_smooth_eps:
label_out = fluid.layers.label_smooth(
label=fluid.layers.one_hot(
input=label, depth=self._trg_vocab_size),
epsilon=self._label_smooth_eps)
cost = fluid.layers.softmax_with_cross_entropy(
logits=predict,
label=label_out,
soft_label=True if self._label_smooth_eps else False)
weighted_cost = cost * weights
sum_cost = fluid.layers.reduce_sum(weighted_cost)
token_num = fluid.layers.reduce_sum(weights)
token_num.stop_gradient = True
avg_cost = sum_cost / token_num
return sum_cost, avg_cost, predict, token_num
class TestDygraphTransformer(unittest.TestCase):
def test_transformer_float32(self):
seed = 90
with guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
transformer = TransFormer(
'transformer',
ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size,
ModelHyperParams.max_length + 1,
ModelHyperParams.n_layer,
ModelHyperParams.n_head,
ModelHyperParams.d_key,
ModelHyperParams.d_value,
ModelHyperParams.d_model,
ModelHyperParams.d_inner_hid,
ModelHyperParams.prepostprocess_dropout,
ModelHyperParams.attention_dropout,
ModelHyperParams.relu_dropout,
ModelHyperParams.preprocess_cmd,
ModelHyperParams.postprocess_cmd,
ModelHyperParams.weight_sharing,
TrainTaskConfig.label_smooth_eps,
use_py_reader=use_py_reader,
is_test=False)
if sync:
lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
ModelHyperParams.d_model, TrainTaskConfig.warmup_steps)
with fluid.default_main_program()._lr_schedule_guard():
learning_rate = lr_decay * TrainTaskConfig.learning_rate
optimizer = fluid.optimizer.Adam(
learning_rate=learning_rate,
beta1=TrainTaskConfig.beta1,
beta2=TrainTaskConfig.beta2,
epsilon=TrainTaskConfig.eps)
else:
optimizer = fluid.optimizer.SGD(learning_rate=0.003)
dy_param_init = dict()
dy_param_updated = dict()
for i in range(batch_num):
enc_inputs, dec_inputs, label, weights = create_data()
dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = transformer(
enc_inputs, dec_inputs, label, weights)
if i == 0:
for param in transformer.parameters():
dy_param_init[param.name] = param.numpy()
dy_avg_cost.backward()
optimizer.minimize(dy_avg_cost)
transformer.clear_gradients()
if i == batch_num - 1:
for param in transformer.parameters():
dy_param_updated[param.name] = param.numpy()
with new_program_scope():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
transformer = TransFormer(
'transformer',
ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size,
ModelHyperParams.max_length + 1,
ModelHyperParams.n_layer,
ModelHyperParams.n_head,
ModelHyperParams.d_key,
ModelHyperParams.d_value,
ModelHyperParams.d_model,
ModelHyperParams.d_inner_hid,
ModelHyperParams.prepostprocess_dropout,
ModelHyperParams.attention_dropout,
ModelHyperParams.relu_dropout,
ModelHyperParams.preprocess_cmd,
ModelHyperParams.postprocess_cmd,
ModelHyperParams.weight_sharing,
TrainTaskConfig.label_smooth_eps,
use_py_reader=use_py_reader,
is_test=False)
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
optimizer = fluid.optimizer.SGD(learning_rate=0.003)
data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
-1] + label_data_input_fields
all_inputs = make_all_inputs(data_input_names)
enc_inputs_len = len(encoder_data_input_fields)
dec_inputs_len = len(decoder_data_input_fields[:-1])
enc_inputs = all_inputs[0:enc_inputs_len]
dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len +
dec_inputs_len]
label = all_inputs[-2]
weights = all_inputs[-1]
static_param_updated = dict()
static_param_init = dict()
static_param_name_list = list()
static_sum_cost, static_avg_cost, static_predict, static_token_num = transformer(
enc_inputs, dec_inputs, label, weights)
optimizer.minimize(static_avg_cost)
for param in transformer.parameters():
static_param_name_list.append(param.name)
out = exe.run(fluid.default_startup_program(),
fetch_list=static_param_name_list)
for i in range(len(static_param_name_list)):
static_param_init[static_param_name_list[i]] = out[i]
static_sum_cost_value = None
static_avg_cost_value = None
static_predict_value = None
static_token_num_value = None
for i in range(batch_num):
feed_dict = create_feed_dict_list(create_data(True))
fetch_list = [
static_sum_cost, static_avg_cost, static_predict,
static_token_num
]
fetch_list.extend(static_param_name_list)
out = exe.run(fluid.default_main_program(),
feed=feed_dict,
fetch_list=fetch_list)
static_sum_cost_value = out[0]
static_avg_cost_value = out[1]
static_predict_value = out[2]
static_token_num_value = out[3]
if i == batch_num - 1:
for k in range(4, len(out)):
static_param_updated[static_param_name_list[k -
4]] = out[k]
self.assertTrue(
np.array_equal(static_avg_cost_value, dy_avg_cost.numpy()))
self.assertTrue(
np.array_equal(static_sum_cost_value, dy_sum_cost.numpy()))
self.assertTrue(
np.array_equal(static_predict_value, dy_predict.numpy()))
self.assertTrue(
np.array_equal(static_token_num_value, dy_token_num.numpy()))
for key, value in six.iteritems(static_param_init):
self.assertTrue(np.array_equal(value, dy_param_init[key]))
for key, value in six.iteritems(static_param_updated):
self.assertTrue(np.array_equal(value, dy_param_updated[key]))
if __name__ == '__main__':
unittest.main()
...@@ -20,12 +20,144 @@ from paddle.fluid import Embedding, LayerNorm, FC, Layer ...@@ -20,12 +20,144 @@ from paddle.fluid import Embedding, LayerNorm, FC, Layer
from paddle.fluid.dygraph import to_variable, guard from paddle.fluid.dygraph import to_variable, guard
from test_imperative_base import new_program_scope from test_imperative_base import new_program_scope
from paddle.fluid import core from paddle.fluid import core
from test_imperative_transformer import TransFormer, TrainTaskConfig, ModelHyperParams
import numpy as np import numpy as np
import six import six
np.set_printoptions(suppress=True) np.set_printoptions(suppress=True)
# Copy from models
class TrainTaskConfig(object):
# support both CPU and GPU now.
use_gpu = True
# the epoch number to train.
pass_num = 30
# the number of sequences contained in a mini-batch.
# deprecated, set batch_size in args.
batch_size = 32
# the hyper parameters for Adam optimizer.
# This static learning_rate will be multiplied to the LearningRateScheduler
# derived learning rate the to get the final learning rate.
learning_rate = 2.0
beta1 = 0.9
beta2 = 0.997
eps = 1e-9
# the parameters for learning rate scheduling.
warmup_steps = 8000
# the weight used to mix up the ground-truth distribution and the fixed
# uniform distribution in label smoothing when training.
# Set this as zero if label smoothing is not wanted.
label_smooth_eps = 0.1
# the directory for saving trained models.
model_dir = "trained_models"
# the directory for saving checkpoints.
ckpt_dir = "trained_ckpts"
# the directory for loading checkpoint.
# If provided, continue training from the checkpoint.
ckpt_path = None
# the parameter to initialize the learning rate scheduler.
# It should be provided if use checkpoints, since the checkpoint doesn't
# include the training step counter currently.
start_step = 0
# the frequency to save trained models.
save_freq = 10000
class InferTaskConfig(object):
use_gpu = True
# the number of examples in one run for sequence generation.
batch_size = 10
# the parameters for beam search.
beam_size = 5
max_out_len = 256
# the number of decoded sentences to output.
n_best = 1
# the flags indicating whether to output the special tokens.
output_bos = False
output_eos = False
output_unk = True
# the directory for loading the trained model.
model_path = "trained_models/pass_1.infer.model"
class ModelHyperParams(object):
# These following five vocabularies related configurations will be set
# automatically according to the passed vocabulary path and special tokens.
# size of source word dictionary.
src_vocab_size = 10000
# size of target word dictionay
trg_vocab_size = 10000
# index for <bos> token
bos_idx = 0
# index for <eos> token
eos_idx = 1
# index for <unk> token
unk_idx = 2
# max length of sequences deciding the size of position encoding table.
max_length = 4
# the dimension for word embeddings, which is also the last dimension of
# the input and output of multi-head attention, position-wise feed-forward
# networks, encoder and decoder.
d_model = 512
# size of the hidden layer in position-wise feed-forward networks.
d_inner_hid = 2048
# the dimension that keys are projected to for dot-product attention.
d_key = 64
# the dimension that values are projected to for dot-product attention.
d_value = 64
# number of head used in multi-head attention.
n_head = 8
# number of sub-layers to be stacked in the encoder and decoder.
n_layer = 6
# dropout rates of different modules.
prepostprocess_dropout = 0.1
attention_dropout = 0.1
relu_dropout = 0.1
# to process before each sub-layer
preprocess_cmd = "n" # layer normalization
# to process after each sub-layer
postprocess_cmd = "da" # dropout + residual connection
# random seed used in dropout for CE.
dropout_seed = None
# the flag indicating whether to share embedding and softmax weights.
# vocabularies in source and target should be same for weight sharing.
weight_sharing = True
def merge_cfg_from_list(cfg_list, g_cfgs):
"""
Set the above global configurations using the cfg_list.
"""
assert len(cfg_list) % 2 == 0
for key, value in zip(cfg_list[0::2], cfg_list[1::2]):
for g_cfg in g_cfgs:
if hasattr(g_cfg, key):
try:
value = eval(value)
except Exception: # for file path
pass
setattr(g_cfg, key, value)
break
def position_encoding_init(n_position, d_pos_vec):
"""
Generate the initial values for the sinusoid position encoding table.
"""
channels = d_pos_vec
position = np.arange(n_position)
num_timescales = channels // 2
log_timescale_increment = (np.log(float(1e4) / float(1)) /
(num_timescales - 1))
inv_timescales = np.exp(np.arange(
num_timescales)) * -log_timescale_increment
scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales,
0)
signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant')
position_enc = signal
return position_enc.astype("float32")
def create_data(is_static=False): def create_data(is_static=False):
if is_static: if is_static:
return [ return [
...@@ -208,6 +340,598 @@ lbl_word_np = np.random.randint( ...@@ -208,6 +340,598 @@ lbl_word_np = np.random.randint(
lbl_weight_np = np.random.randn(TrainTaskConfig.batch_size * seq_len, lbl_weight_np = np.random.randn(TrainTaskConfig.batch_size * seq_len,
1).astype('float32') 1).astype('float32')
pos_inp1 = position_encoding_init(ModelHyperParams.max_length,
ModelHyperParams.d_model)
pos_inp2 = position_encoding_init(ModelHyperParams.max_length,
ModelHyperParams.d_model)
class PrePostProcessLayer(Layer):
def __init__(self, name_scope, process_cmd, shape_len=None):
super(PrePostProcessLayer, self).__init__(name_scope)
for cmd in process_cmd:
if cmd == "n":
self._layer_norm = LayerNorm(
name_scope=self.full_name(),
begin_norm_axis=shape_len - 1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(0.)))
def forward(self, prev_out, out, process_cmd, dropout_rate=0.):
for cmd in process_cmd:
if cmd == "a": # add residual connection
out = out + prev_out if prev_out else out
elif cmd == "n": # add layer normalization
out = self._layer_norm(out)
elif cmd == "d": # add dropout
if dropout_rate:
out = fluid.layers.dropout(
out,
dropout_prob=dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False)
return out
class PositionwiseFeedForwardLayer(Layer):
def __init__(self, name_scope, d_inner_hid, d_hid, dropout_rate):
super(PositionwiseFeedForwardLayer, self).__init__(name_scope)
self._i2h = FC(name_scope=self.full_name(),
size=d_inner_hid,
num_flatten_dims=2,
act="relu")
self._h2o = FC(name_scope=self.full_name(),
size=d_hid,
num_flatten_dims=2)
self._dropout_rate = dropout_rate
def forward(self, x):
hidden = self._i2h(x)
if self._dropout_rate:
hidden = fluid.layers.dropout(
hidden,
dropout_prob=self._dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False)
out = self._h2o(hidden)
return out
class MultiHeadAttentionLayer(Layer):
def __init__(self,
name_scope,
d_key,
d_value,
d_model,
n_head=1,
dropout_rate=0.,
cache=None,
gather_idx=None,
static_kv=False):
super(MultiHeadAttentionLayer, self).__init__(name_scope)
self._n_head = n_head
self._d_key = d_key
self._d_value = d_value
self._d_model = d_model
self._dropout_rate = dropout_rate
self._q_fc = FC(name_scope=self.full_name(),
size=d_key * n_head,
bias_attr=False,
num_flatten_dims=2)
self._k_fc = FC(name_scope=self.full_name(),
size=d_key * n_head,
bias_attr=False,
num_flatten_dims=2)
self._v_fc = FC(name_scope=self.full_name(),
size=d_value * n_head,
bias_attr=False,
num_flatten_dims=2)
self._proj_fc = FC(name_scope=self.full_name(),
size=self._d_model,
bias_attr=False,
num_flatten_dims=2)
def forward(self, queries, keys, values, attn_bias):
# compute q ,k ,v
keys = queries if keys is None else keys
values = keys if values is None else values
q = self._q_fc(queries)
k = self._k_fc(keys)
v = self._v_fc(values)
# split head
reshaped_q = fluid.layers.reshape(
x=q, shape=[0, 0, self._n_head, self._d_key], inplace=False)
transpose_q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3])
reshaped_k = fluid.layers.reshape(
x=k, shape=[0, 0, self._n_head, self._d_key], inplace=False)
transpose_k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3])
reshaped_v = fluid.layers.reshape(
x=v, shape=[0, 0, self._n_head, self._d_value], inplace=False)
transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
# scale dot product attention
product = fluid.layers.matmul(
x=transpose_q,
y=transpose_k,
transpose_y=True,
alpha=self._d_model**-0.5)
if attn_bias:
product += attn_bias
weights = fluid.layers.softmax(product)
if self._dropout_rate:
weights_droped = fluid.layers.dropout(
weights,
dropout_prob=self._dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False)
out = fluid.layers.matmul(weights_droped, transpose_v)
else:
out = fluid.layers.matmul(weights, transpose_v)
# combine heads
if len(out.shape) != 4:
raise ValueError("Input(x) should be a 4-D Tensor.")
trans_x = fluid.layers.transpose(out, perm=[0, 2, 1, 3])
final_out = fluid.layers.reshape(
x=trans_x,
shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
inplace=False)
# fc to output
proj_out = self._proj_fc(final_out)
return proj_out
class EncoderSubLayer(Layer):
def __init__(self,
name_scope,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd="n",
postprocess_cmd="da"):
super(EncoderSubLayer, self).__init__(name_scope)
self._preprocess_cmd = preprocess_cmd
self._postprocess_cmd = postprocess_cmd
self._prepostprocess_dropout = prepostprocess_dropout
self._preprocess_layer = PrePostProcessLayer(self.full_name(),
self._preprocess_cmd, 3)
self._multihead_attention_layer = MultiHeadAttentionLayer(
self.full_name(), d_key, d_value, d_model, n_head,
attention_dropout)
self._postprocess_layer = PrePostProcessLayer(
self.full_name(), self._postprocess_cmd, None)
self._preprocess_layer2 = PrePostProcessLayer(self.full_name(),
self._preprocess_cmd, 3)
self._positionwise_feed_forward = PositionwiseFeedForwardLayer(
self.full_name(), d_inner_hid, d_model, relu_dropout)
self._postprocess_layer2 = PrePostProcessLayer(
self.full_name(), self._postprocess_cmd, None)
def forward(self, enc_input, attn_bias):
pre_process_multihead = self._preprocess_layer(
None, enc_input, self._preprocess_cmd, self._prepostprocess_dropout)
attn_output = self._multihead_attention_layer(pre_process_multihead,
None, None, attn_bias)
attn_output = self._postprocess_layer(enc_input, attn_output,
self._postprocess_cmd,
self._prepostprocess_dropout)
pre_process2_output = self._preprocess_layer2(
None, attn_output, self._preprocess_cmd,
self._prepostprocess_dropout)
ffd_output = self._positionwise_feed_forward(pre_process2_output)
return self._postprocess_layer2(attn_output, ffd_output,
self._postprocess_cmd,
self._prepostprocess_dropout)
class EncoderLayer(Layer):
def __init__(self,
name_scope,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd="n",
postprocess_cmd="da"):
super(EncoderLayer, self).__init__(name_scope)
self._preprocess_cmd = preprocess_cmd
self._encoder_sublayers = list()
self._prepostprocess_dropout = prepostprocess_dropout
self._n_layer = n_layer
self._preprocess_layer = PrePostProcessLayer(self.full_name(),
self._preprocess_cmd, 3)
for i in range(n_layer):
self._encoder_sublayers.append(
self.add_sublayer(
'esl_%d' % i,
EncoderSubLayer(
self.full_name(), n_head, d_key, d_value, d_model,
d_inner_hid, prepostprocess_dropout, attention_dropout,
relu_dropout, preprocess_cmd, postprocess_cmd)))
def forward(self, enc_input, attn_bias):
for i in range(self._n_layer):
enc_output = self._encoder_sublayers[i](enc_input, attn_bias)
enc_input = enc_output
return self._preprocess_layer(None, enc_output, self._preprocess_cmd,
self._prepostprocess_dropout)
class PrepareEncoderDecoderLayer(Layer):
def __init__(self,
name_scope,
src_vocab_size,
src_emb_dim,
src_max_len,
dropout_rate,
word_emb_param_name=None,
pos_enc_param_name=None):
super(PrepareEncoderDecoderLayer, self).__init__(name_scope)
self._src_max_len = src_max_len
self._src_emb_dim = src_emb_dim
self._src_vocab_size = src_vocab_size
self._dropout_rate = dropout_rate
self._input_emb = Embedding(
name_scope=self.full_name(),
size=[src_vocab_size, src_emb_dim],
padding_idx=0,
param_attr=fluid.ParamAttr(
name=word_emb_param_name,
initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
if pos_enc_param_name is pos_enc_param_names[0]:
pos_inp = pos_inp1
else:
pos_inp = pos_inp2
self._pos_emb = Embedding(
name_scope=self.full_name(),
size=[self._src_max_len, src_emb_dim],
param_attr=fluid.ParamAttr(
name=pos_enc_param_name,
initializer=fluid.initializer.NumpyArrayInitializer(pos_inp),
trainable=False))
# use in dygraph_mode to fit different length batch
# self._pos_emb._w = to_variable(
# position_encoding_init(self._src_max_len, self._src_emb_dim))
def forward(self, src_word, src_pos):
src_word_emb = self._input_emb(src_word)
src_word_emb = fluid.layers.scale(
x=src_word_emb, scale=self._src_emb_dim**0.5)
# # TODO change this to fit dynamic length input
src_pos_emb = self._pos_emb(src_pos)
src_pos_emb.stop_gradient = True
enc_input = src_word_emb + src_pos_emb
return fluid.layers.dropout(
enc_input,
dropout_prob=self._dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False) if self._dropout_rate else enc_input
class WrapEncoderLayer(Layer):
def __init__(self, name_cope, src_vocab_size, max_length, n_layer, n_head,
d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
attention_dropout, relu_dropout, preprocess_cmd,
postprocess_cmd, weight_sharing):
"""
The wrapper assembles together all needed layers for the encoder.
"""
super(WrapEncoderLayer, self).__init__(name_cope)
self._prepare_encoder_layer = PrepareEncoderDecoderLayer(
self.full_name(),
src_vocab_size,
d_model,
max_length,
prepostprocess_dropout,
word_emb_param_name=word_emb_param_names[0],
pos_enc_param_name=pos_enc_param_names[0])
self._encoder = EncoderLayer(
self.full_name(), n_layer, n_head, d_key, d_value, d_model,
d_inner_hid, prepostprocess_dropout, attention_dropout,
relu_dropout, preprocess_cmd, postprocess_cmd)
def forward(self, enc_inputs):
src_word, src_pos, src_slf_attn_bias = enc_inputs
enc_input = self._prepare_encoder_layer(src_word, src_pos)
enc_output = self._encoder(enc_input, src_slf_attn_bias)
return enc_output
class DecoderSubLayer(Layer):
def __init__(self,
name_scope,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
cache=None,
gather_idx=None):
super(DecoderSubLayer, self).__init__(name_scope)
self._postprocess_cmd = postprocess_cmd
self._preprocess_cmd = preprocess_cmd
self._prepostprcess_dropout = prepostprocess_dropout
self._pre_process_layer = PrePostProcessLayer(self.full_name(),
preprocess_cmd, 3)
self._multihead_attention_layer = MultiHeadAttentionLayer(
self.full_name(),
d_key,
d_value,
d_model,
n_head,
attention_dropout,
cache=cache,
gather_idx=gather_idx)
self._post_process_layer = PrePostProcessLayer(self.full_name(),
postprocess_cmd, None)
self._pre_process_layer2 = PrePostProcessLayer(self.full_name(),
preprocess_cmd, 3)
self._multihead_attention_layer2 = MultiHeadAttentionLayer(
self.full_name(),
d_key,
d_value,
d_model,
n_head,
attention_dropout,
cache=cache,
gather_idx=gather_idx,
static_kv=True)
self._post_process_layer2 = PrePostProcessLayer(self.full_name(),
postprocess_cmd, None)
self._pre_process_layer3 = PrePostProcessLayer(self.full_name(),
preprocess_cmd, 3)
self._positionwise_feed_forward_layer = PositionwiseFeedForwardLayer(
self.full_name(), d_inner_hid, d_model, relu_dropout)
self._post_process_layer3 = PrePostProcessLayer(self.full_name(),
postprocess_cmd, None)
def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
pre_process_rlt = self._pre_process_layer(
None, dec_input, self._preprocess_cmd, self._prepostprcess_dropout)
slf_attn_output = self._multihead_attention_layer(pre_process_rlt, None,
None, slf_attn_bias)
slf_attn_output_pp = self._post_process_layer(
dec_input, slf_attn_output, self._postprocess_cmd,
self._prepostprcess_dropout)
pre_process_rlt2 = self._pre_process_layer2(None, slf_attn_output_pp,
self._preprocess_cmd,
self._prepostprcess_dropout)
enc_attn_output_pp = self._multihead_attention_layer2(
pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias)
enc_attn_output = self._post_process_layer2(
slf_attn_output_pp, enc_attn_output_pp, self._postprocess_cmd,
self._prepostprcess_dropout)
pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output,
self._preprocess_cmd,
self._prepostprcess_dropout)
ffd_output = self._positionwise_feed_forward_layer(pre_process_rlt3)
dec_output = self._post_process_layer3(enc_attn_output, ffd_output,
self._postprocess_cmd,
self._prepostprcess_dropout)
return dec_output
class DecoderLayer(Layer):
def __init__(self,
name_scope,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
caches=None,
gather_idx=None):
super(DecoderLayer, self).__init__(name_scope)
self._pre_process_layer = PrePostProcessLayer(self.full_name(),
preprocess_cmd, 3)
self._decoder_sub_layers = list()
self._n_layer = n_layer
self._preprocess_cmd = preprocess_cmd
self._prepostprocess_dropout = prepostprocess_dropout
for i in range(n_layer):
self._decoder_sub_layers.append(
self.add_sublayer(
'dsl_%d' % i,
DecoderSubLayer(
self.full_name(),
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
cache=None if caches is None else caches[i],
gather_idx=gather_idx)))
def forward(self, dec_input, enc_output, dec_slf_attn_bias,
dec_enc_attn_bias):
for i in range(self._n_layer):
tmp_dec_output = self._decoder_sub_layers[i](
dec_input, enc_output, dec_slf_attn_bias, dec_enc_attn_bias)
dec_input = tmp_dec_output
dec_output = self._pre_process_layer(None, tmp_dec_output,
self._preprocess_cmd,
self._prepostprocess_dropout)
return dec_output
class WrapDecoderLayer(Layer):
def __init__(self,
name_scope,
trg_vocab_size,
max_length,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
weight_sharing,
caches=None,
gather_idx=None):
"""
The wrapper assembles together all needed layers for the encoder.
"""
super(WrapDecoderLayer, self).__init__(name_scope)
self._prepare_decoder_layer = PrepareEncoderDecoderLayer(
self.full_name(),
trg_vocab_size,
d_model,
max_length,
prepostprocess_dropout,
word_emb_param_name=word_emb_param_names[1],
pos_enc_param_name=pos_enc_param_names[1])
self._decoder_layer = DecoderLayer(
self.full_name(),
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
caches=caches,
gather_idx=gather_idx)
self._weight_sharing = weight_sharing
if not weight_sharing:
self._fc = FC(self.full_name(),
size=trg_vocab_size,
bias_attr=False)
def forward(self, dec_inputs=None, enc_output=None):
trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs
dec_input = self._prepare_decoder_layer(trg_word, trg_pos)
dec_output = self._decoder_layer(dec_input, enc_output,
trg_slf_attn_bias, trg_src_attn_bias)
dec_output_reshape = fluid.layers.reshape(
dec_output, shape=[-1, dec_output.shape[-1]], inplace=False)
if self._weight_sharing:
predict = fluid.layers.matmul(
x=dec_output_reshape,
y=self._prepare_decoder_layer._input_emb._w,
transpose_y=True)
else:
predict = self._fc(dec_output_reshape)
if dec_inputs is None:
# Return probs for independent decoder program.
predict_out = fluid.layers.softmax(predict)
return predict_out
return predict
class TransFormer(Layer):
def __init__(self,
name_scope,
src_vocab_size,
trg_vocab_size,
max_length,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
weight_sharing,
label_smooth_eps,
use_py_reader=False,
is_test=False):
super(TransFormer, self).__init__(name_scope)
self._label_smooth_eps = label_smooth_eps
self._trg_vocab_size = trg_vocab_size
if weight_sharing:
assert src_vocab_size == trg_vocab_size, (
"Vocabularies in source and target should be same for weight sharing."
)
self._wrap_encoder_layer = WrapEncoderLayer(
self.full_name(), src_vocab_size, max_length, n_layer, n_head,
d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd,
weight_sharing)
self._wrap_decoder_layer = WrapDecoderLayer(
self.full_name(), trg_vocab_size, max_length, n_layer, n_head,
d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd,
weight_sharing)
if weight_sharing:
self._wrap_decoder_layer._prepare_decoder_layer._input_emb._w = self._wrap_encoder_layer._prepare_encoder_layer._input_emb._w
def forward(self, enc_inputs, dec_inputs, label, weights):
enc_output = self._wrap_encoder_layer(enc_inputs)
predict = self._wrap_decoder_layer(dec_inputs, enc_output)
if self._label_smooth_eps:
label_out = fluid.layers.label_smooth(
label=fluid.layers.one_hot(
input=label, depth=self._trg_vocab_size),
epsilon=self._label_smooth_eps)
cost = fluid.layers.softmax_with_cross_entropy(
logits=predict,
label=label_out,
soft_label=True if self._label_smooth_eps else False)
weighted_cost = cost * weights
sum_cost = fluid.layers.reduce_sum(weighted_cost)
token_num = fluid.layers.reduce_sum(weights)
token_num.stop_gradient = True
avg_cost = sum_cost / token_num
return sum_cost, avg_cost, predict, token_num
class TestDygraphTransformerSortGradient(unittest.TestCase): class TestDygraphTransformerSortGradient(unittest.TestCase):
def test_transformer_sort_gradient_float32(self): def test_transformer_sort_gradient_float32(self):
......
...@@ -16,3 +16,4 @@ funcsigs ...@@ -16,3 +16,4 @@ funcsigs
pyyaml pyyaml
decorator decorator
prettytable prettytable
objgraph
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册