提交 e9233d1c 编写于 作者: J Jiabin Yang 提交者: XiaoguangHu

Refactor dygraph (#19107)

* refactor dygraph,test=develop

* fix failed unittest,test=develop

* polish code,test=develop

* check windows ci error,test=develop
try to fix windows ci error by np.allclose,test=develop

* polish vlog and profiler, test=develop

* try to fix preceding ops order,test=develop

* test transformer in windows ci, test=develop

* use python c-api to speed up tracer.trace,test=develop

* test=develop, fix docker with paddle nccl problem

* test=develop, add ut for debug string and gradient_accumulator

* test=develop, add tests for layer/gradient_accumulator/prepared_op

* test=develop, fix complie error for test_prepared_op

* test=develop, add more ut for dygraph

* test=develop, create API.spec for dygraph api change

* test=develop, refoctor name to make it easier to understand

* test=develop, refoctor name to make it easier to understand

* test=develop, fix multi-gpu failed problem , add Tracer tests, change PADDLEENFORCE to PADDLEENFORCE_EQ

* test=develop, fix ut failed on parallel se-resnext

* test=develop, change one more PADDLE_ENFORCE
上级 dca9b6c5
......@@ -820,11 +820,11 @@ paddle.fluid.dygraph.TreeConv.state_dict (ArgSpec(args=['self', 'destination', '
paddle.fluid.dygraph.TreeConv.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
paddle.fluid.dygraph.TreeConv.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.dygraph.Tracer ('paddle.fluid.dygraph.tracer.Tracer', ('document', '28d72409112111274c33e1f07229d5da'))
paddle.fluid.dygraph.Tracer.__init__ (ArgSpec(args=['self', 'block'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.dygraph.Tracer.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.dygraph.Tracer.all_parameters (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.dygraph.Tracer.eval_mode (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.dygraph.Tracer.trace 1. trace(self: paddle.fluid.core_avx.Tracer, arg0: paddle.fluid.core_avx.OpBase, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CPUPlace, arg5: bool) -> None 2. trace(self: paddle.fluid.core_avx.Tracer, arg0: paddle.fluid.core_avx.OpBase, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CUDAPlace, arg5: bool) -> None
paddle.fluid.dygraph.Tracer.trace_op (ArgSpec(args=['self', 'op', 'inputs', 'outputs', 'stop_gradient'], varargs=None, keywords=None, defaults=(False,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.dygraph.Tracer.trace 1. trace(self: paddle.fluid.core_avx.Tracer, arg0: unicode, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CUDAPlace, arg5: bool) -> None 2. trace(self: paddle.fluid.core_avx.Tracer, arg0: unicode, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CPUPlace, arg5: bool) -> None
paddle.fluid.dygraph.Tracer.trace_op (ArgSpec(args=['self', 'type', 'inputs', 'outputs', 'attrs', 'stop_gradient'], varargs=None, keywords=None, defaults=(False,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.dygraph.Tracer.trace_var (ArgSpec(args=['self', 'name', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.dygraph.Tracer.train_mode (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.dygraph.prepare_context (ArgSpec(args=['strategy'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
......
cc_library(imperative_flag SRCS flags.cc DEPS gflags)
if(WITH_PYTHON)
cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind profiler imperative_flag)
cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind profiler)
cc_library(engine SRCS engine.cc)
cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits)
cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows var_type_traits layer)
cc_library(tracer SRCS tracer.cc DEPS layer engine)
cc_library(engine SRCS engine.cc DEPS layer gradient_accumulator)
cc_library(imperative_profiler SRCS profiler.cc)
cc_library(nccl_context SRCS nccl_context.cc DEPS device_context)
cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context)
endif()
add_subdirectory(tests)
......@@ -16,17 +16,12 @@
// Created by Jiabin on 2019-04-25.
//
#pragma once
#ifndef PADDLE_BACKWARDSTRATEGY_H
#define PADDLE_BACKWARDSTRATEGY_H
#endif // PADDLE_BACKWARDSTRATEGY_H
namespace paddle {
namespace imperative {
namespace detail {
class BackwardStrategy {
public:
struct BackwardStrategy {
/* DyGraph now support two kinds of backward strategy, one is sorted sum
* gradient, another is sum gradient once they are created */
// TODO(jiabin): add more Strategy when we support
......
......@@ -14,40 +14,219 @@
#include "paddle/fluid/imperative/engine.h"
#include <mutex> // NOLINT
#include <algorithm>
#include <memory>
#include <queue>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/imperative/gradient_accumulator.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace imperative {
static std::once_flag init_engine;
static Engine* engine;
void Engine::RunOp(paddle::imperative::OpBase* op,
const paddle::imperative::NameVarBaseMap& ins,
const paddle::imperative::NameVarBaseMap& outs,
const paddle::platform::Place& place) {
platform::RecordEvent event(op->Type());
op->Run(ins, outs);
}
class DummyEngine : public Engine {
public:
void Enqueue(Runnable* runnable) override {
queued_runnables_.push_back(runnable);
void BasicEngine::Init(VarBase* var, const detail::BackwardStrategy& strategy) {
backward_strategy_ = strategy;
const std::vector<OpBase*> ops = var->GradVarBase()->GradOps();
var->ClearGradOps();
if (ops.empty()) {
VLOG(3) << "Skip auto grad since there is no grad op for var: "
<< var->Name();
return;
} else {
bool valid = false;
for (const auto& op : ops) {
if (op) {
valid = true;
}
}
if (!valid) {
VLOG(3) << "Skip auto grad since all grad op of start VarBase is nullptr";
return;
}
}
init_ops_ = ops;
platform::RecordEvent record_event("Imperative Backward");
VLOG(3) << "start backward";
PADDLE_ENFORCE_EQ(var->HasGradVar(), true,
"Grad variable not exist for variable %s", var->Name());
size_t Size() const override { return queued_runnables_.size(); }
auto& fwd_var = var->Var().Get<framework::LoDTensor>();
auto* grad_var =
var->GradVarBase()->MutableVar()->GetMutable<framework::LoDTensor>();
auto* dev_ctx = platform::DeviceContextPool::Instance().Get(fwd_var.place());
grad_var->Resize(fwd_var.dims());
grad_var->mutable_data(fwd_var.place(), fwd_var.type());
operators::math::set_constant(*dev_ctx, grad_var, 1.0);
}
void Sync() override {
for (Runnable* l : queued_runnables_) {
LOG(INFO) << "running " << reinterpret_cast<void*>(l);
bool BasicEngine::CheckBackwardInputs(OpBase* op) {
for (auto& pair : op->GetInsMap()) {
for (auto& var : pair.second) {
if (var && !var->StopGradient()) {
return true;
}
}
queued_runnables_.clear();
}
return false;
}
void BasicEngine::PrepareGradAccumulators(OpBase* op) {
for (const auto& pair : op->GetOutsMap()) {
for (const auto& var : pair.second) {
if (!var) continue;
auto& accumulator = accumulators_[var.get()];
if (!accumulator) {
if (backward_strategy_.sorted_sum_gradient_) {
accumulator.reset(new SortedGradientAccumulator(var.get()));
} else {
accumulator.reset(new EagerGradientAccumulator(var.get()));
}
}
accumulator->IncreaseRefCnt();
VLOG(3) << "Prepare to acccumulate variable grad " << var->Name()
<< "with reference count " << accumulator->RefCnt();
}
}
}
void BasicEngine::PrepareDeps() {
PADDLE_ENFORCE_EQ(op_deps_.empty(), true, "Op deps must be initialized here");
PADDLE_ENFORCE_EQ(accumulators_.empty(), true,
"Accumulators must be initialized here");
std::queue<OpBase*> q;
std::unordered_set<OpBase*> visited;
for (const auto& init_op : init_ops_) {
q.push(init_op);
visited.insert(init_op);
}
private:
std::vector<Runnable*> queued_runnables_;
};
while (!q.empty()) {
auto* cur_op = q.front();
q.pop();
VLOG(3) << "Checking grads of op " << cur_op->Type();
Engine* GetEngine() {
std::call_once(init_engine, []() { engine = new DummyEngine(); });
return engine;
if (!CheckBackwardInputs(cur_op)) {
// TODO(zjl): clear ops that do not need grad before running autograd
VLOG(3) << "Stop checking preceding ops of " << cur_op->Type()
<< " because all of its backward inputs is stop_gradient=True";
continue;
}
PrepareGradAccumulators(cur_op);
auto& preceding_ops = cur_op->GradPendingOps();
for (auto* preceding_op : preceding_ops) {
PADDLE_ENFORCE_NOT_NULL(preceding_op);
++op_deps_[preceding_op];
if (visited.count(preceding_op) == 0) {
visited.insert(preceding_op);
q.push(preceding_op);
}
}
}
}
void BasicEngine::SumGradient(OpBase* op, std::shared_ptr<VarBase> src,
VarBase* dst) {
auto iter = accumulators_.find(dst);
PADDLE_ENFORCE_EQ(iter != accumulators_.end(), true,
"Cannot find gradient of variable %s", dst->Name());
iter->second->Add(std::move(src), op->id());
}
void BasicEngine::Execute() {
PrepareDeps();
// Start execute Computation graph
std::queue<OpBase*> q;
for (const auto& init_op : init_ops_) {
q.push(init_op);
}
while (!q.empty()) {
OpBase* cur_op = q.front();
q.pop();
// Step 1: Run Backward
auto& bwd_ins = cur_op->GetInsMap();
auto& bwd_outs = cur_op->GetOutsMap();
NameVarBaseMap tmp_outs;
// A var may be coresponding to several grad var in one op
std::unordered_map<VarBase*, std::vector<std::shared_ptr<VarBase>>> var_map;
size_t counter = 0;
for (auto& bwd_out : bwd_outs) {
auto& tmp_var_list = tmp_outs[bwd_out.first];
tmp_var_list.reserve(bwd_out.second.size());
for (auto& var : bwd_out.second) {
auto tmp_var = std::make_shared<VarBase>(
false, "Gtmp@" + std::to_string(counter++)); // Do not need grad
tmp_var_list.emplace_back(tmp_var);
if (var) {
var_map[var.get()].emplace_back(std::move(tmp_var));
var->ClearGradOps();
}
}
}
VLOG(3) << "Start to execute grad op " << cur_op->Type();
RunOp(cur_op, bwd_ins, tmp_outs, cur_op->place());
// Step 2: Sum Gradient
{
platform::RecordEvent record_event("merge_grads");
for (auto& var_pair : var_map) {
auto* dst_var = var_pair.first;
if (dst_var == nullptr) continue;
for (auto& src_var : var_pair.second) {
VLOG(3) << "Sum gradient of variable " << dst_var->Name()
<< " after op " << cur_op->Type();
SumGradient(cur_op, std::move(src_var), dst_var);
}
}
}
// Step 3: Collect ready ops
for (auto* preceding_op : cur_op->GradPendingOps()) {
PADDLE_ENFORCE_NOT_NULL(preceding_op);
auto iter = op_deps_.find(preceding_op);
if (iter == op_deps_.end()) {
continue;
}
VLOG(3) << "Found preceding op of " << cur_op->Type();
// An Op is ready to go while its deps comes to zero
if (--(iter->second) == 0) {
q.push(preceding_op);
VLOG(3) << "Push preceding op " << preceding_op->Type()
<< " into queue";
}
}
// Step 4: Delete op to collect unused variables
VLOG(3) << "Remove op after op " << cur_op->Type() << " runs";
RemoveOp(cur_op);
}
VLOG(3) << "Clean properties of BasicEngine";
CleanEngine();
}
} // namespace imperative
} // namespace paddle
......@@ -16,24 +16,80 @@
#include <cstddef>
#include <cstdint>
#include <memory>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/imperative/backward_strategy.h"
#include "paddle/fluid/imperative/gradient_accumulator.h"
#include "paddle/fluid/imperative/layer.h"
namespace paddle {
namespace imperative {
struct Runnable {};
// It seems there is no need for Engine to be an
// singleton, we can have multi-engine to run
// mutil-graoh. For future use we may expose a interface
// to Python to support
class Engine {
public:
virtual ~Engine() {}
virtual ~Engine() = default;
virtual void Execute() = 0;
virtual void Init(VarBase* var, const detail::BackwardStrategy& strategy) = 0;
virtual void RunOp(imperative::OpBase* op, const NameVarBaseMap& ins,
const NameVarBaseMap& outs, const platform::Place& place);
virtual void Enqueue(Runnable* runnable) = 0;
virtual void RemoveOp(OpBase* op) {
PADDLE_ENFORCE_NOT_NULL(op, "Cannot remove null op");
auto iter = grad_ops_.find(op);
PADDLE_ENFORCE_EQ(iter != grad_ops_.end(), true, "Op is not inside tracer");
grad_ops_.erase(iter);
}
virtual size_t Size() const = 0;
void InsertOp(OpBase* op, std::shared_ptr<OpBase> op_shared) {
grad_ops_[op] = std::move(op_shared);
}
void Clear() { grad_ops_.clear(); }
virtual void Sync() = 0;
private:
std::unordered_map<OpBase*, std::shared_ptr<OpBase>>
grad_ops_; // opBase for remove - grad_op
};
Engine* GetEngine();
class BasicEngine : public Engine {
public:
BasicEngine() = default;
void Init(VarBase* var, const detail::BackwardStrategy& strategy) override;
~BasicEngine() override = default;
void Execute() override;
private:
void PrepareDeps();
bool CheckBackwardInputs(OpBase* op);
void PrepareGradAccumulators(OpBase* op);
void SumGradient(OpBase* op, std::shared_ptr<VarBase> src, VarBase* dst);
// TODO(jiabin): maybe we can optimize the performance of engine by cache the
// result
void CleanEngine() {
init_ops_.clear();
op_deps_.clear();
accumulators_.clear();
Clear();
}
std::vector<OpBase*> init_ops_;
detail::BackwardStrategy backward_strategy_;
std::unordered_map<OpBase*, size_t> op_deps_;
std::unordered_map<VarBase*, std::unique_ptr<GradientAccumulator>>
accumulators_;
};
} // namespace imperative
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/gradient_accumulator.h"
#include <algorithm>
#include <memory>
#include <utility>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace imperative {
template <typename T>
class TensorAddFunctor : public boost::static_visitor<> {
public:
TensorAddFunctor(int64_t numel, const T* x, T* y)
: numel_(numel), x_(x), y_(y) {}
void operator()(const platform::CPUPlace& place) {
platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto blas = operators::math::GetBlas<platform::CPUDeviceContext, T>(*ctx);
blas.AXPY(numel_, 1., x_, y_);
}
#ifdef PADDLE_WITH_CUDA
void operator()(const platform::CUDAPlace& place) {
platform::CUDADeviceContext* ctx =
dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto blas = operators::math::GetBlas<platform::CUDADeviceContext, T>(*ctx);
blas.AXPY(numel_, 1., x_, y_);
}
#else
void operator()(const platform::CUDAPlace& place) {
PADDLE_THROW("Do NOT support gradient merge in place %s", place);
}
#endif
// there is NO blas in CUDAPinnedPlace
void operator()(const platform::CUDAPinnedPlace& place) {
PADDLE_THROW("Do NOT support gradient merge in place %s", place);
}
private:
int64_t numel_;
const T* x_;
T* y_;
};
void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
auto* dst_tensor = dst->GetMutable<framework::LoDTensor>();
auto& src_tensor = src.Get<framework::LoDTensor>();
auto numel = src_tensor.numel();
// FIXME(minqiyang): loss_grad op will pass a zero grad of label
// ugly fix for it
if (numel == 0) {
return;
}
PADDLE_ENFORCE_EQ(dst_tensor->numel() == numel, true,
"dst_numel %d vs. src_numel %d", dst_tensor->numel(),
numel);
auto data_type = src_tensor.type();
auto place = src_tensor.place();
#define PADDLE_TENSOR_ADD_MACRO(cpp_type) \
if (data_type == framework::DataTypeTrait<cpp_type>::DataType()) { \
TensorAddFunctor<cpp_type> func( \
numel, src_tensor.data<cpp_type>(), \
dst_tensor->mutable_data<cpp_type>(place)); \
boost::apply_visitor(func, place); \
return; \
}
PADDLE_TENSOR_ADD_MACRO(float);
PADDLE_TENSOR_ADD_MACRO(double);
#undef PADDLE_TENSOR_ADD_MACRO
PADDLE_THROW("Not supported data type %s for AddTo",
framework::DataTypeToString(data_type));
}
void EagerGradientAccumulator::Add(std::shared_ptr<VarBase> var,
size_t trace_id) {
auto* dst_var = var_->MutableVar();
if (cur_cnt_ == 0) {
*dst_var = std::move(*(var->MutableVar()));
} else {
TensorAdd(var->Var(), dst_var);
}
++cur_cnt_;
}
void SortedGradientAccumulator::Add(std::shared_ptr<VarBase> var,
size_t trace_id) {
auto* dst_var = var_->MutableVar();
if (ref_cnt_ == 1) {
*dst_var = std::move(*(var->MutableVar()));
} else {
if (tmp_grad_vars_.empty()) {
tmp_grad_vars_.reserve(ref_cnt_);
}
tmp_grad_vars_.emplace_back(std::move(var), trace_id);
if (tmp_grad_vars_.size() != ref_cnt_) {
return;
}
std::sort(tmp_grad_vars_.begin(), tmp_grad_vars_.end(),
[](const std::pair<std::shared_ptr<VarBase>, size_t>& p1,
const std::pair<std::shared_ptr<VarBase>, size_t>& p2) {
return p1.second > p2.second;
});
*dst_var = std::move(*(tmp_grad_vars_[0].first->MutableVar()));
for (size_t i = 1; i < tmp_grad_vars_.size(); ++i) {
TensorAdd(tmp_grad_vars_[i].first->Var(), dst_var);
}
tmp_grad_vars_.clear();
}
}
} // namespace imperative
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <utility>
#include <vector>
#include "paddle/fluid/imperative/layer.h"
namespace paddle {
namespace imperative {
class GradientAccumulator {
public:
explicit GradientAccumulator(VarBase* var) : var_(var) {}
virtual void Add(std::shared_ptr<VarBase> var, size_t trace_id) = 0;
virtual ~GradientAccumulator() = default;
inline void IncreaseRefCnt() { ++ref_cnt_; }
inline size_t RefCnt() const { return ref_cnt_; }
protected:
VarBase* var_;
size_t ref_cnt_{0};
};
class EagerGradientAccumulator : public GradientAccumulator {
public:
using GradientAccumulator::GradientAccumulator;
void Add(std::shared_ptr<VarBase> var, size_t trace_id) override;
private:
size_t cur_cnt_{0};
};
class SortedGradientAccumulator : public GradientAccumulator {
public:
using GradientAccumulator::GradientAccumulator;
void Add(std::shared_ptr<VarBase> var, size_t trace_id) override;
private:
std::vector<std::pair<std::shared_ptr<VarBase>, size_t>> tmp_grad_vars_;
};
} // namespace imperative
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -13,27 +13,21 @@
// limitations under the License.
#include "paddle/fluid/imperative/layer.h"
#include <algorithm>
#include <deque>
#include <limits>
#include <map>
#include <random>
#include <unordered_set>
#include <queue>
#include <utility>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/imperative/prepared_operator.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h"
namespace paddle {
namespace imperative {
using framework::Variable;
void ThreadSafeNameSet::Insert(const std::string& name) {
std::lock_guard<std::mutex> guard(mtx_);
set_.insert(name);
......@@ -42,7 +36,7 @@ void ThreadSafeNameSet::Insert(const std::string& name) {
void ThreadSafeNameSet::Remove(const std::string& name) {
std::lock_guard<std::mutex> guard(mtx_);
auto iter = set_.find(name);
PADDLE_ENFORCE(iter != set_.end(), "%s does not exist", name);
PADDLE_ENFORCE_EQ(iter != set_.end(), true, "%s does not exist", name);
set_.erase(iter);
}
......@@ -55,222 +49,161 @@ ThreadSafeNameSet VarBase::name_set_;
std::vector<std::string> VarBase::AliveVarNames() { return name_set_.Names(); }
using framework::Variable;
namespace detail {
template <typename T>
class TensorAddToFunctor : public boost::static_visitor<> {
public:
TensorAddToFunctor(int64_t numel, const T* x, T* y)
: numel_(numel), x_(x), y_(y) {}
void operator()(const platform::CPUPlace& place) {
platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto blas = operators::math::GetBlas<platform::CPUDeviceContext, T>(*ctx);
blas.AXPY(numel_, 1., x_, y_);
static framework::VariableNameMap CreateVarNameMap(
const framework::OpInfo& op_info, const std::string& op_type,
const NameVarBaseMap& varbase_map, bool is_input) {
if (op_info.proto_ == nullptr) {
return {};
}
#ifdef PADDLE_WITH_CUDA
void operator()(const platform::CUDAPlace& place) {
platform::CUDADeviceContext* ctx =
dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto blas = operators::math::GetBlas<platform::CUDADeviceContext, T>(*ctx);
blas.AXPY(numel_, 1., x_, y_);
}
#else
void operator()(const platform::CUDAPlace& place) {
PADDLE_THROW("Do NOT support gradient merge in place %s", place);
framework::VariableNameMap result;
for (auto& var :
is_input ? op_info.Proto().inputs() : op_info.Proto().outputs()) {
auto it = varbase_map.find(var.name());
if (it == varbase_map.end()) {
PADDLE_ENFORCE_EQ(
var.dispensable(), true,
"Var: %s not dispensable and there are no such var in inputs",
var.name());
result[var.name()] = {};
} else {
auto& var_vector = it->second;
std::vector<std::string> args;
args.reserve(var_vector.size());
for (auto& var_base : var_vector) {
args.emplace_back(var_base->Name());
}
result[var.name()] = std::move(args);
}
}
#endif
return result;
}
// there is NO blas in CUDAPinnedPlace
void operator()(const platform::CUDAPinnedPlace& place) {
PADDLE_THROW("Do NOT support gradient merge in place %s", place);
static framework::RuntimeContext PrepareRuntimeContext(
const NameVarBaseMap& ins, const NameVarBaseMap& outs) {
framework::VariableValueMap inputs, outputs;
for (auto& in_pair : ins) {
auto& in_ctx = inputs[in_pair.first];
in_ctx.reserve(in_pair.second.size());
for (auto& in_var : in_pair.second) {
in_ctx.emplace_back(in_var->MutableVar());
}
}
private:
int64_t numel_;
const T* x_;
T* y_;
};
} // namespace detail
void AddTo(std::shared_ptr<VarBase> src, std::shared_ptr<VarBase> dst,
platform::Place place, GradientRef* grad_ref) {
PADDLE_ENFORCE(grad_ref->find(dst.get()) != grad_ref->end(),
"gradient %s are not found in grad_ref", dst->Name());
if ((*grad_ref)[dst.get()].second) {
PADDLE_ENFORCE(src->IsInitialize(), "Using uninitialized VarBase");
dst->var_ = std::move(src->var_);
(*grad_ref)[dst.get()].second = false;
if (!dst->IsInitialize()) {
dst->SetInitialize(true);
}
return;
} else {
framework::Tensor* dst_tensor =
dst->var_->GetMutable<framework::LoDTensor>();
framework::Tensor* src_tensor =
src->var_->GetMutable<framework::LoDTensor>();
// FIXME(minqiyang): loss_grad op will pass a zero grad of label
// ugly fix for it
if (src_tensor->numel() == 0) {
return;
for (auto& out_pair : outs) {
auto& out_ctx = outputs[out_pair.first];
out_ctx.reserve(out_pair.second.size());
for (auto& out_var : out_pair.second) {
out_ctx.emplace_back(out_var->MutableVar());
}
}
return framework::RuntimeContext(std::move(inputs), std::move(outputs));
}
static std::string DebugString(
const std::string& name,
const std::vector<std::shared_ptr<VarBase>>& vars) {
std::stringstream ss;
ss << name << "{";
PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(),
"dst_numel %lld vs. src_numel %lld", dst_tensor->numel(),
src_tensor->numel());
for (size_t i = 0; i < vars.size(); ++i) {
if (i > 0) ss << ", ";
detail::TensorAddToFunctor<float> func(
src_tensor->numel(), src_tensor->data<float>(),
dst_tensor->mutable_data<float>(place));
boost::apply_visitor(func, place);
if (vars[i] == nullptr) {
ss << "NULL";
continue;
}
ss << vars[i]->Name() << "[";
auto& var = vars[i]->Var();
if (!var.IsInitialized()) {
ss << "NOT_INITED_VAR";
} else if (var.IsType<framework::LoDTensor>()) {
auto& tensor = var.Get<framework::LoDTensor>();
ss << "LoDTensor<";
if (tensor.IsInitialized()) {
ss << framework::DataTypeToString(tensor.type()) << ", ";
ss << tensor.place() << ", ";
ss << "(" << tensor.dims() << ")";
} else {
ss << "NOT_INITED";
}
ss << ">";
} else {
ss << "UNRESOLVED_TYPE";
}
ss << "]";
}
}
void ZeroGrads(const std::shared_ptr<imperative::VarBase> vb,
const platform::Place& place) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place);
auto grad_t = vb->var_->GetMutable<framework::LoDTensor>();
operators::math::set_constant(*dev_ctx, grad_t, 0.0);
ss << "}";
return ss.str();
}
void AddGradBySort(BackwardSumMap* bck_map,
std::shared_ptr<imperative::VarBase> target,
GradientRef* grad_ref) {
PADDLE_ENFORCE(bck_map->find(target.get()) != bck_map->end(),
"Can't find %s in backward grad map", target->Name());
std::pair<platform::Place,
std::vector<std::pair<int, std::shared_ptr<imperative::VarBase>>>>&
current = bck_map->at(target.get());
std::sort(current.second.begin(), current.second.end(),
[](const std::pair<int, std::shared_ptr<imperative::VarBase>>& a,
const std::pair<int, std::shared_ptr<imperative::VarBase>>& b) {
return a.first > b.first;
});
for (auto& var_pair : current.second) {
VLOG(10) << "add origin_grad: " << target->Name();
VLOG(10) << "added grad: " << var_pair.second->Name()
<< " trace id is: " << var_pair.first;
AddTo(var_pair.second, target, current.first, grad_ref);
var_pair.second.reset();
std::string LayerDebugString(const std::string& op_type,
const NameVarBaseMap& ins,
const NameVarBaseMap& outs) {
std::stringstream ss;
ss << "Op(" << op_type << "): ";
ss << "Inputs: ";
size_t i = 0;
for (auto& pair : ins) {
if (i > 0) ss << ", ";
ss << DebugString(pair.first, pair.second);
++i;
}
}
class Autograd {
public:
Autograd() {}
ss << ", Outputs: ";
i = 0;
for (auto& pair : outs) {
if (i > 0) ss << ", ";
ss << DebugString(pair.first, pair.second);
++i;
}
return ss.str();
}
void RunBackward(VarBase* var, const detail::BackwardStrategy& bck_stratedy) {
if (var->IsStopGradient()) {
void VarBase::AddGradOps(const std::weak_ptr<OpBase>& op) {
if (op.lock() == nullptr) {
return;
}
for (const auto& cur_op : grad_ops_) {
if (cur_op.lock() == op.lock()) {
return;
}
VLOG(2) << "start autograd";
BackwardSumMap bck_map;
std::deque<OpBase*> ready;
ready.push_back(var->PreOp());
std::map<OpBase*, int> dep_counts =
ComputeDepCounts(var->PreOp(), bck_stratedy, &grad_ref);
while (!ready.empty()) {
OpBase* ready_op = ready.front();
ready.pop_front();
std::vector<VarBasePtrMap> grads_outputs =
ready_op->ApplyGrad(&bck_map, &grad_ref, bck_stratedy);
for (const auto& map : grads_outputs) {
for (auto it = map.rbegin(); it != map.rend(); ++it) {
const std::vector<std::shared_ptr<VarBase>>& grad_outs = it->second;
for (size_t i = 0; i < grad_outs.size(); ++i) {
if (!grad_outs[i] || grad_outs[i]->IsStopGradient()) continue;
OpBase* pre_op = grad_outs[i]->PreOp();
if (!pre_op) continue;
dep_counts[pre_op] -= 1;
PADDLE_ENFORCE(dep_counts[pre_op] >= 0);
bool pre_op_ready = dep_counts[pre_op] == 0;
if (pre_op_ready) {
ready.push_back(pre_op);
}
}
}
}
ready_op->InvokeBackwardHooks();
}
}
grad_ops_.emplace_back(op);
}
private:
std::map<OpBase*, int> ComputeDepCounts(
OpBase* op, const detail::BackwardStrategy& bck_stratedy,
GradientRef* grad_ref) {
if (bck_stratedy.sorted_sum_gradient_) {
PADDLE_ENFORCE_NOT_NULL(grad_ref,
"grad_ref should not be null when "
"using sorted grad backward strategy");
}
std::map<OpBase*, int> ret;
std::deque<OpBase*> queue;
queue.push_back(op);
std::unordered_set<OpBase*> visited;
visited.insert(op);
while (!queue.empty()) {
OpBase* candidate = queue.front();
queue.pop_front();
for (const auto& map : candidate->grad_output_vars_) {
for (const auto& it : map) {
for (const auto& vb : it.second) {
if (bck_stratedy.sorted_sum_gradient_) {
++(*grad_ref)[vb.get()].first;
}
// init the state of the grad_
(*grad_ref)[vb.get()].second = true;
}
}
}
for (auto it : candidate->pre_ops_) {
for (OpBase* pre_op : it.second) {
if (!pre_op) continue;
VLOG(2) << "op dep " << candidate->Type() << " trace id "
<< candidate->trace_id_ << " <---- " << it.first << " <---- "
<< pre_op->Type() << " trace id " << pre_op->trace_id_;
if (visited.find(pre_op) == visited.end()) {
visited.insert(pre_op);
queue.push_back(pre_op);
}
ret[pre_op] += 1;
}
}
void VarBase::ClearGradient() {
if (grad_var_) {
auto* grad_t = grad_var_->var_.GetMutable<framework::LoDTensor>();
if (grad_t->IsInitialized()) {
auto* dev_ctx =
platform::DeviceContextPool::Instance().Get(grad_t->place());
operators::math::set_constant(*dev_ctx, grad_t, 0.0);
}
return ret;
}
}
GradientRef grad_ref;
};
std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
std::shared_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
const bool blocking) const {
PADDLE_ENFORCE(var_->IsInitialized(),
"Variable must be initialized when getting numpy tensor");
// TODO(minqiyang): change this after move unique_name generator to CXX
const framework::LoDTensor& self_tensor = var_->Get<framework::LoDTensor>();
std::unique_ptr<VarBase> new_var(new VarBase(
"Itmp", self_tensor.type(), self_tensor.dims(), dst_place, true, false));
framework::LoDTensor* tensor =
new_var->var_->GetMutable<framework::LoDTensor>();
tensor->set_lod(var_->Get<framework::LoDTensor>().lod());
const auto& src_tensor = var_->Get<framework::LoDTensor>();
framework::TensorCopy(src_tensor, dst_place, tensor);
PADDLE_ENFORCE_EQ(var_.IsInitialized() && var_.IsType<framework::LoDTensor>(),
true,
"Variable must be initialized and type of LoDTensor when "
"getting numpy tensor");
auto& src_tensor = var_.Get<framework::LoDTensor>();
// TODO(Jiabin): change this after move unique_name generator to CXX
auto new_var = std::make_shared<VarBase>(
false, "Itmp" + std::to_string(copied_counter_++));
auto* dst_tensor = new_var->var_.GetMutable<framework::LoDTensor>();
dst_tensor->set_lod(src_tensor.lod());
framework::TensorCopy(src_tensor, dst_place, dst_tensor);
if (blocking) {
platform::DeviceContextPool::Instance().Get(dst_place)->Wait();
auto src_place = src_tensor.place();
......@@ -285,184 +218,66 @@ std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
return new_var;
}
// create OpBase from optype
OpBase::OpBase(size_t id, const std::string& type, const NameVarBaseMap& ins,
const NameVarBaseMap& outs, framework::AttributeMap attrs,
const platform::Place& place)
: id_(id), place_(place) {
const auto& info = framework::OpInfoMap::Instance().Get(type);
// Step 1: Run forward
if (info.Checker() != nullptr) {
info.Checker()->Check(&attrs);
}
framework::LoDTensor& VarBase::GradValue() {
VLOG(3) << "get var grad " << Name();
PADDLE_ENFORCE_NOT_NULL(grads_,
"Could not get grad value from no grad variable");
return *(grads_->var_->GetMutable<framework::LoDTensor>());
auto input_name_map = CreateVarNameMap(info, type, ins, true);
auto output_name_map = CreateVarNameMap(info, type, outs, false);
op_ = framework::OpRegistry::CreateOp(type, std::move(input_name_map),
std::move(output_name_map),
std::move(attrs));
VLOG(3) << "Construct Op: " << type << std::endl;
}
std::vector<VarBasePtrMap> OpBase::ApplyGrad(
BackwardSumMap* bck_map, GradientRef* grad_ref,
const detail::BackwardStrategy& bck_stratedy) {
PADDLE_ENFORCE(!grad_op_descs_.empty(), "%s has no backward implementation",
Type());
VLOG(3) << "apply op grad: " << Type();
std::vector<VarBasePtrMap> tmp_grad_outputs;
const size_t grad_op_count = grad_op_descs_.size();
tmp_grad_outputs.resize(grad_op_count);
for (size_t k = 0; k < grad_op_count; ++k) {
framework::OpDesc* grad_op_desc = grad_op_descs_[k];
platform::RecordEvent record_event(grad_op_desc->Type());
auto& grad_output_variable_map = grad_output_vars_[k];
VLOG(3) << "apply grad op " << grad_op_desc->Type();
// Allocate tmp grad output variable
for (const auto& it : grad_output_variable_map) {
auto& outputs = tmp_grad_outputs[k][it.first];
outputs.reserve(it.second.size());
for (const std::shared_ptr<imperative::VarBase>& origin_grad_var_base :
it.second) {
// Allocate a new variable
std::shared_ptr<imperative::VarBase> tmp_grad_var_base(new VarBase(
string::Sprintf("%s@IGrad", origin_grad_var_base->Name()),
origin_grad_var_base->DataType(), origin_grad_var_base->Dims(),
place_, true, false));
outputs.emplace_back(std::move(tmp_grad_var_base));
}
}
// No need to do compile time infer shape here.
// grad_op_desc_->InferShape(*block_);
// grad_op_desc->InferVarType(block_);
std::unique_ptr<framework::OperatorBase> opbase =
framework::OpRegistry::CreateOp(*grad_op_desc);
auto& info = framework::OpInfoMap::Instance().Get(grad_op_desc->Type());
if (info.infer_var_type_) {
RuntimeInferVarTypeContext infer_var_type_ctx(
&grad_input_vars_[k], &tmp_grad_outputs[k], &(opbase->Attrs()));
info.infer_var_type_(&infer_var_type_ctx);
}
framework::OperatorWithKernel* op_kernel =
dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
// Run grad op
framework::VariableValueMap grad_invars_map;
framework::VariableValueMap grad_outvars_map;
for (const auto& it : grad_input_vars_[k]) {
auto& grad_invars = grad_invars_map[it.first];
grad_invars.reserve(it.second.size());
for (const std::shared_ptr<imperative::VarBase>& grad_inp : it.second) {
PADDLE_ENFORCE_NOT_NULL(grad_inp->var_, "op %s input %s nullptr",
grad_op_desc->Type(), grad_inp->Name());
if (!grad_inp->IsInitialize()) {
grad_inp->InitBuffer();
ZeroGrads(grad_inp, place_);
}
const std::shared_ptr<imperative::VarBase>& const_grad_inp = grad_inp;
grad_invars.emplace_back(const_grad_inp->var_.get());
}
}
for (const auto& it : tmp_grad_outputs[k]) {
auto& grad_outvars = grad_outvars_map[it.first];
grad_outvars.reserve(it.second.size());
for (const std::shared_ptr<imperative::VarBase>& grad_out : it.second) {
PADDLE_ENFORCE_NOT_NULL(grad_out->var_, "op %s output %s nullptr",
grad_op_desc->Type(), grad_out->Name());
grad_outvars.emplace_back(grad_out->var_.get());
}
}
// create OpBase from opdesc
OpBase::OpBase(size_t id, const framework::OpDesc& op_desc,
const platform::Place& place)
: id_(id), op_(framework::OpRegistry::CreateOp(op_desc)), place_(place) {
VLOG(3) << "Construct Op: " << op_desc.Type() << std::endl;
}
framework::RuntimeContext ctx(grad_invars_map, grad_outvars_map);
framework::Scope scope;
PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
p.op.RuntimeInferShape(scope, place_, ctx);
p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx,
p.kernel_configs));
void OpBase::Run(const NameVarBaseMap& ins, const NameVarBaseMap& outs) {
auto* op_kernel = dynamic_cast<framework::OperatorWithKernel*>(op_.get());
PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
auto& info = op_->Info();
if (info.infer_var_type_) {
RuntimeInferVarTypeContext infer_var_type_ctx(ins, &outs, op_->Attrs());
info.infer_var_type_(&infer_var_type_ctx);
}
platform::RecordEvent record_event("merge_grads");
// Add tmp grad outputs to original grad vars
for (size_t k = 0; k < grad_output_vars_.size(); ++k) {
for (const auto& it : grad_output_vars_[k]) {
auto& outputs = tmp_grad_outputs[k][it.first];
const auto& origin_outputs = it.second;
PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
for (size_t i = 0; i < outputs.size(); ++i) {
// track outputs used by sum
if (bck_stratedy.sorted_sum_gradient_) {
if (bck_map->find(origin_outputs[i].get()) != bck_map->end()) {
VLOG(10) << "add sub grad to " << origin_outputs[i]->Name();
bck_map->at(origin_outputs[i].get())
.second.emplace_back(
std::pair<int, std::shared_ptr<imperative::VarBase>>(
this->trace_id_, std::move(outputs[i])));
} else {
VLOG(10) << "insert new map for " << origin_outputs[i]->Name();
std::pair<platform::Place,
std::vector<
std::pair<int, std::shared_ptr<imperative::VarBase>>>>
tmp(place_,
{std::make_pair(this->trace_id_, std::move(outputs[i]))});
bck_map->insert(std::make_pair(origin_outputs[i].get(), tmp));
}
PADDLE_ENFORCE(
grad_ref->find(origin_outputs[i].get()) != grad_ref->end(),
"Can't find %s in grad_reference count map",
origin_outputs[i]->Name());
PADDLE_ENFORCE(grad_ref->at(origin_outputs[i].get()).first >= 1,
"Backward error when calculate grad reference");
if (grad_ref->at(origin_outputs[i].get()).first > 1) {
VLOG(10) << "remove ref for " << origin_outputs[i]->Name();
grad_ref->at(origin_outputs[i].get()).first--;
} else {
VLOG(10) << "Add grad for: " << origin_outputs[i]->Name();
AddGradBySort(bck_map, origin_outputs[i], grad_ref);
grad_ref->at(origin_outputs[i].get()).first--;
}
} else {
VLOG(10) << "AddTo Called with orig_grad is: "
<< origin_outputs[i]->name_ << " Grad to be added is "
<< outputs[i]->name_;
AddTo(outputs[i], origin_outputs[i], place_, grad_ref);
outputs[i].reset();
}
}
// Initialize output var type
for (auto& var_pair : outs) {
for (auto& var : var_pair.second) {
InitializeVariable(var->MutableVar(), var->Type());
}
}
return grad_output_vars_;
}
VLOG(3) << "Running Op " << Type();
VLOG(5) << LayerDebugString(Type(), ins, outs);
auto runtime_ctx = PrepareRuntimeContext(ins, outs);
auto runtime_place = PreparedOp::GetExpectedPlace(place(), ins);
void OpBase::InvokeBackwardHooks() {
VLOG(3) << "call backward hooks, hooks num: " << backward_hooks_.size();
// call backward hooks
for (py::object& callable : backward_hooks_) {
callable(this);
}
}
auto prepared_op =
PreparedOp::Prepare(runtime_ctx, *op_kernel, runtime_place);
void OpBase::RegisterBackwardHooks(const py::object& callable) {
VLOG(3) << "Register backward hooks " << trace_id_;
prepared_op.Run();
// TODO(minqiyang): check the callable format
backward_hooks_.push_back(callable);
VLOG(4) << LayerDebugString(Type(), ins, outs);
}
void VarBase::RunBackward(const detail::BackwardStrategy& bck_stratedy) {
if (!pre_op_) return;
platform::RecordEvent record_event("Imperative Backward");
VLOG(3) << "start backward";
grads_->InitBuffer();
auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>();
operators::math::set_constant(
*(platform::DeviceContextPool::Instance().Get(
var_->GetMutable<framework::LoDTensor>()->place())),
grads_t, 1.0);
Autograd().RunBackward(this, bck_stratedy);
void OpBase::ClearBackwardTrace() {
grad_pending_ops_.clear();
ins_.clear();
outs_.clear();
}
} // namespace imperative
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -13,8 +13,10 @@
// limitations under the License.
#pragma once
#include <algorithm>
#include <atomic>
#include <cstdint>
#include <list>
#include <map> // NOLINT
#include <memory> // NOLINT
#include <mutex> // NOLINT
......@@ -22,95 +24,19 @@
#include <string> // NOLINT
#include <unordered_map> // NOLINT
#include <utility>
#include <vector> // NOLINT
// clang-format off
#include "paddle/fluid/framework/python_headers.h"
// clang-format on
#include <vector>
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/framework/var_type_inference.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/imperative/backward_strategy.h"
#include "paddle/fluid/imperative/type_defs.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/imperative/flags.h"
#include "paddle/fluid/imperative/type_defs.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
namespace paddle {
namespace imperative {
class VarBase;
namespace py = ::pybind11;
class PreparedOp {
public:
PreparedOp(const framework::OperatorBase& op,
const framework::RuntimeContext& ctx,
framework::OperatorWithKernel::OpKernelFunc func,
platform::DeviceContext* dev_ctx,
std::vector<framework::KernelConfig>* kernel_configs)
: op(op),
ctx(ctx),
func(func),
dev_ctx(dev_ctx),
kernel_configs(kernel_configs) {}
static PreparedOp Prepare(const framework::RuntimeContext& ctx,
const framework::OperatorWithKernel& op,
const platform::Place& place) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place);
// check if op[type] has kernel registered.
auto& all_op_kernels = op.AllOpKernels();
auto kernels_iter = all_op_kernels.find(op.Type());
if (kernels_iter == all_op_kernels.end()) {
PADDLE_THROW(
"There are no kernels which are registered in the %s operator.",
op.Type());
}
framework::OperatorWithKernel::OpKernelMap& kernels = kernels_iter->second;
auto expected_kernel_key =
op.GetExpectedKernelType(framework::ExecutionContext(
op, framework::Scope(), *dev_ctx, ctx, nullptr));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_MKLDNN
// workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
if (kernel_iter == kernels.end() &&
expected_kernel_key.library_type_ == framework::LibraryType::kMKLDNN) {
VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
expected_kernel_key.library_type_ = framework::LibraryType::kPlain;
expected_kernel_key.data_layout_ = framework::DataLayout::kAnyLayout;
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
if (kernel_iter == kernels.end()) {
PADDLE_THROW("op %s does not have kernel for %s", op.Type(),
KernelTypeToString(expected_kernel_key));
}
std::vector<framework::KernelConfig>* kernel_configs =
op.GetKernelConfig(expected_kernel_key);
return PreparedOp(op, ctx, kernel_iter->second, dev_ctx, kernel_configs);
}
inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx; }
const framework::OperatorBase& op;
const framework::RuntimeContext& ctx;
framework::OperatorWithKernel::OpKernelFunc func;
platform::DeviceContext* dev_ctx;
std::vector<framework::KernelConfig>* kernel_configs;
};
class OpBase;
class ThreadSafeNameSet {
......@@ -126,290 +52,117 @@ class ThreadSafeNameSet {
mutable std::mutex mtx_;
};
/* The wrapper for Variable which holds a Variable and a VarBase of its
* gradient. This object should be managed totally by Python intepreter.
*
* Nearly all interface should be implemented in C++.
*/
class VarBase {
DISABLE_COPY_AND_ASSIGN(VarBase);
public:
static std::vector<std::string> AliveVarNames();
// Internal interface, create VarBase from exist variable
VarBase(const std::string& name, std::unique_ptr<framework::Variable> var,
VarBase* grad, bool stop_gradient)
: VarBase(name, var->Get<framework::LoDTensor>().type(),
var->Get<framework::LoDTensor>().dims(),
var->Get<framework::LoDTensor>().place(), nullptr, grad,
stop_gradient, false, true) {
var_ = std::move(var);
}
// Python interface
VarBase(const std::string& name, const framework::proto::VarType::Type dtype,
const std::vector<int64_t>& shape, const platform::Place& place,
bool stop_gradient, bool persistable)
: VarBase(name, dtype, framework::make_ddim(shape), place, stop_gradient,
persistable) {}
// Internal interface, create VarBase from with ddim
VarBase(const std::string& name, const framework::proto::VarType::Type dtype,
const framework::DDim& shape, const platform::Place& place,
bool stop_gradient, bool persistable)
: VarBase(name, dtype, shape, place, nullptr, nullptr, stop_gradient,
persistable, true) {}
// Grad used constructor
VarBase(const std::string& name, const framework::proto::VarType::Type dtype,
const std::vector<int64_t>& shape, const platform::Place& place,
bool stop_gradient, bool persistable, bool need_initialize)
: VarBase(name, dtype, framework::make_ddim(shape), place, nullptr,
nullptr, stop_gradient, persistable, need_initialize) {}
private:
// TODO(minqiyang): need support SelectedRows
VarBase(const std::string& name, framework::proto::VarType::Type dtype,
const framework::DDim& shape, const platform::Place& place,
std::unique_ptr<framework::Variable> var, VarBase* grad,
bool stop_gradient, bool persistable, bool need_initialize)
explicit VarBase(bool has_grad, const std::string& name)
: name_(name),
type_(framework::proto::VarType::LOD_TENSOR),
place_(place),
var_(std::move(var)),
grads_(grad),
dtype_(dtype),
stop_gradient_(stop_gradient),
persistable_(persistable),
pre_op_(nullptr),
pre_op_out_name_(),
pre_op_out_idx_(-1) {
if (!var_) {
var_.reset(new framework::Variable());
}
auto tensor = var_->GetMutable<framework::LoDTensor>();
tensor->Resize(shape);
if (need_initialize) {
tensor->mutable_data(place, dtype);
is_initialized_ = true;
VLOG(8) << "initialized varbase: " << name_ << " type: " << dtype
<< " place: " << place;
} else {
is_initialized_ = false;
VLOG(8) << "not initialized varbase: " << name_;
}
VLOG(8) << "create varbase: " << name_ << " type: " << dtype
<< " place: " << place << "Stop gradient: " << stop_gradient_;
grad_var_(has_grad ? new VarBase(false, GradVarName()) : nullptr) {
if (IsDebugEnabled()) {
VLOG(10) << "Construct VarBase: " << name;
name_set_.Insert(name_);
}
}
public:
virtual ~VarBase() {
pre_op_ = nullptr;
pre_op_out_idx_ = -1;
VLOG(8) << "destruct varbase: " << name_;
explicit VarBase(const std::string& name) : VarBase(true, name) {}
~VarBase() {
VLOG(10) << "Destruct VarBase: " << name_;
if (IsDebugEnabled()) {
name_set_.Remove(name_);
}
}
inline void SetName(const std::string& name) { name_ = name; }
inline std::string Name() const { return name_; }
inline bool IsInitialize() const { return is_initialized_; }
inline void SetInitialize(bool inited) { is_initialized_ = inited; }
inline std::vector<int64_t> Shape() const {
if (var_->IsInitialized()) {
return framework::vectorize(var_->Get<framework::LoDTensor>().dims());
} else {
return {};
}
}
const framework::Variable& Var() const { return var_; }
inline framework::DDim Dims() const {
return var_->Get<framework::LoDTensor>().dims();
}
framework::Variable* MutableVar() { return &var_; }
// data type. e.g.. FP32
inline void SetDataType(framework::proto::VarType::Type type) {
auto tensor = var_->GetMutable<framework::LoDTensor>();
tensor->mutable_data(tensor->place(), type);
bool HasGradVar() const { return grad_var_ != nullptr; }
const std::shared_ptr<VarBase>& GradVarBase() const { return grad_var_; }
const framework::Variable& GradVar() const {
PADDLE_ENFORCE_NOT_NULL(grad_var_, "Gradient of %s does not exist", name_);
return grad_var_->var_;
}
inline framework::proto::VarType::Type DataType() const { return dtype_; }
// tensor type. e.g.. LoDTensor
inline void SetType(framework::proto::VarType::Type type) { type_ = type; }
inline framework::proto::VarType::Type Type() const { return type_; }
framework::Variable* MutableGradVar() {
PADDLE_ENFORCE_NOT_NULL(grad_var_, "Gradient of %s does not exist", name_);
return &(grad_var_->var_);
}
inline void SetStopGradient(bool stop_gradient) {
void SetStopGradient(bool stop_gradient) {
stop_gradient_ = stop_gradient;
if (grads_) {
grads_->stop_gradient_ = stop_gradient;
if (grad_var_) {
grad_var_->stop_gradient_ = stop_gradient;
}
}
inline bool IsStopGradient() const { return stop_gradient_; }
inline void SetPersistable(bool persistable) { persistable_ = persistable; }
inline bool IsPersistable() const { return persistable_; }
inline void SetPreOp(OpBase* op) { pre_op_ = op; }
inline platform::Place GetPlace() { return place_; }
inline OpBase* PreOp() const { return pre_op_; }
inline int PreOpOutIdx() const { return pre_op_out_idx_; }
bool StopGradient() const { return stop_gradient_; }
void RunBackward(const detail::BackwardStrategy& bck_stratedy);
void SetPersistable(bool persistable) { persistable_ = persistable; }
inline void ResetPreOp(OpBase* op) {
if (op == pre_op_) {
// clear pre_op info when op equals to var's pre_op
pre_op_ = nullptr;
pre_op_out_idx_ = -1;
}
}
void InitBuffer() {
if (!is_initialized_) {
var_->GetMutable<framework::LoDTensor>()->mutable_data(place_, dtype_);
is_initialized_ = true;
VLOG(8) << "initialized varbase: " << name_ << " type: " << dtype_
<< " place: " << place_;
} else {
VLOG(8) << "var: " << name_ << " has already been initialized ";
}
}
bool Persistable() const { return persistable_; }
void TrackPreOp(OpBase* pre_op, const std::string& pre_op_out_name,
int pre_op_out_idx, bool pre_op_stop_gradient) {
pre_op_ = pre_op;
pre_op_out_name_ = pre_op_out_name;
pre_op_out_idx_ = pre_op_out_idx;
if (pre_op_stop_gradient) {
stop_gradient_ = pre_op_stop_gradient;
}
}
void AddGradOps(const std::weak_ptr<OpBase>& op);
void ClearGradient() {
VLOG(1) << "clear gradient of " << Name();
if (grads_ && grads_->var_ && grads_->var_->IsInitialized()) {
auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>();
operators::math::set_constant(
*(platform::DeviceContextPool::Instance().Get(
grads_->var_->Get<framework::LoDTensor>().place())),
grads_t, 0.0);
std::vector<OpBase*> GradOps() {
std::vector<OpBase*> rlt;
// TODO(jiabin): use better data structure to remove nullptr when we find it
for (const auto& wk_ptr : grad_ops_) {
OpBase* tmp_op = wk_ptr.lock().get();
if (tmp_op) rlt.emplace_back(tmp_op);
}
return rlt;
}
void ClearGradOps() { grad_ops_.clear(); }
framework::LoDTensor& GradValue();
std::unique_ptr<VarBase> NewVarBase(const platform::Place& dst_place,
const bool blocking) const;
const std::string& Name() const { return name_; }
inline std::string GradName() const {
return string::Sprintf("%s@IGrad", Name());
void SetName(const std::string& name) {
name_ = name;
if (grad_var_) {
grad_var_->SetName(GradVarName());
}
}
std::string name_;
framework::proto::VarType::Type type_;
platform::Place place_;
std::unique_ptr<framework::Variable> var_;
std::shared_ptr<VarBase> grads_;
std::string GradVarName() { return framework::GradVarName(name_); }
private:
framework::proto::VarType::Type dtype_;
bool stop_gradient_;
bool persistable_;
bool is_initialized_;
OpBase* pre_op_;
std::string pre_op_out_name_;
int pre_op_out_idx_;
// A private flag to check memory leak
static ThreadSafeNameSet name_set_;
};
void SetType(framework::proto::VarType::Type type) { type_ = type; }
/* The wrapper for OpDesc which holds a OpDesc and a OpDesc of its
* gradient. This object should be managed totally by Python intepreter.
*/
class PYBIND11_HIDDEN OpBase {
public:
OpBase(const std::string& type)
: type_(type),
trace_id_(-1),
place_(platform::CPUPlace()),
backward_hooks_() {}
virtual ~OpBase() {
for (const auto& it : outputs_ref) {
auto vb = it.lock();
if (vb) {
VLOG(3) << "Op reset by" << vb->name_;
vb->ResetPreOp(this);
}
}
// TODO(minqiyang): remove op_desc from block_desc in tracer
// release resource
for (framework::OpDesc* desc : grad_op_descs_) {
delete desc;
}
}
framework::proto::VarType::Type Type() const { return type_; }
std::vector<VarBasePtrMap> ApplyGrad(
BackwardSumMap* bck_map, GradientRef* grad_ref,
const detail::BackwardStrategy& bck_stratedy);
inline std::string Type() const { return type_; }
inline std::string GradOpType(size_t index) const {
PADDLE_ENFORCE_NOT_NULL(grad_op_descs_[index]);
return grad_op_descs_[index]->Type();
}
void RegisterBackwardHooks(const py::object& callable);
void InvokeBackwardHooks();
void TrackPreOp(
const std::string& inp_name,
const std::vector<std::shared_ptr<imperative::VarBase>>& inputs) {
auto& pre_ops_list = pre_ops_[inp_name];
pre_ops_list.reserve(inputs.size());
auto& pre_ops_out_idx_list = pre_ops_out_idx_[inp_name];
for (std::shared_ptr<imperative::VarBase> inp_var : inputs) {
if (inp_var->PreOp() && !inp_var->IsStopGradient()) {
VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot "
<< inp_name;
pre_ops_list.emplace_back(inp_var->PreOp());
pre_ops_out_idx_list.push_back(inp_var->PreOpOutIdx());
} else {
VLOG(3) << "no pre op in slot " << inp_name
<< " input var stop_gradient: " << inp_var->IsStopGradient();
pre_ops_list.emplace_back(nullptr);
// pre_ops_out_idx_list.push_back(-1);
}
void SetDataType(framework::proto::VarType::Type data_type) {
data_type_ = data_type;
if (grad_var_) {
grad_var_->SetDataType(data_type_);
}
}
std::string type_;
int trace_id_;
framework::proto::VarType::Type DataType() const { return data_type_; }
// Note: each fwd op corresponds to a vector of bwd ops.
std::vector<framework::OpDesc*> grad_op_descs_;
void ClearGradient();
platform::Place place_;
std::shared_ptr<VarBase> NewVarBase(const platform::Place& dst_place,
const bool blocking) const;
OpBasePtrMap pre_ops_;
std::map<std::string, std::vector<int>> pre_ops_out_idx_;
private:
framework::Variable var_;
std::string name_;
std::shared_ptr<VarBase> grad_var_;
mutable size_t copied_counter_ = 0;
VarBaseWeakPtrList outputs_ref;
// Inputs to a vector of bwd ops.
std::vector<VarBasePtrMap> grad_input_vars_;
// Outputs to a vector of bwd ops.
std::vector<VarBasePtrMap> grad_output_vars_;
// grad_op indicates which grad_op will this var be used as input
std::vector<std::weak_ptr<OpBase>> grad_ops_;
std::vector<py::object> backward_hooks_;
bool stop_gradient_{false};
bool persistable_{false};
framework::AttributeMap attrs_;
framework::proto::VarType::Type type_{framework::proto::VarType::LOD_TENSOR};
framework::proto::VarType::Type data_type_{framework::proto::VarType::FP32};
static ThreadSafeNameSet name_set_;
};
class Layer {
......@@ -418,18 +171,16 @@ class Layer {
virtual std::vector<std::shared_ptr<VarBase>> Forward(
const std::vector<std::shared_ptr<VarBase>>& inputs) {
std::vector<std::shared_ptr<VarBase>> vars;
return vars;
return {};
}
};
// infer var type context for imperative mode
class PYBIND11_HIDDEN RuntimeInferVarTypeContext
: public framework::InferVarTypeContext {
class RuntimeInferVarTypeContext : public framework::InferVarTypeContext {
public:
RuntimeInferVarTypeContext(const imperative::VarBasePtrMap* inputs,
imperative::VarBasePtrMap* outputs,
const framework::AttributeMap* attrs_map)
RuntimeInferVarTypeContext(const NameVarBaseMap& inputs,
const NameVarBaseMap* outputs,
const framework::AttributeMap& attrs_map)
: InferVarTypeContext(nullptr, nullptr),
inputs_(inputs),
outputs_(outputs),
......@@ -437,19 +188,19 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext
input_names_(),
output_names_(),
var_set_() {
input_names_.reserve(inputs_->size());
for (auto& it : *inputs_) {
for (std::shared_ptr<imperative::VarBase> var : it.second) {
input_names_.reserve(inputs_.size());
for (auto& it : inputs_) {
for (auto& var : it.second) {
input_names_[it.first].emplace_back(var->Name());
var_set_[var->Name()] = var;
var_set_[var->Name()] = var.get();
}
}
output_names_.reserve(outputs_->size());
for (auto& it : *outputs_) {
for (std::shared_ptr<imperative::VarBase> var : it.second) {
for (auto& var : it.second) {
output_names_[it.first].emplace_back(var->Name());
var_set_[var->Name()] = var;
var_set_[var->Name()] = var.get();
}
}
}
......@@ -457,8 +208,10 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext
virtual ~RuntimeInferVarTypeContext() {}
framework::Attribute GetAttr(const std::string& name) const override {
PADDLE_ENFORCE_NOT_NULL(attrs_);
return attrs_->at(name);
auto iter = attrs_.find(name);
PADDLE_ENFORCE_EQ(iter != attrs_.end(), true, "Cannot find attribute %s",
name);
return iter->second;
}
bool HasVar(const std::string& name) const override {
......@@ -466,8 +219,7 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext
}
bool HasInput(const std::string& name) const override {
PADDLE_ENFORCE_NOT_NULL(inputs_);
return inputs_->count(name) > 0;
return inputs_.count(name) > 0;
}
bool HasOutput(const std::string& name) const override {
......@@ -477,17 +229,26 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext
const std::vector<std::string>& Input(
const std::string& name) const override {
return input_names_.at(name);
auto iter = input_names_.find(name);
PADDLE_ENFORCE_EQ(iter != input_names_.end(), true, "Cannot find input %s",
name);
return iter->second;
}
const std::vector<std::string>& Output(
const std::string& name) const override {
return output_names_.at(name);
auto iter = output_names_.find(name);
PADDLE_ENFORCE_EQ(iter != output_names_.end(), true,
"Cannot find output %s", name);
return iter->second;
}
framework::proto::VarType::Type GetType(
const std::string& name) const override {
return var_set_.at(name)->Type();
auto iter = var_set_.find(name);
PADDLE_ENFORCE_EQ(iter != var_set_.end(), true,
"Cannot find var %s in GetType", name);
return iter->second->Type();
}
void SetType(const std::string& name,
......@@ -501,7 +262,10 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext
framework::proto::VarType::Type GetDataType(
const std::string& name) const override {
return var_set_.at(name)->DataType();
auto iter = var_set_.find(name);
PADDLE_ENFORCE_EQ(iter != var_set_.end(), true,
"Cannot find var %s in GetDataType", name);
return iter->second->DataType();
}
void SetDataType(const std::string& name,
......@@ -538,13 +302,97 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext
}
private:
const imperative::VarBasePtrMap* inputs_;
imperative::VarBasePtrMap* outputs_;
const framework::AttributeMap* attrs_;
const NameVarBaseMap& inputs_;
const NameVarBaseMap* outputs_;
const framework::AttributeMap& attrs_;
std::unordered_map<std::string, std::vector<std::string>> input_names_;
std::unordered_map<std::string, std::vector<std::string>> output_names_;
std::unordered_map<std::string, std::shared_ptr<imperative::VarBase>>
var_set_;
std::unordered_map<std::string, VarBase*> var_set_;
};
// TODO(zjl): to support py_func layer
class OpBase : public std::enable_shared_from_this<OpBase> {
DISABLE_COPY_AND_ASSIGN(OpBase);
public:
~OpBase() { VLOG(3) << "Destruct Op: " << Type() << std::endl; }
// Developer should not rely on this method to create OpBase.
// OpBase should be created in Tracer and managed by Tracer totally.
template <typename... Args>
static std::shared_ptr<OpBase> Create(Args&&... args) {
return std::shared_ptr<OpBase>(new OpBase(std::forward<Args>(args)...));
}
size_t id() const { return id_; }
const std::string& Type() const { return op_->Type(); }
void Run(const NameVarBaseMap& ins, const NameVarBaseMap& outs);
const framework::VariableNameMap& InputNameMap() const {
return op_->Inputs();
}
const framework::VariableNameMap& OutputNameMap() const {
return op_->Outputs();
}
const framework::AttributeMap& Attrs() const { return op_->Attrs(); }
const framework::OpInfo& Info() const { return op_->Info(); }
void ClearBackwardTrace();
const std::vector<OpBase*>& GradPendingOps() const {
return grad_pending_ops_;
}
void InsertGradPendingOps(OpBase* op) { grad_pending_ops_.emplace_back(op); }
void SortGradPendingOps() {
std::sort(grad_pending_ops_.begin(), grad_pending_ops_.end(),
[](OpBase* op1, OpBase* op2) { return op1->id() > op2->id(); });
}
NameVarBaseMap* GetMutableOutsMap() { return &outs_; }
NameVarBaseMap* GetMutableInsMap() { return &ins_; }
const NameVarBaseMap& GetInsMap() { return ins_; }
const NameVarBaseMap& GetOutsMap() { return outs_; }
const platform::Place& place() const { return place_; }
// TODO(jiabin) prepare for backward hook
void RegisterBackwardHooks(const std::function<void()>& func) {
backward_hooks_.emplace_back(func);
}
void InvokeBackwardHooks() {
for (const auto& func : backward_hooks_) {
func();
VLOG(5) << "Invoke Backward Hook for: " << Type() << std::endl;
}
}
private:
OpBase(size_t id, const std::string& type, const NameVarBaseMap& ins,
const NameVarBaseMap& outs, framework::AttributeMap attrs,
const platform::Place& place);
OpBase(size_t id, const framework::OpDesc& op_desc,
const platform::Place& place);
size_t id_;
std::unique_ptr<framework::OperatorBase> op_;
std::vector<std::function<void()>> backward_hooks_;
platform::Place place_;
// Not need to be std::weak_ptr, because op is binded to a certain Tracer,
// and would not be used by a Tracer that does not create itself.
std::vector<OpBase*> grad_pending_ops_;
// This part is only used for backward
NameVarBaseMap ins_;
NameVarBaseMap outs_;
};
} // namespace imperative
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/prepared_operator.h"
#include <sstream>
namespace paddle {
namespace imperative {
const framework::Tensor* GetTensorFromVar(const framework::Variable& var) {
if (var.IsType<framework::LoDTensor>()) {
return &(var.Get<framework::LoDTensor>());
} else if (var.IsType<framework::SelectedRows>()) {
return &(var.Get<framework::SelectedRows>().value());
} else {
return nullptr;
}
}
platform::Place PreparedOp::GetExpectedPlace(const platform::Place& place,
const NameVarBaseMap& ins) {
bool found = false;
for (auto& name_pair : ins) {
for (auto& var_base : name_pair.second) {
const auto* tensor = GetTensorFromVar(var_base->Var());
if (tensor && tensor->IsInitialized()) {
auto tmp_place = tensor->place();
PADDLE_ENFORCE_EQ(!found || tmp_place == place, true,
"Input variable should keep in the same place: %s, "
"but get place: %s of input %s instead",
place, tmp_place, name_pair.first);
}
}
}
return place;
}
PreparedOp::PreparedOp(const framework::OperatorBase& op,
const framework::RuntimeContext& ctx,
framework::OperatorWithKernel::OpKernelFunc func,
platform::DeviceContext* dev_ctx,
std::vector<framework::KernelConfig>* kernel_configs)
: op_(op),
ctx_(ctx),
func_(std::move(func)),
dev_ctx_(dev_ctx),
kernel_configs_(kernel_configs) {}
PreparedOp PreparedOp::Prepare(const framework::RuntimeContext& ctx,
const framework::OperatorWithKernel& op,
const platform::Place& place) {
auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
// check if op[type] has kernel registered.
auto& all_op_kernels = op.AllOpKernels();
auto kernels_iter = all_op_kernels.find(op.Type());
if (kernels_iter == all_op_kernels.end()) {
PADDLE_THROW(
"There are no kernels which are registered in the %s operator.",
op.Type());
}
auto& kernels = kernels_iter->second;
auto expected_kernel_key =
op.GetExpectedKernelType(framework::ExecutionContext(
op, framework::Scope(), *dev_ctx, ctx, nullptr));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key);
// TODO(jiabin): Add operator.cc's line 1000 part back when we need that case
if (kernel_iter == kernels.end()) {
PADDLE_THROW("op %s does not have kernel for %s", op.Type(),
KernelTypeToString(expected_kernel_key));
}
std::vector<framework::KernelConfig>* kernel_configs =
op.GetKernelConfig(expected_kernel_key);
return PreparedOp(op, ctx, kernel_iter->second, dev_ctx, kernel_configs);
}
void PreparedOp::Run() {
// TODO(zjl): remove scope in dygraph
framework::Scope scope;
op_.RuntimeInferShape(scope, dev_ctx_->GetPlace(), ctx_);
func_(framework::ExecutionContext(op_, scope, *dev_ctx_, ctx_,
kernel_configs_));
}
} // namespace imperative
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/type_defs.h"
namespace paddle {
namespace imperative {
const framework::Tensor* GetTensorFromVar(const framework::Variable& var);
class PreparedOp {
public:
static PreparedOp Prepare(const framework::RuntimeContext& ctx,
const framework::OperatorWithKernel& op,
const platform::Place& place);
inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx_; }
void Run();
static platform::Place GetExpectedPlace(const platform::Place& place,
const NameVarBaseMap& ins);
private:
PreparedOp(const framework::OperatorBase& op,
const framework::RuntimeContext& ctx,
framework::OperatorWithKernel::OpKernelFunc func,
platform::DeviceContext* dev_ctx,
std::vector<framework::KernelConfig>* kernel_configs);
private:
const framework::OperatorBase& op_;
const framework::RuntimeContext& ctx_;
framework::OperatorWithKernel::OpKernelFunc func_;
platform::DeviceContext* dev_ctx_;
std::vector<framework::KernelConfig>* kernel_configs_;
};
} // namespace imperative
} // namespace paddle
cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context)
cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS gradient_accumulator memcpy)
cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op)
cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split)
cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/imperative/gradient_accumulator.h"
#include "paddle/fluid/memory/memcpy.h"
namespace imperative = paddle::imperative;
namespace platform = paddle::platform;
namespace framework = paddle::framework;
namespace paddle {
namespace imperative {
void TensorAdd(const framework::Variable& src, framework::Variable* dst);
#if defined(PADDLE_WITH_CUDA)
template <typename T>
int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) {
framework::Variable var1;
framework::Variable var2;
std::vector<T> src_data(10, t1);
std::vector<T> dst_data(10, t2);
std::vector<T> result;
platform::CPUPlace src_place;
for (unsigned int i = 0; i < 10; i++) {
result.emplace_back(src_data[i] + dst_data[i]);
}
std::vector<int64_t> dims = {2, 5};
auto* src = var1.GetMutable<framework::LoDTensor>();
auto* dst = var2.GetMutable<framework::LoDTensor>();
src->Resize(framework::make_ddim(dims));
dst->Resize(framework::make_ddim(dims));
auto* src_mutable = src->mutable_data<T>(place);
auto* dst_mutable = dst->mutable_data<T>(place);
paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
sizeof(T) * src_data.size(), 0);
paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
sizeof(T) * dst_data.size(), 0);
imperative::TensorAdd(var1, &var2);
framework::LoDTensor rlt;
platform::CPUPlace rlt_place;
framework::TensorCopySync(*dst, rlt_place, &rlt);
for (unsigned int i = 0; i < rlt.numel(); i++) {
if (rlt.data<T>()[i] != result[i]) return 1;
}
return 0;
}
#endif
template <typename T>
int TensorCPUAddTest(platform::CPUPlace place, T t1, T t2) {
framework::Variable var1;
framework::Variable var2;
std::vector<T> src_data(10, t1);
std::vector<T> dst_data(10, t2);
std::vector<T> result;
platform::CPUPlace src_place;
for (unsigned int i = 0; i < 10; i++) {
result.emplace_back(src_data[i] + dst_data[i]);
}
std::vector<int64_t> dims = {2, 5};
auto* src = var1.GetMutable<framework::LoDTensor>();
auto* dst = var2.GetMutable<framework::LoDTensor>();
src->Resize(framework::make_ddim(dims));
dst->Resize(framework::make_ddim(dims));
auto* src_mutable = src->mutable_data<T>(place);
auto* dst_mutable = dst->mutable_data<T>(place);
paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
sizeof(T) * src_data.size());
paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
sizeof(T) * dst_data.size());
imperative::TensorAdd(var1, &var2);
framework::LoDTensor rlt;
platform::CPUPlace rlt_place;
framework::TensorCopySync(*dst, rlt_place, &rlt);
for (unsigned int i = 0; i < rlt.numel(); i++) {
if (rlt.data<T>()[i] != result[i]) return 1;
}
return 0;
}
TEST(test_add_functor, add_functor) {
#if defined(PADDLE_WITH_CUDA)
platform::CUDAPlace gpu_place(0);
#endif
platform::CPUPlace cpu_place;
int cpu_res = 1;
cpu_res = TensorCPUAddTest(cpu_place, 1.0, 0.0);
EXPECT_EQ(cpu_res, 0);
cpu_res = TensorCPUAddTest(cpu_place, static_cast<double>(1.0),
static_cast<double>(2.0));
EXPECT_EQ(cpu_res, 0);
#if defined(PADDLE_WITH_CUDA)
int gpu_res = 1;
gpu_res = TensorGPUAddTest(gpu_place, 1.0, 0.0);
EXPECT_EQ(gpu_res, 0);
gpu_res = TensorGPUAddTest(gpu_place, static_cast<double>(1.0),
static_cast<double>(2.0));
EXPECT_EQ(gpu_res, 0);
#endif
}
} // namespace imperative
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Created by Jiabin on 2019-08-16.
//
#include <paddle/fluid/framework/op_registry.h>
#include <memory>
#include <string>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/imperative/layer.h"
namespace imperative = paddle::imperative;
namespace platform = paddle::platform;
namespace framework = paddle::framework;
namespace paddle {
namespace imperative {
using vb_vector = std::vector<std::shared_ptr<imperative::VarBase>>;
using var_pair = std::pair<std::string, vb_vector>;
TEST(test_layer, test_runtime_context) {
std::shared_ptr<imperative::VarBase> vin(
new imperative::VarBase(false, "vin"));
std::shared_ptr<imperative::VarBase> vout(
new imperative::VarBase(false, "vout"));
var_pair in_pair = var_pair("X", vb_vector(1, vin));
var_pair out_pair = var_pair("Out", vb_vector(1, vout));
imperative::NameVarBaseMap ins = {in_pair};
imperative::NameVarBaseMap outs = {out_pair};
framework::AttributeMap attrs;
auto* ctx = new imperative::RuntimeInferVarTypeContext(ins, &outs, attrs);
ASSERT_TRUE(ctx->HasVar("vin"));
ASSERT_TRUE(ctx->HasInput("X"));
ASSERT_TRUE(ctx->HasOutput("Out"));
ASSERT_ANY_THROW(ctx->GetDataTypes("vin"));
std::vector<framework::proto::VarType::Type> NullType;
ASSERT_ANY_THROW(ctx->SetDataTypes("vin", NullType));
ASSERT_ANY_THROW(ctx->GetShape("vin"));
ASSERT_ANY_THROW(ctx->GetLoDLevel("vin"));
ASSERT_ANY_THROW(ctx->SetLoDLevel("vin", 2));
}
std::string LayerDebugString(const std::string& op_type,
const NameVarBaseMap& ins,
const NameVarBaseMap& outs);
TEST(test_layer, test_debug_string_test_debug_Test) {
std::shared_ptr<imperative::VarBase> vin(
new imperative::VarBase(false, "vin"));
std::shared_ptr<imperative::VarBase> vin_error(
new imperative::VarBase(false, "vin_error"));
std::shared_ptr<imperative::VarBase> vout(
new imperative::VarBase(false, "vout"));
std::shared_ptr<imperative::VarBase> vout_error(
new imperative::VarBase(false, "vout_error"));
vin_error->MutableVar()->GetMutable<framework::LoDTensor>();
vout->MutableVar()->GetMutable<framework::LoDTensor>();
vout_error->MutableVar()->GetMutable<framework::SelectedRows>();
var_pair in_pair = var_pair("X", vb_vector(1, vin));
vb_vector vb_in_error = {vin_error, nullptr};
var_pair vin_error_pair = var_pair("X", vb_in_error);
var_pair out_pair = var_pair("Out", vb_vector(1, vout));
var_pair vout_error_pair = var_pair("Out2", vb_vector(1, vout_error));
imperative::NameVarBaseMap ins = {in_pair};
imperative::NameVarBaseMap ins_error = {vin_error_pair};
imperative::NameVarBaseMap outs = {out_pair};
imperative::NameVarBaseMap outs_error = {vout_error_pair};
ASSERT_NO_FATAL_FAILURE(LayerDebugString("test_op", ins, outs));
std::string res = LayerDebugString("test_op", ins, outs_error);
ASSERT_TRUE(res.find("UNRESOLVED_TYPE") != std::string::npos);
std::string res2 = LayerDebugString("test_op", ins_error, outs_error);
VLOG(3) << res2;
ASSERT_TRUE(res2.find("NOT_INITED") != std::string::npos);
ASSERT_TRUE(res2.find("NULL") != std::string::npos);
}
TEST(test_layer, test_clear_backward_info) {
std::shared_ptr<imperative::VarBase> vin(
new imperative::VarBase(false, "vin"));
std::shared_ptr<imperative::VarBase> vout(
new imperative::VarBase(false, "vout"));
framework::OpDesc desc;
platform::CPUPlace place;
var_pair x_pair = var_pair("X", vb_vector(1, vin));
var_pair y_pair = var_pair("Y", vb_vector(1, vin));
var_pair out_pair = var_pair("Out", vb_vector(1, vout));
imperative::NameVarBaseMap ins = {x_pair, y_pair};
imperative::NameVarBaseMap outs = {out_pair};
framework::AttributeMap concat_att_map;
concat_att_map["axis"] = 1;
std::shared_ptr<imperative::OpBase> op(
OpBase::Create(0, "mul", ins, outs, concat_att_map, place));
std::shared_ptr<imperative::OpBase> preceding_op(
OpBase::Create(0, "mul", ins, outs, concat_att_map, place));
op->InsertGradPendingOps(preceding_op.get());
*(op->GetMutableInsMap()) = ins;
*(op->GetMutableOutsMap()) = outs;
ASSERT_GT(op->GetInsMap().size(), 0);
ASSERT_GT(op->GetOutsMap().size(), 0);
ASSERT_GT(op->GradPendingOps().size(), 0);
op->ClearBackwardTrace();
ASSERT_EQ(op->GetInsMap().size(), 0);
ASSERT_EQ(op->GetOutsMap().size(), 0);
ASSERT_EQ(op->GradPendingOps().size(), 0);
}
TEST(test_layer, test_varbase_basic) {
platform::CPUPlace place;
std::shared_ptr<imperative::VarBase> vin(
new imperative::VarBase(false, "vin"));
vin->MutableVar()->GetMutable<framework::LoDTensor>()->mutable_data<float>(
place);
std::shared_ptr<imperative::VarBase> vout(vin->NewVarBase(place, false));
ASSERT_EQ(vout->Name(), "Itmp0");
std::shared_ptr<imperative::VarBase> vin_with_grad(
new imperative::VarBase(true, "vin"));
ASSERT_ANY_THROW(vin->MutableGradVar());
ASSERT_NO_THROW(ASSERT_TRUE(dynamic_cast<framework::Variable*>(
vin_with_grad->MutableGradVar()) != 0));
ASSERT_TRUE(
dynamic_cast<framework::Variable*>(vin_with_grad->MutableGradVar()) != 0);
vin_with_grad->SetStopGradient(true);
ASSERT_TRUE(vin_with_grad->StopGradient());
ASSERT_NO_FATAL_FAILURE(vin_with_grad->SetPersistable(true));
ASSERT_TRUE(vin_with_grad->StopGradient());
ASSERT_NO_FATAL_FAILURE(vin_with_grad->SetName("new_name"));
ASSERT_EQ(vin_with_grad->Name(), "new_name");
}
// TODO(jiabin): Add more ut here for layer
} // namespace imperative
} // namespace paddle
USE_OP(mul);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Created by Jiabin on 2019-08-19.
//
#include <paddle/fluid/framework/op_registry.h>
#include <memory>
#include <string>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/imperative/prepared_operator.h"
#include "paddle/fluid/imperative/type_defs.h"
namespace imperative = paddle::imperative;
namespace platform = paddle::platform;
namespace framework = paddle::framework;
namespace paddle {
namespace imperative {
static framework::RuntimeContext PrepareRuntimeContext(
const NameVarBaseMap& ins, const NameVarBaseMap& outs) {
framework::VariableValueMap inputs, outputs;
for (auto& in_pair : ins) {
auto& in_ctx = inputs[in_pair.first];
in_ctx.reserve(in_pair.second.size());
for (auto& in_var : in_pair.second) {
in_ctx.emplace_back(in_var->MutableVar());
}
}
for (auto& out_pair : outs) {
auto& out_ctx = outputs[out_pair.first];
out_ctx.reserve(out_pair.second.size());
for (auto& out_var : out_pair.second) {
out_ctx.emplace_back(out_var->MutableVar());
}
}
return framework::RuntimeContext(std::move(inputs), std::move(outputs));
}
static framework::VariableNameMap CreateVarNameMap(
const framework::OpInfo& op_info, const std::string& op_type,
const NameVarBaseMap& varbase_map, bool is_input) {
if (op_info.proto_ == nullptr) {
return {};
}
framework::VariableNameMap result;
for (auto& var :
is_input ? op_info.Proto().inputs() : op_info.Proto().outputs()) {
auto it = varbase_map.find(var.name());
if (it == varbase_map.end()) {
PADDLE_ENFORCE_EQ(
var.dispensable(), true,
"Var: %s not dispensable and there are no such var in inputs",
var.name());
result[var.name()] = {};
} else {
auto& var_vector = it->second;
std::vector<std::string> args;
args.reserve(var_vector.size());
for (auto& var_base : var_vector) {
args.emplace_back(var_base->Name());
}
result[var.name()] = std::move(args);
}
}
return result;
}
using vb_vector = std::vector<std::shared_ptr<imperative::VarBase>>;
using var_pair = std::pair<std::string, vb_vector>;
TEST(test_prepare_op, test_prepare_op) {
std::shared_ptr<imperative::VarBase> vin(
new imperative::VarBase(false, "vin"));
std::shared_ptr<imperative::VarBase> vout(
new imperative::VarBase(false, "vout"));
framework::OpDesc desc;
platform::CPUPlace place;
vin->MutableVar()->GetMutable<framework::LoDTensor>()->mutable_data<float>(
place);
var_pair x_pair = var_pair("X", vb_vector(1, vin));
var_pair out_pair = var_pair("Out", vb_vector(1, vout));
imperative::NameVarBaseMap ins = {x_pair};
imperative::NameVarBaseMap outs = {out_pair};
framework::AttributeMap split_attr_map;
const auto& info = framework::OpInfoMap::Instance().Get("split");
framework::VariableNameMap var_in_map =
CreateVarNameMap(info, "split", ins, true);
framework::VariableNameMap var_out_map =
CreateVarNameMap(info, "split", outs, false);
framework::OperatorWithKernel op("split", var_in_map, var_out_map,
split_attr_map);
framework::RuntimeContext ctx = PrepareRuntimeContext(ins, outs);
ASSERT_NO_FATAL_FAILURE(PreparedOp preparedOp =
PreparedOp::Prepare(ctx, op, place));
}
const framework::Tensor* GetTensorFromVar(const framework::Variable& var);
TEST(test_prepare_op, test_get_tensor_from_var) {
std::shared_ptr<imperative::VarBase> vout_error(
new imperative::VarBase(false, "vout_error"));
vout_error->MutableVar()->GetMutable<framework::SelectedRows>();
auto* ts = GetTensorFromVar(*vout_error->MutableVar());
ASSERT_TRUE(ts != nullptr);
}
} // namespace imperative
} // namespace paddle
USE_OP(split);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Created by Jiabin on 2019-08-16.
//
#include <paddle/fluid/framework/op_registry.h>
#include <memory>
#include <string>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/imperative/tracer.h"
namespace imperative = paddle::imperative;
namespace platform = paddle::platform;
namespace framework = paddle::framework;
namespace paddle {
namespace imperative {
using vb_vector = std::vector<std::shared_ptr<imperative::VarBase>>;
using var_pair = std::pair<std::string, vb_vector>;
TEST(test_tracer, test_trace_op) {
// Doing an mul
imperative::Tracer tracer;
std::shared_ptr<imperative::VarBase> x_in(
new imperative::VarBase(true, "x_in"));
std::shared_ptr<imperative::VarBase> y_in(
new imperative::VarBase(true, "y_in"));
std::shared_ptr<imperative::VarBase> vout(
new imperative::VarBase(true, "vout"));
platform::CPUPlace place;
std::vector<float> src_data(10, 2.0);
std::vector<int64_t> dims1 = {2, 5};
std::vector<int64_t> dims2 = {5, 2};
auto* x_in_tensor = x_in->MutableVar()->GetMutable<framework::LoDTensor>();
auto* y_in_tensor = y_in->MutableVar()->GetMutable<framework::LoDTensor>();
x_in_tensor->Resize(framework::make_ddim(dims1));
auto* mutable_x = x_in_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_x, place, src_data.data(),
sizeof(float) * src_data.size());
y_in_tensor->Resize(framework::make_ddim(dims2));
auto* mutable_y = y_in_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_y, place, src_data.data(),
sizeof(float) * src_data.size());
var_pair x_pair = var_pair("X", vb_vector(1, x_in));
var_pair y_pair = var_pair("Y", vb_vector(1, y_in));
var_pair out_pair = var_pair("Out", vb_vector(1, vout));
imperative::NameVarBaseMap ins = {x_pair, y_pair};
imperative::NameVarBaseMap outs = {out_pair};
framework::AttributeMap mul_attr_map;
mul_attr_map["use_mkldnn"] = false;
tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true);
const auto& out_tensor = vout->Var().Get<framework::LoDTensor>();
for (size_t i = 0; i < vout->Var().Get<framework::LoDTensor>().numel(); i++) {
ASSERT_EQ(out_tensor.data<float>()[i], 20.0);
}
}
TEST(test_tracer, test_track_backward_output) {
// Doing an mul
imperative::Tracer tracer;
std::shared_ptr<imperative::VarBase> x_in(
new imperative::VarBase(true, "x_in"));
std::shared_ptr<imperative::VarBase> y_in(
new imperative::VarBase(false, "y_in"));
std::shared_ptr<imperative::VarBase> vout(
new imperative::VarBase(true, "vout"));
platform::CPUPlace place;
std::vector<float> src_data(10, 2.0);
std::vector<int64_t> dims1 = {2, 5};
std::vector<int64_t> dims2 = {5, 2};
auto* x_in_tensor = x_in->MutableVar()->GetMutable<framework::LoDTensor>();
auto* y_in_tensor = y_in->MutableVar()->GetMutable<framework::LoDTensor>();
x_in_tensor->Resize(framework::make_ddim(dims1));
auto* mutable_x = x_in_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_x, place, src_data.data(),
sizeof(float) * src_data.size());
y_in_tensor->Resize(framework::make_ddim(dims2));
auto* mutable_y = y_in_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_y, place, src_data.data(),
sizeof(float) * src_data.size());
var_pair x_pair = var_pair("X", vb_vector(1, x_in));
var_pair y_pair = var_pair("Y", vb_vector(1, y_in));
var_pair out_pair = var_pair("Out", vb_vector(1, vout));
imperative::NameVarBaseMap ins = {x_pair, y_pair};
imperative::NameVarBaseMap outs = {out_pair};
framework::AttributeMap mul_attr_map;
mul_attr_map["use_mkldnn"] = false;
ASSERT_ANY_THROW(tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true));
}
TEST(test_tracer, test_track_backward_input) {
// Doing an mul
imperative::Tracer tracer;
std::shared_ptr<imperative::VarBase> x_in(
new imperative::VarBase(true, "x_in"));
std::shared_ptr<imperative::VarBase> y_in(
new imperative::VarBase(true, "y_in"));
std::shared_ptr<imperative::VarBase> vout(
new imperative::VarBase(false, "vout"));
platform::CPUPlace place;
std::vector<float> src_data(10, 2.0);
std::vector<int64_t> dims1 = {2, 5};
std::vector<int64_t> dims2 = {5, 2};
auto* x_in_tensor = x_in->MutableVar()->GetMutable<framework::LoDTensor>();
auto* y_in_tensor = y_in->MutableVar()->GetMutable<framework::LoDTensor>();
x_in_tensor->Resize(framework::make_ddim(dims1));
auto* mutable_x = x_in_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_x, place, src_data.data(),
sizeof(float) * src_data.size());
y_in_tensor->Resize(framework::make_ddim(dims2));
auto* mutable_y = y_in_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_y, place, src_data.data(),
sizeof(float) * src_data.size());
var_pair x_pair = var_pair("X", vb_vector(1, x_in));
var_pair y_pair = var_pair("Y", vb_vector(1, y_in));
var_pair out_pair = var_pair("Out", vb_vector(1, vout));
imperative::NameVarBaseMap ins = {x_pair, y_pair};
imperative::NameVarBaseMap outs = {out_pair};
framework::AttributeMap mul_attr_map;
mul_attr_map["use_mkldnn"] = false;
ASSERT_ANY_THROW(tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true));
}
} // namespace imperative
} // namespace paddle
USE_OP(mul);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -11,282 +11,207 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/tracer.h"
#include <memory>
#include <set>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include "paddle/fluid/framework/var_type_inference.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace imperative {
void CreateGradOp(const framework::OpDesc& op_desc,
const std::unordered_set<std::string>& no_grad_set,
const std::vector<framework::BlockDesc*>& grad_sub_block,
std::vector<framework::OpDesc*>* grad_op_descs,
std::unordered_map<std::string, std::string>* grad_to_var) {
PADDLE_ENFORCE(grad_op_descs->empty());
const framework::OpInfo& op_info =
framework::OpInfoMap::Instance().Get(op_desc.Type());
if (!op_info.grad_op_maker_) return;
std::vector<std::unique_ptr<framework::OpDesc>> descs =
op_info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
for (auto& desc : descs) {
grad_op_descs->emplace_back(desc.release());
static std::vector<std::unique_ptr<framework::OpDesc>> CreateGradOpDescs(
const framework::OpInfo& op_info, const framework::OpDesc& op_desc,
const std::unordered_set<std::string>& no_grad_set,
const std::vector<framework::BlockDesc*>& grad_sub_block,
std::unordered_map<std::string, std::string>* grad_to_var) {
if (op_info.grad_op_maker_) {
return op_info.grad_op_maker_(op_desc, no_grad_set, grad_to_var,
grad_sub_block);
} else {
return {};
}
}
void CreateNoBuffuerGrad(std::shared_ptr<imperative::VarBase> var,
platform::DeviceContext* dev_ctx) {
PADDLE_ENFORCE_NOT_NULL(var, "Could not get valid var base");
PADDLE_ENFORCE_NOT_NULL(dev_ctx,
"Could not get valid device from forward op");
if (var->grads_ == nullptr) {
auto& var_t = var->var_->Get<framework::LoDTensor>();
var->grads_ = std::shared_ptr<imperative::VarBase>(
new VarBase(var->GradName(), framework::proto::VarType::FP32,
framework::vectorize(var_t.dims()), dev_ctx->GetPlace(),
var->IsStopGradient(), false, false));
void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
const NameVarBaseMap& outs, framework::AttributeMap attrs,
const platform::Place& place, bool trace_backward) {
platform::RecordEvent event(type);
VLOG(1) << "Trace Op: " << type;
size_t op_id = GenerateUniqueId();
auto op = OpBase::Create(op_id, type, ins, outs, std::move(attrs), place);
op->Run(ins, outs);
if (ComputeRequiredGrad(ins, outs, trace_backward)) {
TraceBackward(op, framework::OpDesc(op->Type(), op->InputNameMap(),
op->OutputNameMap(), op->Attrs()),
ins, outs);
}
}
platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) {
platform::Place result = place;
for (const auto& it : inputs) {
for (const std::shared_ptr<imperative::VarBase>& var : it.second) {
platform::Place tmp_place =
var->var_->Get<framework::LoDTensor>().place();
if (!platform::is_same_place(tmp_place, result)) {
PADDLE_THROW(
"Input variable should keep in the same place: %s, but get place: "
"%s of input %s instead",
result, tmp_place, it.first);
}
}
}
return result;
bool Tracer::ComputeRequiredGrad(const NameVarBaseMap& ins,
const NameVarBaseMap outs,
bool trace_backward) {
// TODO(jiabin): Implement auto prune here
return trace_backward;
}
framework::VariableNameMap CreateInputVarNameMap(
const OpBase* op, const VarBasePtrMap& varbase_map) {
framework::VariableNameMap result;
void Tracer::TraceBackward(const std::shared_ptr<OpBase>& fwd_op,
const framework::OpDesc& fwd_op_desc,
const NameVarBaseMap& ins,
const NameVarBaseMap& outs) {
// grad_to_var is a map of framework::GradVarName(in_var_name/out_var_name) ->
// in_var_name/out_var_name
std::unordered_map<std::string, std::string> grad_to_var;
auto& info_map = framework::OpInfoMap::Instance();
auto* op_info = info_map.GetNullable(op->Type());
if (op_info == nullptr || op_info->proto_ == nullptr) {
return result;
}
// Get grad_op_desc using fwd_op_desc
std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs_ =
CreateGradOpDescs(fwd_op->Info(), fwd_op_desc, {}, {}, &grad_to_var);
for (auto& in : op_info->Proto().inputs()) {
auto it = varbase_map.find(in.name());
if (it == varbase_map.end()) {
PADDLE_ENFORCE(in.dispensable());
result[in.name()] = {};
} else {
auto var_vector = it->second;
std::vector<std::string> args;
args.reserve(var_vector.size());
for (std::shared_ptr<imperative::VarBase> var_base : var_vector) {
args.emplace_back(var_base->Name());
}
result[in.name()] = args;
}
}
return result;
}
// Create grad_ops using grad_op_descs
framework::VariableNameMap CreateOutputVarNameMap(
const OpBase* op, const VarBasePtrMap& varbase_map) {
framework::VariableNameMap result;
size_t grad_op_num = grad_op_descs_.size();
auto& info_map = framework::OpInfoMap::Instance();
auto* op_info = info_map.GetNullable(op->Type());
if (op_info == nullptr || op_info->proto_ == nullptr) {
return result;
}
VLOG(3) << "Create " << grad_op_num << " grad op desc(s) to op "
<< fwd_op->Type();
for (auto& out : op_info->Proto().outputs()) {
auto it = varbase_map.find(out.name());
if (it == varbase_map.end()) {
PADDLE_ENFORCE(out.dispensable());
result[out.name()] = {};
} else {
auto var_vector = it->second;
std::vector<std::string> args;
args.reserve(var_vector.size());
for (const std::shared_ptr<imperative::VarBase>& var_base : var_vector) {
args.emplace_back(var_base->Name());
}
result[out.name()] = args;
}
if (grad_op_num == 0) {
return;
}
return result;
}
Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {}
void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
VarBasePtrMap* outputs, framework::AttributeMap attrs_map,
const platform::Place expected_place,
const bool stop_gradient) {
platform::RecordEvent record_event(op->type_);
framework::VariableValueMap invars_map;
framework::VariableValueMap outvars_map;
// Construct input_vars_map and output_vars_map
std::map<std::string, std::shared_ptr<imperative::VarBase>> current_vars_map;
for (auto it : inputs) {
auto& invars = invars_map[it.first];
invars.reserve(it.second.size());
for (std::shared_ptr<imperative::VarBase> inp : it.second) {
PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", op->Type(),
inp->Name());
invars.emplace_back(inp->var_.get());
if (!stop_gradient) {
current_vars_map[inp->Name()] = inp;
}
VLOG(3) << "input var name: " << inp->Name()
<< " inited: " << inp->var_->IsInitialized()
<< " stop_grad: " << inp->IsStopGradient();
// Build a map to record var_name -> std::shared_ptr<VarBase>*,
// so that we can find suitable var in grad op descs
std::unordered_map<std::string, const std::shared_ptr<VarBase>*> name_to_var;
for (auto& pair : ins) {
for (auto& var : pair.second) {
auto& var_ptr = name_to_var[var->Name()];
PADDLE_ENFORCE_EQ(var_ptr == nullptr || var_ptr->get() == var.get(), true,
"There are different variables with same name %s",
var->Name());
var_ptr = &var;
}
op->TrackPreOp(it.first, it.second);
}
for (const auto& it : *outputs) {
auto& outvars = outvars_map[it.first];
const std::vector<std::shared_ptr<imperative::VarBase>>& outputs_tmp =
it.second;
outvars.reserve(outputs_tmp.size());
for (size_t i = 0U; i < outputs_tmp.size(); ++i) {
// Add weak_ptr to track outputs
op->outputs_ref.emplace_back(outputs_tmp[i]);
std::shared_ptr<imperative::VarBase> out = outputs_tmp[i];
outvars.emplace_back(out->var_.get());
out->TrackPreOp(op, it.first, i, stop_gradient);
if (!stop_gradient) {
current_vars_map[out->Name()] = out;
}
VLOG(3) << "output var name: " << out->Name()
<< " inited: " << out->var_->IsInitialized()
<< " stop_grad: " << out->IsStopGradient();
for (auto& pair : outs) {
for (auto& var : pair.second) {
auto& var_ptr = name_to_var[var->Name()];
PADDLE_ENFORCE_EQ(var_ptr == nullptr || var_ptr->get() == var.get(), true,
"There are different variables with same name %s",
var->Name());
var_ptr = &var;
}
}
// Check attrs and create op
framework::VariableNameMap invars_name_map =
CreateInputVarNameMap(op, inputs);
framework::VariableNameMap outvars_name_map =
CreateOutputVarNameMap(op, *outputs);
auto& info = framework::OpInfoMap::Instance().Get(op->Type());
if (info.Checker() != nullptr) {
info.Checker()->Check(&attrs_map);
}
std::unique_ptr<framework::OperatorBase> op_base =
framework::OpRegistry::CreateOp(op->Type(), invars_name_map,
outvars_name_map, attrs_map);
if (info.infer_var_type_) {
RuntimeInferVarTypeContext infer_var_type_ctx(&inputs, outputs, &attrs_map);
info.infer_var_type_(&infer_var_type_ctx);
}
// TODO(minqiyang): Support infer var type in imperative mode
// Run forward op
VLOG(3) << "tracer running " << op->Type();
framework::RuntimeContext ctx(invars_map, outvars_map);
// TODO(panyx0718): Cache p.
framework::OperatorWithKernel* op_kernel =
dynamic_cast<framework::OperatorWithKernel*>(op_base.get());
PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
framework::Scope scope;
op->place_ = GetExpectedPlace(expected_place, inputs);
PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_);
prepared_op.op.RuntimeInferShape(scope, op->place_, ctx);
prepared_op.func(
framework::ExecutionContext(prepared_op.op, scope, *prepared_op.dev_ctx,
prepared_op.ctx, prepared_op.kernel_configs));
if (!stop_gradient) {
VLOG(5) << "start construct backward op";
// construct grad op descs
op->attrs_ = attrs_map;
std::unique_ptr<framework::OpDesc> fwd_op_desc(new framework::OpDesc(
op->Type(), invars_name_map, outvars_name_map, attrs_map));
std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
new std::unordered_map<std::string, std::string>());
// NOTE(minqiyang): We don't support control flow op in imperative now
// Add grad_block_ when we want to support it
CreateGradOp(*fwd_op_desc, {}, {}, &op->grad_op_descs_, grad_to_var.get());
VLOG(5) << "create grad op desc: " << op->grad_op_descs_[0]->Type();
const size_t grad_op_count = op->grad_op_descs_.size();
// Build backward ins and outs
for (size_t i = 0; i < grad_op_num; i++) {
// Step1: build grad op and add them to engine
// Use trace id to decide the order of gradient sum in sorted sum mode
size_t trace_id = fwd_op->id();
std::shared_ptr<OpBase> grad_op =
OpBase::Create(trace_id, (*(grad_op_descs_[i].get())), fwd_op->place());
// this OpBase* is just used to manage op's life time
engine_->InsertOp(grad_op.get(), grad_op);
std::unordered_set<OpBase*> visited_preceding_ops;
// Step2 : prepare grad_in vars and bind them with grad_op,
// set inputs' grad_op as current grad_op
for (const auto& grad_ins : grad_op_descs_[i]->Inputs()) {
if (grad_ins.second.empty()) continue;
auto& bwd_in = (*grad_op->GetMutableInsMap())[grad_ins.first];
bwd_in.reserve(grad_ins.second.size());
for (auto& grad_in_var_name : grad_ins.second) {
auto iter = grad_to_var.find(grad_in_var_name);
if (iter != grad_to_var.end()) {
// If it is a grad var, find its coresponding forward var
auto& fwd_var_name = iter->second;
auto fwd_var_iter = name_to_var.find(fwd_var_name);
PADDLE_ENFORCE_EQ(fwd_var_iter != name_to_var.end(), true,
"Cannot find forward variable named %s",
fwd_var_name);
PADDLE_ENFORCE_NOT_NULL(
(*(fwd_var_iter->second))->GradVarBase(),
"Grad of %s should "
"not be NULL when we Track_Backward Input of %s",
(*(fwd_var_iter->second))->Name(), grad_op->Type());
(*(fwd_var_iter->second))->GradVarBase()->AddGradOps(grad_op);
VLOG(3) << "Add Grad Op " << grad_op->Type() << " for :"
<< (*(fwd_var_iter->second))->GradVarBase()->Name();
bwd_in.emplace_back((*(fwd_var_iter->second))->GradVarBase());
} else {
// If it is a forward var, just add it
auto fwd_var_iter = name_to_var.find(grad_in_var_name);
PADDLE_ENFORCE_EQ(fwd_var_iter != name_to_var.end(), true,
"Cannot find forward variable named %s",
grad_in_var_name);
bwd_in.emplace_back(*(fwd_var_iter->second));
}
op->grad_input_vars_.resize(grad_op_count);
op->grad_output_vars_.resize(grad_op_count);
VLOG(3) << "Set backward input " << grad_ins.first << " of "
<< grad_op->Type() << " to be "
<< (bwd_in.back() ? bwd_in.back()->Name() : "nullptr");
}
}
for (size_t i = 0; i < grad_op_count; ++i) {
framework::OpDesc* grad_op_desc = op->grad_op_descs_[i];
for (auto it : grad_op_desc->Inputs()) {
auto& grad_in_vars = op->grad_input_vars_[i][it.first];
grad_in_vars.reserve(it.second.size());
for (const std::string& grad_invar : it.second) {
auto var_it = grad_to_var->find(grad_invar);
if (var_it == grad_to_var->end()) {
auto fwd_var_it = current_vars_map.find(grad_invar);
PADDLE_ENFORCE(fwd_var_it != current_vars_map.end());
// Forward inputs or outputs.
grad_in_vars.emplace_back(fwd_var_it->second);
} else {
std::shared_ptr<imperative::VarBase> var =
current_vars_map[var_it->second];
CreateNoBuffuerGrad(var, prepared_op.GetDeviceContext());
// Douts.
var->grads_->SetPreOp(var->PreOp());
grad_in_vars.emplace_back(var->grads_);
// Step3: prepare grad_out vars and using their grad_ops to set current
// grad_op's preceding op
for (auto& grad_outs : grad_op_descs_[i]->Outputs()) {
if (grad_outs.second.empty()) continue;
auto& bwd_out = (*grad_op->GetMutableOutsMap())[grad_outs.first];
bwd_out.reserve(grad_outs.second.size());
for (auto& grad_out_var_name : grad_outs.second) {
auto iter = grad_to_var.find(grad_out_var_name);
PADDLE_ENFORCE_EQ(iter != grad_to_var.end(), true,
"Cannot find output of input grad %s in op %s",
grad_out_var_name, fwd_op->Type());
auto fwd_var_iter = name_to_var.find(iter->second);
PADDLE_ENFORCE_EQ(fwd_var_iter != name_to_var.end(), true,
"Cannot find forward variable named %s",
iter->second);
PADDLE_ENFORCE_NOT_NULL(
(*(fwd_var_iter->second))->GradVarBase(),
"Grad of %s should "
"not be NULL when we Track_Backward Output of %s",
(*(fwd_var_iter->second))->Name(), grad_op->Type());
bwd_out.emplace_back((*(fwd_var_iter->second))->GradVarBase());
VLOG(3) << "Set backward output " << grad_outs.first << " of "
<< grad_op->Type() << " to be "
<< (bwd_out.back() ? bwd_out.back()->Name() : "nullptr");
auto preceding_ops =
(*(fwd_var_iter->second))->GradVarBase()->GradOps();
if (VLOG_IS_ON(3) && !preceding_ops.empty()) {
VLOG(3) << "Add preceding Op of :"
<< (*(fwd_var_iter->second))->GradVarBase()->Name()
<< " It's preceding Op are: ";
for (const auto& op : preceding_ops) {
VLOG(3) << op->Type();
}
}
}
for (auto it : grad_op_desc->Outputs()) {
auto& grad_out_vars = op->grad_output_vars_[i][it.first];
for (const std::string& grad_outvar : it.second) {
auto var_it = grad_to_var->find(grad_outvar);
PADDLE_ENFORCE(var_it != grad_to_var->end(),
"Could not found the grad op output var, should this "
"operator %s's stop gradient be True",
op->Type());
std::shared_ptr<imperative::VarBase> var =
current_vars_map[var_it->second];
CreateNoBuffuerGrad(var, prepared_op.GetDeviceContext());
var->grads_->SetPreOp(var->PreOp());
grad_out_vars.push_back(var->grads_);
VLOG(3) << "grads output var name: " << var->name_;
if (!preceding_ops.empty()) {
for (const auto& op : preceding_ops) {
PADDLE_ENFORCE_NOT_NULL(op, "No nullptr should be preceding_op");
if (visited_preceding_ops.count(op) == 0) {
visited_preceding_ops.insert(op);
grad_op->InsertGradPendingOps(op);
}
}
} else {
VLOG(5) << "Hit leaf VarBase";
VLOG(5) << "Hit leaf VarBase"
<< (*(fwd_var_iter->second))->GradVarBase()->Name();
}
}
}
// To ensure numeric stability as static graph
grad_op->SortGradPendingOps();
}
}
} // namespace imperative
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -14,46 +14,48 @@
#pragma once
#include <map>
#include <set>
#include <atomic>
#include <future> // NOLINT
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "ThreadPool.h"
#include "paddle/fluid/imperative/engine.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/macros.h"
namespace paddle {
namespace imperative {
void CreateGradOp(const framework::OpDesc& op_desc,
const std::unordered_set<std::string>& no_grad_set,
const std::vector<framework::BlockDesc*>& grad_sub_block,
framework::OpDesc** grad_op_desc,
std::unordered_map<std::string, std::string>* grad_to_var);
platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs);
class Tracer {
DISABLE_COPY_AND_ASSIGN(Tracer);
public:
explicit Tracer(framework::BlockDesc* root_block);
Tracer() : engine_(new BasicEngine()) {}
virtual ~Tracer() {}
~Tracer() = default;
void Trace(OpBase* op, const VarBasePtrMap& inputs,
VarBasePtrMap* outputs, // NOLINT
framework::AttributeMap attrs_map,
const platform::Place expected_place,
const bool stop_gradient = false);
void TraceOp(const std::string& type, const NameVarBaseMap& ins,
const NameVarBaseMap& outs, framework::AttributeMap attrs,
const platform::Place& place, bool trace_bacward);
bool ComputeRequiredGrad(const NameVarBaseMap& ins, const NameVarBaseMap outs,
bool trace_backward);
void TraceBackward(const std::shared_ptr<OpBase>& fwd_op,
const framework::OpDesc& fwd_op_desc,
const NameVarBaseMap& ins, const NameVarBaseMap& outs);
Engine* GetDefaultEngine() const { return engine_.get(); }
private:
platform::Place GetPlace(const VarBasePtrMap& inputs);
static size_t GenerateUniqueId() {
static std::atomic<size_t> id{0};
return id.fetch_add(1);
}
framework::BlockDesc* root_block_;
private:
std::unique_ptr<Engine> engine_;
};
} // namespace imperative
......
......@@ -17,8 +17,6 @@ limitations under the License. */
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
namespace paddle {
......@@ -26,18 +24,10 @@ namespace imperative {
class VarBase;
class OpBase;
class Tracer;
typedef std::map<std::string, std::vector<std::shared_ptr<VarBase>>>
VarBasePtrMap;
typedef std::vector<std::weak_ptr<VarBase>> VarBaseWeakPtrList;
typedef std::map<std::string, std::vector<OpBase*>> OpBasePtrMap;
typedef std::unordered_map<
const VarBase*,
std::pair<platform::Place,
std::vector<std::pair<int, std::shared_ptr<VarBase>>>>>
BackwardSumMap; // var_grad -> {place, {id -> var_grad@rename}}
typedef std::unordered_map<const VarBase*, std::pair<int, bool>> GradientRef;
// var_grad -> {ref_times, is_first_to_be_accumulate}
using NameVarBaseMap =
std::map<std::string, std::vector<std::shared_ptr<VarBase>>>;
} // namespace imperative
} // namespace paddle
set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper nccl_wrapper prune
feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
tracer analysis_predictor imperative_profiler nccl_context)
feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool
analysis_predictor imperative_profiler nccl_context imperative_flag)
if(WITH_PYTHON)
list(APPEND PYBIND_DEPS py_func_op)
......
......@@ -20,11 +20,13 @@ limitations under the License. */
#include <pybind11/functional.h>
#include <pybind11/stl.h>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include "paddle/fluid/framework/block_desc.h"
#include <vector>
#include "paddle/fluid/imperative/backward_strategy.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/nccl_context.h"
#include "paddle/fluid/imperative/profiler.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/imperative/type_defs.h"
......@@ -44,16 +46,27 @@ class Layer : public imperative::Layer {
const std::vector<std::shared_ptr<imperative::VarBase>> &inputs)
override {
PYBIND11_OVERLOAD(std::vector<std::shared_ptr<imperative::VarBase>>, Layer,
Forward,
inputs); // NOLINT
Forward, inputs); // NOLINT
}
};
class PYBIND11_HIDDEN PyOpBase : public imperative::OpBase {
// warper for pyobject to avoid imperative module depend on python
// TODO(jiabin) Add OpBase's pybind interface back to enable backward hook
class PYBIND11_HIDDEN PyCallableObject {
public:
using imperative::OpBase::OpBase; // Inherit constructors
PyCallableObject(std::shared_ptr<py::object> py_obj_ptr)
: py_obj_ptr_(std::move(py_obj_ptr)) {}
~PyCallableObject() {
py::call_guard<py::gil_scoped_acquire>();
py_obj_ptr_.reset();
}
void operator()() {
py::call_guard<py::gil_scoped_acquire>();
py_obj_ptr_->operator()(this);
}
PyOpBase(const std::string &name) : OpBase(name) {}
private:
std::shared_ptr<py::object> py_obj_ptr_;
};
// Function like obj.attr_name in Python.
......@@ -125,33 +138,43 @@ GetVarBaseListFromPyHandle(const py::handle &handle) {
}
} else {
PADDLE_THROW(
"unsupported type %s, must be Variable, List[Variable] or "
"unsupported type %s, must be Variable, list[Variable] or "
"tuple[Variable]",
py::str(handle));
}
PADDLE_ENFORCE(PyErr_Occurred() == nullptr,
py::str(py::handle(PyErr_Occurred())));
return result;
}
using PyVarBaseMap = std::unordered_map<std::string, py::handle>;
using PyNameVarBaseMap = std::unordered_map<std::string, py::handle>;
static imperative::VarBasePtrMap ConvertToVarBasePtrMap(
const PyVarBaseMap &map) {
imperative::VarBasePtrMap result;
static imperative::NameVarBaseMap ConvertToNameVarBaseMap(
const PyNameVarBaseMap &map) {
imperative::NameVarBaseMap result;
for (auto &pair : map) {
auto var_vec = GetVarBaseListFromPyHandle(pair.second);
if (!var_vec.empty()) {
result.emplace(pair.first, std::move(var_vec));
}
}
PADDLE_ENFORCE_EQ(PyErr_Occurred() == nullptr, true,
py::str(py::handle(PyErr_Occurred())));
return result;
}
static std::string GetTypeName(const imperative::VarBase &var) {
if (var.Type() == framework::proto::VarType::RAW) {
return "RAW";
} else if (!var.Var().IsInitialized()) {
return "nullptr";
} else {
return framework::ToTypeName(var.Var().Type());
}
}
// Bind Methods
void BindImperative(pybind11::module *m_ptr) {
void BindImperative(py::module *m_ptr) {
auto &m = *m_ptr;
py::class_<imperative::detail::BackwardStrategy> backward_strategy(
......@@ -200,69 +223,88 @@ void BindImperative(pybind11::module *m_ptr) {
m.def("_dygraph_debug_level", []() { return imperative::GetDebugLevel(); });
py::class_<imperative::VarBase, std::shared_ptr<imperative::VarBase>>(
m, "VarBase", R"DOC()DOC")
m, "VarBase",
R"DOC()DOC")
.def_static("_alive_vars", &imperative::VarBase::AliveVarNames)
.def(
py::init<const std::string &, paddle::framework::proto::VarType::Type,
const std::vector<int64_t>, const paddle::platform::CPUPlace,
bool, bool>())
.def(
py::init<const std::string &, paddle::framework::proto::VarType::Type,
const std::vector<int64_t>,
const paddle::platform::CUDAPlace, bool, bool>())
.def("__init__",
[](imperative::VarBase &self, const std::string &name,
framework::proto::VarType::Type type,
framework::proto::VarType::Type dtype,
const std::vector<int> &dims, bool stop_gradient,
bool persistable) {
new (&self) imperative::VarBase(name);
self.SetPersistable(persistable);
self.SetType(type);
self.SetDataType(dtype);
self.SetStopGradient(stop_gradient);
if (type == framework::proto::VarType::LOD_TENSOR) {
auto *tensor =
self.MutableVar()->GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim(dims));
}
})
.def("_run_backward",
[](imperative::VarBase &self,
const imperative::detail::BackwardStrategy &bckst) {
self.RunBackward(bckst);
})
.def("_grad_name", &imperative::VarBase::GradName)
.def("_grad_value", &imperative::VarBase::GradValue)
const imperative::detail::BackwardStrategy &bckst,
const imperative::Tracer &tracer) {
// TODO(jiabin): when we impl more backward execution we can select
// them
imperative::Engine *engine = tracer.GetDefaultEngine();
VLOG(3) << "Start backward";
engine->Init(&self, bckst);
engine->Execute();
VLOG(3) << "Finish backward";
},
py::call_guard<py::gil_scoped_release>())
.def("_grad_name", &imperative::VarBase::GradVarName)
.def("_grad_value",
[](imperative::VarBase &self) {
return self.MutableGradVar()->Get<framework::LoDTensor>();
},
py::return_value_policy::reference)
.def("_clear_gradient", &imperative::VarBase::ClearGradient)
.def("_grad_ivar",
[](const imperative::VarBase &self) { return self.grads_; },
py::return_value_policy::reference)
[](const imperative::VarBase &self) {
auto &grad_var = self.GradVarBase();
if (grad_var && grad_var->Var().IsInitialized()) {
return grad_var;
} else {
return std::shared_ptr<imperative::VarBase>(nullptr);
}
},
py::return_value_policy::copy)
.def("_copy_to",
[](const imperative::VarBase &self, const platform::CPUPlace &place,
bool blocking) {
return self.NewVarBase(place, blocking).release();
},
py::return_value_policy::take_ownership)
bool blocking) { return self.NewVarBase(place, blocking); },
py::return_value_policy::copy)
.def("_copy_to",
[](const imperative::VarBase &self, const platform::CUDAPlace &place,
bool blocking) {
return self.NewVarBase(place, blocking).release();
},
py::return_value_policy::take_ownership)
.def("value",
[](const imperative::VarBase &self) { return self.var_.get(); },
bool blocking) { return self.NewVarBase(place, blocking); },
py::return_value_policy::copy)
.def("value", [](imperative::VarBase &self) { return self.MutableVar(); },
py::return_value_policy::reference)
.def_property("name", &imperative::VarBase::Name,
&imperative::VarBase::SetName)
.def_property_readonly("shape", &imperative::VarBase::Shape)
.def_property_readonly(
"shape",
[](imperative::VarBase &self) {
if (self.Var().IsType<framework::LoDTensor>()) {
return framework::vectorize2int(
self.Var().Get<framework::LoDTensor>().dims());
} else {
VLOG(2) << "It is meaningless to get shape of variable type "
<< GetTypeName(self);
return std::vector<int>();
}
})
.def_property_readonly("type", &imperative::VarBase::Type)
.def_property_readonly("dtype", &imperative::VarBase::DataType)
.def_property("persistable", &imperative::VarBase::IsPersistable,
.def_property("persistable", &imperative::VarBase::Persistable,
&imperative::VarBase::SetPersistable)
.def_property("stop_gradient", &imperative::VarBase::IsStopGradient,
.def_property("stop_gradient", &imperative::VarBase::StopGradient,
&imperative::VarBase::SetStopGradient);
py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
.def(py::init<const std::string &>())
.def("register_backward_hooks",
[](imperative::OpBase &self, const py::object &callable) {
self.RegisterBackwardHooks(callable);
})
.def_property("_trace_id",
[](const imperative::OpBase &self) {
py::gil_scoped_release release;
return self.trace_id_;
},
[](imperative::OpBase &self, int trace_id) {
py::gil_scoped_release release;
self.trace_id_ = trace_id;
},
py::return_value_policy::reference)
.def_property_readonly("type", &imperative::OpBase::Type);
py::class_<imperative::Layer, Layer /* <--- trampoline*/> layer(m, "Layer");
layer.def(py::init<>())
.def("forward",
......@@ -271,42 +313,35 @@ void BindImperative(pybind11::module *m_ptr) {
return self.Forward(inputs);
});
// NOTE(zjl): Tracer use PyVarBaseMap as its parameter but not VarBasePtrMap.
// We call Python C-API to convert PyVarBaseMap to VarBasePtrMap, instead
// making conversion in Python code. This speed up Tracer.trace() about 6%
// in ptb model and make time cost in Python to be nearly zero.
py::class_<imperative::Tracer>(m, "Tracer", "")
.def("__init__",
[](imperative::Tracer &self, framework::BlockDesc *root_block) {
new (&self) imperative::Tracer(root_block);
})
[](imperative::Tracer &self) { new (&self) imperative::Tracer(); })
.def("trace",
[](imperative::Tracer &self, imperative::OpBase *op,
const PyVarBaseMap &inputs, const PyVarBaseMap &outputs,
framework::AttributeMap attrs_map,
const platform::CPUPlace expected_place,
const bool stop_gradient = false) {
auto ins = ConvertToVarBasePtrMap(inputs);
auto outs = ConvertToVarBasePtrMap(outputs);
[](imperative::Tracer &self, const std::string &type,
const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
framework::AttributeMap attrs, const platform::CUDAPlace &place,
bool trace_backward) {
auto ins_map = ConvertToNameVarBaseMap(ins);
auto outs_map = ConvertToNameVarBaseMap(outs);
{
py::gil_scoped_release release;
self.Trace(op, std::move(ins), &outs, attrs_map, expected_place,
stop_gradient);
self.TraceOp(type, std::move(ins_map), std::move(outs_map),
std::move(attrs), place, trace_backward);
}
})
.def("trace", [](imperative::Tracer &self, imperative::OpBase *op,
const PyVarBaseMap &inputs, const PyVarBaseMap &outputs,
framework::AttributeMap attrs_map,
const platform::CUDAPlace expected_place,
const bool stop_gradient = false) {
auto ins = ConvertToVarBasePtrMap(inputs);
auto outs = ConvertToVarBasePtrMap(outputs);
{
py::gil_scoped_release release;
self.Trace(op, std::move(ins), &outs, attrs_map, expected_place,
stop_gradient);
}
});
.def("trace",
[](imperative::Tracer &self, const std::string &type,
const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
framework::AttributeMap attrs, const platform::CPUPlace &place,
bool trace_backward) {
auto ins_map = ConvertToNameVarBaseMap(ins);
auto outs_map = ConvertToNameVarBaseMap(outs);
{
py::gil_scoped_release release;
self.TraceOp(type, std::move(ins_map), std::move(outs_map),
std::move(attrs), place, trace_backward);
}
});
// define parallel context
py::class_<imperative::ParallelStrategy> parallel_strategy(
......
......@@ -14,10 +14,6 @@ limitations under the License. */
#pragma once
#include <Python.h>
#include <string>
#include <vector>
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/nccl_context.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
......
......@@ -18,6 +18,7 @@ from paddle.fluid import core
from paddle.fluid import framework
from .tracer import Tracer
import logging
import objgraph
__all__ = [
'no_grad',
......@@ -123,7 +124,7 @@ def guard(place=None):
"""
train = framework.Program()
startup = framework.Program()
tracer = Tracer(train.current_block().desc)
tracer = Tracer()
if place is None:
if core.is_compiled_with_cuda():
......@@ -138,19 +139,22 @@ def guard(place=None):
yield
def _print_debug_msg():
def _print_debug_msg(limit=5, is_test=False):
if not core._is_dygraph_debug_enabled():
logging.warn(
'Debug mode is not enabled. Please set FLAGS_dygraph_debug=1 to enable debug'
)
return
unique_name_size = len(framework.unique_name.generator.ids)
tracer_var_size = len(framework._dygraph_tracer()._vars)
alive_cpp_var_size = len(core.VarBase._alive_vars())
logging.warn(
'unique_name num: {}, tracer vars num: {}, alive cpp vars num: {}'
.format(unique_name_size, tracer_var_size, alive_cpp_var_size))
if not is_test:
logging.warn(
'unique_name num: {}, tracer vars num: {}, alive cpp vars num: {}'
.format(unique_name_size, tracer_var_size, alive_cpp_var_size))
objgraph.show_growth(limit=limit)
else:
return unique_name_size, tracer_var_size, alive_cpp_var_size
def to_variable(value, block=None, name=None):
......
......@@ -20,7 +20,7 @@ from . import layers
from . import parallel_helper
from .. import framework
from ..layers import collective
from . import to_variable
from . import to_variable, no_grad
__all__ = ["prepare_context"]
......@@ -197,6 +197,7 @@ class DataParallel(layers.Layer):
for g_var, g_shape in zip(origin_grad_vars, grad_shapes):
nn.reshape(x=g_var, shape=g_shape, inplace=True)
@no_grad
def apply_collective_grads(self):
"""
AllReduce the Parameters' gradient.
......
......@@ -23,21 +23,15 @@ from paddle.fluid import framework
__all__ = ['Tracer']
def release_op(op):
del framework._dygraph_tracer()._ops[op._trace_id]
class Tracer(core.Tracer):
"""
Python wrapper of dygraph tracer
"""
def __init__(self, block):
super(Tracer, self).__init__(block)
def __init__(self):
super(Tracer, self).__init__()
self._ops = defaultdict()
self._vars = defaultdict()
self._trace_id = 0
self._train_mode = True
def trace_var(self, name, var):
......@@ -47,23 +41,10 @@ class Tracer(core.Tracer):
return list((item for name, item in six.iteritems(self._vars)
if isinstance(item, framework.Parameter)))
def _clear_ops(self):
self._ops = defaultdict()
self._trace_id = 0
def trace_op(self, op, inputs, outputs, stop_gradient=False):
# record op's trace id
op.iop._trace_id = self._trace_id
self.trace(op.iop, inputs, outputs, op.attrs,
framework._current_expected_place(), stop_gradient)
if not stop_gradient and self._train_mode:
self._trace_id += 1
self._ops[op.iop._trace_id] = op
# register backward hooks and variables if needed
op.iop.register_backward_hooks(release_op)
def trace_op(self, type, inputs, outputs, attrs, stop_gradient=False):
self.trace(type, inputs, outputs, attrs,
framework._current_expected_place(), self._train_mode and
not stop_gradient)
def train_mode(self):
self._train_mode = True
......
......@@ -458,9 +458,10 @@ class Variable(object):
self._ivar = kwargs.get("ivar", None)
if not self._ivar:
self._ivar = core.VarBase(
name, dtype if dtype else core.VarDesc.VarType.FP32,
list(shape) if shape else [],
_current_expected_place(), stop_gradient, True
name, type
if type else core.VarDesc.VarType.LOD_TENSOR, dtype
if dtype else core.VarDesc.VarType.FP32,
list(shape) if shape else [], stop_gradient, True
if persistable else False)
if persistable:
_dygraph_tracer().trace_var(name, self)
......@@ -582,13 +583,16 @@ class Variable(object):
return np.array(new_ivar.value().get_tensor())
def backward(self, backward_strategy=None):
from .dygraph import BackwardStrategy
if backward_strategy is None:
backward_strategy = BackwardStrategy()
backward_strategy.sort_sum_gradient = False
if in_dygraph_mode():
from .dygraph import BackwardStrategy
if backward_strategy is None:
backward_strategy = BackwardStrategy()
backward_strategy.sort_sum_gradient = False
self._ivar._run_backward(backward_strategy)
_dygraph_tracer()._clear_ops()
self._ivar._run_backward(backward_strategy, _dygraph_tracer())
else:
raise ValueError(
"Variable.backward() is only avaliable in DyGraph mode")
def gradient(self):
new_ivar = self._ivar._grad_ivar()._copy_to(core.CPUPlace(), True)
......@@ -616,9 +620,13 @@ class Variable(object):
"""
if in_dygraph_mode():
# TODO(panyx0718): add more dygraph debug info.
return 'name %s, dtype: %s shape: %s %s' % (
self.name, self.dtype, self.shape,
str(self._ivar.value().get_tensor()))
tensor = self._ivar.value().get_tensor()
if tensor._is_initialized():
return 'name %s, dtype: %s shape: %s %s' % (
self.name, self.dtype, self.shape, str(tensor))
else:
return 'name %s, shape: %s, not inited' % (self.name,
self.shape)
assert isinstance(throw_on_error, bool) and isinstance(with_details,
bool)
......@@ -713,7 +721,7 @@ class Variable(object):
@property
def type(self):
if in_dygraph_mode():
return self._ivar.dtype
return self._ivar.type
else:
return self.desc.type()
......@@ -1085,9 +1093,7 @@ class Operator(object):
if type is None:
raise ValueError(
"`type` to initialized an Operator can not be None.")
self.iop = core.OpBase(type)
self.previous_ops = []
self._type = type
self.attrs = attrs if attrs else {}
else:
self.block = block
......@@ -1233,7 +1239,7 @@ class Operator(object):
@property
def type(self):
if in_dygraph_mode():
return self.iop.type
return self._type
else:
return self.desc.type()
......@@ -1787,10 +1793,12 @@ class Block(object):
else:
attrs['is_test'] = False
type = kwargs.get("type", None)
op = Operator(
block=self,
desc=None,
type=kwargs.get("type", None),
type=type,
inputs=None,
outputs=None,
attrs=attrs)
......@@ -1799,9 +1807,11 @@ class Block(object):
#
# TODO(minqiyang): add op stop_gradient support in static mode too.
# currently, we only support stop_gradient in dygraph mode.
_dygraph_tracer().trace_op(op,
_dygraph_tracer().trace_op(type,
kwargs.get("inputs", {}),
kwargs.get("outputs", {}),
kwargs.get("outputs", {}), attrs
if attrs else {},
kwargs.get("stop_gradient", False))
else:
op_desc = self.desc.append_op()
......@@ -1862,17 +1872,15 @@ class Block(object):
def _prepend_op(self, *args, **kwargs):
if in_dygraph_mode():
type = kwargs.get("type", None)
attrs = kwargs.get("attrs", {})
op = Operator(
self,
None,
type=kwargs.get("type", None),
inputs=None,
outputs=None,
attrs=kwargs.get("attrs", {}))
self, None, type=type, inputs=None, outputs=None, attrs=attrs)
_dygraph_tracer().trace_op(op,
_dygraph_tracer().trace_op(type,
kwargs.get("inputs", {}),
kwargs.get("outputs", {}),
kwargs.get("outputs", {}), attrs
if attrs else {},
kwargs.get("stop_gradient", False))
else:
op_desc = self.desc._prepend_op()
......
......@@ -615,9 +615,6 @@ class Optimizer(object):
optimize_ops = self.apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads)
if framework.in_dygraph_mode():
framework._dygraph_tracer()._clear_ops()
return optimize_ops, params_grads
......
......@@ -177,7 +177,7 @@ list(REMOVE_ITEM TEST_OPS test_basic_gru_api)
list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op)
list(REMOVE_ITEM TEST_OPS test_basic_lstm_api)
list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op)
list(REMOVE_ITEM TEST_OPS test_imperative_debug_string)
# Some ops need to check results when gc is enabled
# Currently, only ops that register NoNeedBufferVarsInference need to do this test
set(TEST_OPS_WITH_GC
......@@ -240,6 +240,7 @@ py_test_modules(test_imperative_ocr_attention_model MODULES test_imperative_ocr_
py_test_modules(test_install_check MODULES test_install_check ENVS
FLAGS_cudnn_deterministic=1 SERIAL)
set_tests_properties(test_install_check PROPERTIES LABELS "RUN_TYPE=DIST")
py_test_modules(test_imperative_debug_string MODULES test_imperative_debug_string ENVS FLAGS_dygraph_debug=1)
if(WITH_DISTRIBUTE)
py_test_modules(test_dist_train MODULES test_dist_train ENVS ${dist_ENVS})
py_test_modules(test_lookup_remote_table_op MODULES test_lookup_remote_table_op ENVS ${dist_ENVS})
......
......@@ -27,17 +27,40 @@ import paddle.fluid as fluid
import paddle.fluid.dygraph as dygraph
from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC, BatchNorm
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC, LayerNorm
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.layer_helper import LayerHelper
import math
from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
momentum_rate = 0.9
l2_decay = 1.2e-4
def optimizer_setting(params):
ls = params["learning_strategy"]
if "total_images" not in params:
total_images = 6149
else:
total_images = params["total_images"]
batch_size = ls["batch_size"]
step = int(math.ceil(float(total_images) / batch_size))
bd = [step * e for e in ls["epochs"]]
lr = params["lr"]
num_epochs = params["num_epochs"]
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.cosine_decay(
learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
momentum=momentum_rate,
regularization=fluid.regularizer.L2Decay(l2_decay))
return optimizer
class ConvBNLayer(fluid.dygraph.Layer):
def __init__(self,
name_scope,
num_channels,
num_filters,
filter_size,
stride=1,
......@@ -46,26 +69,21 @@ class ConvBNLayer(fluid.dygraph.Layer):
super(ConvBNLayer, self).__init__(name_scope)
self._conv = Conv2D(
self.full_name(),
"conv2d",
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
bias_attr=None)
bias_attr=False,
param_attr=fluid.ParamAttr(name="weights"))
self._batch_norm = BatchNorm(
self.full_name(), num_filters, act=act, momentum=0.1)
self._layer_norm = fluid.dygraph.nn.LayerNorm(
self.full_name(), begin_norm_axis=1)
self._layer_norm = LayerNorm(self.full_name(), begin_norm_axis=1)
def forward(self, inputs):
y = self._conv(inputs)
# FIXME(zcd): when compare the result of multi-card and single-card,
# we should replace batch_norm with layer_norm.
y = self._layer_norm(y)
# y = self._batch_norm(y)
return y
......@@ -76,17 +94,19 @@ class SqueezeExcitation(fluid.dygraph.Layer):
super(SqueezeExcitation, self).__init__(name_scope)
self._pool = Pool2D(
self.full_name(), pool_size=0, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(num_channels * 1.0)
self._squeeze = FC(
self.full_name(),
size=num_channels // reduction_ratio,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.05)),
initializer=fluid.initializer.Uniform(-stdv, stdv)),
act='relu')
stdv = 1.0 / math.sqrt(num_channels / 16.0 * 1.0)
self._excitation = FC(
self.full_name(),
size=num_channels,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.05)),
initializer=fluid.initializer.Uniform(-stdv, stdv)),
act='sigmoid')
def forward(self, input):
......@@ -110,39 +130,37 @@ class BottleneckBlock(fluid.dygraph.Layer):
self.conv0 = ConvBNLayer(
self.full_name(),
num_channels=num_channels,
num_filters=num_filters,
filter_size=1)
filter_size=1,
act="relu")
self.conv1 = ConvBNLayer(
self.full_name(),
num_channels=num_filters,
num_filters=num_filters,
filter_size=3,
stride=stride,
groups=cardinality)
groups=cardinality,
act="relu")
self.conv2 = ConvBNLayer(
self.full_name(),
num_channels=num_filters,
num_filters=num_filters * 4,
num_filters=num_filters * 2,
filter_size=1,
act='relu')
act=None)
self.scale = SqueezeExcitation(
self.full_name(),
num_channels=num_filters * 4,
num_channels=num_filters * 2,
reduction_ratio=reduction_ratio)
if not shortcut:
self.short = ConvBNLayer(
self.full_name(),
num_channels=num_channels,
num_filters=num_filters * 4,
num_filters=num_filters * 2,
filter_size=1,
stride=stride)
self.shortcut = shortcut
self._num_channels_out = num_filters * 4
self._num_channels_out = num_filters * 2
def forward(self, inputs):
y = self.conv0(inputs)
......@@ -155,10 +173,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
else:
short = self.short(inputs)
y = fluid.layers.elementwise_add(x=short, y=scale)
layer_helper = LayerHelper(self.full_name(), act='relu')
y = layer_helper.append_activation(y)
y = fluid.layers.elementwise_add(x=short, y=scale, act='relu')
return y
......@@ -178,7 +193,6 @@ class SeResNeXt(fluid.dygraph.Layer):
num_filters = [128, 256, 512, 1024]
self.conv0 = ConvBNLayer(
self.full_name(),
num_channels=3,
num_filters=64,
filter_size=7,
stride=2,
......@@ -196,8 +210,7 @@ class SeResNeXt(fluid.dygraph.Layer):
num_filters = [128, 256, 512, 1024]
self.conv0 = ConvBNLayer(
self.full_name(),
num_channels=3,
num_filters=3,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
......@@ -214,24 +227,21 @@ class SeResNeXt(fluid.dygraph.Layer):
num_filters = [128, 256, 512, 1024]
self.conv0 = ConvBNLayer(
self.full_name(),
num_channels=3,
num_filters=3,
filter_size=7,
num_filters=64,
filter_size=3,
stride=2,
act='relu')
self.conv1 = ConvBNLayer(
self.full_name(),
num_channels=64,
num_filters=3,
filter_size=7,
stride=2,
num_filters=64,
filter_size=3,
stride=1,
act='relu')
self.conv2 = ConvBNLayer(
self.full_name(),
num_channels=64,
num_filters=3,
filter_size=7,
stride=2,
num_filters=128,
filter_size=3,
stride=1,
act='relu')
self.pool = Pool2D(
self.full_name(),
......@@ -261,16 +271,14 @@ class SeResNeXt(fluid.dygraph.Layer):
self.pool2d_avg = Pool2D(
self.full_name(), pool_size=7, pool_type='avg', global_pooling=True)
import math
stdv = 1.0 / math.sqrt(2048 * 1.0)
self.fc = FC(self.full_name(),
size=class_dim,
act='softmax',
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
self.out = FC(self.full_name(),
size=class_dim,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
def forward(self, inputs, label):
def forward(self, inputs):
if self.layers == 50 or self.layers == 101:
y = self.conv0(inputs)
y = self.pool(y)
......@@ -283,13 +291,8 @@ class SeResNeXt(fluid.dygraph.Layer):
for bottleneck_block in self.bottleneck_block_list:
y = bottleneck_block(y)
y = self.pool2d_avg(y)
# FIXME(zcd): the dropout should be removed when compare the
# result of multi-card and single-card.
# y = fluid.layers.dropout(y, dropout_prob=0.2, seed=1)
cost = self.fc(y)
loss = fluid.layers.cross_entropy(cost, label)
avg_loss = fluid.layers.mean(loss)
return avg_loss
y = self.out(y)
return y
class TestSeResNeXt(TestParallelDyGraphRunnerBase):
......@@ -312,8 +315,11 @@ class TestSeResNeXt(TestParallelDyGraphRunnerBase):
label = to_variable(y_data)
label.stop_gradient = True
loss = model(img, label)
return loss
out = model(img)
softmax_out = fluid.layers.softmax(out, use_cudnn=False)
loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
avg_loss = fluid.layers.mean(x=loss)
return avg_loss
if __name__ == "__main__":
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid as fluid
import numpy as np
class MLP(fluid.Layer):
def __init__(self, name_scope):
super(MLP, self).__init__(name_scope)
self._fc1 = fluid.dygraph.FC(
self.full_name(),
3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)))
self._fc2 = fluid.dygraph.FC(
self.full_name(),
4,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)))
def forward(self, inputs):
x = self._fc1(inputs)
x = self._fc2(x)
x = fluid.layers.reduce_sum(x)
return x
class TestDygraphDebugString(unittest.TestCase):
def test_dygraph_debug_string(self):
np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
unique_name = 0
trace_var = 0
alive_var = 0
with fluid.dygraph.guard():
mlp = MLP("mlp")
for i in range(10):
var_inp = fluid.dygraph.base.to_variable(np_inp)
out = mlp(var_inp)
out.backward()
mlp.clear_gradients()
unique_name_tmp, trace_var_tmp, alive_var_tmp = fluid.dygraph.base._print_debug_msg(
is_test=True)
if i > 0:
self.assertGreaterEqual(unique_name, unique_name_tmp)
self.assertGreaterEqual(trace_var, trace_var_tmp)
self.assertGreaterEqual(alive_var, alive_var_tmp)
else:
unique_name = unique_name_tmp
trace_var = trace_var_tmp
alive_var = alive_var_tmp
try:
fluid.dygraph.base._print_debug_msg()
except Exception as e:
raise RuntimeError(
"No Exception is accepted in _print_debug_msg, but we got: {}".
format(e))
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid as fluid
import numpy as np
from test_imperative_base import new_program_scope
class MLP(fluid.Layer):
def __init__(self, name_scope):
super(MLP, self).__init__(name_scope)
self._fc1 = fluid.dygraph.FC(
self.full_name(),
3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)))
self._fc2 = fluid.dygraph.FC(
self.full_name(),
4,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.1)))
def forward(self, inputs):
x = self._fc1(inputs)
x = self._fc2(x)
x = fluid.layers.reduce_sum(x)
return x
class TestDygraphFramework(unittest.TestCase):
def test_dygraph_backward(self):
with new_program_scope():
mlp = MLP("mlp")
var_inp = fluid.layers.data(
"input", shape=[2, 2], dtype="float32", append_batch_size=False)
out = mlp(var_inp)
try:
out.backward()
raise AssertionError(
"backward should not be usable in static graph mode")
except ValueError as e:
self.assertTrue((e is not None))
def test_dygraph_to_string(self):
np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
with fluid.dygraph.guard():
var_inp = fluid.dygraph.base.to_variable(np_inp)
var_inp.to_string(throw_on_error=True)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid as fluid
from paddle.fluid import Embedding, LayerNorm, FC, Layer
from paddle.fluid.dygraph import to_variable, guard
from test_imperative_base import new_program_scope
from paddle.fluid import core
import numpy as np
import six
np.set_printoptions(suppress=True)
# Copy from models
class TrainTaskConfig(object):
# support both CPU and GPU now.
use_gpu = True
# the epoch number to train.
pass_num = 30
# the number of sequences contained in a mini-batch.
# deprecated, set batch_size in args.
batch_size = 32
# the hyper parameters for Adam optimizer.
# This static learning_rate will be multiplied to the LearningRateScheduler
# derived learning rate the to get the final learning rate.
learning_rate = 2.0
beta1 = 0.9
beta2 = 0.997
eps = 1e-9
# the parameters for learning rate scheduling.
warmup_steps = 8000
# the weight used to mix up the ground-truth distribution and the fixed
# uniform distribution in label smoothing when training.
# Set this as zero if label smoothing is not wanted.
label_smooth_eps = 0.1
# the directory for saving trained models.
model_dir = "trained_models"
# the directory for saving checkpoints.
ckpt_dir = "trained_ckpts"
# the directory for loading checkpoint.
# If provided, continue training from the checkpoint.
ckpt_path = None
# the parameter to initialize the learning rate scheduler.
# It should be provided if use checkpoints, since the checkpoint doesn't
# include the training step counter currently.
start_step = 0
# the frequency to save trained models.
save_freq = 10000
class InferTaskConfig(object):
use_gpu = True
# the number of examples in one run for sequence generation.
batch_size = 10
# the parameters for beam search.
beam_size = 5
max_out_len = 256
# the number of decoded sentences to output.
n_best = 1
# the flags indicating whether to output the special tokens.
output_bos = False
output_eos = False
output_unk = True
# the directory for loading the trained model.
model_path = "trained_models/pass_1.infer.model"
class ModelHyperParams(object):
# These following five vocabularies related configurations will be set
# automatically according to the passed vocabulary path and special tokens.
# size of source word dictionary.
src_vocab_size = 10000
# size of target word dictionay
trg_vocab_size = 10000
# index for <bos> token
bos_idx = 0
# index for <eos> token
eos_idx = 1
# index for <unk> token
unk_idx = 2
# max length of sequences deciding the size of position encoding table.
max_length = 4
# the dimension for word embeddings, which is also the last dimension of
# the input and output of multi-head attention, position-wise feed-forward
# networks, encoder and decoder.
d_model = 512
# size of the hidden layer in position-wise feed-forward networks.
d_inner_hid = 2048
# the dimension that keys are projected to for dot-product attention.
d_key = 64
# the dimension that values are projected to for dot-product attention.
d_value = 64
# number of head used in multi-head attention.
n_head = 8
# number of sub-layers to be stacked in the encoder and decoder.
n_layer = 6
# dropout rates of different modules.
prepostprocess_dropout = 0.1
attention_dropout = 0.1
relu_dropout = 0.1
# to process before each sub-layer
preprocess_cmd = "n" # layer normalization
# to process after each sub-layer
postprocess_cmd = "da" # dropout + residual connection
# random seed used in dropout for CE.
dropout_seed = None
# the flag indicating whether to share embedding and softmax weights.
# vocabularies in source and target should be same for weight sharing.
weight_sharing = True
def merge_cfg_from_list(cfg_list, g_cfgs):
"""
Set the above global configurations using the cfg_list.
"""
assert len(cfg_list) % 2 == 0
for key, value in zip(cfg_list[0::2], cfg_list[1::2]):
for g_cfg in g_cfgs:
if hasattr(g_cfg, key):
try:
value = eval(value)
except Exception: # for file path
pass
setattr(g_cfg, key, value)
break
def position_encoding_init(n_position, d_pos_vec):
"""
Generate the initial values for the sinusoid position encoding table.
"""
channels = d_pos_vec
position = np.arange(n_position)
num_timescales = channels // 2
log_timescale_increment = (np.log(float(1e4) / float(1)) /
(num_timescales - 1))
inv_timescales = np.exp(np.arange(
num_timescales)) * -log_timescale_increment
scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales,
0)
signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant')
position_enc = signal
return position_enc.astype("float32")
def create_data(is_static=False):
if is_static:
return [
src_word_np, src_pos_np, src_slf_attn_bias_np, trg_word_np,
trg_pos_np, trg_slf_attn_bias_np, trg_src_attn_bias_np, lbl_word_np,
lbl_weight_np
]
else:
enc_inputs = [
to_variable(
src_word_np, name='src_word'), to_variable(
src_pos_np, name='src_pos'), to_variable(
src_slf_attn_bias_np, name='src_slf_attn_bias')
]
dec_inputs = [
to_variable(
trg_word_np, name='trg_word'), to_variable(
trg_pos_np, name='trg_pos'), to_variable(
trg_slf_attn_bias_np, name='trg_slf_attn_bias'),
to_variable(
trg_src_attn_bias_np, name='trg_src_attn_bias')
]
label = to_variable(lbl_word_np, name='lbl_word')
weight = to_variable(lbl_weight_np, name='lbl_weight')
return enc_inputs, dec_inputs, label, weight
def create_feed_dict_list(data, init=False):
if init:
data_input_names = encoder_data_input_fields + \
decoder_data_input_fields[:-1] + label_data_input_fields + pos_enc_param_names
else:
data_input_names = encoder_data_input_fields + \
decoder_data_input_fields[:-1] + label_data_input_fields
feed_dict_list = dict()
for i in range(len(data_input_names)):
feed_dict_list[data_input_names[i]] = data[i]
return feed_dict_list
def make_all_inputs(input_fields):
"""
Define the input data layers for the transformer model.
"""
inputs = []
for input_field in input_fields:
input_var = fluid.layers.data(
name=input_field,
shape=input_descs[input_field][0],
dtype=input_descs[input_field][1],
lod_level=input_descs[input_field][2]
if len(input_descs[input_field]) == 3 else 0,
append_batch_size=False)
inputs.append(input_var)
return inputs
# The placeholder for batch_size in compile time. Must be -1 currently to be
# consistent with some ops' infer-shape output in compile time, such as the
# sequence_expand op used in beamsearch decoder.
batch_size = -1
# The placeholder for squence length in compile time.
seq_len = ModelHyperParams.max_length
# Here list the data shapes and data types of all inputs.
# The shapes here act as placeholder and are set to pass the infer-shape in
# compile time.
input_descs = {
# The actual data shape of src_word is:
# [batch_size, max_src_len_in_batch, 1]
"src_word": [(batch_size, seq_len, 1), "int64", 2],
# The actual data shape of src_pos is:
# [batch_size, max_src_len_in_batch, 1]
"src_pos": [(batch_size, seq_len, 1), "int64"],
# This input is used to remove attention weights on paddings in the
# encoder.
# The actual data shape of src_slf_attn_bias is:
# [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
"src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
seq_len), "float32"],
# The actual data shape of trg_word is:
# [batch_size, max_trg_len_in_batch, 1]
"trg_word": [(batch_size, seq_len, 1), "int64",
2], # lod_level is only used in fast decoder.
# The actual data shape of trg_pos is:
# [batch_size, max_trg_len_in_batch, 1]
"trg_pos": [(batch_size, seq_len, 1), "int64"],
# This input is used to remove attention weights on paddings and
# subsequent words in the decoder.
# The actual data shape of trg_slf_attn_bias is:
# [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
"trg_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
seq_len), "float32"],
# This input is used to remove attention weights on paddings of the source
# input in the encoder-decoder attention.
# The actual data shape of trg_src_attn_bias is:
# [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
"trg_src_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
seq_len), "float32"],
# This input is used in independent decoder program for inference.
# The actual data shape of enc_output is:
# [batch_size, max_src_len_in_batch, d_model]
"enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"],
# The actual data shape of label_word is:
# [batch_size * max_trg_len_in_batch, 1]
"lbl_word": [(batch_size * seq_len, 1), "int64"],
# This input is used to mask out the loss of paddding tokens.
# The actual data shape of label_weight is:
# [batch_size * max_trg_len_in_batch, 1]
"lbl_weight": [(batch_size * seq_len, 1), "float32"],
# This input is used in beam-search decoder.
"init_score": [(batch_size, 1), "float32", 2],
# This input is used in beam-search decoder for the first gather
# (cell states updation)
"init_idx": [(batch_size, ), "int32"],
}
# Names of word embedding table which might be reused for weight sharing.
word_emb_param_names = (
"src_word_emb_table",
"trg_word_emb_table", )
# Names of position encoding table which will be initialized externally.
pos_enc_param_names = (
"src_pos_enc_table",
"trg_pos_enc_table", )
# separated inputs for different usages.
encoder_data_input_fields = (
"src_word",
"src_pos",
"src_slf_attn_bias", )
decoder_data_input_fields = (
"trg_word",
"trg_pos",
"trg_slf_attn_bias",
"trg_src_attn_bias",
"enc_output", )
label_data_input_fields = (
"lbl_word",
"lbl_weight", )
# In fast decoder, trg_pos (only containing the current time step) is generated
# by ops and trg_slf_attn_bias is not needed.
fast_decoder_data_input_fields = (
"trg_word",
"init_score",
"init_idx",
"trg_src_attn_bias", )
# if we use py_reader
use_py_reader = False
# if we run sync mode
sync = False
# how many batches we use
batch_num = 5
np.random.seed = 90
src_word_np = np.arange(1, TrainTaskConfig.batch_size * seq_len + 1).reshape(
[TrainTaskConfig.batch_size, seq_len, 1]).astype('int64')
src_pos_np = np.random.randint(
1, seq_len, size=(TrainTaskConfig.batch_size, seq_len, 1), dtype='int64')
src_slf_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size,
ModelHyperParams.n_head, seq_len,
seq_len).astype('float32')
trg_word_np = np.arange(1, TrainTaskConfig.batch_size * seq_len + 1).reshape(
[TrainTaskConfig.batch_size, seq_len, 1]).astype('int64')
trg_pos_np = np.random.randint(
1, seq_len, size=(TrainTaskConfig.batch_size, seq_len, 1), dtype='int64')
trg_slf_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size,
ModelHyperParams.n_head, seq_len,
seq_len).astype('float32')
trg_src_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size,
ModelHyperParams.n_head, seq_len,
seq_len).astype('float32')
lbl_word_np = np.random.randint(
1,
ModelHyperParams.src_vocab_size - 1,
size=(TrainTaskConfig.batch_size * seq_len, 1),
dtype='int64')
lbl_weight_np = np.random.randn(TrainTaskConfig.batch_size * seq_len,
1).astype('float32')
pos_inp1 = position_encoding_init(ModelHyperParams.max_length,
ModelHyperParams.d_model)
pos_inp2 = position_encoding_init(ModelHyperParams.max_length,
ModelHyperParams.d_model)
class PrePostProcessLayer(Layer):
def __init__(self, name_scope, process_cmd, shape_len=None):
super(PrePostProcessLayer, self).__init__(name_scope)
for cmd in process_cmd:
if cmd == "n":
self._layer_norm = LayerNorm(
name_scope=self.full_name(),
begin_norm_axis=shape_len - 1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(0.)))
def forward(self, prev_out, out, process_cmd, dropout_rate=0.):
for cmd in process_cmd:
if cmd == "a": # add residual connection
out = out + prev_out if prev_out else out
elif cmd == "n": # add layer normalization
out = self._layer_norm(out)
elif cmd == "d": # add dropout
if dropout_rate:
out = fluid.layers.dropout(
out,
dropout_prob=dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False)
return out
class PositionwiseFeedForwardLayer(Layer):
def __init__(self, name_scope, d_inner_hid, d_hid, dropout_rate):
super(PositionwiseFeedForwardLayer, self).__init__(name_scope)
self._i2h = FC(name_scope=self.full_name(),
size=d_inner_hid,
num_flatten_dims=2,
act="relu")
self._h2o = FC(name_scope=self.full_name(),
size=d_hid,
num_flatten_dims=2)
self._dropout_rate = dropout_rate
def forward(self, x):
hidden = self._i2h(x)
if self._dropout_rate:
hidden = fluid.layers.dropout(
hidden,
dropout_prob=self._dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False)
out = self._h2o(hidden)
return out
class MultiHeadAttentionLayer(Layer):
def __init__(self,
name_scope,
d_key,
d_value,
d_model,
n_head=1,
dropout_rate=0.,
cache=None,
gather_idx=None,
static_kv=False):
super(MultiHeadAttentionLayer, self).__init__(name_scope)
self._n_head = n_head
self._d_key = d_key
self._d_value = d_value
self._d_model = d_model
self._dropout_rate = dropout_rate
self._q_fc = FC(name_scope=self.full_name(),
size=d_key * n_head,
bias_attr=False,
num_flatten_dims=2)
self._k_fc = FC(name_scope=self.full_name(),
size=d_key * n_head,
bias_attr=False,
num_flatten_dims=2)
self._v_fc = FC(name_scope=self.full_name(),
size=d_value * n_head,
bias_attr=False,
num_flatten_dims=2)
self._proj_fc = FC(name_scope=self.full_name(),
size=self._d_model,
bias_attr=False,
num_flatten_dims=2)
def forward(self, queries, keys, values, attn_bias):
# compute q ,k ,v
keys = queries if keys is None else keys
values = keys if values is None else values
q = self._q_fc(queries)
k = self._k_fc(keys)
v = self._v_fc(values)
# split head
reshaped_q = fluid.layers.reshape(
x=q, shape=[0, 0, self._n_head, self._d_key], inplace=False)
transpose_q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3])
reshaped_k = fluid.layers.reshape(
x=k, shape=[0, 0, self._n_head, self._d_key], inplace=False)
transpose_k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3])
reshaped_v = fluid.layers.reshape(
x=v, shape=[0, 0, self._n_head, self._d_value], inplace=False)
transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
# scale dot product attention
product = fluid.layers.matmul(
x=transpose_q,
y=transpose_k,
transpose_y=True,
alpha=self._d_model**-0.5)
if attn_bias:
product += attn_bias
weights = fluid.layers.softmax(product)
if self._dropout_rate:
weights_droped = fluid.layers.dropout(
weights,
dropout_prob=self._dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False)
out = fluid.layers.matmul(weights_droped, transpose_v)
else:
out = fluid.layers.matmul(weights, transpose_v)
# combine heads
if len(out.shape) != 4:
raise ValueError("Input(x) should be a 4-D Tensor.")
trans_x = fluid.layers.transpose(out, perm=[0, 2, 1, 3])
final_out = fluid.layers.reshape(
x=trans_x,
shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
inplace=False)
# fc to output
proj_out = self._proj_fc(final_out)
return proj_out
class EncoderSubLayer(Layer):
def __init__(self,
name_scope,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd="n",
postprocess_cmd="da"):
super(EncoderSubLayer, self).__init__(name_scope)
self._preprocess_cmd = preprocess_cmd
self._postprocess_cmd = postprocess_cmd
self._prepostprocess_dropout = prepostprocess_dropout
self._preprocess_layer = PrePostProcessLayer(self.full_name(),
self._preprocess_cmd, 3)
self._multihead_attention_layer = MultiHeadAttentionLayer(
self.full_name(), d_key, d_value, d_model, n_head,
attention_dropout)
self._postprocess_layer = PrePostProcessLayer(
self.full_name(), self._postprocess_cmd, None)
self._preprocess_layer2 = PrePostProcessLayer(self.full_name(),
self._preprocess_cmd, 3)
self._positionwise_feed_forward = PositionwiseFeedForwardLayer(
self.full_name(), d_inner_hid, d_model, relu_dropout)
self._postprocess_layer2 = PrePostProcessLayer(
self.full_name(), self._postprocess_cmd, None)
def forward(self, enc_input, attn_bias):
pre_process_multihead = self._preprocess_layer(
None, enc_input, self._preprocess_cmd, self._prepostprocess_dropout)
attn_output = self._multihead_attention_layer(pre_process_multihead,
None, None, attn_bias)
attn_output = self._postprocess_layer(enc_input, attn_output,
self._postprocess_cmd,
self._prepostprocess_dropout)
pre_process2_output = self._preprocess_layer2(
None, attn_output, self._preprocess_cmd,
self._prepostprocess_dropout)
ffd_output = self._positionwise_feed_forward(pre_process2_output)
return self._postprocess_layer2(attn_output, ffd_output,
self._postprocess_cmd,
self._prepostprocess_dropout)
class EncoderLayer(Layer):
def __init__(self,
name_scope,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd="n",
postprocess_cmd="da"):
super(EncoderLayer, self).__init__(name_scope)
self._preprocess_cmd = preprocess_cmd
self._encoder_sublayers = list()
self._prepostprocess_dropout = prepostprocess_dropout
self._n_layer = n_layer
self._preprocess_layer = PrePostProcessLayer(self.full_name(),
self._preprocess_cmd, 3)
for i in range(n_layer):
self._encoder_sublayers.append(
self.add_sublayer(
'esl_%d' % i,
EncoderSubLayer(
self.full_name(), n_head, d_key, d_value, d_model,
d_inner_hid, prepostprocess_dropout, attention_dropout,
relu_dropout, preprocess_cmd, postprocess_cmd)))
def forward(self, enc_input, attn_bias):
for i in range(self._n_layer):
enc_output = self._encoder_sublayers[i](enc_input, attn_bias)
enc_input = enc_output
return self._preprocess_layer(None, enc_output, self._preprocess_cmd,
self._prepostprocess_dropout)
class PrepareEncoderDecoderLayer(Layer):
def __init__(self,
name_scope,
src_vocab_size,
src_emb_dim,
src_max_len,
dropout_rate,
word_emb_param_name=None,
pos_enc_param_name=None):
super(PrepareEncoderDecoderLayer, self).__init__(name_scope)
self._src_max_len = src_max_len
self._src_emb_dim = src_emb_dim
self._src_vocab_size = src_vocab_size
self._dropout_rate = dropout_rate
self._input_emb = Embedding(
name_scope=self.full_name(),
size=[src_vocab_size, src_emb_dim],
padding_idx=0,
param_attr=fluid.ParamAttr(
name=word_emb_param_name,
initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
if pos_enc_param_name is pos_enc_param_names[0]:
pos_inp = pos_inp1
else:
pos_inp = pos_inp2
self._pos_emb = Embedding(
name_scope=self.full_name(),
size=[self._src_max_len, src_emb_dim],
param_attr=fluid.ParamAttr(
name=pos_enc_param_name,
initializer=fluid.initializer.NumpyArrayInitializer(pos_inp),
trainable=False))
# use in dygraph_mode to fit different length batch
# self._pos_emb._w = to_variable(
# position_encoding_init(self._src_max_len, self._src_emb_dim))
def forward(self, src_word, src_pos):
src_word_emb = self._input_emb(src_word)
src_word_emb = fluid.layers.scale(
x=src_word_emb, scale=self._src_emb_dim**0.5)
# # TODO change this to fit dynamic length input
src_pos_emb = self._pos_emb(src_pos)
src_pos_emb.stop_gradient = True
enc_input = src_word_emb + src_pos_emb
return fluid.layers.dropout(
enc_input,
dropout_prob=self._dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False) if self._dropout_rate else enc_input
class WrapEncoderLayer(Layer):
def __init__(self, name_cope, src_vocab_size, max_length, n_layer, n_head,
d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
attention_dropout, relu_dropout, preprocess_cmd,
postprocess_cmd, weight_sharing):
"""
The wrapper assembles together all needed layers for the encoder.
"""
super(WrapEncoderLayer, self).__init__(name_cope)
self._prepare_encoder_layer = PrepareEncoderDecoderLayer(
self.full_name(),
src_vocab_size,
d_model,
max_length,
prepostprocess_dropout,
word_emb_param_name=word_emb_param_names[0],
pos_enc_param_name=pos_enc_param_names[0])
self._encoder = EncoderLayer(
self.full_name(), n_layer, n_head, d_key, d_value, d_model,
d_inner_hid, prepostprocess_dropout, attention_dropout,
relu_dropout, preprocess_cmd, postprocess_cmd)
def forward(self, enc_inputs):
src_word, src_pos, src_slf_attn_bias = enc_inputs
enc_input = self._prepare_encoder_layer(src_word, src_pos)
enc_output = self._encoder(enc_input, src_slf_attn_bias)
return enc_output
class DecoderSubLayer(Layer):
def __init__(self,
name_scope,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
cache=None,
gather_idx=None):
super(DecoderSubLayer, self).__init__(name_scope)
self._postprocess_cmd = postprocess_cmd
self._preprocess_cmd = preprocess_cmd
self._prepostprcess_dropout = prepostprocess_dropout
self._pre_process_layer = PrePostProcessLayer(self.full_name(),
preprocess_cmd, 3)
self._multihead_attention_layer = MultiHeadAttentionLayer(
self.full_name(),
d_key,
d_value,
d_model,
n_head,
attention_dropout,
cache=cache,
gather_idx=gather_idx)
self._post_process_layer = PrePostProcessLayer(self.full_name(),
postprocess_cmd, None)
self._pre_process_layer2 = PrePostProcessLayer(self.full_name(),
preprocess_cmd, 3)
self._multihead_attention_layer2 = MultiHeadAttentionLayer(
self.full_name(),
d_key,
d_value,
d_model,
n_head,
attention_dropout,
cache=cache,
gather_idx=gather_idx,
static_kv=True)
self._post_process_layer2 = PrePostProcessLayer(self.full_name(),
postprocess_cmd, None)
self._pre_process_layer3 = PrePostProcessLayer(self.full_name(),
preprocess_cmd, 3)
self._positionwise_feed_forward_layer = PositionwiseFeedForwardLayer(
self.full_name(), d_inner_hid, d_model, relu_dropout)
self._post_process_layer3 = PrePostProcessLayer(self.full_name(),
postprocess_cmd, None)
def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
pre_process_rlt = self._pre_process_layer(
None, dec_input, self._preprocess_cmd, self._prepostprcess_dropout)
slf_attn_output = self._multihead_attention_layer(pre_process_rlt, None,
None, slf_attn_bias)
slf_attn_output_pp = self._post_process_layer(
dec_input, slf_attn_output, self._postprocess_cmd,
self._prepostprcess_dropout)
pre_process_rlt2 = self._pre_process_layer2(None, slf_attn_output_pp,
self._preprocess_cmd,
self._prepostprcess_dropout)
enc_attn_output_pp = self._multihead_attention_layer2(
pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias)
enc_attn_output = self._post_process_layer2(
slf_attn_output_pp, enc_attn_output_pp, self._postprocess_cmd,
self._prepostprcess_dropout)
pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output,
self._preprocess_cmd,
self._prepostprcess_dropout)
ffd_output = self._positionwise_feed_forward_layer(pre_process_rlt3)
dec_output = self._post_process_layer3(enc_attn_output, ffd_output,
self._postprocess_cmd,
self._prepostprcess_dropout)
return dec_output
class DecoderLayer(Layer):
def __init__(self,
name_scope,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
caches=None,
gather_idx=None):
super(DecoderLayer, self).__init__(name_scope)
self._pre_process_layer = PrePostProcessLayer(self.full_name(),
preprocess_cmd, 3)
self._decoder_sub_layers = list()
self._n_layer = n_layer
self._preprocess_cmd = preprocess_cmd
self._prepostprocess_dropout = prepostprocess_dropout
for i in range(n_layer):
self._decoder_sub_layers.append(
self.add_sublayer(
'dsl_%d' % i,
DecoderSubLayer(
self.full_name(),
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
cache=None if caches is None else caches[i],
gather_idx=gather_idx)))
def forward(self, dec_input, enc_output, dec_slf_attn_bias,
dec_enc_attn_bias):
for i in range(self._n_layer):
tmp_dec_output = self._decoder_sub_layers[i](
dec_input, enc_output, dec_slf_attn_bias, dec_enc_attn_bias)
dec_input = tmp_dec_output
dec_output = self._pre_process_layer(None, tmp_dec_output,
self._preprocess_cmd,
self._prepostprocess_dropout)
return dec_output
class WrapDecoderLayer(Layer):
def __init__(self,
name_scope,
trg_vocab_size,
max_length,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
weight_sharing,
caches=None,
gather_idx=None):
"""
The wrapper assembles together all needed layers for the encoder.
"""
super(WrapDecoderLayer, self).__init__(name_scope)
self._prepare_decoder_layer = PrepareEncoderDecoderLayer(
self.full_name(),
trg_vocab_size,
d_model,
max_length,
prepostprocess_dropout,
word_emb_param_name=word_emb_param_names[1],
pos_enc_param_name=pos_enc_param_names[1])
self._decoder_layer = DecoderLayer(
self.full_name(),
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
caches=caches,
gather_idx=gather_idx)
self._weight_sharing = weight_sharing
if not weight_sharing:
self._fc = FC(self.full_name(),
size=trg_vocab_size,
bias_attr=False)
def forward(self, dec_inputs=None, enc_output=None):
trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs
dec_input = self._prepare_decoder_layer(trg_word, trg_pos)
dec_output = self._decoder_layer(dec_input, enc_output,
trg_slf_attn_bias, trg_src_attn_bias)
dec_output_reshape = fluid.layers.reshape(
dec_output, shape=[-1, dec_output.shape[-1]], inplace=False)
if self._weight_sharing:
predict = fluid.layers.matmul(
x=dec_output_reshape,
y=self._prepare_decoder_layer._input_emb._w,
transpose_y=True)
else:
predict = self._fc(dec_output_reshape)
if dec_inputs is None:
# Return probs for independent decoder program.
predict_out = fluid.layers.softmax(predict)
return predict_out
return predict
class TransFormer(Layer):
def __init__(self,
name_scope,
src_vocab_size,
trg_vocab_size,
max_length,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
weight_sharing,
label_smooth_eps,
use_py_reader=False,
is_test=False):
super(TransFormer, self).__init__(name_scope)
self._label_smooth_eps = label_smooth_eps
self._trg_vocab_size = trg_vocab_size
if weight_sharing:
assert src_vocab_size == trg_vocab_size, (
"Vocabularies in source and target should be same for weight sharing."
)
self._wrap_encoder_layer = WrapEncoderLayer(
self.full_name(), src_vocab_size, max_length, n_layer, n_head,
d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd,
weight_sharing)
self._wrap_decoder_layer = WrapDecoderLayer(
self.full_name(), trg_vocab_size, max_length, n_layer, n_head,
d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd,
weight_sharing)
if weight_sharing:
self._wrap_decoder_layer._prepare_decoder_layer._input_emb._w = self._wrap_encoder_layer._prepare_encoder_layer._input_emb._w
def forward(self, enc_inputs, dec_inputs, label, weights):
enc_output = self._wrap_encoder_layer(enc_inputs)
predict = self._wrap_decoder_layer(dec_inputs, enc_output)
if self._label_smooth_eps:
label_out = fluid.layers.label_smooth(
label=fluid.layers.one_hot(
input=label, depth=self._trg_vocab_size),
epsilon=self._label_smooth_eps)
cost = fluid.layers.softmax_with_cross_entropy(
logits=predict,
label=label_out,
soft_label=True if self._label_smooth_eps else False)
weighted_cost = cost * weights
sum_cost = fluid.layers.reduce_sum(weighted_cost)
token_num = fluid.layers.reduce_sum(weights)
token_num.stop_gradient = True
avg_cost = sum_cost / token_num
return sum_cost, avg_cost, predict, token_num
class TestDygraphTransformer(unittest.TestCase):
def test_transformer_float32(self):
seed = 90
with guard():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
transformer = TransFormer(
'transformer',
ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size,
ModelHyperParams.max_length + 1,
ModelHyperParams.n_layer,
ModelHyperParams.n_head,
ModelHyperParams.d_key,
ModelHyperParams.d_value,
ModelHyperParams.d_model,
ModelHyperParams.d_inner_hid,
ModelHyperParams.prepostprocess_dropout,
ModelHyperParams.attention_dropout,
ModelHyperParams.relu_dropout,
ModelHyperParams.preprocess_cmd,
ModelHyperParams.postprocess_cmd,
ModelHyperParams.weight_sharing,
TrainTaskConfig.label_smooth_eps,
use_py_reader=use_py_reader,
is_test=False)
if sync:
lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
ModelHyperParams.d_model, TrainTaskConfig.warmup_steps)
with fluid.default_main_program()._lr_schedule_guard():
learning_rate = lr_decay * TrainTaskConfig.learning_rate
optimizer = fluid.optimizer.Adam(
learning_rate=learning_rate,
beta1=TrainTaskConfig.beta1,
beta2=TrainTaskConfig.beta2,
epsilon=TrainTaskConfig.eps)
else:
optimizer = fluid.optimizer.SGD(learning_rate=0.003)
dy_param_init = dict()
dy_param_updated = dict()
for i in range(batch_num):
enc_inputs, dec_inputs, label, weights = create_data()
dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = transformer(
enc_inputs, dec_inputs, label, weights)
if i == 0:
for param in transformer.parameters():
dy_param_init[param.name] = param.numpy()
dy_avg_cost.backward()
optimizer.minimize(dy_avg_cost)
transformer.clear_gradients()
if i == batch_num - 1:
for param in transformer.parameters():
dy_param_updated[param.name] = param.numpy()
with new_program_scope():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
transformer = TransFormer(
'transformer',
ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size,
ModelHyperParams.max_length + 1,
ModelHyperParams.n_layer,
ModelHyperParams.n_head,
ModelHyperParams.d_key,
ModelHyperParams.d_value,
ModelHyperParams.d_model,
ModelHyperParams.d_inner_hid,
ModelHyperParams.prepostprocess_dropout,
ModelHyperParams.attention_dropout,
ModelHyperParams.relu_dropout,
ModelHyperParams.preprocess_cmd,
ModelHyperParams.postprocess_cmd,
ModelHyperParams.weight_sharing,
TrainTaskConfig.label_smooth_eps,
use_py_reader=use_py_reader,
is_test=False)
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
optimizer = fluid.optimizer.SGD(learning_rate=0.003)
data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
-1] + label_data_input_fields
all_inputs = make_all_inputs(data_input_names)
enc_inputs_len = len(encoder_data_input_fields)
dec_inputs_len = len(decoder_data_input_fields[:-1])
enc_inputs = all_inputs[0:enc_inputs_len]
dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len +
dec_inputs_len]
label = all_inputs[-2]
weights = all_inputs[-1]
static_param_updated = dict()
static_param_init = dict()
static_param_name_list = list()
static_sum_cost, static_avg_cost, static_predict, static_token_num = transformer(
enc_inputs, dec_inputs, label, weights)
optimizer.minimize(static_avg_cost)
for param in transformer.parameters():
static_param_name_list.append(param.name)
out = exe.run(fluid.default_startup_program(),
fetch_list=static_param_name_list)
for i in range(len(static_param_name_list)):
static_param_init[static_param_name_list[i]] = out[i]
static_sum_cost_value = None
static_avg_cost_value = None
static_predict_value = None
static_token_num_value = None
for i in range(batch_num):
feed_dict = create_feed_dict_list(create_data(True))
fetch_list = [
static_sum_cost, static_avg_cost, static_predict,
static_token_num
]
fetch_list.extend(static_param_name_list)
out = exe.run(fluid.default_main_program(),
feed=feed_dict,
fetch_list=fetch_list)
static_sum_cost_value = out[0]
static_avg_cost_value = out[1]
static_predict_value = out[2]
static_token_num_value = out[3]
if i == batch_num - 1:
for k in range(4, len(out)):
static_param_updated[static_param_name_list[k -
4]] = out[k]
self.assertTrue(
np.array_equal(static_avg_cost_value, dy_avg_cost.numpy()))
self.assertTrue(
np.array_equal(static_sum_cost_value, dy_sum_cost.numpy()))
self.assertTrue(
np.array_equal(static_predict_value, dy_predict.numpy()))
self.assertTrue(
np.array_equal(static_token_num_value, dy_token_num.numpy()))
for key, value in six.iteritems(static_param_init):
self.assertTrue(np.array_equal(value, dy_param_init[key]))
for key, value in six.iteritems(static_param_updated):
self.assertTrue(np.array_equal(value, dy_param_updated[key]))
if __name__ == '__main__':
unittest.main()
......@@ -20,12 +20,144 @@ from paddle.fluid import Embedding, LayerNorm, FC, Layer
from paddle.fluid.dygraph import to_variable, guard
from test_imperative_base import new_program_scope
from paddle.fluid import core
from test_imperative_transformer import TransFormer, TrainTaskConfig, ModelHyperParams
import numpy as np
import six
np.set_printoptions(suppress=True)
# Copy from models
class TrainTaskConfig(object):
# support both CPU and GPU now.
use_gpu = True
# the epoch number to train.
pass_num = 30
# the number of sequences contained in a mini-batch.
# deprecated, set batch_size in args.
batch_size = 32
# the hyper parameters for Adam optimizer.
# This static learning_rate will be multiplied to the LearningRateScheduler
# derived learning rate the to get the final learning rate.
learning_rate = 2.0
beta1 = 0.9
beta2 = 0.997
eps = 1e-9
# the parameters for learning rate scheduling.
warmup_steps = 8000
# the weight used to mix up the ground-truth distribution and the fixed
# uniform distribution in label smoothing when training.
# Set this as zero if label smoothing is not wanted.
label_smooth_eps = 0.1
# the directory for saving trained models.
model_dir = "trained_models"
# the directory for saving checkpoints.
ckpt_dir = "trained_ckpts"
# the directory for loading checkpoint.
# If provided, continue training from the checkpoint.
ckpt_path = None
# the parameter to initialize the learning rate scheduler.
# It should be provided if use checkpoints, since the checkpoint doesn't
# include the training step counter currently.
start_step = 0
# the frequency to save trained models.
save_freq = 10000
class InferTaskConfig(object):
use_gpu = True
# the number of examples in one run for sequence generation.
batch_size = 10
# the parameters for beam search.
beam_size = 5
max_out_len = 256
# the number of decoded sentences to output.
n_best = 1
# the flags indicating whether to output the special tokens.
output_bos = False
output_eos = False
output_unk = True
# the directory for loading the trained model.
model_path = "trained_models/pass_1.infer.model"
class ModelHyperParams(object):
# These following five vocabularies related configurations will be set
# automatically according to the passed vocabulary path and special tokens.
# size of source word dictionary.
src_vocab_size = 10000
# size of target word dictionay
trg_vocab_size = 10000
# index for <bos> token
bos_idx = 0
# index for <eos> token
eos_idx = 1
# index for <unk> token
unk_idx = 2
# max length of sequences deciding the size of position encoding table.
max_length = 4
# the dimension for word embeddings, which is also the last dimension of
# the input and output of multi-head attention, position-wise feed-forward
# networks, encoder and decoder.
d_model = 512
# size of the hidden layer in position-wise feed-forward networks.
d_inner_hid = 2048
# the dimension that keys are projected to for dot-product attention.
d_key = 64
# the dimension that values are projected to for dot-product attention.
d_value = 64
# number of head used in multi-head attention.
n_head = 8
# number of sub-layers to be stacked in the encoder and decoder.
n_layer = 6
# dropout rates of different modules.
prepostprocess_dropout = 0.1
attention_dropout = 0.1
relu_dropout = 0.1
# to process before each sub-layer
preprocess_cmd = "n" # layer normalization
# to process after each sub-layer
postprocess_cmd = "da" # dropout + residual connection
# random seed used in dropout for CE.
dropout_seed = None
# the flag indicating whether to share embedding and softmax weights.
# vocabularies in source and target should be same for weight sharing.
weight_sharing = True
def merge_cfg_from_list(cfg_list, g_cfgs):
"""
Set the above global configurations using the cfg_list.
"""
assert len(cfg_list) % 2 == 0
for key, value in zip(cfg_list[0::2], cfg_list[1::2]):
for g_cfg in g_cfgs:
if hasattr(g_cfg, key):
try:
value = eval(value)
except Exception: # for file path
pass
setattr(g_cfg, key, value)
break
def position_encoding_init(n_position, d_pos_vec):
"""
Generate the initial values for the sinusoid position encoding table.
"""
channels = d_pos_vec
position = np.arange(n_position)
num_timescales = channels // 2
log_timescale_increment = (np.log(float(1e4) / float(1)) /
(num_timescales - 1))
inv_timescales = np.exp(np.arange(
num_timescales)) * -log_timescale_increment
scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales,
0)
signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant')
position_enc = signal
return position_enc.astype("float32")
def create_data(is_static=False):
if is_static:
return [
......@@ -208,6 +340,598 @@ lbl_word_np = np.random.randint(
lbl_weight_np = np.random.randn(TrainTaskConfig.batch_size * seq_len,
1).astype('float32')
pos_inp1 = position_encoding_init(ModelHyperParams.max_length,
ModelHyperParams.d_model)
pos_inp2 = position_encoding_init(ModelHyperParams.max_length,
ModelHyperParams.d_model)
class PrePostProcessLayer(Layer):
def __init__(self, name_scope, process_cmd, shape_len=None):
super(PrePostProcessLayer, self).__init__(name_scope)
for cmd in process_cmd:
if cmd == "n":
self._layer_norm = LayerNorm(
name_scope=self.full_name(),
begin_norm_axis=shape_len - 1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(0.)))
def forward(self, prev_out, out, process_cmd, dropout_rate=0.):
for cmd in process_cmd:
if cmd == "a": # add residual connection
out = out + prev_out if prev_out else out
elif cmd == "n": # add layer normalization
out = self._layer_norm(out)
elif cmd == "d": # add dropout
if dropout_rate:
out = fluid.layers.dropout(
out,
dropout_prob=dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False)
return out
class PositionwiseFeedForwardLayer(Layer):
def __init__(self, name_scope, d_inner_hid, d_hid, dropout_rate):
super(PositionwiseFeedForwardLayer, self).__init__(name_scope)
self._i2h = FC(name_scope=self.full_name(),
size=d_inner_hid,
num_flatten_dims=2,
act="relu")
self._h2o = FC(name_scope=self.full_name(),
size=d_hid,
num_flatten_dims=2)
self._dropout_rate = dropout_rate
def forward(self, x):
hidden = self._i2h(x)
if self._dropout_rate:
hidden = fluid.layers.dropout(
hidden,
dropout_prob=self._dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False)
out = self._h2o(hidden)
return out
class MultiHeadAttentionLayer(Layer):
def __init__(self,
name_scope,
d_key,
d_value,
d_model,
n_head=1,
dropout_rate=0.,
cache=None,
gather_idx=None,
static_kv=False):
super(MultiHeadAttentionLayer, self).__init__(name_scope)
self._n_head = n_head
self._d_key = d_key
self._d_value = d_value
self._d_model = d_model
self._dropout_rate = dropout_rate
self._q_fc = FC(name_scope=self.full_name(),
size=d_key * n_head,
bias_attr=False,
num_flatten_dims=2)
self._k_fc = FC(name_scope=self.full_name(),
size=d_key * n_head,
bias_attr=False,
num_flatten_dims=2)
self._v_fc = FC(name_scope=self.full_name(),
size=d_value * n_head,
bias_attr=False,
num_flatten_dims=2)
self._proj_fc = FC(name_scope=self.full_name(),
size=self._d_model,
bias_attr=False,
num_flatten_dims=2)
def forward(self, queries, keys, values, attn_bias):
# compute q ,k ,v
keys = queries if keys is None else keys
values = keys if values is None else values
q = self._q_fc(queries)
k = self._k_fc(keys)
v = self._v_fc(values)
# split head
reshaped_q = fluid.layers.reshape(
x=q, shape=[0, 0, self._n_head, self._d_key], inplace=False)
transpose_q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3])
reshaped_k = fluid.layers.reshape(
x=k, shape=[0, 0, self._n_head, self._d_key], inplace=False)
transpose_k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3])
reshaped_v = fluid.layers.reshape(
x=v, shape=[0, 0, self._n_head, self._d_value], inplace=False)
transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
# scale dot product attention
product = fluid.layers.matmul(
x=transpose_q,
y=transpose_k,
transpose_y=True,
alpha=self._d_model**-0.5)
if attn_bias:
product += attn_bias
weights = fluid.layers.softmax(product)
if self._dropout_rate:
weights_droped = fluid.layers.dropout(
weights,
dropout_prob=self._dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False)
out = fluid.layers.matmul(weights_droped, transpose_v)
else:
out = fluid.layers.matmul(weights, transpose_v)
# combine heads
if len(out.shape) != 4:
raise ValueError("Input(x) should be a 4-D Tensor.")
trans_x = fluid.layers.transpose(out, perm=[0, 2, 1, 3])
final_out = fluid.layers.reshape(
x=trans_x,
shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
inplace=False)
# fc to output
proj_out = self._proj_fc(final_out)
return proj_out
class EncoderSubLayer(Layer):
def __init__(self,
name_scope,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd="n",
postprocess_cmd="da"):
super(EncoderSubLayer, self).__init__(name_scope)
self._preprocess_cmd = preprocess_cmd
self._postprocess_cmd = postprocess_cmd
self._prepostprocess_dropout = prepostprocess_dropout
self._preprocess_layer = PrePostProcessLayer(self.full_name(),
self._preprocess_cmd, 3)
self._multihead_attention_layer = MultiHeadAttentionLayer(
self.full_name(), d_key, d_value, d_model, n_head,
attention_dropout)
self._postprocess_layer = PrePostProcessLayer(
self.full_name(), self._postprocess_cmd, None)
self._preprocess_layer2 = PrePostProcessLayer(self.full_name(),
self._preprocess_cmd, 3)
self._positionwise_feed_forward = PositionwiseFeedForwardLayer(
self.full_name(), d_inner_hid, d_model, relu_dropout)
self._postprocess_layer2 = PrePostProcessLayer(
self.full_name(), self._postprocess_cmd, None)
def forward(self, enc_input, attn_bias):
pre_process_multihead = self._preprocess_layer(
None, enc_input, self._preprocess_cmd, self._prepostprocess_dropout)
attn_output = self._multihead_attention_layer(pre_process_multihead,
None, None, attn_bias)
attn_output = self._postprocess_layer(enc_input, attn_output,
self._postprocess_cmd,
self._prepostprocess_dropout)
pre_process2_output = self._preprocess_layer2(
None, attn_output, self._preprocess_cmd,
self._prepostprocess_dropout)
ffd_output = self._positionwise_feed_forward(pre_process2_output)
return self._postprocess_layer2(attn_output, ffd_output,
self._postprocess_cmd,
self._prepostprocess_dropout)
class EncoderLayer(Layer):
def __init__(self,
name_scope,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd="n",
postprocess_cmd="da"):
super(EncoderLayer, self).__init__(name_scope)
self._preprocess_cmd = preprocess_cmd
self._encoder_sublayers = list()
self._prepostprocess_dropout = prepostprocess_dropout
self._n_layer = n_layer
self._preprocess_layer = PrePostProcessLayer(self.full_name(),
self._preprocess_cmd, 3)
for i in range(n_layer):
self._encoder_sublayers.append(
self.add_sublayer(
'esl_%d' % i,
EncoderSubLayer(
self.full_name(), n_head, d_key, d_value, d_model,
d_inner_hid, prepostprocess_dropout, attention_dropout,
relu_dropout, preprocess_cmd, postprocess_cmd)))
def forward(self, enc_input, attn_bias):
for i in range(self._n_layer):
enc_output = self._encoder_sublayers[i](enc_input, attn_bias)
enc_input = enc_output
return self._preprocess_layer(None, enc_output, self._preprocess_cmd,
self._prepostprocess_dropout)
class PrepareEncoderDecoderLayer(Layer):
def __init__(self,
name_scope,
src_vocab_size,
src_emb_dim,
src_max_len,
dropout_rate,
word_emb_param_name=None,
pos_enc_param_name=None):
super(PrepareEncoderDecoderLayer, self).__init__(name_scope)
self._src_max_len = src_max_len
self._src_emb_dim = src_emb_dim
self._src_vocab_size = src_vocab_size
self._dropout_rate = dropout_rate
self._input_emb = Embedding(
name_scope=self.full_name(),
size=[src_vocab_size, src_emb_dim],
padding_idx=0,
param_attr=fluid.ParamAttr(
name=word_emb_param_name,
initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
if pos_enc_param_name is pos_enc_param_names[0]:
pos_inp = pos_inp1
else:
pos_inp = pos_inp2
self._pos_emb = Embedding(
name_scope=self.full_name(),
size=[self._src_max_len, src_emb_dim],
param_attr=fluid.ParamAttr(
name=pos_enc_param_name,
initializer=fluid.initializer.NumpyArrayInitializer(pos_inp),
trainable=False))
# use in dygraph_mode to fit different length batch
# self._pos_emb._w = to_variable(
# position_encoding_init(self._src_max_len, self._src_emb_dim))
def forward(self, src_word, src_pos):
src_word_emb = self._input_emb(src_word)
src_word_emb = fluid.layers.scale(
x=src_word_emb, scale=self._src_emb_dim**0.5)
# # TODO change this to fit dynamic length input
src_pos_emb = self._pos_emb(src_pos)
src_pos_emb.stop_gradient = True
enc_input = src_word_emb + src_pos_emb
return fluid.layers.dropout(
enc_input,
dropout_prob=self._dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False) if self._dropout_rate else enc_input
class WrapEncoderLayer(Layer):
def __init__(self, name_cope, src_vocab_size, max_length, n_layer, n_head,
d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
attention_dropout, relu_dropout, preprocess_cmd,
postprocess_cmd, weight_sharing):
"""
The wrapper assembles together all needed layers for the encoder.
"""
super(WrapEncoderLayer, self).__init__(name_cope)
self._prepare_encoder_layer = PrepareEncoderDecoderLayer(
self.full_name(),
src_vocab_size,
d_model,
max_length,
prepostprocess_dropout,
word_emb_param_name=word_emb_param_names[0],
pos_enc_param_name=pos_enc_param_names[0])
self._encoder = EncoderLayer(
self.full_name(), n_layer, n_head, d_key, d_value, d_model,
d_inner_hid, prepostprocess_dropout, attention_dropout,
relu_dropout, preprocess_cmd, postprocess_cmd)
def forward(self, enc_inputs):
src_word, src_pos, src_slf_attn_bias = enc_inputs
enc_input = self._prepare_encoder_layer(src_word, src_pos)
enc_output = self._encoder(enc_input, src_slf_attn_bias)
return enc_output
class DecoderSubLayer(Layer):
def __init__(self,
name_scope,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
cache=None,
gather_idx=None):
super(DecoderSubLayer, self).__init__(name_scope)
self._postprocess_cmd = postprocess_cmd
self._preprocess_cmd = preprocess_cmd
self._prepostprcess_dropout = prepostprocess_dropout
self._pre_process_layer = PrePostProcessLayer(self.full_name(),
preprocess_cmd, 3)
self._multihead_attention_layer = MultiHeadAttentionLayer(
self.full_name(),
d_key,
d_value,
d_model,
n_head,
attention_dropout,
cache=cache,
gather_idx=gather_idx)
self._post_process_layer = PrePostProcessLayer(self.full_name(),
postprocess_cmd, None)
self._pre_process_layer2 = PrePostProcessLayer(self.full_name(),
preprocess_cmd, 3)
self._multihead_attention_layer2 = MultiHeadAttentionLayer(
self.full_name(),
d_key,
d_value,
d_model,
n_head,
attention_dropout,
cache=cache,
gather_idx=gather_idx,
static_kv=True)
self._post_process_layer2 = PrePostProcessLayer(self.full_name(),
postprocess_cmd, None)
self._pre_process_layer3 = PrePostProcessLayer(self.full_name(),
preprocess_cmd, 3)
self._positionwise_feed_forward_layer = PositionwiseFeedForwardLayer(
self.full_name(), d_inner_hid, d_model, relu_dropout)
self._post_process_layer3 = PrePostProcessLayer(self.full_name(),
postprocess_cmd, None)
def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
pre_process_rlt = self._pre_process_layer(
None, dec_input, self._preprocess_cmd, self._prepostprcess_dropout)
slf_attn_output = self._multihead_attention_layer(pre_process_rlt, None,
None, slf_attn_bias)
slf_attn_output_pp = self._post_process_layer(
dec_input, slf_attn_output, self._postprocess_cmd,
self._prepostprcess_dropout)
pre_process_rlt2 = self._pre_process_layer2(None, slf_attn_output_pp,
self._preprocess_cmd,
self._prepostprcess_dropout)
enc_attn_output_pp = self._multihead_attention_layer2(
pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias)
enc_attn_output = self._post_process_layer2(
slf_attn_output_pp, enc_attn_output_pp, self._postprocess_cmd,
self._prepostprcess_dropout)
pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output,
self._preprocess_cmd,
self._prepostprcess_dropout)
ffd_output = self._positionwise_feed_forward_layer(pre_process_rlt3)
dec_output = self._post_process_layer3(enc_attn_output, ffd_output,
self._postprocess_cmd,
self._prepostprcess_dropout)
return dec_output
class DecoderLayer(Layer):
def __init__(self,
name_scope,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
caches=None,
gather_idx=None):
super(DecoderLayer, self).__init__(name_scope)
self._pre_process_layer = PrePostProcessLayer(self.full_name(),
preprocess_cmd, 3)
self._decoder_sub_layers = list()
self._n_layer = n_layer
self._preprocess_cmd = preprocess_cmd
self._prepostprocess_dropout = prepostprocess_dropout
for i in range(n_layer):
self._decoder_sub_layers.append(
self.add_sublayer(
'dsl_%d' % i,
DecoderSubLayer(
self.full_name(),
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
cache=None if caches is None else caches[i],
gather_idx=gather_idx)))
def forward(self, dec_input, enc_output, dec_slf_attn_bias,
dec_enc_attn_bias):
for i in range(self._n_layer):
tmp_dec_output = self._decoder_sub_layers[i](
dec_input, enc_output, dec_slf_attn_bias, dec_enc_attn_bias)
dec_input = tmp_dec_output
dec_output = self._pre_process_layer(None, tmp_dec_output,
self._preprocess_cmd,
self._prepostprocess_dropout)
return dec_output
class WrapDecoderLayer(Layer):
def __init__(self,
name_scope,
trg_vocab_size,
max_length,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
weight_sharing,
caches=None,
gather_idx=None):
"""
The wrapper assembles together all needed layers for the encoder.
"""
super(WrapDecoderLayer, self).__init__(name_scope)
self._prepare_decoder_layer = PrepareEncoderDecoderLayer(
self.full_name(),
trg_vocab_size,
d_model,
max_length,
prepostprocess_dropout,
word_emb_param_name=word_emb_param_names[1],
pos_enc_param_name=pos_enc_param_names[1])
self._decoder_layer = DecoderLayer(
self.full_name(),
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
caches=caches,
gather_idx=gather_idx)
self._weight_sharing = weight_sharing
if not weight_sharing:
self._fc = FC(self.full_name(),
size=trg_vocab_size,
bias_attr=False)
def forward(self, dec_inputs=None, enc_output=None):
trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs
dec_input = self._prepare_decoder_layer(trg_word, trg_pos)
dec_output = self._decoder_layer(dec_input, enc_output,
trg_slf_attn_bias, trg_src_attn_bias)
dec_output_reshape = fluid.layers.reshape(
dec_output, shape=[-1, dec_output.shape[-1]], inplace=False)
if self._weight_sharing:
predict = fluid.layers.matmul(
x=dec_output_reshape,
y=self._prepare_decoder_layer._input_emb._w,
transpose_y=True)
else:
predict = self._fc(dec_output_reshape)
if dec_inputs is None:
# Return probs for independent decoder program.
predict_out = fluid.layers.softmax(predict)
return predict_out
return predict
class TransFormer(Layer):
def __init__(self,
name_scope,
src_vocab_size,
trg_vocab_size,
max_length,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
weight_sharing,
label_smooth_eps,
use_py_reader=False,
is_test=False):
super(TransFormer, self).__init__(name_scope)
self._label_smooth_eps = label_smooth_eps
self._trg_vocab_size = trg_vocab_size
if weight_sharing:
assert src_vocab_size == trg_vocab_size, (
"Vocabularies in source and target should be same for weight sharing."
)
self._wrap_encoder_layer = WrapEncoderLayer(
self.full_name(), src_vocab_size, max_length, n_layer, n_head,
d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd,
weight_sharing)
self._wrap_decoder_layer = WrapDecoderLayer(
self.full_name(), trg_vocab_size, max_length, n_layer, n_head,
d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd,
weight_sharing)
if weight_sharing:
self._wrap_decoder_layer._prepare_decoder_layer._input_emb._w = self._wrap_encoder_layer._prepare_encoder_layer._input_emb._w
def forward(self, enc_inputs, dec_inputs, label, weights):
enc_output = self._wrap_encoder_layer(enc_inputs)
predict = self._wrap_decoder_layer(dec_inputs, enc_output)
if self._label_smooth_eps:
label_out = fluid.layers.label_smooth(
label=fluid.layers.one_hot(
input=label, depth=self._trg_vocab_size),
epsilon=self._label_smooth_eps)
cost = fluid.layers.softmax_with_cross_entropy(
logits=predict,
label=label_out,
soft_label=True if self._label_smooth_eps else False)
weighted_cost = cost * weights
sum_cost = fluid.layers.reduce_sum(weighted_cost)
token_num = fluid.layers.reduce_sum(weights)
token_num.stop_gradient = True
avg_cost = sum_cost / token_num
return sum_cost, avg_cost, predict, token_num
class TestDygraphTransformerSortGradient(unittest.TestCase):
def test_transformer_sort_gradient_float32(self):
......
......@@ -16,3 +16,4 @@ funcsigs
pyyaml
decorator
prettytable
objgraph
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册