diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index b3b140936600546267c10d0d7ea5feb98e672a10..98f1147400bbb4e5a490c8c3600521bee95f614b 100755 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -820,11 +820,11 @@ paddle.fluid.dygraph.TreeConv.state_dict (ArgSpec(args=['self', 'destination', ' paddle.fluid.dygraph.TreeConv.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) paddle.fluid.dygraph.TreeConv.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.Tracer ('paddle.fluid.dygraph.tracer.Tracer', ('document', '28d72409112111274c33e1f07229d5da')) -paddle.fluid.dygraph.Tracer.__init__ (ArgSpec(args=['self', 'block'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Tracer.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.Tracer.all_parameters (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.Tracer.eval_mode (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.dygraph.Tracer.trace 1. trace(self: paddle.fluid.core_avx.Tracer, arg0: paddle.fluid.core_avx.OpBase, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CPUPlace, arg5: bool) -> None 2. trace(self: paddle.fluid.core_avx.Tracer, arg0: paddle.fluid.core_avx.OpBase, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CUDAPlace, arg5: bool) -> None -paddle.fluid.dygraph.Tracer.trace_op (ArgSpec(args=['self', 'op', 'inputs', 'outputs', 'stop_gradient'], varargs=None, keywords=None, defaults=(False,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Tracer.trace 1. trace(self: paddle.fluid.core_avx.Tracer, arg0: unicode, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CUDAPlace, arg5: bool) -> None 2. trace(self: paddle.fluid.core_avx.Tracer, arg0: unicode, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CPUPlace, arg5: bool) -> None +paddle.fluid.dygraph.Tracer.trace_op (ArgSpec(args=['self', 'type', 'inputs', 'outputs', 'attrs', 'stop_gradient'], varargs=None, keywords=None, defaults=(False,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.Tracer.trace_var (ArgSpec(args=['self', 'name', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.Tracer.train_mode (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.prepare_context (ArgSpec(args=['strategy'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 73c629fd227aee0bf90c4049a2f66f717e939984..1a1deef963542b1d16978d314d126585af5c07b8 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -1,10 +1,11 @@ cc_library(imperative_flag SRCS flags.cc DEPS gflags) -if(WITH_PYTHON) -cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind profiler imperative_flag) -cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind profiler) -cc_library(engine SRCS engine.cc) +cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits) +cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry) +cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows var_type_traits layer) +cc_library(tracer SRCS tracer.cc DEPS layer engine) +cc_library(engine SRCS engine.cc DEPS layer gradient_accumulator) cc_library(imperative_profiler SRCS profiler.cc) cc_library(nccl_context SRCS nccl_context.cc DEPS device_context) -cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context) -endif() + +add_subdirectory(tests) diff --git a/paddle/fluid/imperative/backward_strategy.h b/paddle/fluid/imperative/backward_strategy.h index 9ff07d6d797ca88d07c57514545064f26426eaf9..0f04d6db8e63d5d069745ed1895df774e69d60d0 100644 --- a/paddle/fluid/imperative/backward_strategy.h +++ b/paddle/fluid/imperative/backward_strategy.h @@ -16,17 +16,12 @@ // Created by Jiabin on 2019-04-25. // #pragma once -#ifndef PADDLE_BACKWARDSTRATEGY_H -#define PADDLE_BACKWARDSTRATEGY_H - -#endif // PADDLE_BACKWARDSTRATEGY_H namespace paddle { namespace imperative { namespace detail { -class BackwardStrategy { - public: +struct BackwardStrategy { /* DyGraph now support two kinds of backward strategy, one is sorted sum * gradient, another is sum gradient once they are created */ // TODO(jiabin): add more Strategy when we support diff --git a/paddle/fluid/imperative/engine.cc b/paddle/fluid/imperative/engine.cc index de7ab0e5918281579728ef48d1517be2cd530af7..158ba7a2a1061cf43da3538a3b830d35c2f5b982 100644 --- a/paddle/fluid/imperative/engine.cc +++ b/paddle/fluid/imperative/engine.cc @@ -14,40 +14,219 @@ #include "paddle/fluid/imperative/engine.h" -#include // NOLINT +#include +#include +#include +#include +#include +#include #include - -#include "glog/logging.h" +#include "paddle/fluid/imperative/gradient_accumulator.h" +#include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/imperative/tracer.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace imperative { -static std::once_flag init_engine; -static Engine* engine; +void Engine::RunOp(paddle::imperative::OpBase* op, + const paddle::imperative::NameVarBaseMap& ins, + const paddle::imperative::NameVarBaseMap& outs, + const paddle::platform::Place& place) { + platform::RecordEvent event(op->Type()); + + op->Run(ins, outs); +} -class DummyEngine : public Engine { - public: - void Enqueue(Runnable* runnable) override { - queued_runnables_.push_back(runnable); +void BasicEngine::Init(VarBase* var, const detail::BackwardStrategy& strategy) { + backward_strategy_ = strategy; + const std::vector ops = var->GradVarBase()->GradOps(); + var->ClearGradOps(); + + if (ops.empty()) { + VLOG(3) << "Skip auto grad since there is no grad op for var: " + << var->Name(); + return; + } else { + bool valid = false; + for (const auto& op : ops) { + if (op) { + valid = true; + } + } + if (!valid) { + VLOG(3) << "Skip auto grad since all grad op of start VarBase is nullptr"; + return; + } } + init_ops_ = ops; + platform::RecordEvent record_event("Imperative Backward"); + VLOG(3) << "start backward"; + + PADDLE_ENFORCE_EQ(var->HasGradVar(), true, + "Grad variable not exist for variable %s", var->Name()); - size_t Size() const override { return queued_runnables_.size(); } + auto& fwd_var = var->Var().Get(); + auto* grad_var = + var->GradVarBase()->MutableVar()->GetMutable(); + auto* dev_ctx = platform::DeviceContextPool::Instance().Get(fwd_var.place()); + grad_var->Resize(fwd_var.dims()); + grad_var->mutable_data(fwd_var.place(), fwd_var.type()); + operators::math::set_constant(*dev_ctx, grad_var, 1.0); +} - void Sync() override { - for (Runnable* l : queued_runnables_) { - LOG(INFO) << "running " << reinterpret_cast(l); +bool BasicEngine::CheckBackwardInputs(OpBase* op) { + for (auto& pair : op->GetInsMap()) { + for (auto& var : pair.second) { + if (var && !var->StopGradient()) { + return true; + } } - queued_runnables_.clear(); + } + return false; +} + +void BasicEngine::PrepareGradAccumulators(OpBase* op) { + for (const auto& pair : op->GetOutsMap()) { + for (const auto& var : pair.second) { + if (!var) continue; + + auto& accumulator = accumulators_[var.get()]; + if (!accumulator) { + if (backward_strategy_.sorted_sum_gradient_) { + accumulator.reset(new SortedGradientAccumulator(var.get())); + } else { + accumulator.reset(new EagerGradientAccumulator(var.get())); + } + } + + accumulator->IncreaseRefCnt(); + + VLOG(3) << "Prepare to acccumulate variable grad " << var->Name() + << "with reference count " << accumulator->RefCnt(); + } + } +} + +void BasicEngine::PrepareDeps() { + PADDLE_ENFORCE_EQ(op_deps_.empty(), true, "Op deps must be initialized here"); + PADDLE_ENFORCE_EQ(accumulators_.empty(), true, + "Accumulators must be initialized here"); + + std::queue q; + std::unordered_set visited; + for (const auto& init_op : init_ops_) { + q.push(init_op); + visited.insert(init_op); } - private: - std::vector queued_runnables_; -}; + while (!q.empty()) { + auto* cur_op = q.front(); + q.pop(); + VLOG(3) << "Checking grads of op " << cur_op->Type(); -Engine* GetEngine() { - std::call_once(init_engine, []() { engine = new DummyEngine(); }); - return engine; + if (!CheckBackwardInputs(cur_op)) { + // TODO(zjl): clear ops that do not need grad before running autograd + VLOG(3) << "Stop checking preceding ops of " << cur_op->Type() + << " because all of its backward inputs is stop_gradient=True"; + continue; + } + + PrepareGradAccumulators(cur_op); + + auto& preceding_ops = cur_op->GradPendingOps(); + for (auto* preceding_op : preceding_ops) { + PADDLE_ENFORCE_NOT_NULL(preceding_op); + ++op_deps_[preceding_op]; + if (visited.count(preceding_op) == 0) { + visited.insert(preceding_op); + q.push(preceding_op); + } + } + } } +void BasicEngine::SumGradient(OpBase* op, std::shared_ptr src, + VarBase* dst) { + auto iter = accumulators_.find(dst); + PADDLE_ENFORCE_EQ(iter != accumulators_.end(), true, + "Cannot find gradient of variable %s", dst->Name()); + iter->second->Add(std::move(src), op->id()); +} +void BasicEngine::Execute() { + PrepareDeps(); + // Start execute Computation graph + std::queue q; + for (const auto& init_op : init_ops_) { + q.push(init_op); + } + while (!q.empty()) { + OpBase* cur_op = q.front(); + q.pop(); + + // Step 1: Run Backward + auto& bwd_ins = cur_op->GetInsMap(); + auto& bwd_outs = cur_op->GetOutsMap(); + + NameVarBaseMap tmp_outs; + // A var may be coresponding to several grad var in one op + std::unordered_map>> var_map; + size_t counter = 0; + for (auto& bwd_out : bwd_outs) { + auto& tmp_var_list = tmp_outs[bwd_out.first]; + tmp_var_list.reserve(bwd_out.second.size()); + for (auto& var : bwd_out.second) { + auto tmp_var = std::make_shared( + false, "Gtmp@" + std::to_string(counter++)); // Do not need grad + tmp_var_list.emplace_back(tmp_var); + if (var) { + var_map[var.get()].emplace_back(std::move(tmp_var)); + var->ClearGradOps(); + } + } + } + + VLOG(3) << "Start to execute grad op " << cur_op->Type(); + RunOp(cur_op, bwd_ins, tmp_outs, cur_op->place()); + // Step 2: Sum Gradient + { + platform::RecordEvent record_event("merge_grads"); + for (auto& var_pair : var_map) { + auto* dst_var = var_pair.first; + if (dst_var == nullptr) continue; + for (auto& src_var : var_pair.second) { + VLOG(3) << "Sum gradient of variable " << dst_var->Name() + << " after op " << cur_op->Type(); + SumGradient(cur_op, std::move(src_var), dst_var); + } + } + } + + // Step 3: Collect ready ops + for (auto* preceding_op : cur_op->GradPendingOps()) { + PADDLE_ENFORCE_NOT_NULL(preceding_op); + auto iter = op_deps_.find(preceding_op); + if (iter == op_deps_.end()) { + continue; + } + + VLOG(3) << "Found preceding op of " << cur_op->Type(); + // An Op is ready to go while its deps comes to zero + + if (--(iter->second) == 0) { + q.push(preceding_op); + VLOG(3) << "Push preceding op " << preceding_op->Type() + << " into queue"; + } + } + + // Step 4: Delete op to collect unused variables + VLOG(3) << "Remove op after op " << cur_op->Type() << " runs"; + RemoveOp(cur_op); + } + VLOG(3) << "Clean properties of BasicEngine"; + CleanEngine(); +} } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/engine.h b/paddle/fluid/imperative/engine.h index a1dfa5bda38d0c419aa4ccbea77b32eb7e0d5b23..efc005c7f3030ab399a2345a2d4b1cec59b98137 100644 --- a/paddle/fluid/imperative/engine.h +++ b/paddle/fluid/imperative/engine.h @@ -16,24 +16,80 @@ #include #include +#include +#include +#include +#include +#include "paddle/fluid/imperative/backward_strategy.h" +#include "paddle/fluid/imperative/gradient_accumulator.h" +#include "paddle/fluid/imperative/layer.h" namespace paddle { namespace imperative { -struct Runnable {}; - +// It seems there is no need for Engine to be an +// singleton, we can have multi-engine to run +// mutil-graoh. For future use we may expose a interface +// to Python to support class Engine { public: - virtual ~Engine() {} + virtual ~Engine() = default; + virtual void Execute() = 0; + virtual void Init(VarBase* var, const detail::BackwardStrategy& strategy) = 0; + virtual void RunOp(imperative::OpBase* op, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, const platform::Place& place); - virtual void Enqueue(Runnable* runnable) = 0; + virtual void RemoveOp(OpBase* op) { + PADDLE_ENFORCE_NOT_NULL(op, "Cannot remove null op"); + auto iter = grad_ops_.find(op); + PADDLE_ENFORCE_EQ(iter != grad_ops_.end(), true, "Op is not inside tracer"); + grad_ops_.erase(iter); + } - virtual size_t Size() const = 0; + void InsertOp(OpBase* op, std::shared_ptr op_shared) { + grad_ops_[op] = std::move(op_shared); + } + void Clear() { grad_ops_.clear(); } - virtual void Sync() = 0; + private: + std::unordered_map> + grad_ops_; // opBase for remove - grad_op }; -Engine* GetEngine(); +class BasicEngine : public Engine { + public: + BasicEngine() = default; + + void Init(VarBase* var, const detail::BackwardStrategy& strategy) override; + + ~BasicEngine() override = default; + + void Execute() override; + + private: + void PrepareDeps(); + + bool CheckBackwardInputs(OpBase* op); + + void PrepareGradAccumulators(OpBase* op); + + void SumGradient(OpBase* op, std::shared_ptr src, VarBase* dst); + + // TODO(jiabin): maybe we can optimize the performance of engine by cache the + // result + void CleanEngine() { + init_ops_.clear(); + op_deps_.clear(); + accumulators_.clear(); + Clear(); + } + + std::vector init_ops_; + detail::BackwardStrategy backward_strategy_; + std::unordered_map op_deps_; + std::unordered_map> + accumulators_; +}; } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc new file mode 100644 index 0000000000000000000000000000000000000000..7f424dcfbc04603547c8b8ca24b43b59939063a1 --- /dev/null +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -0,0 +1,148 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/gradient_accumulator.h" +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace imperative { + +template +class TensorAddFunctor : public boost::static_visitor<> { + public: + TensorAddFunctor(int64_t numel, const T* x, T* y) + : numel_(numel), x_(x), y_(y) {} + + void operator()(const platform::CPUPlace& place) { + platform::CPUDeviceContext* ctx = dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)); + auto blas = operators::math::GetBlas(*ctx); + blas.AXPY(numel_, 1., x_, y_); + } + +#ifdef PADDLE_WITH_CUDA + void operator()(const platform::CUDAPlace& place) { + platform::CUDADeviceContext* ctx = + dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)); + auto blas = operators::math::GetBlas(*ctx); + blas.AXPY(numel_, 1., x_, y_); + } +#else + void operator()(const platform::CUDAPlace& place) { + PADDLE_THROW("Do NOT support gradient merge in place %s", place); + } +#endif + + // there is NO blas in CUDAPinnedPlace + void operator()(const platform::CUDAPinnedPlace& place) { + PADDLE_THROW("Do NOT support gradient merge in place %s", place); + } + + private: + int64_t numel_; + const T* x_; + T* y_; +}; + +void TensorAdd(const framework::Variable& src, framework::Variable* dst) { + auto* dst_tensor = dst->GetMutable(); + auto& src_tensor = src.Get(); + + auto numel = src_tensor.numel(); + + // FIXME(minqiyang): loss_grad op will pass a zero grad of label + // ugly fix for it + if (numel == 0) { + return; + } + + PADDLE_ENFORCE_EQ(dst_tensor->numel() == numel, true, + "dst_numel %d vs. src_numel %d", dst_tensor->numel(), + numel); + + auto data_type = src_tensor.type(); + auto place = src_tensor.place(); + +#define PADDLE_TENSOR_ADD_MACRO(cpp_type) \ + if (data_type == framework::DataTypeTrait::DataType()) { \ + TensorAddFunctor func( \ + numel, src_tensor.data(), \ + dst_tensor->mutable_data(place)); \ + boost::apply_visitor(func, place); \ + return; \ + } + + PADDLE_TENSOR_ADD_MACRO(float); + PADDLE_TENSOR_ADD_MACRO(double); + +#undef PADDLE_TENSOR_ADD_MACRO + + PADDLE_THROW("Not supported data type %s for AddTo", + framework::DataTypeToString(data_type)); +} + +void EagerGradientAccumulator::Add(std::shared_ptr var, + size_t trace_id) { + auto* dst_var = var_->MutableVar(); + if (cur_cnt_ == 0) { + *dst_var = std::move(*(var->MutableVar())); + } else { + TensorAdd(var->Var(), dst_var); + } + ++cur_cnt_; +} + +void SortedGradientAccumulator::Add(std::shared_ptr var, + size_t trace_id) { + auto* dst_var = var_->MutableVar(); + if (ref_cnt_ == 1) { + *dst_var = std::move(*(var->MutableVar())); + } else { + if (tmp_grad_vars_.empty()) { + tmp_grad_vars_.reserve(ref_cnt_); + } + + tmp_grad_vars_.emplace_back(std::move(var), trace_id); + + if (tmp_grad_vars_.size() != ref_cnt_) { + return; + } + + std::sort(tmp_grad_vars_.begin(), tmp_grad_vars_.end(), + [](const std::pair, size_t>& p1, + const std::pair, size_t>& p2) { + return p1.second > p2.second; + }); + + *dst_var = std::move(*(tmp_grad_vars_[0].first->MutableVar())); + for (size_t i = 1; i < tmp_grad_vars_.size(); ++i) { + TensorAdd(tmp_grad_vars_[i].first->Var(), dst_var); + } + + tmp_grad_vars_.clear(); + } +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h new file mode 100644 index 0000000000000000000000000000000000000000..d4980496b266f08273108c5f98be7d4520678b29 --- /dev/null +++ b/paddle/fluid/imperative/gradient_accumulator.h @@ -0,0 +1,63 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/imperative/layer.h" + +namespace paddle { +namespace imperative { + +class GradientAccumulator { + public: + explicit GradientAccumulator(VarBase* var) : var_(var) {} + + virtual void Add(std::shared_ptr var, size_t trace_id) = 0; + + virtual ~GradientAccumulator() = default; + + inline void IncreaseRefCnt() { ++ref_cnt_; } + + inline size_t RefCnt() const { return ref_cnt_; } + + protected: + VarBase* var_; + size_t ref_cnt_{0}; +}; + +class EagerGradientAccumulator : public GradientAccumulator { + public: + using GradientAccumulator::GradientAccumulator; + + void Add(std::shared_ptr var, size_t trace_id) override; + + private: + size_t cur_cnt_{0}; +}; + +class SortedGradientAccumulator : public GradientAccumulator { + public: + using GradientAccumulator::GradientAccumulator; + + void Add(std::shared_ptr var, size_t trace_id) override; + + private: + std::vector, size_t>> tmp_grad_vars_; +}; + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 1f7374e4126966e644883fc88eec37389ef16b08..bed49f0d12c2da1fa74db93f12f4e88873481e18 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,27 +13,21 @@ // limitations under the License. #include "paddle/fluid/imperative/layer.h" - #include -#include -#include -#include -#include -#include +#include #include - -#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/imperative/prepared_operator.h" +#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" -#include "paddle/fluid/string/printf.h" namespace paddle { namespace imperative { +using framework::Variable; void ThreadSafeNameSet::Insert(const std::string& name) { std::lock_guard guard(mtx_); set_.insert(name); @@ -42,7 +36,7 @@ void ThreadSafeNameSet::Insert(const std::string& name) { void ThreadSafeNameSet::Remove(const std::string& name) { std::lock_guard guard(mtx_); auto iter = set_.find(name); - PADDLE_ENFORCE(iter != set_.end(), "%s does not exist", name); + PADDLE_ENFORCE_EQ(iter != set_.end(), true, "%s does not exist", name); set_.erase(iter); } @@ -55,222 +49,161 @@ ThreadSafeNameSet VarBase::name_set_; std::vector VarBase::AliveVarNames() { return name_set_.Names(); } -using framework::Variable; - -namespace detail { - -template -class TensorAddToFunctor : public boost::static_visitor<> { - public: - TensorAddToFunctor(int64_t numel, const T* x, T* y) - : numel_(numel), x_(x), y_(y) {} - - void operator()(const platform::CPUPlace& place) { - platform::CPUDeviceContext* ctx = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)); - auto blas = operators::math::GetBlas(*ctx); - blas.AXPY(numel_, 1., x_, y_); +static framework::VariableNameMap CreateVarNameMap( + const framework::OpInfo& op_info, const std::string& op_type, + const NameVarBaseMap& varbase_map, bool is_input) { + if (op_info.proto_ == nullptr) { + return {}; } -#ifdef PADDLE_WITH_CUDA - void operator()(const platform::CUDAPlace& place) { - platform::CUDADeviceContext* ctx = - dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)); - auto blas = operators::math::GetBlas(*ctx); - blas.AXPY(numel_, 1., x_, y_); - } -#else - void operator()(const platform::CUDAPlace& place) { - PADDLE_THROW("Do NOT support gradient merge in place %s", place); + framework::VariableNameMap result; + + for (auto& var : + is_input ? op_info.Proto().inputs() : op_info.Proto().outputs()) { + auto it = varbase_map.find(var.name()); + if (it == varbase_map.end()) { + PADDLE_ENFORCE_EQ( + var.dispensable(), true, + "Var: %s not dispensable and there are no such var in inputs", + var.name()); + result[var.name()] = {}; + } else { + auto& var_vector = it->second; + std::vector args; + args.reserve(var_vector.size()); + for (auto& var_base : var_vector) { + args.emplace_back(var_base->Name()); + } + result[var.name()] = std::move(args); + } } -#endif + return result; +} - // there is NO blas in CUDAPinnedPlace - void operator()(const platform::CUDAPinnedPlace& place) { - PADDLE_THROW("Do NOT support gradient merge in place %s", place); +static framework::RuntimeContext PrepareRuntimeContext( + const NameVarBaseMap& ins, const NameVarBaseMap& outs) { + framework::VariableValueMap inputs, outputs; + for (auto& in_pair : ins) { + auto& in_ctx = inputs[in_pair.first]; + in_ctx.reserve(in_pair.second.size()); + for (auto& in_var : in_pair.second) { + in_ctx.emplace_back(in_var->MutableVar()); + } } - private: - int64_t numel_; - const T* x_; - T* y_; -}; - -} // namespace detail - -void AddTo(std::shared_ptr src, std::shared_ptr dst, - platform::Place place, GradientRef* grad_ref) { - PADDLE_ENFORCE(grad_ref->find(dst.get()) != grad_ref->end(), - "gradient %s are not found in grad_ref", dst->Name()); - if ((*grad_ref)[dst.get()].second) { - PADDLE_ENFORCE(src->IsInitialize(), "Using uninitialized VarBase"); - dst->var_ = std::move(src->var_); - (*grad_ref)[dst.get()].second = false; - if (!dst->IsInitialize()) { - dst->SetInitialize(true); - } - return; - } else { - framework::Tensor* dst_tensor = - dst->var_->GetMutable(); - framework::Tensor* src_tensor = - src->var_->GetMutable(); - - // FIXME(minqiyang): loss_grad op will pass a zero grad of label - // ugly fix for it - if (src_tensor->numel() == 0) { - return; + for (auto& out_pair : outs) { + auto& out_ctx = outputs[out_pair.first]; + out_ctx.reserve(out_pair.second.size()); + for (auto& out_var : out_pair.second) { + out_ctx.emplace_back(out_var->MutableVar()); } + } + return framework::RuntimeContext(std::move(inputs), std::move(outputs)); +} + +static std::string DebugString( + const std::string& name, + const std::vector>& vars) { + std::stringstream ss; + ss << name << "{"; - PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), - "dst_numel %lld vs. src_numel %lld", dst_tensor->numel(), - src_tensor->numel()); + for (size_t i = 0; i < vars.size(); ++i) { + if (i > 0) ss << ", "; - detail::TensorAddToFunctor func( - src_tensor->numel(), src_tensor->data(), - dst_tensor->mutable_data(place)); - boost::apply_visitor(func, place); + if (vars[i] == nullptr) { + ss << "NULL"; + continue; + } + ss << vars[i]->Name() << "["; + auto& var = vars[i]->Var(); + if (!var.IsInitialized()) { + ss << "NOT_INITED_VAR"; + } else if (var.IsType()) { + auto& tensor = var.Get(); + ss << "LoDTensor<"; + if (tensor.IsInitialized()) { + ss << framework::DataTypeToString(tensor.type()) << ", "; + ss << tensor.place() << ", "; + ss << "(" << tensor.dims() << ")"; + } else { + ss << "NOT_INITED"; + } + ss << ">"; + } else { + ss << "UNRESOLVED_TYPE"; + } + ss << "]"; } -} -void ZeroGrads(const std::shared_ptr vb, - const platform::Place& place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(place); - auto grad_t = vb->var_->GetMutable(); - operators::math::set_constant(*dev_ctx, grad_t, 0.0); + ss << "}"; + return ss.str(); } -void AddGradBySort(BackwardSumMap* bck_map, - std::shared_ptr target, - GradientRef* grad_ref) { - PADDLE_ENFORCE(bck_map->find(target.get()) != bck_map->end(), - "Can't find %s in backward grad map", target->Name()); - std::pair>>>& - current = bck_map->at(target.get()); - std::sort(current.second.begin(), current.second.end(), - [](const std::pair>& a, - const std::pair>& b) { - return a.first > b.first; - }); - for (auto& var_pair : current.second) { - VLOG(10) << "add origin_grad: " << target->Name(); - VLOG(10) << "added grad: " << var_pair.second->Name() - << " trace id is: " << var_pair.first; - AddTo(var_pair.second, target, current.first, grad_ref); - var_pair.second.reset(); +std::string LayerDebugString(const std::string& op_type, + const NameVarBaseMap& ins, + const NameVarBaseMap& outs) { + std::stringstream ss; + ss << "Op(" << op_type << "): "; + + ss << "Inputs: "; + + size_t i = 0; + for (auto& pair : ins) { + if (i > 0) ss << ", "; + ss << DebugString(pair.first, pair.second); + ++i; } -} -class Autograd { - public: - Autograd() {} + ss << ", Outputs: "; + i = 0; + for (auto& pair : outs) { + if (i > 0) ss << ", "; + ss << DebugString(pair.first, pair.second); + ++i; + } + return ss.str(); +} - void RunBackward(VarBase* var, const detail::BackwardStrategy& bck_stratedy) { - if (var->IsStopGradient()) { +void VarBase::AddGradOps(const std::weak_ptr& op) { + if (op.lock() == nullptr) { + return; + } + for (const auto& cur_op : grad_ops_) { + if (cur_op.lock() == op.lock()) { return; } - VLOG(2) << "start autograd"; - BackwardSumMap bck_map; - std::deque ready; - ready.push_back(var->PreOp()); - - std::map dep_counts = - ComputeDepCounts(var->PreOp(), bck_stratedy, &grad_ref); - - while (!ready.empty()) { - OpBase* ready_op = ready.front(); - ready.pop_front(); - std::vector grads_outputs = - ready_op->ApplyGrad(&bck_map, &grad_ref, bck_stratedy); - - for (const auto& map : grads_outputs) { - for (auto it = map.rbegin(); it != map.rend(); ++it) { - const std::vector>& grad_outs = it->second; - for (size_t i = 0; i < grad_outs.size(); ++i) { - if (!grad_outs[i] || grad_outs[i]->IsStopGradient()) continue; - OpBase* pre_op = grad_outs[i]->PreOp(); - if (!pre_op) continue; - dep_counts[pre_op] -= 1; - PADDLE_ENFORCE(dep_counts[pre_op] >= 0); - bool pre_op_ready = dep_counts[pre_op] == 0; - if (pre_op_ready) { - ready.push_back(pre_op); - } - } - } - } - - ready_op->InvokeBackwardHooks(); - } } + grad_ops_.emplace_back(op); +} - private: - std::map ComputeDepCounts( - OpBase* op, const detail::BackwardStrategy& bck_stratedy, - GradientRef* grad_ref) { - if (bck_stratedy.sorted_sum_gradient_) { - PADDLE_ENFORCE_NOT_NULL(grad_ref, - "grad_ref should not be null when " - "using sorted grad backward strategy"); - } - std::map ret; - - std::deque queue; - queue.push_back(op); - std::unordered_set visited; - visited.insert(op); - while (!queue.empty()) { - OpBase* candidate = queue.front(); - queue.pop_front(); - for (const auto& map : candidate->grad_output_vars_) { - for (const auto& it : map) { - for (const auto& vb : it.second) { - if (bck_stratedy.sorted_sum_gradient_) { - ++(*grad_ref)[vb.get()].first; - } - // init the state of the grad_ - (*grad_ref)[vb.get()].second = true; - } - } - } - for (auto it : candidate->pre_ops_) { - for (OpBase* pre_op : it.second) { - if (!pre_op) continue; - VLOG(2) << "op dep " << candidate->Type() << " trace id " - << candidate->trace_id_ << " <---- " << it.first << " <---- " - << pre_op->Type() << " trace id " << pre_op->trace_id_; - if (visited.find(pre_op) == visited.end()) { - visited.insert(pre_op); - queue.push_back(pre_op); - } - ret[pre_op] += 1; - } - } +void VarBase::ClearGradient() { + if (grad_var_) { + auto* grad_t = grad_var_->var_.GetMutable(); + if (grad_t->IsInitialized()) { + auto* dev_ctx = + platform::DeviceContextPool::Instance().Get(grad_t->place()); + operators::math::set_constant(*dev_ctx, grad_t, 0.0); } - return ret; } +} - GradientRef grad_ref; -}; - -std::unique_ptr VarBase::NewVarBase(const platform::Place& dst_place, +std::shared_ptr VarBase::NewVarBase(const platform::Place& dst_place, const bool blocking) const { - PADDLE_ENFORCE(var_->IsInitialized(), - "Variable must be initialized when getting numpy tensor"); - - // TODO(minqiyang): change this after move unique_name generator to CXX - const framework::LoDTensor& self_tensor = var_->Get(); - std::unique_ptr new_var(new VarBase( - "Itmp", self_tensor.type(), self_tensor.dims(), dst_place, true, false)); - framework::LoDTensor* tensor = - new_var->var_->GetMutable(); - tensor->set_lod(var_->Get().lod()); - - const auto& src_tensor = var_->Get(); - framework::TensorCopy(src_tensor, dst_place, tensor); + PADDLE_ENFORCE_EQ(var_.IsInitialized() && var_.IsType(), + true, + "Variable must be initialized and type of LoDTensor when " + "getting numpy tensor"); + + auto& src_tensor = var_.Get(); + + // TODO(Jiabin): change this after move unique_name generator to CXX + auto new_var = std::make_shared( + false, "Itmp" + std::to_string(copied_counter_++)); + + auto* dst_tensor = new_var->var_.GetMutable(); + dst_tensor->set_lod(src_tensor.lod()); + + framework::TensorCopy(src_tensor, dst_place, dst_tensor); if (blocking) { platform::DeviceContextPool::Instance().Get(dst_place)->Wait(); auto src_place = src_tensor.place(); @@ -285,184 +218,66 @@ std::unique_ptr VarBase::NewVarBase(const platform::Place& dst_place, return new_var; } +// create OpBase from optype +OpBase::OpBase(size_t id, const std::string& type, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, framework::AttributeMap attrs, + const platform::Place& place) + : id_(id), place_(place) { + const auto& info = framework::OpInfoMap::Instance().Get(type); + + // Step 1: Run forward + if (info.Checker() != nullptr) { + info.Checker()->Check(&attrs); + } -framework::LoDTensor& VarBase::GradValue() { - VLOG(3) << "get var grad " << Name(); - PADDLE_ENFORCE_NOT_NULL(grads_, - "Could not get grad value from no grad variable"); - return *(grads_->var_->GetMutable()); + auto input_name_map = CreateVarNameMap(info, type, ins, true); + auto output_name_map = CreateVarNameMap(info, type, outs, false); + op_ = framework::OpRegistry::CreateOp(type, std::move(input_name_map), + std::move(output_name_map), + std::move(attrs)); + VLOG(3) << "Construct Op: " << type << std::endl; } -std::vector OpBase::ApplyGrad( - BackwardSumMap* bck_map, GradientRef* grad_ref, - const detail::BackwardStrategy& bck_stratedy) { - PADDLE_ENFORCE(!grad_op_descs_.empty(), "%s has no backward implementation", - Type()); - VLOG(3) << "apply op grad: " << Type(); - std::vector tmp_grad_outputs; - const size_t grad_op_count = grad_op_descs_.size(); - - tmp_grad_outputs.resize(grad_op_count); - for (size_t k = 0; k < grad_op_count; ++k) { - framework::OpDesc* grad_op_desc = grad_op_descs_[k]; - platform::RecordEvent record_event(grad_op_desc->Type()); - auto& grad_output_variable_map = grad_output_vars_[k]; - VLOG(3) << "apply grad op " << grad_op_desc->Type(); - - // Allocate tmp grad output variable - for (const auto& it : grad_output_variable_map) { - auto& outputs = tmp_grad_outputs[k][it.first]; - outputs.reserve(it.second.size()); - for (const std::shared_ptr& origin_grad_var_base : - it.second) { - // Allocate a new variable - std::shared_ptr tmp_grad_var_base(new VarBase( - string::Sprintf("%s@IGrad", origin_grad_var_base->Name()), - origin_grad_var_base->DataType(), origin_grad_var_base->Dims(), - place_, true, false)); - outputs.emplace_back(std::move(tmp_grad_var_base)); - } - } - - // No need to do compile time infer shape here. - // grad_op_desc_->InferShape(*block_); - // grad_op_desc->InferVarType(block_); - - std::unique_ptr opbase = - framework::OpRegistry::CreateOp(*grad_op_desc); - - auto& info = framework::OpInfoMap::Instance().Get(grad_op_desc->Type()); - if (info.infer_var_type_) { - RuntimeInferVarTypeContext infer_var_type_ctx( - &grad_input_vars_[k], &tmp_grad_outputs[k], &(opbase->Attrs())); - info.infer_var_type_(&infer_var_type_ctx); - } - - framework::OperatorWithKernel* op_kernel = - dynamic_cast(opbase.get()); - PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); - - // Run grad op - framework::VariableValueMap grad_invars_map; - framework::VariableValueMap grad_outvars_map; - - for (const auto& it : grad_input_vars_[k]) { - auto& grad_invars = grad_invars_map[it.first]; - grad_invars.reserve(it.second.size()); - for (const std::shared_ptr& grad_inp : it.second) { - PADDLE_ENFORCE_NOT_NULL(grad_inp->var_, "op %s input %s nullptr", - grad_op_desc->Type(), grad_inp->Name()); - if (!grad_inp->IsInitialize()) { - grad_inp->InitBuffer(); - ZeroGrads(grad_inp, place_); - } - const std::shared_ptr& const_grad_inp = grad_inp; - grad_invars.emplace_back(const_grad_inp->var_.get()); - } - } - - for (const auto& it : tmp_grad_outputs[k]) { - auto& grad_outvars = grad_outvars_map[it.first]; - grad_outvars.reserve(it.second.size()); - for (const std::shared_ptr& grad_out : it.second) { - PADDLE_ENFORCE_NOT_NULL(grad_out->var_, "op %s output %s nullptr", - grad_op_desc->Type(), grad_out->Name()); - - grad_outvars.emplace_back(grad_out->var_.get()); - } - } +// create OpBase from opdesc +OpBase::OpBase(size_t id, const framework::OpDesc& op_desc, + const platform::Place& place) + : id_(id), op_(framework::OpRegistry::CreateOp(op_desc)), place_(place) { + VLOG(3) << "Construct Op: " << op_desc.Type() << std::endl; +} - framework::RuntimeContext ctx(grad_invars_map, grad_outvars_map); - framework::Scope scope; - PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_); - p.op.RuntimeInferShape(scope, place_, ctx); - p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx, - p.kernel_configs)); +void OpBase::Run(const NameVarBaseMap& ins, const NameVarBaseMap& outs) { + auto* op_kernel = dynamic_cast(op_.get()); + PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); + auto& info = op_->Info(); + if (info.infer_var_type_) { + RuntimeInferVarTypeContext infer_var_type_ctx(ins, &outs, op_->Attrs()); + info.infer_var_type_(&infer_var_type_ctx); } - platform::RecordEvent record_event("merge_grads"); - // Add tmp grad outputs to original grad vars - for (size_t k = 0; k < grad_output_vars_.size(); ++k) { - for (const auto& it : grad_output_vars_[k]) { - auto& outputs = tmp_grad_outputs[k][it.first]; - const auto& origin_outputs = it.second; - PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); - - for (size_t i = 0; i < outputs.size(); ++i) { - // track outputs used by sum - if (bck_stratedy.sorted_sum_gradient_) { - if (bck_map->find(origin_outputs[i].get()) != bck_map->end()) { - VLOG(10) << "add sub grad to " << origin_outputs[i]->Name(); - bck_map->at(origin_outputs[i].get()) - .second.emplace_back( - std::pair>( - this->trace_id_, std::move(outputs[i]))); - } else { - VLOG(10) << "insert new map for " << origin_outputs[i]->Name(); - std::pair>>> - tmp(place_, - {std::make_pair(this->trace_id_, std::move(outputs[i]))}); - bck_map->insert(std::make_pair(origin_outputs[i].get(), tmp)); - } - - PADDLE_ENFORCE( - grad_ref->find(origin_outputs[i].get()) != grad_ref->end(), - "Can't find %s in grad_reference count map", - origin_outputs[i]->Name()); - PADDLE_ENFORCE(grad_ref->at(origin_outputs[i].get()).first >= 1, - "Backward error when calculate grad reference"); - if (grad_ref->at(origin_outputs[i].get()).first > 1) { - VLOG(10) << "remove ref for " << origin_outputs[i]->Name(); - grad_ref->at(origin_outputs[i].get()).first--; - } else { - VLOG(10) << "Add grad for: " << origin_outputs[i]->Name(); - AddGradBySort(bck_map, origin_outputs[i], grad_ref); - grad_ref->at(origin_outputs[i].get()).first--; - } - } else { - VLOG(10) << "AddTo Called with orig_grad is: " - << origin_outputs[i]->name_ << " Grad to be added is " - << outputs[i]->name_; - AddTo(outputs[i], origin_outputs[i], place_, grad_ref); - outputs[i].reset(); - } - } + // Initialize output var type + for (auto& var_pair : outs) { + for (auto& var : var_pair.second) { + InitializeVariable(var->MutableVar(), var->Type()); } } - return grad_output_vars_; -} + VLOG(3) << "Running Op " << Type(); + VLOG(5) << LayerDebugString(Type(), ins, outs); + auto runtime_ctx = PrepareRuntimeContext(ins, outs); + auto runtime_place = PreparedOp::GetExpectedPlace(place(), ins); -void OpBase::InvokeBackwardHooks() { - VLOG(3) << "call backward hooks, hooks num: " << backward_hooks_.size(); - - // call backward hooks - for (py::object& callable : backward_hooks_) { - callable(this); - } -} + auto prepared_op = + PreparedOp::Prepare(runtime_ctx, *op_kernel, runtime_place); -void OpBase::RegisterBackwardHooks(const py::object& callable) { - VLOG(3) << "Register backward hooks " << trace_id_; + prepared_op.Run(); - // TODO(minqiyang): check the callable format - backward_hooks_.push_back(callable); + VLOG(4) << LayerDebugString(Type(), ins, outs); } -void VarBase::RunBackward(const detail::BackwardStrategy& bck_stratedy) { - if (!pre_op_) return; - platform::RecordEvent record_event("Imperative Backward"); - VLOG(3) << "start backward"; - grads_->InitBuffer(); - auto grads_t = grads_->var_->GetMutable(); - operators::math::set_constant( - *(platform::DeviceContextPool::Instance().Get( - var_->GetMutable()->place())), - grads_t, 1.0); - - Autograd().RunBackward(this, bck_stratedy); +void OpBase::ClearBackwardTrace() { + grad_pending_ops_.clear(); + ins_.clear(); + outs_.clear(); } } // namespace imperative diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index ba4be301c28415af5d26c97ef598723542892248..37741fef2401eeda7a665f9113e50bdd0ee923e7 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,8 +13,10 @@ // limitations under the License. #pragma once - +#include +#include #include +#include #include // NOLINT #include // NOLINT #include // NOLINT @@ -22,95 +24,19 @@ #include // NOLINT #include // NOLINT #include -#include // NOLINT - -// clang-format off -#include "paddle/fluid/framework/python_headers.h" -// clang-format on - +#include #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/var_type_inference.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/imperative/backward_strategy.h" -#include "paddle/fluid/imperative/type_defs.h" +#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/imperative/flags.h" +#include "paddle/fluid/imperative/type_defs.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/macros.h" namespace paddle { namespace imperative { -class VarBase; - -namespace py = ::pybind11; - -class PreparedOp { - public: - PreparedOp(const framework::OperatorBase& op, - const framework::RuntimeContext& ctx, - framework::OperatorWithKernel::OpKernelFunc func, - platform::DeviceContext* dev_ctx, - std::vector* kernel_configs) - : op(op), - ctx(ctx), - func(func), - dev_ctx(dev_ctx), - kernel_configs(kernel_configs) {} - - static PreparedOp Prepare(const framework::RuntimeContext& ctx, - const framework::OperatorWithKernel& op, - const platform::Place& place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(place); - - // check if op[type] has kernel registered. - auto& all_op_kernels = op.AllOpKernels(); - auto kernels_iter = all_op_kernels.find(op.Type()); - if (kernels_iter == all_op_kernels.end()) { - PADDLE_THROW( - "There are no kernels which are registered in the %s operator.", - op.Type()); - } - - framework::OperatorWithKernel::OpKernelMap& kernels = kernels_iter->second; - - auto expected_kernel_key = - op.GetExpectedKernelType(framework::ExecutionContext( - op, framework::Scope(), *dev_ctx, ctx, nullptr)); - VLOG(3) << "expected_kernel_key:" << expected_kernel_key; - - auto kernel_iter = kernels.find(expected_kernel_key); -#ifdef PADDLE_WITH_MKLDNN - // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set - if (kernel_iter == kernels.end() && - expected_kernel_key.library_type_ == framework::LibraryType::kMKLDNN) { - VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; - expected_kernel_key.library_type_ = framework::LibraryType::kPlain; - expected_kernel_key.data_layout_ = framework::DataLayout::kAnyLayout; - kernel_iter = kernels.find(expected_kernel_key); - } -#endif - if (kernel_iter == kernels.end()) { - PADDLE_THROW("op %s does not have kernel for %s", op.Type(), - KernelTypeToString(expected_kernel_key)); - } - std::vector* kernel_configs = - op.GetKernelConfig(expected_kernel_key); - - return PreparedOp(op, ctx, kernel_iter->second, dev_ctx, kernel_configs); - } - - inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx; } - - const framework::OperatorBase& op; - const framework::RuntimeContext& ctx; - framework::OperatorWithKernel::OpKernelFunc func; - platform::DeviceContext* dev_ctx; - std::vector* kernel_configs; -}; - class OpBase; class ThreadSafeNameSet { @@ -126,290 +52,117 @@ class ThreadSafeNameSet { mutable std::mutex mtx_; }; -/* The wrapper for Variable which holds a Variable and a VarBase of its - * gradient. This object should be managed totally by Python intepreter. - * - * Nearly all interface should be implemented in C++. - */ class VarBase { + DISABLE_COPY_AND_ASSIGN(VarBase); + public: static std::vector AliveVarNames(); - - // Internal interface, create VarBase from exist variable - VarBase(const std::string& name, std::unique_ptr var, - VarBase* grad, bool stop_gradient) - : VarBase(name, var->Get().type(), - var->Get().dims(), - var->Get().place(), nullptr, grad, - stop_gradient, false, true) { - var_ = std::move(var); - } - - // Python interface - VarBase(const std::string& name, const framework::proto::VarType::Type dtype, - const std::vector& shape, const platform::Place& place, - bool stop_gradient, bool persistable) - : VarBase(name, dtype, framework::make_ddim(shape), place, stop_gradient, - persistable) {} - - // Internal interface, create VarBase from with ddim - VarBase(const std::string& name, const framework::proto::VarType::Type dtype, - const framework::DDim& shape, const platform::Place& place, - bool stop_gradient, bool persistable) - : VarBase(name, dtype, shape, place, nullptr, nullptr, stop_gradient, - persistable, true) {} - - // Grad used constructor - VarBase(const std::string& name, const framework::proto::VarType::Type dtype, - const std::vector& shape, const platform::Place& place, - bool stop_gradient, bool persistable, bool need_initialize) - : VarBase(name, dtype, framework::make_ddim(shape), place, nullptr, - nullptr, stop_gradient, persistable, need_initialize) {} - - private: - // TODO(minqiyang): need support SelectedRows - VarBase(const std::string& name, framework::proto::VarType::Type dtype, - const framework::DDim& shape, const platform::Place& place, - std::unique_ptr var, VarBase* grad, - bool stop_gradient, bool persistable, bool need_initialize) + explicit VarBase(bool has_grad, const std::string& name) : name_(name), - type_(framework::proto::VarType::LOD_TENSOR), - place_(place), - var_(std::move(var)), - grads_(grad), - dtype_(dtype), - stop_gradient_(stop_gradient), - persistable_(persistable), - pre_op_(nullptr), - pre_op_out_name_(), - pre_op_out_idx_(-1) { - if (!var_) { - var_.reset(new framework::Variable()); - } - - auto tensor = var_->GetMutable(); - tensor->Resize(shape); - if (need_initialize) { - tensor->mutable_data(place, dtype); - is_initialized_ = true; - VLOG(8) << "initialized varbase: " << name_ << " type: " << dtype - << " place: " << place; - } else { - is_initialized_ = false; - VLOG(8) << "not initialized varbase: " << name_; - } - VLOG(8) << "create varbase: " << name_ << " type: " << dtype - << " place: " << place << "Stop gradient: " << stop_gradient_; - + grad_var_(has_grad ? new VarBase(false, GradVarName()) : nullptr) { if (IsDebugEnabled()) { + VLOG(10) << "Construct VarBase: " << name; name_set_.Insert(name_); } } - public: - virtual ~VarBase() { - pre_op_ = nullptr; - pre_op_out_idx_ = -1; - VLOG(8) << "destruct varbase: " << name_; + explicit VarBase(const std::string& name) : VarBase(true, name) {} + + ~VarBase() { + VLOG(10) << "Destruct VarBase: " << name_; if (IsDebugEnabled()) { name_set_.Remove(name_); } } - inline void SetName(const std::string& name) { name_ = name; } - inline std::string Name() const { return name_; } - inline bool IsInitialize() const { return is_initialized_; } - inline void SetInitialize(bool inited) { is_initialized_ = inited; } - inline std::vector Shape() const { - if (var_->IsInitialized()) { - return framework::vectorize(var_->Get().dims()); - } else { - return {}; - } - } + const framework::Variable& Var() const { return var_; } - inline framework::DDim Dims() const { - return var_->Get().dims(); - } + framework::Variable* MutableVar() { return &var_; } - // data type. e.g.. FP32 - inline void SetDataType(framework::proto::VarType::Type type) { - auto tensor = var_->GetMutable(); - tensor->mutable_data(tensor->place(), type); + bool HasGradVar() const { return grad_var_ != nullptr; } + + const std::shared_ptr& GradVarBase() const { return grad_var_; } + + const framework::Variable& GradVar() const { + PADDLE_ENFORCE_NOT_NULL(grad_var_, "Gradient of %s does not exist", name_); + return grad_var_->var_; } - inline framework::proto::VarType::Type DataType() const { return dtype_; } - // tensor type. e.g.. LoDTensor - inline void SetType(framework::proto::VarType::Type type) { type_ = type; } - inline framework::proto::VarType::Type Type() const { return type_; } + framework::Variable* MutableGradVar() { + PADDLE_ENFORCE_NOT_NULL(grad_var_, "Gradient of %s does not exist", name_); + return &(grad_var_->var_); + } - inline void SetStopGradient(bool stop_gradient) { + void SetStopGradient(bool stop_gradient) { stop_gradient_ = stop_gradient; - if (grads_) { - grads_->stop_gradient_ = stop_gradient; + if (grad_var_) { + grad_var_->stop_gradient_ = stop_gradient; } } - inline bool IsStopGradient() const { return stop_gradient_; } - inline void SetPersistable(bool persistable) { persistable_ = persistable; } - inline bool IsPersistable() const { return persistable_; } - inline void SetPreOp(OpBase* op) { pre_op_ = op; } - inline platform::Place GetPlace() { return place_; } - inline OpBase* PreOp() const { return pre_op_; } - inline int PreOpOutIdx() const { return pre_op_out_idx_; } + bool StopGradient() const { return stop_gradient_; } - void RunBackward(const detail::BackwardStrategy& bck_stratedy); + void SetPersistable(bool persistable) { persistable_ = persistable; } - inline void ResetPreOp(OpBase* op) { - if (op == pre_op_) { - // clear pre_op info when op equals to var's pre_op - pre_op_ = nullptr; - pre_op_out_idx_ = -1; - } - } - - void InitBuffer() { - if (!is_initialized_) { - var_->GetMutable()->mutable_data(place_, dtype_); - is_initialized_ = true; - VLOG(8) << "initialized varbase: " << name_ << " type: " << dtype_ - << " place: " << place_; - } else { - VLOG(8) << "var: " << name_ << " has already been initialized "; - } - } + bool Persistable() const { return persistable_; } - void TrackPreOp(OpBase* pre_op, const std::string& pre_op_out_name, - int pre_op_out_idx, bool pre_op_stop_gradient) { - pre_op_ = pre_op; - pre_op_out_name_ = pre_op_out_name; - pre_op_out_idx_ = pre_op_out_idx; - if (pre_op_stop_gradient) { - stop_gradient_ = pre_op_stop_gradient; - } - } + void AddGradOps(const std::weak_ptr& op); - void ClearGradient() { - VLOG(1) << "clear gradient of " << Name(); - if (grads_ && grads_->var_ && grads_->var_->IsInitialized()) { - auto grads_t = grads_->var_->GetMutable(); - operators::math::set_constant( - *(platform::DeviceContextPool::Instance().Get( - grads_->var_->Get().place())), - grads_t, 0.0); + std::vector GradOps() { + std::vector rlt; + // TODO(jiabin): use better data structure to remove nullptr when we find it + for (const auto& wk_ptr : grad_ops_) { + OpBase* tmp_op = wk_ptr.lock().get(); + if (tmp_op) rlt.emplace_back(tmp_op); } + return rlt; } + void ClearGradOps() { grad_ops_.clear(); } - framework::LoDTensor& GradValue(); - - std::unique_ptr NewVarBase(const platform::Place& dst_place, - const bool blocking) const; + const std::string& Name() const { return name_; } - inline std::string GradName() const { - return string::Sprintf("%s@IGrad", Name()); + void SetName(const std::string& name) { + name_ = name; + if (grad_var_) { + grad_var_->SetName(GradVarName()); + } } - std::string name_; - framework::proto::VarType::Type type_; - platform::Place place_; - - std::unique_ptr var_; - std::shared_ptr grads_; + std::string GradVarName() { return framework::GradVarName(name_); } - private: - framework::proto::VarType::Type dtype_; - bool stop_gradient_; - bool persistable_; - bool is_initialized_; - OpBase* pre_op_; - std::string pre_op_out_name_; - int pre_op_out_idx_; - - // A private flag to check memory leak - static ThreadSafeNameSet name_set_; -}; + void SetType(framework::proto::VarType::Type type) { type_ = type; } -/* The wrapper for OpDesc which holds a OpDesc and a OpDesc of its - * gradient. This object should be managed totally by Python intepreter. - */ -class PYBIND11_HIDDEN OpBase { - public: - OpBase(const std::string& type) - : type_(type), - trace_id_(-1), - place_(platform::CPUPlace()), - backward_hooks_() {} - - virtual ~OpBase() { - for (const auto& it : outputs_ref) { - auto vb = it.lock(); - if (vb) { - VLOG(3) << "Op reset by" << vb->name_; - vb->ResetPreOp(this); - } - } - // TODO(minqiyang): remove op_desc from block_desc in tracer - // release resource - for (framework::OpDesc* desc : grad_op_descs_) { - delete desc; - } - } + framework::proto::VarType::Type Type() const { return type_; } - std::vector ApplyGrad( - BackwardSumMap* bck_map, GradientRef* grad_ref, - const detail::BackwardStrategy& bck_stratedy); - - inline std::string Type() const { return type_; } - inline std::string GradOpType(size_t index) const { - PADDLE_ENFORCE_NOT_NULL(grad_op_descs_[index]); - return grad_op_descs_[index]->Type(); - } - - void RegisterBackwardHooks(const py::object& callable); - - void InvokeBackwardHooks(); - - void TrackPreOp( - const std::string& inp_name, - const std::vector>& inputs) { - auto& pre_ops_list = pre_ops_[inp_name]; - pre_ops_list.reserve(inputs.size()); - auto& pre_ops_out_idx_list = pre_ops_out_idx_[inp_name]; - for (std::shared_ptr inp_var : inputs) { - if (inp_var->PreOp() && !inp_var->IsStopGradient()) { - VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot " - << inp_name; - pre_ops_list.emplace_back(inp_var->PreOp()); - pre_ops_out_idx_list.push_back(inp_var->PreOpOutIdx()); - } else { - VLOG(3) << "no pre op in slot " << inp_name - << " input var stop_gradient: " << inp_var->IsStopGradient(); - pre_ops_list.emplace_back(nullptr); - // pre_ops_out_idx_list.push_back(-1); - } + void SetDataType(framework::proto::VarType::Type data_type) { + data_type_ = data_type; + if (grad_var_) { + grad_var_->SetDataType(data_type_); } } - std::string type_; - int trace_id_; + framework::proto::VarType::Type DataType() const { return data_type_; } - // Note: each fwd op corresponds to a vector of bwd ops. - std::vector grad_op_descs_; + void ClearGradient(); - platform::Place place_; + std::shared_ptr NewVarBase(const platform::Place& dst_place, + const bool blocking) const; - OpBasePtrMap pre_ops_; - std::map> pre_ops_out_idx_; + private: + framework::Variable var_; + std::string name_; + std::shared_ptr grad_var_; + mutable size_t copied_counter_ = 0; - VarBaseWeakPtrList outputs_ref; - // Inputs to a vector of bwd ops. - std::vector grad_input_vars_; - // Outputs to a vector of bwd ops. - std::vector grad_output_vars_; + // grad_op indicates which grad_op will this var be used as input + std::vector> grad_ops_; - std::vector backward_hooks_; + bool stop_gradient_{false}; + bool persistable_{false}; - framework::AttributeMap attrs_; + framework::proto::VarType::Type type_{framework::proto::VarType::LOD_TENSOR}; + framework::proto::VarType::Type data_type_{framework::proto::VarType::FP32}; + static ThreadSafeNameSet name_set_; }; class Layer { @@ -418,18 +171,16 @@ class Layer { virtual std::vector> Forward( const std::vector>& inputs) { - std::vector> vars; - return vars; + return {}; } }; // infer var type context for imperative mode -class PYBIND11_HIDDEN RuntimeInferVarTypeContext - : public framework::InferVarTypeContext { +class RuntimeInferVarTypeContext : public framework::InferVarTypeContext { public: - RuntimeInferVarTypeContext(const imperative::VarBasePtrMap* inputs, - imperative::VarBasePtrMap* outputs, - const framework::AttributeMap* attrs_map) + RuntimeInferVarTypeContext(const NameVarBaseMap& inputs, + const NameVarBaseMap* outputs, + const framework::AttributeMap& attrs_map) : InferVarTypeContext(nullptr, nullptr), inputs_(inputs), outputs_(outputs), @@ -437,19 +188,19 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext input_names_(), output_names_(), var_set_() { - input_names_.reserve(inputs_->size()); - for (auto& it : *inputs_) { - for (std::shared_ptr var : it.second) { + input_names_.reserve(inputs_.size()); + for (auto& it : inputs_) { + for (auto& var : it.second) { input_names_[it.first].emplace_back(var->Name()); - var_set_[var->Name()] = var; + var_set_[var->Name()] = var.get(); } } output_names_.reserve(outputs_->size()); for (auto& it : *outputs_) { - for (std::shared_ptr var : it.second) { + for (auto& var : it.second) { output_names_[it.first].emplace_back(var->Name()); - var_set_[var->Name()] = var; + var_set_[var->Name()] = var.get(); } } } @@ -457,8 +208,10 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext virtual ~RuntimeInferVarTypeContext() {} framework::Attribute GetAttr(const std::string& name) const override { - PADDLE_ENFORCE_NOT_NULL(attrs_); - return attrs_->at(name); + auto iter = attrs_.find(name); + PADDLE_ENFORCE_EQ(iter != attrs_.end(), true, "Cannot find attribute %s", + name); + return iter->second; } bool HasVar(const std::string& name) const override { @@ -466,8 +219,7 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext } bool HasInput(const std::string& name) const override { - PADDLE_ENFORCE_NOT_NULL(inputs_); - return inputs_->count(name) > 0; + return inputs_.count(name) > 0; } bool HasOutput(const std::string& name) const override { @@ -477,17 +229,26 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext const std::vector& Input( const std::string& name) const override { - return input_names_.at(name); + auto iter = input_names_.find(name); + PADDLE_ENFORCE_EQ(iter != input_names_.end(), true, "Cannot find input %s", + name); + return iter->second; } const std::vector& Output( const std::string& name) const override { - return output_names_.at(name); + auto iter = output_names_.find(name); + PADDLE_ENFORCE_EQ(iter != output_names_.end(), true, + "Cannot find output %s", name); + return iter->second; } framework::proto::VarType::Type GetType( const std::string& name) const override { - return var_set_.at(name)->Type(); + auto iter = var_set_.find(name); + PADDLE_ENFORCE_EQ(iter != var_set_.end(), true, + "Cannot find var %s in GetType", name); + return iter->second->Type(); } void SetType(const std::string& name, @@ -501,7 +262,10 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext framework::proto::VarType::Type GetDataType( const std::string& name) const override { - return var_set_.at(name)->DataType(); + auto iter = var_set_.find(name); + PADDLE_ENFORCE_EQ(iter != var_set_.end(), true, + "Cannot find var %s in GetDataType", name); + return iter->second->DataType(); } void SetDataType(const std::string& name, @@ -538,13 +302,97 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext } private: - const imperative::VarBasePtrMap* inputs_; - imperative::VarBasePtrMap* outputs_; - const framework::AttributeMap* attrs_; + const NameVarBaseMap& inputs_; + const NameVarBaseMap* outputs_; + const framework::AttributeMap& attrs_; std::unordered_map> input_names_; std::unordered_map> output_names_; - std::unordered_map> - var_set_; + std::unordered_map var_set_; +}; + +// TODO(zjl): to support py_func layer +class OpBase : public std::enable_shared_from_this { + DISABLE_COPY_AND_ASSIGN(OpBase); + + public: + ~OpBase() { VLOG(3) << "Destruct Op: " << Type() << std::endl; } + + // Developer should not rely on this method to create OpBase. + // OpBase should be created in Tracer and managed by Tracer totally. + template + static std::shared_ptr Create(Args&&... args) { + return std::shared_ptr(new OpBase(std::forward(args)...)); + } + + size_t id() const { return id_; } + + const std::string& Type() const { return op_->Type(); } + + void Run(const NameVarBaseMap& ins, const NameVarBaseMap& outs); + + const framework::VariableNameMap& InputNameMap() const { + return op_->Inputs(); + } + + const framework::VariableNameMap& OutputNameMap() const { + return op_->Outputs(); + } + + const framework::AttributeMap& Attrs() const { return op_->Attrs(); } + const framework::OpInfo& Info() const { return op_->Info(); } + + void ClearBackwardTrace(); + + const std::vector& GradPendingOps() const { + return grad_pending_ops_; + } + + void InsertGradPendingOps(OpBase* op) { grad_pending_ops_.emplace_back(op); } + + void SortGradPendingOps() { + std::sort(grad_pending_ops_.begin(), grad_pending_ops_.end(), + [](OpBase* op1, OpBase* op2) { return op1->id() > op2->id(); }); + } + NameVarBaseMap* GetMutableOutsMap() { return &outs_; } + NameVarBaseMap* GetMutableInsMap() { return &ins_; } + const NameVarBaseMap& GetInsMap() { return ins_; } + const NameVarBaseMap& GetOutsMap() { return outs_; } + const platform::Place& place() const { return place_; } + + // TODO(jiabin) prepare for backward hook + void RegisterBackwardHooks(const std::function& func) { + backward_hooks_.emplace_back(func); + } + + void InvokeBackwardHooks() { + for (const auto& func : backward_hooks_) { + func(); + VLOG(5) << "Invoke Backward Hook for: " << Type() << std::endl; + } + } + + private: + OpBase(size_t id, const std::string& type, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, framework::AttributeMap attrs, + const platform::Place& place); + + OpBase(size_t id, const framework::OpDesc& op_desc, + const platform::Place& place); + + size_t id_; + + std::unique_ptr op_; + + std::vector> backward_hooks_; + platform::Place place_; + + // Not need to be std::weak_ptr, because op is binded to a certain Tracer, + // and would not be used by a Tracer that does not create itself. + std::vector grad_pending_ops_; + + // This part is only used for backward + NameVarBaseMap ins_; + NameVarBaseMap outs_; }; } // namespace imperative diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc new file mode 100644 index 0000000000000000000000000000000000000000..e5b48fa2e20955b57d7bba957d8340506d422b73 --- /dev/null +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/prepared_operator.h" +#include + +namespace paddle { +namespace imperative { + +const framework::Tensor* GetTensorFromVar(const framework::Variable& var) { + if (var.IsType()) { + return &(var.Get()); + } else if (var.IsType()) { + return &(var.Get().value()); + } else { + return nullptr; + } +} + +platform::Place PreparedOp::GetExpectedPlace(const platform::Place& place, + const NameVarBaseMap& ins) { + bool found = false; + for (auto& name_pair : ins) { + for (auto& var_base : name_pair.second) { + const auto* tensor = GetTensorFromVar(var_base->Var()); + if (tensor && tensor->IsInitialized()) { + auto tmp_place = tensor->place(); + PADDLE_ENFORCE_EQ(!found || tmp_place == place, true, + "Input variable should keep in the same place: %s, " + "but get place: %s of input %s instead", + place, tmp_place, name_pair.first); + } + } + } + return place; +} + +PreparedOp::PreparedOp(const framework::OperatorBase& op, + const framework::RuntimeContext& ctx, + framework::OperatorWithKernel::OpKernelFunc func, + platform::DeviceContext* dev_ctx, + std::vector* kernel_configs) + : op_(op), + ctx_(ctx), + func_(std::move(func)), + dev_ctx_(dev_ctx), + kernel_configs_(kernel_configs) {} + +PreparedOp PreparedOp::Prepare(const framework::RuntimeContext& ctx, + const framework::OperatorWithKernel& op, + const platform::Place& place) { + auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place); + + // check if op[type] has kernel registered. + auto& all_op_kernels = op.AllOpKernels(); + auto kernels_iter = all_op_kernels.find(op.Type()); + if (kernels_iter == all_op_kernels.end()) { + PADDLE_THROW( + "There are no kernels which are registered in the %s operator.", + op.Type()); + } + + auto& kernels = kernels_iter->second; + + auto expected_kernel_key = + op.GetExpectedKernelType(framework::ExecutionContext( + op, framework::Scope(), *dev_ctx, ctx, nullptr)); + VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + + auto kernel_iter = kernels.find(expected_kernel_key); + // TODO(jiabin): Add operator.cc's line 1000 part back when we need that case + if (kernel_iter == kernels.end()) { + PADDLE_THROW("op %s does not have kernel for %s", op.Type(), + KernelTypeToString(expected_kernel_key)); + } + std::vector* kernel_configs = + op.GetKernelConfig(expected_kernel_key); + return PreparedOp(op, ctx, kernel_iter->second, dev_ctx, kernel_configs); +} + +void PreparedOp::Run() { + // TODO(zjl): remove scope in dygraph + framework::Scope scope; + op_.RuntimeInferShape(scope, dev_ctx_->GetPlace(), ctx_); + func_(framework::ExecutionContext(op_, scope, *dev_ctx_, ctx_, + kernel_configs_)); +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h new file mode 100644 index 0000000000000000000000000000000000000000..4616a85674683695875ff932c11dd3adba384170 --- /dev/null +++ b/paddle/fluid/imperative/prepared_operator.h @@ -0,0 +1,58 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/imperative/type_defs.h" + +namespace paddle { +namespace imperative { + +const framework::Tensor* GetTensorFromVar(const framework::Variable& var); + +class PreparedOp { + public: + static PreparedOp Prepare(const framework::RuntimeContext& ctx, + const framework::OperatorWithKernel& op, + const platform::Place& place); + + inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx_; } + + void Run(); + + static platform::Place GetExpectedPlace(const platform::Place& place, + const NameVarBaseMap& ins); + + private: + PreparedOp(const framework::OperatorBase& op, + const framework::RuntimeContext& ctx, + framework::OperatorWithKernel::OpKernelFunc func, + platform::DeviceContext* dev_ctx, + std::vector* kernel_configs); + + private: + const framework::OperatorBase& op_; + const framework::RuntimeContext& ctx_; + framework::OperatorWithKernel::OpKernelFunc func_; + platform::DeviceContext* dev_ctx_; + std::vector* kernel_configs_; +}; + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..25a038997fa85b0f181e3bd43dbd79a9cd9a9b25 --- /dev/null +++ b/paddle/fluid/imperative/tests/CMakeLists.txt @@ -0,0 +1,5 @@ +cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context) +cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS gradient_accumulator memcpy) +cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op) +cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split) +cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op) diff --git a/paddle/fluid/imperative/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc similarity index 100% rename from paddle/fluid/imperative/nccl_context_test.cc rename to paddle/fluid/imperative/tests/nccl_context_test.cc diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc new file mode 100644 index 0000000000000000000000000000000000000000..29a51733c9316efed585741e0440c07886491ab5 --- /dev/null +++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc @@ -0,0 +1,121 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/imperative/gradient_accumulator.h" +#include "paddle/fluid/memory/memcpy.h" + +namespace imperative = paddle::imperative; +namespace platform = paddle::platform; +namespace framework = paddle::framework; +namespace paddle { +namespace imperative { + +void TensorAdd(const framework::Variable& src, framework::Variable* dst); + +#if defined(PADDLE_WITH_CUDA) +template +int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) { + framework::Variable var1; + framework::Variable var2; + std::vector src_data(10, t1); + std::vector dst_data(10, t2); + std::vector result; + platform::CPUPlace src_place; + for (unsigned int i = 0; i < 10; i++) { + result.emplace_back(src_data[i] + dst_data[i]); + } + std::vector dims = {2, 5}; + auto* src = var1.GetMutable(); + auto* dst = var2.GetMutable(); + src->Resize(framework::make_ddim(dims)); + dst->Resize(framework::make_ddim(dims)); + auto* src_mutable = src->mutable_data(place); + auto* dst_mutable = dst->mutable_data(place); + paddle::memory::Copy(place, src_mutable, src_place, src_data.data(), + sizeof(T) * src_data.size(), 0); + paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(), + sizeof(T) * dst_data.size(), 0); + imperative::TensorAdd(var1, &var2); + framework::LoDTensor rlt; + platform::CPUPlace rlt_place; + framework::TensorCopySync(*dst, rlt_place, &rlt); + + for (unsigned int i = 0; i < rlt.numel(); i++) { + if (rlt.data()[i] != result[i]) return 1; + } + return 0; +} +#endif + +template +int TensorCPUAddTest(platform::CPUPlace place, T t1, T t2) { + framework::Variable var1; + framework::Variable var2; + std::vector src_data(10, t1); + std::vector dst_data(10, t2); + std::vector result; + platform::CPUPlace src_place; + for (unsigned int i = 0; i < 10; i++) { + result.emplace_back(src_data[i] + dst_data[i]); + } + std::vector dims = {2, 5}; + auto* src = var1.GetMutable(); + auto* dst = var2.GetMutable(); + src->Resize(framework::make_ddim(dims)); + dst->Resize(framework::make_ddim(dims)); + auto* src_mutable = src->mutable_data(place); + auto* dst_mutable = dst->mutable_data(place); + paddle::memory::Copy(place, src_mutable, src_place, src_data.data(), + sizeof(T) * src_data.size()); + paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(), + sizeof(T) * dst_data.size()); + imperative::TensorAdd(var1, &var2); + framework::LoDTensor rlt; + platform::CPUPlace rlt_place; + framework::TensorCopySync(*dst, rlt_place, &rlt); + + for (unsigned int i = 0; i < rlt.numel(); i++) { + if (rlt.data()[i] != result[i]) return 1; + } + return 0; +} + +TEST(test_add_functor, add_functor) { +#if defined(PADDLE_WITH_CUDA) + platform::CUDAPlace gpu_place(0); +#endif + platform::CPUPlace cpu_place; + + int cpu_res = 1; + cpu_res = TensorCPUAddTest(cpu_place, 1.0, 0.0); + EXPECT_EQ(cpu_res, 0); + cpu_res = TensorCPUAddTest(cpu_place, static_cast(1.0), + static_cast(2.0)); + EXPECT_EQ(cpu_res, 0); +#if defined(PADDLE_WITH_CUDA) + int gpu_res = 1; + gpu_res = TensorGPUAddTest(gpu_place, 1.0, 0.0); + EXPECT_EQ(gpu_res, 0); + gpu_res = TensorGPUAddTest(gpu_place, static_cast(1.0), + static_cast(2.0)); + EXPECT_EQ(gpu_res, 0); +#endif +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/tests/test_layer.cc b/paddle/fluid/imperative/tests/test_layer.cc new file mode 100644 index 0000000000000000000000000000000000000000..a25ec66903a114437c5563a496cc17be3f942b39 --- /dev/null +++ b/paddle/fluid/imperative/tests/test_layer.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// Created by Jiabin on 2019-08-16. +// + +#include +#include +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/imperative/layer.h" + +namespace imperative = paddle::imperative; +namespace platform = paddle::platform; +namespace framework = paddle::framework; + +namespace paddle { +namespace imperative { + +using vb_vector = std::vector>; + +using var_pair = std::pair; + +TEST(test_layer, test_runtime_context) { + std::shared_ptr vin( + new imperative::VarBase(false, "vin")); + std::shared_ptr vout( + new imperative::VarBase(false, "vout")); + var_pair in_pair = var_pair("X", vb_vector(1, vin)); + var_pair out_pair = var_pair("Out", vb_vector(1, vout)); + imperative::NameVarBaseMap ins = {in_pair}; + imperative::NameVarBaseMap outs = {out_pair}; + framework::AttributeMap attrs; + auto* ctx = new imperative::RuntimeInferVarTypeContext(ins, &outs, attrs); + ASSERT_TRUE(ctx->HasVar("vin")); + ASSERT_TRUE(ctx->HasInput("X")); + ASSERT_TRUE(ctx->HasOutput("Out")); + + ASSERT_ANY_THROW(ctx->GetDataTypes("vin")); + std::vector NullType; + ASSERT_ANY_THROW(ctx->SetDataTypes("vin", NullType)); + ASSERT_ANY_THROW(ctx->GetShape("vin")); + ASSERT_ANY_THROW(ctx->GetLoDLevel("vin")); + ASSERT_ANY_THROW(ctx->SetLoDLevel("vin", 2)); +} + +std::string LayerDebugString(const std::string& op_type, + const NameVarBaseMap& ins, + const NameVarBaseMap& outs); + +TEST(test_layer, test_debug_string_test_debug_Test) { + std::shared_ptr vin( + new imperative::VarBase(false, "vin")); + std::shared_ptr vin_error( + new imperative::VarBase(false, "vin_error")); + std::shared_ptr vout( + new imperative::VarBase(false, "vout")); + std::shared_ptr vout_error( + new imperative::VarBase(false, "vout_error")); + vin_error->MutableVar()->GetMutable(); + vout->MutableVar()->GetMutable(); + vout_error->MutableVar()->GetMutable(); + var_pair in_pair = var_pair("X", vb_vector(1, vin)); + vb_vector vb_in_error = {vin_error, nullptr}; + var_pair vin_error_pair = var_pair("X", vb_in_error); + var_pair out_pair = var_pair("Out", vb_vector(1, vout)); + var_pair vout_error_pair = var_pair("Out2", vb_vector(1, vout_error)); + imperative::NameVarBaseMap ins = {in_pair}; + imperative::NameVarBaseMap ins_error = {vin_error_pair}; + imperative::NameVarBaseMap outs = {out_pair}; + imperative::NameVarBaseMap outs_error = {vout_error_pair}; + ASSERT_NO_FATAL_FAILURE(LayerDebugString("test_op", ins, outs)); + std::string res = LayerDebugString("test_op", ins, outs_error); + ASSERT_TRUE(res.find("UNRESOLVED_TYPE") != std::string::npos); + std::string res2 = LayerDebugString("test_op", ins_error, outs_error); + VLOG(3) << res2; + ASSERT_TRUE(res2.find("NOT_INITED") != std::string::npos); + ASSERT_TRUE(res2.find("NULL") != std::string::npos); +} + +TEST(test_layer, test_clear_backward_info) { + std::shared_ptr vin( + new imperative::VarBase(false, "vin")); + std::shared_ptr vout( + new imperative::VarBase(false, "vout")); + framework::OpDesc desc; + platform::CPUPlace place; + var_pair x_pair = var_pair("X", vb_vector(1, vin)); + var_pair y_pair = var_pair("Y", vb_vector(1, vin)); + var_pair out_pair = var_pair("Out", vb_vector(1, vout)); + imperative::NameVarBaseMap ins = {x_pair, y_pair}; + imperative::NameVarBaseMap outs = {out_pair}; + framework::AttributeMap concat_att_map; + concat_att_map["axis"] = 1; + std::shared_ptr op( + OpBase::Create(0, "mul", ins, outs, concat_att_map, place)); + std::shared_ptr preceding_op( + OpBase::Create(0, "mul", ins, outs, concat_att_map, place)); + op->InsertGradPendingOps(preceding_op.get()); + *(op->GetMutableInsMap()) = ins; + *(op->GetMutableOutsMap()) = outs; + ASSERT_GT(op->GetInsMap().size(), 0); + ASSERT_GT(op->GetOutsMap().size(), 0); + ASSERT_GT(op->GradPendingOps().size(), 0); + + op->ClearBackwardTrace(); + + ASSERT_EQ(op->GetInsMap().size(), 0); + ASSERT_EQ(op->GetOutsMap().size(), 0); + ASSERT_EQ(op->GradPendingOps().size(), 0); +} + +TEST(test_layer, test_varbase_basic) { + platform::CPUPlace place; + std::shared_ptr vin( + new imperative::VarBase(false, "vin")); + vin->MutableVar()->GetMutable()->mutable_data( + place); + std::shared_ptr vout(vin->NewVarBase(place, false)); + ASSERT_EQ(vout->Name(), "Itmp0"); + + std::shared_ptr vin_with_grad( + new imperative::VarBase(true, "vin")); + ASSERT_ANY_THROW(vin->MutableGradVar()); + ASSERT_NO_THROW(ASSERT_TRUE(dynamic_cast( + vin_with_grad->MutableGradVar()) != 0)); + ASSERT_TRUE( + dynamic_cast(vin_with_grad->MutableGradVar()) != 0); + vin_with_grad->SetStopGradient(true); + ASSERT_TRUE(vin_with_grad->StopGradient()); + ASSERT_NO_FATAL_FAILURE(vin_with_grad->SetPersistable(true)); + ASSERT_TRUE(vin_with_grad->StopGradient()); + ASSERT_NO_FATAL_FAILURE(vin_with_grad->SetName("new_name")); + ASSERT_EQ(vin_with_grad->Name(), "new_name"); +} +// TODO(jiabin): Add more ut here for layer + +} // namespace imperative +} // namespace paddle + +USE_OP(mul); diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6c35248262ae43696990ae2b874e58ad81fb3c26 --- /dev/null +++ b/paddle/fluid/imperative/tests/test_prepare_op.cc @@ -0,0 +1,130 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// Created by Jiabin on 2019-08-19. +// + +#include +#include +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/imperative/prepared_operator.h" +#include "paddle/fluid/imperative/type_defs.h" + +namespace imperative = paddle::imperative; +namespace platform = paddle::platform; +namespace framework = paddle::framework; + +namespace paddle { +namespace imperative { + +static framework::RuntimeContext PrepareRuntimeContext( + const NameVarBaseMap& ins, const NameVarBaseMap& outs) { + framework::VariableValueMap inputs, outputs; + for (auto& in_pair : ins) { + auto& in_ctx = inputs[in_pair.first]; + in_ctx.reserve(in_pair.second.size()); + for (auto& in_var : in_pair.second) { + in_ctx.emplace_back(in_var->MutableVar()); + } + } + + for (auto& out_pair : outs) { + auto& out_ctx = outputs[out_pair.first]; + out_ctx.reserve(out_pair.second.size()); + for (auto& out_var : out_pair.second) { + out_ctx.emplace_back(out_var->MutableVar()); + } + } + return framework::RuntimeContext(std::move(inputs), std::move(outputs)); +} + +static framework::VariableNameMap CreateVarNameMap( + const framework::OpInfo& op_info, const std::string& op_type, + const NameVarBaseMap& varbase_map, bool is_input) { + if (op_info.proto_ == nullptr) { + return {}; + } + + framework::VariableNameMap result; + + for (auto& var : + is_input ? op_info.Proto().inputs() : op_info.Proto().outputs()) { + auto it = varbase_map.find(var.name()); + if (it == varbase_map.end()) { + PADDLE_ENFORCE_EQ( + var.dispensable(), true, + "Var: %s not dispensable and there are no such var in inputs", + var.name()); + result[var.name()] = {}; + } else { + auto& var_vector = it->second; + std::vector args; + args.reserve(var_vector.size()); + for (auto& var_base : var_vector) { + args.emplace_back(var_base->Name()); + } + result[var.name()] = std::move(args); + } + } + return result; +} + +using vb_vector = std::vector>; + +using var_pair = std::pair; + +TEST(test_prepare_op, test_prepare_op) { + std::shared_ptr vin( + new imperative::VarBase(false, "vin")); + std::shared_ptr vout( + new imperative::VarBase(false, "vout")); + framework::OpDesc desc; + platform::CPUPlace place; + vin->MutableVar()->GetMutable()->mutable_data( + place); + var_pair x_pair = var_pair("X", vb_vector(1, vin)); + var_pair out_pair = var_pair("Out", vb_vector(1, vout)); + imperative::NameVarBaseMap ins = {x_pair}; + imperative::NameVarBaseMap outs = {out_pair}; + framework::AttributeMap split_attr_map; + const auto& info = framework::OpInfoMap::Instance().Get("split"); + framework::VariableNameMap var_in_map = + CreateVarNameMap(info, "split", ins, true); + framework::VariableNameMap var_out_map = + CreateVarNameMap(info, "split", outs, false); + framework::OperatorWithKernel op("split", var_in_map, var_out_map, + split_attr_map); + framework::RuntimeContext ctx = PrepareRuntimeContext(ins, outs); + ASSERT_NO_FATAL_FAILURE(PreparedOp preparedOp = + PreparedOp::Prepare(ctx, op, place)); +} + +const framework::Tensor* GetTensorFromVar(const framework::Variable& var); + +TEST(test_prepare_op, test_get_tensor_from_var) { + std::shared_ptr vout_error( + new imperative::VarBase(false, "vout_error")); + vout_error->MutableVar()->GetMutable(); + auto* ts = GetTensorFromVar(*vout_error->MutableVar()); + ASSERT_TRUE(ts != nullptr); +} + +} // namespace imperative +} // namespace paddle + +USE_OP(split); diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc new file mode 100644 index 0000000000000000000000000000000000000000..3a544e5f502b1e635a5185a9bb9c86b181b8535d --- /dev/null +++ b/paddle/fluid/imperative/tests/test_tracer.cc @@ -0,0 +1,148 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// Created by Jiabin on 2019-08-16. +// + +#include +#include +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/imperative/tracer.h" + +namespace imperative = paddle::imperative; +namespace platform = paddle::platform; +namespace framework = paddle::framework; + +namespace paddle { +namespace imperative { + +using vb_vector = std::vector>; + +using var_pair = std::pair; + +TEST(test_tracer, test_trace_op) { + // Doing an mul + imperative::Tracer tracer; + std::shared_ptr x_in( + new imperative::VarBase(true, "x_in")); + std::shared_ptr y_in( + new imperative::VarBase(true, "y_in")); + std::shared_ptr vout( + new imperative::VarBase(true, "vout")); + platform::CPUPlace place; + std::vector src_data(10, 2.0); + std::vector dims1 = {2, 5}; + std::vector dims2 = {5, 2}; + + auto* x_in_tensor = x_in->MutableVar()->GetMutable(); + auto* y_in_tensor = y_in->MutableVar()->GetMutable(); + x_in_tensor->Resize(framework::make_ddim(dims1)); + auto* mutable_x = x_in_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_x, place, src_data.data(), + sizeof(float) * src_data.size()); + y_in_tensor->Resize(framework::make_ddim(dims2)); + auto* mutable_y = y_in_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_y, place, src_data.data(), + sizeof(float) * src_data.size()); + + var_pair x_pair = var_pair("X", vb_vector(1, x_in)); + var_pair y_pair = var_pair("Y", vb_vector(1, y_in)); + var_pair out_pair = var_pair("Out", vb_vector(1, vout)); + imperative::NameVarBaseMap ins = {x_pair, y_pair}; + imperative::NameVarBaseMap outs = {out_pair}; + framework::AttributeMap mul_attr_map; + mul_attr_map["use_mkldnn"] = false; + tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true); + const auto& out_tensor = vout->Var().Get(); + for (size_t i = 0; i < vout->Var().Get().numel(); i++) { + ASSERT_EQ(out_tensor.data()[i], 20.0); + } +} + +TEST(test_tracer, test_track_backward_output) { + // Doing an mul + imperative::Tracer tracer; + std::shared_ptr x_in( + new imperative::VarBase(true, "x_in")); + std::shared_ptr y_in( + new imperative::VarBase(false, "y_in")); + std::shared_ptr vout( + new imperative::VarBase(true, "vout")); + platform::CPUPlace place; + std::vector src_data(10, 2.0); + std::vector dims1 = {2, 5}; + std::vector dims2 = {5, 2}; + + auto* x_in_tensor = x_in->MutableVar()->GetMutable(); + auto* y_in_tensor = y_in->MutableVar()->GetMutable(); + x_in_tensor->Resize(framework::make_ddim(dims1)); + auto* mutable_x = x_in_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_x, place, src_data.data(), + sizeof(float) * src_data.size()); + y_in_tensor->Resize(framework::make_ddim(dims2)); + auto* mutable_y = y_in_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_y, place, src_data.data(), + sizeof(float) * src_data.size()); + + var_pair x_pair = var_pair("X", vb_vector(1, x_in)); + var_pair y_pair = var_pair("Y", vb_vector(1, y_in)); + var_pair out_pair = var_pair("Out", vb_vector(1, vout)); + imperative::NameVarBaseMap ins = {x_pair, y_pair}; + imperative::NameVarBaseMap outs = {out_pair}; + framework::AttributeMap mul_attr_map; + mul_attr_map["use_mkldnn"] = false; + ASSERT_ANY_THROW(tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true)); +} + +TEST(test_tracer, test_track_backward_input) { + // Doing an mul + imperative::Tracer tracer; + std::shared_ptr x_in( + new imperative::VarBase(true, "x_in")); + std::shared_ptr y_in( + new imperative::VarBase(true, "y_in")); + std::shared_ptr vout( + new imperative::VarBase(false, "vout")); + platform::CPUPlace place; + std::vector src_data(10, 2.0); + std::vector dims1 = {2, 5}; + std::vector dims2 = {5, 2}; + + auto* x_in_tensor = x_in->MutableVar()->GetMutable(); + auto* y_in_tensor = y_in->MutableVar()->GetMutable(); + x_in_tensor->Resize(framework::make_ddim(dims1)); + auto* mutable_x = x_in_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_x, place, src_data.data(), + sizeof(float) * src_data.size()); + y_in_tensor->Resize(framework::make_ddim(dims2)); + auto* mutable_y = y_in_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_y, place, src_data.data(), + sizeof(float) * src_data.size()); + + var_pair x_pair = var_pair("X", vb_vector(1, x_in)); + var_pair y_pair = var_pair("Y", vb_vector(1, y_in)); + var_pair out_pair = var_pair("Out", vb_vector(1, vout)); + imperative::NameVarBaseMap ins = {x_pair, y_pair}; + imperative::NameVarBaseMap outs = {out_pair}; + framework::AttributeMap mul_attr_map; + mul_attr_map["use_mkldnn"] = false; + ASSERT_ANY_THROW(tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true)); +} +} // namespace imperative +} // namespace paddle + +USE_OP(mul); diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 682bea7d09bc8e01a281886d82e8d95ab363d864..e3cf2b32d797f812c237ab183fc2818382ecb1d0 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,282 +11,207 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "paddle/fluid/imperative/tracer.h" - -#include -#include -#include #include #include - -#include "paddle/fluid/framework/var_type_inference.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { namespace imperative { -void CreateGradOp(const framework::OpDesc& op_desc, - const std::unordered_set& no_grad_set, - const std::vector& grad_sub_block, - std::vector* grad_op_descs, - std::unordered_map* grad_to_var) { - PADDLE_ENFORCE(grad_op_descs->empty()); - const framework::OpInfo& op_info = - framework::OpInfoMap::Instance().Get(op_desc.Type()); - if (!op_info.grad_op_maker_) return; - - std::vector> descs = - op_info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); - for (auto& desc : descs) { - grad_op_descs->emplace_back(desc.release()); +static std::vector> CreateGradOpDescs( + const framework::OpInfo& op_info, const framework::OpDesc& op_desc, + const std::unordered_set& no_grad_set, + const std::vector& grad_sub_block, + std::unordered_map* grad_to_var) { + if (op_info.grad_op_maker_) { + return op_info.grad_op_maker_(op_desc, no_grad_set, grad_to_var, + grad_sub_block); + } else { + return {}; } } -void CreateNoBuffuerGrad(std::shared_ptr var, - platform::DeviceContext* dev_ctx) { - PADDLE_ENFORCE_NOT_NULL(var, "Could not get valid var base"); - PADDLE_ENFORCE_NOT_NULL(dev_ctx, - "Could not get valid device from forward op"); - - if (var->grads_ == nullptr) { - auto& var_t = var->var_->Get(); - var->grads_ = std::shared_ptr( - new VarBase(var->GradName(), framework::proto::VarType::FP32, - framework::vectorize(var_t.dims()), dev_ctx->GetPlace(), - var->IsStopGradient(), false, false)); +void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, framework::AttributeMap attrs, + const platform::Place& place, bool trace_backward) { + platform::RecordEvent event(type); + VLOG(1) << "Trace Op: " << type; + size_t op_id = GenerateUniqueId(); + auto op = OpBase::Create(op_id, type, ins, outs, std::move(attrs), place); + op->Run(ins, outs); + + if (ComputeRequiredGrad(ins, outs, trace_backward)) { + TraceBackward(op, framework::OpDesc(op->Type(), op->InputNameMap(), + op->OutputNameMap(), op->Attrs()), + ins, outs); } } -platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) { - platform::Place result = place; - for (const auto& it : inputs) { - for (const std::shared_ptr& var : it.second) { - platform::Place tmp_place = - var->var_->Get().place(); - if (!platform::is_same_place(tmp_place, result)) { - PADDLE_THROW( - "Input variable should keep in the same place: %s, but get place: " - "%s of input %s instead", - result, tmp_place, it.first); - } - } - } - - return result; +bool Tracer::ComputeRequiredGrad(const NameVarBaseMap& ins, + const NameVarBaseMap outs, + bool trace_backward) { + // TODO(jiabin): Implement auto prune here + return trace_backward; } -framework::VariableNameMap CreateInputVarNameMap( - const OpBase* op, const VarBasePtrMap& varbase_map) { - framework::VariableNameMap result; +void Tracer::TraceBackward(const std::shared_ptr& fwd_op, + const framework::OpDesc& fwd_op_desc, + const NameVarBaseMap& ins, + const NameVarBaseMap& outs) { + // grad_to_var is a map of framework::GradVarName(in_var_name/out_var_name) -> + // in_var_name/out_var_name + std::unordered_map grad_to_var; - auto& info_map = framework::OpInfoMap::Instance(); - auto* op_info = info_map.GetNullable(op->Type()); - if (op_info == nullptr || op_info->proto_ == nullptr) { - return result; - } + // Get grad_op_desc using fwd_op_desc + std::vector> grad_op_descs_ = + CreateGradOpDescs(fwd_op->Info(), fwd_op_desc, {}, {}, &grad_to_var); - for (auto& in : op_info->Proto().inputs()) { - auto it = varbase_map.find(in.name()); - if (it == varbase_map.end()) { - PADDLE_ENFORCE(in.dispensable()); - result[in.name()] = {}; - } else { - auto var_vector = it->second; - std::vector args; - args.reserve(var_vector.size()); - for (std::shared_ptr var_base : var_vector) { - args.emplace_back(var_base->Name()); - } - result[in.name()] = args; - } - } - return result; -} + // Create grad_ops using grad_op_descs -framework::VariableNameMap CreateOutputVarNameMap( - const OpBase* op, const VarBasePtrMap& varbase_map) { - framework::VariableNameMap result; + size_t grad_op_num = grad_op_descs_.size(); - auto& info_map = framework::OpInfoMap::Instance(); - auto* op_info = info_map.GetNullable(op->Type()); - if (op_info == nullptr || op_info->proto_ == nullptr) { - return result; - } + VLOG(3) << "Create " << grad_op_num << " grad op desc(s) to op " + << fwd_op->Type(); - for (auto& out : op_info->Proto().outputs()) { - auto it = varbase_map.find(out.name()); - if (it == varbase_map.end()) { - PADDLE_ENFORCE(out.dispensable()); - result[out.name()] = {}; - } else { - auto var_vector = it->second; - std::vector args; - args.reserve(var_vector.size()); - for (const std::shared_ptr& var_base : var_vector) { - args.emplace_back(var_base->Name()); - } - result[out.name()] = args; - } + if (grad_op_num == 0) { + return; } - return result; -} - -Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {} - -void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, - VarBasePtrMap* outputs, framework::AttributeMap attrs_map, - const platform::Place expected_place, - const bool stop_gradient) { - platform::RecordEvent record_event(op->type_); - framework::VariableValueMap invars_map; - framework::VariableValueMap outvars_map; - - // Construct input_vars_map and output_vars_map - std::map> current_vars_map; - for (auto it : inputs) { - auto& invars = invars_map[it.first]; - invars.reserve(it.second.size()); - for (std::shared_ptr inp : it.second) { - PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", op->Type(), - inp->Name()); - - invars.emplace_back(inp->var_.get()); - if (!stop_gradient) { - current_vars_map[inp->Name()] = inp; - } - VLOG(3) << "input var name: " << inp->Name() - << " inited: " << inp->var_->IsInitialized() - << " stop_grad: " << inp->IsStopGradient(); + // Build a map to record var_name -> std::shared_ptr*, + // so that we can find suitable var in grad op descs + std::unordered_map*> name_to_var; + for (auto& pair : ins) { + for (auto& var : pair.second) { + auto& var_ptr = name_to_var[var->Name()]; + PADDLE_ENFORCE_EQ(var_ptr == nullptr || var_ptr->get() == var.get(), true, + "There are different variables with same name %s", + var->Name()); + var_ptr = &var; } - op->TrackPreOp(it.first, it.second); } - for (const auto& it : *outputs) { - auto& outvars = outvars_map[it.first]; - const std::vector>& outputs_tmp = - it.second; - outvars.reserve(outputs_tmp.size()); - for (size_t i = 0U; i < outputs_tmp.size(); ++i) { - // Add weak_ptr to track outputs - op->outputs_ref.emplace_back(outputs_tmp[i]); - std::shared_ptr out = outputs_tmp[i]; - outvars.emplace_back(out->var_.get()); - out->TrackPreOp(op, it.first, i, stop_gradient); - if (!stop_gradient) { - current_vars_map[out->Name()] = out; - } - - VLOG(3) << "output var name: " << out->Name() - << " inited: " << out->var_->IsInitialized() - << " stop_grad: " << out->IsStopGradient(); + for (auto& pair : outs) { + for (auto& var : pair.second) { + auto& var_ptr = name_to_var[var->Name()]; + PADDLE_ENFORCE_EQ(var_ptr == nullptr || var_ptr->get() == var.get(), true, + "There are different variables with same name %s", + var->Name()); + var_ptr = &var; } } - // Check attrs and create op - framework::VariableNameMap invars_name_map = - CreateInputVarNameMap(op, inputs); - framework::VariableNameMap outvars_name_map = - CreateOutputVarNameMap(op, *outputs); - - auto& info = framework::OpInfoMap::Instance().Get(op->Type()); - if (info.Checker() != nullptr) { - info.Checker()->Check(&attrs_map); - } - - std::unique_ptr op_base = - framework::OpRegistry::CreateOp(op->Type(), invars_name_map, - outvars_name_map, attrs_map); - - if (info.infer_var_type_) { - RuntimeInferVarTypeContext infer_var_type_ctx(&inputs, outputs, &attrs_map); - info.infer_var_type_(&infer_var_type_ctx); - } - - // TODO(minqiyang): Support infer var type in imperative mode - // Run forward op - VLOG(3) << "tracer running " << op->Type(); - framework::RuntimeContext ctx(invars_map, outvars_map); - - // TODO(panyx0718): Cache p. - framework::OperatorWithKernel* op_kernel = - dynamic_cast(op_base.get()); - PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); - - framework::Scope scope; - op->place_ = GetExpectedPlace(expected_place, inputs); - - PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_); - prepared_op.op.RuntimeInferShape(scope, op->place_, ctx); - prepared_op.func( - framework::ExecutionContext(prepared_op.op, scope, *prepared_op.dev_ctx, - prepared_op.ctx, prepared_op.kernel_configs)); - - if (!stop_gradient) { - VLOG(5) << "start construct backward op"; - - // construct grad op descs - op->attrs_ = attrs_map; - std::unique_ptr fwd_op_desc(new framework::OpDesc( - op->Type(), invars_name_map, outvars_name_map, attrs_map)); - std::unique_ptr> grad_to_var( - new std::unordered_map()); - // NOTE(minqiyang): We don't support control flow op in imperative now - // Add grad_block_ when we want to support it - CreateGradOp(*fwd_op_desc, {}, {}, &op->grad_op_descs_, grad_to_var.get()); - - VLOG(5) << "create grad op desc: " << op->grad_op_descs_[0]->Type(); - - const size_t grad_op_count = op->grad_op_descs_.size(); + // Build backward ins and outs + + for (size_t i = 0; i < grad_op_num; i++) { + // Step1: build grad op and add them to engine + + // Use trace id to decide the order of gradient sum in sorted sum mode + size_t trace_id = fwd_op->id(); + std::shared_ptr grad_op = + OpBase::Create(trace_id, (*(grad_op_descs_[i].get())), fwd_op->place()); + + // this OpBase* is just used to manage op's life time + engine_->InsertOp(grad_op.get(), grad_op); + + std::unordered_set visited_preceding_ops; + // Step2 : prepare grad_in vars and bind them with grad_op, + // set inputs' grad_op as current grad_op + for (const auto& grad_ins : grad_op_descs_[i]->Inputs()) { + if (grad_ins.second.empty()) continue; + auto& bwd_in = (*grad_op->GetMutableInsMap())[grad_ins.first]; + bwd_in.reserve(grad_ins.second.size()); + + for (auto& grad_in_var_name : grad_ins.second) { + auto iter = grad_to_var.find(grad_in_var_name); + + if (iter != grad_to_var.end()) { + // If it is a grad var, find its coresponding forward var + auto& fwd_var_name = iter->second; + auto fwd_var_iter = name_to_var.find(fwd_var_name); + PADDLE_ENFORCE_EQ(fwd_var_iter != name_to_var.end(), true, + "Cannot find forward variable named %s", + fwd_var_name); + PADDLE_ENFORCE_NOT_NULL( + (*(fwd_var_iter->second))->GradVarBase(), + "Grad of %s should " + "not be NULL when we Track_Backward Input of %s", + (*(fwd_var_iter->second))->Name(), grad_op->Type()); + (*(fwd_var_iter->second))->GradVarBase()->AddGradOps(grad_op); + VLOG(3) << "Add Grad Op " << grad_op->Type() << " for :" + << (*(fwd_var_iter->second))->GradVarBase()->Name(); + bwd_in.emplace_back((*(fwd_var_iter->second))->GradVarBase()); + } else { + // If it is a forward var, just add it + auto fwd_var_iter = name_to_var.find(grad_in_var_name); + PADDLE_ENFORCE_EQ(fwd_var_iter != name_to_var.end(), true, + "Cannot find forward variable named %s", + grad_in_var_name); + bwd_in.emplace_back(*(fwd_var_iter->second)); + } - op->grad_input_vars_.resize(grad_op_count); - op->grad_output_vars_.resize(grad_op_count); + VLOG(3) << "Set backward input " << grad_ins.first << " of " + << grad_op->Type() << " to be " + << (bwd_in.back() ? bwd_in.back()->Name() : "nullptr"); + } + } - for (size_t i = 0; i < grad_op_count; ++i) { - framework::OpDesc* grad_op_desc = op->grad_op_descs_[i]; - for (auto it : grad_op_desc->Inputs()) { - auto& grad_in_vars = op->grad_input_vars_[i][it.first]; - grad_in_vars.reserve(it.second.size()); - for (const std::string& grad_invar : it.second) { - auto var_it = grad_to_var->find(grad_invar); - if (var_it == grad_to_var->end()) { - auto fwd_var_it = current_vars_map.find(grad_invar); - PADDLE_ENFORCE(fwd_var_it != current_vars_map.end()); - // Forward inputs or outputs. - grad_in_vars.emplace_back(fwd_var_it->second); - } else { - std::shared_ptr var = - current_vars_map[var_it->second]; - CreateNoBuffuerGrad(var, prepared_op.GetDeviceContext()); - // Douts. - var->grads_->SetPreOp(var->PreOp()); - grad_in_vars.emplace_back(var->grads_); + // Step3: prepare grad_out vars and using their grad_ops to set current + // grad_op's preceding op + for (auto& grad_outs : grad_op_descs_[i]->Outputs()) { + if (grad_outs.second.empty()) continue; + auto& bwd_out = (*grad_op->GetMutableOutsMap())[grad_outs.first]; + bwd_out.reserve(grad_outs.second.size()); + + for (auto& grad_out_var_name : grad_outs.second) { + auto iter = grad_to_var.find(grad_out_var_name); + PADDLE_ENFORCE_EQ(iter != grad_to_var.end(), true, + "Cannot find output of input grad %s in op %s", + grad_out_var_name, fwd_op->Type()); + auto fwd_var_iter = name_to_var.find(iter->second); + PADDLE_ENFORCE_EQ(fwd_var_iter != name_to_var.end(), true, + "Cannot find forward variable named %s", + iter->second); + PADDLE_ENFORCE_NOT_NULL( + (*(fwd_var_iter->second))->GradVarBase(), + "Grad of %s should " + "not be NULL when we Track_Backward Output of %s", + (*(fwd_var_iter->second))->Name(), grad_op->Type()); + bwd_out.emplace_back((*(fwd_var_iter->second))->GradVarBase()); + VLOG(3) << "Set backward output " << grad_outs.first << " of " + << grad_op->Type() << " to be " + << (bwd_out.back() ? bwd_out.back()->Name() : "nullptr"); + + auto preceding_ops = + (*(fwd_var_iter->second))->GradVarBase()->GradOps(); + + if (VLOG_IS_ON(3) && !preceding_ops.empty()) { + VLOG(3) << "Add preceding Op of :" + << (*(fwd_var_iter->second))->GradVarBase()->Name() + << " It's preceding Op are: "; + for (const auto& op : preceding_ops) { + VLOG(3) << op->Type(); } } - } - - for (auto it : grad_op_desc->Outputs()) { - auto& grad_out_vars = op->grad_output_vars_[i][it.first]; - for (const std::string& grad_outvar : it.second) { - auto var_it = grad_to_var->find(grad_outvar); - PADDLE_ENFORCE(var_it != grad_to_var->end(), - "Could not found the grad op output var, should this " - "operator %s's stop gradient be True", - op->Type()); - std::shared_ptr var = - current_vars_map[var_it->second]; - CreateNoBuffuerGrad(var, prepared_op.GetDeviceContext()); - var->grads_->SetPreOp(var->PreOp()); - grad_out_vars.push_back(var->grads_); - VLOG(3) << "grads output var name: " << var->name_; + if (!preceding_ops.empty()) { + for (const auto& op : preceding_ops) { + PADDLE_ENFORCE_NOT_NULL(op, "No nullptr should be preceding_op"); + if (visited_preceding_ops.count(op) == 0) { + visited_preceding_ops.insert(op); + grad_op->InsertGradPendingOps(op); + } + } + } else { + VLOG(5) << "Hit leaf VarBase"; + VLOG(5) << "Hit leaf VarBase" + << (*(fwd_var_iter->second))->GradVarBase()->Name(); } } } + // To ensure numeric stability as static graph + grad_op->SortGradPendingOps(); } } + } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 02d902274103e1d42db7b849da633bf50a6167ad..f0a75d44731b20df36de3f93c1dce5a98ce6ae57 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,46 +14,48 @@ #pragma once -#include -#include +#include +#include // NOLINT +#include #include #include -#include #include - -#include "paddle/fluid/framework/op_desc.h" -#include "paddle/fluid/framework/op_registry.h" +#include "ThreadPool.h" #include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/imperative/layer.h" -#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/macros.h" namespace paddle { namespace imperative { -void CreateGradOp(const framework::OpDesc& op_desc, - const std::unordered_set& no_grad_set, - const std::vector& grad_sub_block, - framework::OpDesc** grad_op_desc, - std::unordered_map* grad_to_var); - -platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs); - class Tracer { + DISABLE_COPY_AND_ASSIGN(Tracer); + public: - explicit Tracer(framework::BlockDesc* root_block); + Tracer() : engine_(new BasicEngine()) {} - virtual ~Tracer() {} + ~Tracer() = default; - void Trace(OpBase* op, const VarBasePtrMap& inputs, - VarBasePtrMap* outputs, // NOLINT - framework::AttributeMap attrs_map, - const platform::Place expected_place, - const bool stop_gradient = false); + void TraceOp(const std::string& type, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, framework::AttributeMap attrs, + const platform::Place& place, bool trace_bacward); + + bool ComputeRequiredGrad(const NameVarBaseMap& ins, const NameVarBaseMap outs, + bool trace_backward); + + void TraceBackward(const std::shared_ptr& fwd_op, + const framework::OpDesc& fwd_op_desc, + const NameVarBaseMap& ins, const NameVarBaseMap& outs); + Engine* GetDefaultEngine() const { return engine_.get(); } private: - platform::Place GetPlace(const VarBasePtrMap& inputs); + static size_t GenerateUniqueId() { + static std::atomic id{0}; + return id.fetch_add(1); + } - framework::BlockDesc* root_block_; + private: + std::unique_ptr engine_; }; } // namespace imperative diff --git a/paddle/fluid/imperative/type_defs.h b/paddle/fluid/imperative/type_defs.h index fab8c2e6b9102f6ccaea09a5c08df9574f6b6a56..615b1b082d2e653dcf5e6cc22637ebb4a8fb495e 100644 --- a/paddle/fluid/imperative/type_defs.h +++ b/paddle/fluid/imperative/type_defs.h @@ -17,8 +17,6 @@ limitations under the License. */ #include #include #include -#include -#include #include namespace paddle { @@ -26,18 +24,10 @@ namespace imperative { class VarBase; class OpBase; +class Tracer; -typedef std::map>> - VarBasePtrMap; -typedef std::vector> VarBaseWeakPtrList; -typedef std::map> OpBasePtrMap; -typedef std::unordered_map< - const VarBase*, - std::pair>>>> - BackwardSumMap; // var_grad -> {place, {id -> var_grad@rename}} -typedef std::unordered_map> GradientRef; -// var_grad -> {ref_times, is_first_to_be_accumulate} +using NameVarBaseMap = + std::map>>; } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index b721ebe81719bfb833af56038065f91ce5fb795f..cb3493b62a37b79300a511cdd93f2eb377bb20f8 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,6 +1,6 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper nccl_wrapper prune - feed_fetch_method pass_builder parallel_executor profiler layer scope_pool - tracer analysis_predictor imperative_profiler nccl_context) + feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool + analysis_predictor imperative_profiler nccl_context imperative_flag) if(WITH_PYTHON) list(APPEND PYBIND_DEPS py_func_op) diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index aaae26cd0c51c19687ba56f6267589af6b3f87e1..999251ff57b9dc578c1fbd29fc24dfe85ca7c4a3 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -20,11 +20,13 @@ limitations under the License. */ #include #include #include +#include #include #include - -#include "paddle/fluid/framework/block_desc.h" +#include +#include "paddle/fluid/imperative/backward_strategy.h" #include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/imperative/nccl_context.h" #include "paddle/fluid/imperative/profiler.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/type_defs.h" @@ -44,16 +46,27 @@ class Layer : public imperative::Layer { const std::vector> &inputs) override { PYBIND11_OVERLOAD(std::vector>, Layer, - Forward, - inputs); // NOLINT + Forward, inputs); // NOLINT } }; -class PYBIND11_HIDDEN PyOpBase : public imperative::OpBase { +// warper for pyobject to avoid imperative module depend on python +// TODO(jiabin) Add OpBase's pybind interface back to enable backward hook +class PYBIND11_HIDDEN PyCallableObject { public: - using imperative::OpBase::OpBase; // Inherit constructors + PyCallableObject(std::shared_ptr py_obj_ptr) + : py_obj_ptr_(std::move(py_obj_ptr)) {} + ~PyCallableObject() { + py::call_guard(); + py_obj_ptr_.reset(); + } + void operator()() { + py::call_guard(); + py_obj_ptr_->operator()(this); + } - PyOpBase(const std::string &name) : OpBase(name) {} + private: + std::shared_ptr py_obj_ptr_; }; // Function like obj.attr_name in Python. @@ -125,33 +138,43 @@ GetVarBaseListFromPyHandle(const py::handle &handle) { } } else { PADDLE_THROW( - "unsupported type %s, must be Variable, List[Variable] or " + "unsupported type %s, must be Variable, list[Variable] or " "tuple[Variable]", py::str(handle)); } - PADDLE_ENFORCE(PyErr_Occurred() == nullptr, - py::str(py::handle(PyErr_Occurred()))); - return result; } -using PyVarBaseMap = std::unordered_map; +using PyNameVarBaseMap = std::unordered_map; -static imperative::VarBasePtrMap ConvertToVarBasePtrMap( - const PyVarBaseMap &map) { - imperative::VarBasePtrMap result; +static imperative::NameVarBaseMap ConvertToNameVarBaseMap( + const PyNameVarBaseMap &map) { + imperative::NameVarBaseMap result; for (auto &pair : map) { auto var_vec = GetVarBaseListFromPyHandle(pair.second); if (!var_vec.empty()) { result.emplace(pair.first, std::move(var_vec)); } } + + PADDLE_ENFORCE_EQ(PyErr_Occurred() == nullptr, true, + py::str(py::handle(PyErr_Occurred()))); return result; } +static std::string GetTypeName(const imperative::VarBase &var) { + if (var.Type() == framework::proto::VarType::RAW) { + return "RAW"; + } else if (!var.Var().IsInitialized()) { + return "nullptr"; + } else { + return framework::ToTypeName(var.Var().Type()); + } +} + // Bind Methods -void BindImperative(pybind11::module *m_ptr) { +void BindImperative(py::module *m_ptr) { auto &m = *m_ptr; py::class_ backward_strategy( @@ -200,69 +223,88 @@ void BindImperative(pybind11::module *m_ptr) { m.def("_dygraph_debug_level", []() { return imperative::GetDebugLevel(); }); py::class_>( - m, "VarBase", R"DOC()DOC") + m, "VarBase", + R"DOC()DOC") .def_static("_alive_vars", &imperative::VarBase::AliveVarNames) - .def( - py::init, const paddle::platform::CPUPlace, - bool, bool>()) - .def( - py::init, - const paddle::platform::CUDAPlace, bool, bool>()) + .def("__init__", + [](imperative::VarBase &self, const std::string &name, + framework::proto::VarType::Type type, + framework::proto::VarType::Type dtype, + const std::vector &dims, bool stop_gradient, + bool persistable) { + new (&self) imperative::VarBase(name); + self.SetPersistable(persistable); + self.SetType(type); + self.SetDataType(dtype); + self.SetStopGradient(stop_gradient); + if (type == framework::proto::VarType::LOD_TENSOR) { + auto *tensor = + self.MutableVar()->GetMutable(); + tensor->Resize(framework::make_ddim(dims)); + } + }) .def("_run_backward", [](imperative::VarBase &self, - const imperative::detail::BackwardStrategy &bckst) { - self.RunBackward(bckst); - }) - .def("_grad_name", &imperative::VarBase::GradName) - .def("_grad_value", &imperative::VarBase::GradValue) + const imperative::detail::BackwardStrategy &bckst, + const imperative::Tracer &tracer) { + // TODO(jiabin): when we impl more backward execution we can select + // them + + imperative::Engine *engine = tracer.GetDefaultEngine(); + VLOG(3) << "Start backward"; + engine->Init(&self, bckst); + engine->Execute(); + VLOG(3) << "Finish backward"; + }, + py::call_guard()) + .def("_grad_name", &imperative::VarBase::GradVarName) + .def("_grad_value", + [](imperative::VarBase &self) { + return self.MutableGradVar()->Get(); + }, + py::return_value_policy::reference) .def("_clear_gradient", &imperative::VarBase::ClearGradient) .def("_grad_ivar", - [](const imperative::VarBase &self) { return self.grads_; }, - py::return_value_policy::reference) + [](const imperative::VarBase &self) { + auto &grad_var = self.GradVarBase(); + if (grad_var && grad_var->Var().IsInitialized()) { + return grad_var; + } else { + return std::shared_ptr(nullptr); + } + }, + py::return_value_policy::copy) .def("_copy_to", [](const imperative::VarBase &self, const platform::CPUPlace &place, - bool blocking) { - return self.NewVarBase(place, blocking).release(); - }, - py::return_value_policy::take_ownership) + bool blocking) { return self.NewVarBase(place, blocking); }, + py::return_value_policy::copy) .def("_copy_to", [](const imperative::VarBase &self, const platform::CUDAPlace &place, - bool blocking) { - return self.NewVarBase(place, blocking).release(); - }, - py::return_value_policy::take_ownership) - .def("value", - [](const imperative::VarBase &self) { return self.var_.get(); }, + bool blocking) { return self.NewVarBase(place, blocking); }, + py::return_value_policy::copy) + .def("value", [](imperative::VarBase &self) { return self.MutableVar(); }, py::return_value_policy::reference) .def_property("name", &imperative::VarBase::Name, &imperative::VarBase::SetName) - .def_property_readonly("shape", &imperative::VarBase::Shape) + .def_property_readonly( + "shape", + [](imperative::VarBase &self) { + if (self.Var().IsType()) { + return framework::vectorize2int( + self.Var().Get().dims()); + } else { + VLOG(2) << "It is meaningless to get shape of variable type " + << GetTypeName(self); + return std::vector(); + } + }) + .def_property_readonly("type", &imperative::VarBase::Type) .def_property_readonly("dtype", &imperative::VarBase::DataType) - .def_property("persistable", &imperative::VarBase::IsPersistable, + .def_property("persistable", &imperative::VarBase::Persistable, &imperative::VarBase::SetPersistable) - .def_property("stop_gradient", &imperative::VarBase::IsStopGradient, + .def_property("stop_gradient", &imperative::VarBase::StopGradient, &imperative::VarBase::SetStopGradient); - py::class_(m, "OpBase", R"DOC()DOC") - .def(py::init()) - .def("register_backward_hooks", - [](imperative::OpBase &self, const py::object &callable) { - self.RegisterBackwardHooks(callable); - }) - .def_property("_trace_id", - [](const imperative::OpBase &self) { - py::gil_scoped_release release; - return self.trace_id_; - }, - [](imperative::OpBase &self, int trace_id) { - py::gil_scoped_release release; - self.trace_id_ = trace_id; - }, - py::return_value_policy::reference) - .def_property_readonly("type", &imperative::OpBase::Type); - py::class_ layer(m, "Layer"); layer.def(py::init<>()) .def("forward", @@ -271,42 +313,35 @@ void BindImperative(pybind11::module *m_ptr) { return self.Forward(inputs); }); - // NOTE(zjl): Tracer use PyVarBaseMap as its parameter but not VarBasePtrMap. - // We call Python C-API to convert PyVarBaseMap to VarBasePtrMap, instead - // making conversion in Python code. This speed up Tracer.trace() about 6% - // in ptb model and make time cost in Python to be nearly zero. py::class_(m, "Tracer", "") .def("__init__", - [](imperative::Tracer &self, framework::BlockDesc *root_block) { - new (&self) imperative::Tracer(root_block); - }) + [](imperative::Tracer &self) { new (&self) imperative::Tracer(); }) .def("trace", - [](imperative::Tracer &self, imperative::OpBase *op, - const PyVarBaseMap &inputs, const PyVarBaseMap &outputs, - framework::AttributeMap attrs_map, - const platform::CPUPlace expected_place, - const bool stop_gradient = false) { - auto ins = ConvertToVarBasePtrMap(inputs); - auto outs = ConvertToVarBasePtrMap(outputs); + [](imperative::Tracer &self, const std::string &type, + const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, + framework::AttributeMap attrs, const platform::CUDAPlace &place, + bool trace_backward) { + auto ins_map = ConvertToNameVarBaseMap(ins); + auto outs_map = ConvertToNameVarBaseMap(outs); { py::gil_scoped_release release; - self.Trace(op, std::move(ins), &outs, attrs_map, expected_place, - stop_gradient); + self.TraceOp(type, std::move(ins_map), std::move(outs_map), + std::move(attrs), place, trace_backward); } }) - .def("trace", [](imperative::Tracer &self, imperative::OpBase *op, - const PyVarBaseMap &inputs, const PyVarBaseMap &outputs, - framework::AttributeMap attrs_map, - const platform::CUDAPlace expected_place, - const bool stop_gradient = false) { - auto ins = ConvertToVarBasePtrMap(inputs); - auto outs = ConvertToVarBasePtrMap(outputs); - { - py::gil_scoped_release release; - self.Trace(op, std::move(ins), &outs, attrs_map, expected_place, - stop_gradient); - } - }); + .def("trace", + [](imperative::Tracer &self, const std::string &type, + const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, + framework::AttributeMap attrs, const platform::CPUPlace &place, + bool trace_backward) { + auto ins_map = ConvertToNameVarBaseMap(ins); + auto outs_map = ConvertToNameVarBaseMap(outs); + { + py::gil_scoped_release release; + self.TraceOp(type, std::move(ins_map), std::move(outs_map), + std::move(attrs), place, trace_backward); + } + }); // define parallel context py::class_ parallel_strategy( diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h index cfe185bbfbbf327c8a2e75ed5be69dc5c680c05d..0e3e98512d60fa111c94f70bf43524c36463cc05 100644 --- a/paddle/fluid/pybind/imperative.h +++ b/paddle/fluid/pybind/imperative.h @@ -14,10 +14,6 @@ limitations under the License. */ #pragma once #include -#include -#include -#include "paddle/fluid/imperative/layer.h" -#include "paddle/fluid/imperative/nccl_context.h" #include "pybind11/pybind11.h" #include "pybind11/stl.h" diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index 89aca5178fe98d87355cf86bc23f8a70e65f5da6..fa648eb8d80454aeaaabe75be8b27c72e04643e8 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -18,6 +18,7 @@ from paddle.fluid import core from paddle.fluid import framework from .tracer import Tracer import logging +import objgraph __all__ = [ 'no_grad', @@ -123,7 +124,7 @@ def guard(place=None): """ train = framework.Program() startup = framework.Program() - tracer = Tracer(train.current_block().desc) + tracer = Tracer() if place is None: if core.is_compiled_with_cuda(): @@ -138,19 +139,22 @@ def guard(place=None): yield -def _print_debug_msg(): +def _print_debug_msg(limit=5, is_test=False): if not core._is_dygraph_debug_enabled(): logging.warn( 'Debug mode is not enabled. Please set FLAGS_dygraph_debug=1 to enable debug' ) return - unique_name_size = len(framework.unique_name.generator.ids) tracer_var_size = len(framework._dygraph_tracer()._vars) alive_cpp_var_size = len(core.VarBase._alive_vars()) - logging.warn( - 'unique_name num: {}, tracer vars num: {}, alive cpp vars num: {}' - .format(unique_name_size, tracer_var_size, alive_cpp_var_size)) + if not is_test: + logging.warn( + 'unique_name num: {}, tracer vars num: {}, alive cpp vars num: {}' + .format(unique_name_size, tracer_var_size, alive_cpp_var_size)) + objgraph.show_growth(limit=limit) + else: + return unique_name_size, tracer_var_size, alive_cpp_var_size def to_variable(value, block=None, name=None): diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index e5f57ac7cc4c7414567f91be19a900e088c60633..6f68cc4e1c00e705f1f74a4254499b81160ad0cd 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -20,7 +20,7 @@ from . import layers from . import parallel_helper from .. import framework from ..layers import collective -from . import to_variable +from . import to_variable, no_grad __all__ = ["prepare_context"] @@ -197,6 +197,7 @@ class DataParallel(layers.Layer): for g_var, g_shape in zip(origin_grad_vars, grad_shapes): nn.reshape(x=g_var, shape=g_shape, inplace=True) + @no_grad def apply_collective_grads(self): """ AllReduce the Parameters' gradient. diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py index aea95f2f53049b343f00d3c58c5533b0aa45958b..799f9423a1df7bd293aace89aefc1d62e142ae63 100644 --- a/python/paddle/fluid/dygraph/tracer.py +++ b/python/paddle/fluid/dygraph/tracer.py @@ -23,21 +23,15 @@ from paddle.fluid import framework __all__ = ['Tracer'] -def release_op(op): - del framework._dygraph_tracer()._ops[op._trace_id] - - class Tracer(core.Tracer): """ Python wrapper of dygraph tracer """ - def __init__(self, block): - super(Tracer, self).__init__(block) + def __init__(self): + super(Tracer, self).__init__() - self._ops = defaultdict() self._vars = defaultdict() - self._trace_id = 0 self._train_mode = True def trace_var(self, name, var): @@ -47,23 +41,10 @@ class Tracer(core.Tracer): return list((item for name, item in six.iteritems(self._vars) if isinstance(item, framework.Parameter))) - def _clear_ops(self): - self._ops = defaultdict() - self._trace_id = 0 - - def trace_op(self, op, inputs, outputs, stop_gradient=False): - # record op's trace id - op.iop._trace_id = self._trace_id - - self.trace(op.iop, inputs, outputs, op.attrs, - framework._current_expected_place(), stop_gradient) - - if not stop_gradient and self._train_mode: - self._trace_id += 1 - self._ops[op.iop._trace_id] = op - - # register backward hooks and variables if needed - op.iop.register_backward_hooks(release_op) + def trace_op(self, type, inputs, outputs, attrs, stop_gradient=False): + self.trace(type, inputs, outputs, attrs, + framework._current_expected_place(), self._train_mode and + not stop_gradient) def train_mode(self): self._train_mode = True diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 805d53815884d8ff0f070558e14153553dc211c7..475d0b9f0b0f2722213a42821371d0bd0f94459f 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -458,9 +458,10 @@ class Variable(object): self._ivar = kwargs.get("ivar", None) if not self._ivar: self._ivar = core.VarBase( - name, dtype if dtype else core.VarDesc.VarType.FP32, - list(shape) if shape else [], - _current_expected_place(), stop_gradient, True + name, type + if type else core.VarDesc.VarType.LOD_TENSOR, dtype + if dtype else core.VarDesc.VarType.FP32, + list(shape) if shape else [], stop_gradient, True if persistable else False) if persistable: _dygraph_tracer().trace_var(name, self) @@ -582,13 +583,16 @@ class Variable(object): return np.array(new_ivar.value().get_tensor()) def backward(self, backward_strategy=None): - from .dygraph import BackwardStrategy - if backward_strategy is None: - backward_strategy = BackwardStrategy() - backward_strategy.sort_sum_gradient = False + if in_dygraph_mode(): + from .dygraph import BackwardStrategy + if backward_strategy is None: + backward_strategy = BackwardStrategy() + backward_strategy.sort_sum_gradient = False - self._ivar._run_backward(backward_strategy) - _dygraph_tracer()._clear_ops() + self._ivar._run_backward(backward_strategy, _dygraph_tracer()) + else: + raise ValueError( + "Variable.backward() is only avaliable in DyGraph mode") def gradient(self): new_ivar = self._ivar._grad_ivar()._copy_to(core.CPUPlace(), True) @@ -616,9 +620,13 @@ class Variable(object): """ if in_dygraph_mode(): # TODO(panyx0718): add more dygraph debug info. - return 'name %s, dtype: %s shape: %s %s' % ( - self.name, self.dtype, self.shape, - str(self._ivar.value().get_tensor())) + tensor = self._ivar.value().get_tensor() + if tensor._is_initialized(): + return 'name %s, dtype: %s shape: %s %s' % ( + self.name, self.dtype, self.shape, str(tensor)) + else: + return 'name %s, shape: %s, not inited' % (self.name, + self.shape) assert isinstance(throw_on_error, bool) and isinstance(with_details, bool) @@ -713,7 +721,7 @@ class Variable(object): @property def type(self): if in_dygraph_mode(): - return self._ivar.dtype + return self._ivar.type else: return self.desc.type() @@ -1085,9 +1093,7 @@ class Operator(object): if type is None: raise ValueError( "`type` to initialized an Operator can not be None.") - self.iop = core.OpBase(type) - self.previous_ops = [] - + self._type = type self.attrs = attrs if attrs else {} else: self.block = block @@ -1233,7 +1239,7 @@ class Operator(object): @property def type(self): if in_dygraph_mode(): - return self.iop.type + return self._type else: return self.desc.type() @@ -1787,10 +1793,12 @@ class Block(object): else: attrs['is_test'] = False + type = kwargs.get("type", None) + op = Operator( block=self, desc=None, - type=kwargs.get("type", None), + type=type, inputs=None, outputs=None, attrs=attrs) @@ -1799,9 +1807,11 @@ class Block(object): # # TODO(minqiyang): add op stop_gradient support in static mode too. # currently, we only support stop_gradient in dygraph mode. - _dygraph_tracer().trace_op(op, + + _dygraph_tracer().trace_op(type, kwargs.get("inputs", {}), - kwargs.get("outputs", {}), + kwargs.get("outputs", {}), attrs + if attrs else {}, kwargs.get("stop_gradient", False)) else: op_desc = self.desc.append_op() @@ -1862,17 +1872,15 @@ class Block(object): def _prepend_op(self, *args, **kwargs): if in_dygraph_mode(): + type = kwargs.get("type", None) + attrs = kwargs.get("attrs", {}) op = Operator( - self, - None, - type=kwargs.get("type", None), - inputs=None, - outputs=None, - attrs=kwargs.get("attrs", {})) + self, None, type=type, inputs=None, outputs=None, attrs=attrs) - _dygraph_tracer().trace_op(op, + _dygraph_tracer().trace_op(type, kwargs.get("inputs", {}), - kwargs.get("outputs", {}), + kwargs.get("outputs", {}), attrs + if attrs else {}, kwargs.get("stop_gradient", False)) else: op_desc = self.desc._prepend_op() diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index cbefdc850e27ee1874c435b638818aa1a748506c..03578604ad672a3f77acc47551b8d6c5c9d5364f 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -615,9 +615,6 @@ class Optimizer(object): optimize_ops = self.apply_optimize( loss, startup_program=startup_program, params_grads=params_grads) - if framework.in_dygraph_mode(): - framework._dygraph_tracer()._clear_ops() - return optimize_ops, params_grads diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 838f0277aed6f1c90f318235379ba67b429b03c5..d91b9f7b32f6e86f264bec64fbf8d87b12152610 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -177,7 +177,7 @@ list(REMOVE_ITEM TEST_OPS test_basic_gru_api) list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op) list(REMOVE_ITEM TEST_OPS test_basic_lstm_api) list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op) - +list(REMOVE_ITEM TEST_OPS test_imperative_debug_string) # Some ops need to check results when gc is enabled # Currently, only ops that register NoNeedBufferVarsInference need to do this test set(TEST_OPS_WITH_GC @@ -240,6 +240,7 @@ py_test_modules(test_imperative_ocr_attention_model MODULES test_imperative_ocr_ py_test_modules(test_install_check MODULES test_install_check ENVS FLAGS_cudnn_deterministic=1 SERIAL) set_tests_properties(test_install_check PROPERTIES LABELS "RUN_TYPE=DIST") +py_test_modules(test_imperative_debug_string MODULES test_imperative_debug_string ENVS FLAGS_dygraph_debug=1) if(WITH_DISTRIBUTE) py_test_modules(test_dist_train MODULES test_dist_train ENVS ${dist_ENVS}) py_test_modules(test_lookup_remote_table_op MODULES test_lookup_remote_table_op ENVS ${dist_ENVS}) diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py index 43748eca5c6375932114f0b04880609bf0e161ca..bdf5b483812fb72c47794be72cfcbb57f3dea0c3 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py @@ -27,17 +27,40 @@ import paddle.fluid as fluid import paddle.fluid.dygraph as dygraph from paddle.fluid import core from paddle.fluid.optimizer import SGDOptimizer -from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC, BatchNorm +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC, LayerNorm from paddle.fluid.dygraph.base import to_variable from paddle.fluid.layer_helper import LayerHelper - +import math from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase +momentum_rate = 0.9 +l2_decay = 1.2e-4 + + +def optimizer_setting(params): + ls = params["learning_strategy"] + if "total_images" not in params: + total_images = 6149 + else: + total_images = params["total_images"] + + batch_size = ls["batch_size"] + step = int(math.ceil(float(total_images) / batch_size)) + bd = [step * e for e in ls["epochs"]] + lr = params["lr"] + num_epochs = params["num_epochs"] + optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.layers.cosine_decay( + learning_rate=lr, step_each_epoch=step, epochs=num_epochs), + momentum=momentum_rate, + regularization=fluid.regularizer.L2Decay(l2_decay)) + + return optimizer + class ConvBNLayer(fluid.dygraph.Layer): def __init__(self, name_scope, - num_channels, num_filters, filter_size, stride=1, @@ -46,26 +69,21 @@ class ConvBNLayer(fluid.dygraph.Layer): super(ConvBNLayer, self).__init__(name_scope) self._conv = Conv2D( - self.full_name(), + "conv2d", num_filters=num_filters, filter_size=filter_size, stride=stride, padding=(filter_size - 1) // 2, groups=groups, act=None, - bias_attr=None) + bias_attr=False, + param_attr=fluid.ParamAttr(name="weights")) - self._batch_norm = BatchNorm( - self.full_name(), num_filters, act=act, momentum=0.1) - self._layer_norm = fluid.dygraph.nn.LayerNorm( - self.full_name(), begin_norm_axis=1) + self._layer_norm = LayerNorm(self.full_name(), begin_norm_axis=1) def forward(self, inputs): y = self._conv(inputs) - # FIXME(zcd): when compare the result of multi-card and single-card, - # we should replace batch_norm with layer_norm. y = self._layer_norm(y) - # y = self._batch_norm(y) return y @@ -76,17 +94,19 @@ class SqueezeExcitation(fluid.dygraph.Layer): super(SqueezeExcitation, self).__init__(name_scope) self._pool = Pool2D( self.full_name(), pool_size=0, pool_type='avg', global_pooling=True) + stdv = 1.0 / math.sqrt(num_channels * 1.0) self._squeeze = FC( self.full_name(), size=num_channels // reduction_ratio, param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=0.05)), + initializer=fluid.initializer.Uniform(-stdv, stdv)), act='relu') + stdv = 1.0 / math.sqrt(num_channels / 16.0 * 1.0) self._excitation = FC( self.full_name(), size=num_channels, param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=0.05)), + initializer=fluid.initializer.Uniform(-stdv, stdv)), act='sigmoid') def forward(self, input): @@ -110,39 +130,37 @@ class BottleneckBlock(fluid.dygraph.Layer): self.conv0 = ConvBNLayer( self.full_name(), - num_channels=num_channels, num_filters=num_filters, - filter_size=1) + filter_size=1, + act="relu") self.conv1 = ConvBNLayer( self.full_name(), - num_channels=num_filters, num_filters=num_filters, filter_size=3, stride=stride, - groups=cardinality) + groups=cardinality, + act="relu") self.conv2 = ConvBNLayer( self.full_name(), - num_channels=num_filters, - num_filters=num_filters * 4, + num_filters=num_filters * 2, filter_size=1, - act='relu') + act=None) self.scale = SqueezeExcitation( self.full_name(), - num_channels=num_filters * 4, + num_channels=num_filters * 2, reduction_ratio=reduction_ratio) if not shortcut: self.short = ConvBNLayer( self.full_name(), - num_channels=num_channels, - num_filters=num_filters * 4, + num_filters=num_filters * 2, filter_size=1, stride=stride) self.shortcut = shortcut - self._num_channels_out = num_filters * 4 + self._num_channels_out = num_filters * 2 def forward(self, inputs): y = self.conv0(inputs) @@ -155,10 +173,7 @@ class BottleneckBlock(fluid.dygraph.Layer): else: short = self.short(inputs) - y = fluid.layers.elementwise_add(x=short, y=scale) - - layer_helper = LayerHelper(self.full_name(), act='relu') - y = layer_helper.append_activation(y) + y = fluid.layers.elementwise_add(x=short, y=scale, act='relu') return y @@ -178,7 +193,6 @@ class SeResNeXt(fluid.dygraph.Layer): num_filters = [128, 256, 512, 1024] self.conv0 = ConvBNLayer( self.full_name(), - num_channels=3, num_filters=64, filter_size=7, stride=2, @@ -196,8 +210,7 @@ class SeResNeXt(fluid.dygraph.Layer): num_filters = [128, 256, 512, 1024] self.conv0 = ConvBNLayer( self.full_name(), - num_channels=3, - num_filters=3, + num_filters=64, filter_size=7, stride=2, act='relu') @@ -214,24 +227,21 @@ class SeResNeXt(fluid.dygraph.Layer): num_filters = [128, 256, 512, 1024] self.conv0 = ConvBNLayer( self.full_name(), - num_channels=3, - num_filters=3, - filter_size=7, + num_filters=64, + filter_size=3, stride=2, act='relu') self.conv1 = ConvBNLayer( self.full_name(), - num_channels=64, - num_filters=3, - filter_size=7, - stride=2, + num_filters=64, + filter_size=3, + stride=1, act='relu') self.conv2 = ConvBNLayer( self.full_name(), - num_channels=64, - num_filters=3, - filter_size=7, - stride=2, + num_filters=128, + filter_size=3, + stride=1, act='relu') self.pool = Pool2D( self.full_name(), @@ -261,16 +271,14 @@ class SeResNeXt(fluid.dygraph.Layer): self.pool2d_avg = Pool2D( self.full_name(), pool_size=7, pool_type='avg', global_pooling=True) - import math stdv = 1.0 / math.sqrt(2048 * 1.0) - self.fc = FC(self.full_name(), - size=class_dim, - act='softmax', - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.Uniform(-stdv, stdv))) + self.out = FC(self.full_name(), + size=class_dim, + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv))) - def forward(self, inputs, label): + def forward(self, inputs): if self.layers == 50 or self.layers == 101: y = self.conv0(inputs) y = self.pool(y) @@ -283,13 +291,8 @@ class SeResNeXt(fluid.dygraph.Layer): for bottleneck_block in self.bottleneck_block_list: y = bottleneck_block(y) y = self.pool2d_avg(y) - # FIXME(zcd): the dropout should be removed when compare the - # result of multi-card and single-card. - # y = fluid.layers.dropout(y, dropout_prob=0.2, seed=1) - cost = self.fc(y) - loss = fluid.layers.cross_entropy(cost, label) - avg_loss = fluid.layers.mean(loss) - return avg_loss + y = self.out(y) + return y class TestSeResNeXt(TestParallelDyGraphRunnerBase): @@ -312,8 +315,11 @@ class TestSeResNeXt(TestParallelDyGraphRunnerBase): label = to_variable(y_data) label.stop_gradient = True - loss = model(img, label) - return loss + out = model(img) + softmax_out = fluid.layers.softmax(out, use_cudnn=False) + loss = fluid.layers.cross_entropy(input=softmax_out, label=label) + avg_loss = fluid.layers.mean(x=loss) + return avg_loss if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_imperative_debug_string.py b/python/paddle/fluid/tests/unittests/test_imperative_debug_string.py new file mode 100644 index 0000000000000000000000000000000000000000..1b201fc7f15f629cf03adadd0899f37b75cc8134 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_debug_string.py @@ -0,0 +1,75 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid as fluid +import numpy as np + + +class MLP(fluid.Layer): + def __init__(self, name_scope): + super(MLP, self).__init__(name_scope) + self._fc1 = fluid.dygraph.FC( + self.full_name(), + 3, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1)), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1))) + self._fc2 = fluid.dygraph.FC( + self.full_name(), + 4, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1)), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1))) + + def forward(self, inputs): + x = self._fc1(inputs) + x = self._fc2(x) + x = fluid.layers.reduce_sum(x) + return x + + +class TestDygraphDebugString(unittest.TestCase): + def test_dygraph_debug_string(self): + np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + unique_name = 0 + trace_var = 0 + alive_var = 0 + with fluid.dygraph.guard(): + mlp = MLP("mlp") + for i in range(10): + var_inp = fluid.dygraph.base.to_variable(np_inp) + out = mlp(var_inp) + out.backward() + mlp.clear_gradients() + unique_name_tmp, trace_var_tmp, alive_var_tmp = fluid.dygraph.base._print_debug_msg( + is_test=True) + if i > 0: + self.assertGreaterEqual(unique_name, unique_name_tmp) + self.assertGreaterEqual(trace_var, trace_var_tmp) + self.assertGreaterEqual(alive_var, alive_var_tmp) + else: + unique_name = unique_name_tmp + trace_var = trace_var_tmp + alive_var = alive_var_tmp + try: + fluid.dygraph.base._print_debug_msg() + except Exception as e: + raise RuntimeError( + "No Exception is accepted in _print_debug_msg, but we got: {}". + format(e)) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_framework.py b/python/paddle/fluid/tests/unittests/test_imperative_framework.py new file mode 100644 index 0000000000000000000000000000000000000000..0f83f89f7bd3876f6a9a8aedb9ca43082395f7a9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_framework.py @@ -0,0 +1,66 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid as fluid +import numpy as np +from test_imperative_base import new_program_scope + + +class MLP(fluid.Layer): + def __init__(self, name_scope): + super(MLP, self).__init__(name_scope) + self._fc1 = fluid.dygraph.FC( + self.full_name(), + 3, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1)), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1))) + self._fc2 = fluid.dygraph.FC( + self.full_name(), + 4, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1)), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1))) + + def forward(self, inputs): + x = self._fc1(inputs) + x = self._fc2(x) + x = fluid.layers.reduce_sum(x) + return x + + +class TestDygraphFramework(unittest.TestCase): + def test_dygraph_backward(self): + with new_program_scope(): + mlp = MLP("mlp") + var_inp = fluid.layers.data( + "input", shape=[2, 2], dtype="float32", append_batch_size=False) + out = mlp(var_inp) + try: + out.backward() + raise AssertionError( + "backward should not be usable in static graph mode") + except ValueError as e: + self.assertTrue((e is not None)) + + def test_dygraph_to_string(self): + np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + with fluid.dygraph.guard(): + var_inp = fluid.dygraph.base.to_variable(np_inp) + var_inp.to_string(throw_on_error=True) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py deleted file mode 100644 index 7c4721fdce4334d49fe54159c069f83a593d650c..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py +++ /dev/null @@ -1,1082 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import unittest -import paddle.fluid as fluid -from paddle.fluid import Embedding, LayerNorm, FC, Layer -from paddle.fluid.dygraph import to_variable, guard -from test_imperative_base import new_program_scope -from paddle.fluid import core -import numpy as np -import six -np.set_printoptions(suppress=True) - - -# Copy from models -class TrainTaskConfig(object): - # support both CPU and GPU now. - use_gpu = True - # the epoch number to train. - pass_num = 30 - # the number of sequences contained in a mini-batch. - # deprecated, set batch_size in args. - batch_size = 32 - # the hyper parameters for Adam optimizer. - # This static learning_rate will be multiplied to the LearningRateScheduler - # derived learning rate the to get the final learning rate. - learning_rate = 2.0 - beta1 = 0.9 - beta2 = 0.997 - eps = 1e-9 - # the parameters for learning rate scheduling. - warmup_steps = 8000 - # the weight used to mix up the ground-truth distribution and the fixed - # uniform distribution in label smoothing when training. - # Set this as zero if label smoothing is not wanted. - label_smooth_eps = 0.1 - # the directory for saving trained models. - model_dir = "trained_models" - # the directory for saving checkpoints. - ckpt_dir = "trained_ckpts" - # the directory for loading checkpoint. - # If provided, continue training from the checkpoint. - ckpt_path = None - # the parameter to initialize the learning rate scheduler. - # It should be provided if use checkpoints, since the checkpoint doesn't - # include the training step counter currently. - start_step = 0 - # the frequency to save trained models. - save_freq = 10000 - - -class InferTaskConfig(object): - use_gpu = True - # the number of examples in one run for sequence generation. - batch_size = 10 - # the parameters for beam search. - beam_size = 5 - max_out_len = 256 - # the number of decoded sentences to output. - n_best = 1 - # the flags indicating whether to output the special tokens. - output_bos = False - output_eos = False - output_unk = True - # the directory for loading the trained model. - model_path = "trained_models/pass_1.infer.model" - - -class ModelHyperParams(object): - # These following five vocabularies related configurations will be set - # automatically according to the passed vocabulary path and special tokens. - # size of source word dictionary. - src_vocab_size = 10000 - # size of target word dictionay - trg_vocab_size = 10000 - # index for token - bos_idx = 0 - # index for token - eos_idx = 1 - # index for token - unk_idx = 2 - # max length of sequences deciding the size of position encoding table. - max_length = 4 - # the dimension for word embeddings, which is also the last dimension of - # the input and output of multi-head attention, position-wise feed-forward - # networks, encoder and decoder. - d_model = 512 - # size of the hidden layer in position-wise feed-forward networks. - d_inner_hid = 2048 - # the dimension that keys are projected to for dot-product attention. - d_key = 64 - # the dimension that values are projected to for dot-product attention. - d_value = 64 - # number of head used in multi-head attention. - n_head = 8 - # number of sub-layers to be stacked in the encoder and decoder. - n_layer = 6 - # dropout rates of different modules. - prepostprocess_dropout = 0.1 - attention_dropout = 0.1 - relu_dropout = 0.1 - # to process before each sub-layer - preprocess_cmd = "n" # layer normalization - # to process after each sub-layer - postprocess_cmd = "da" # dropout + residual connection - # random seed used in dropout for CE. - dropout_seed = None - # the flag indicating whether to share embedding and softmax weights. - # vocabularies in source and target should be same for weight sharing. - weight_sharing = True - - -def merge_cfg_from_list(cfg_list, g_cfgs): - """ - Set the above global configurations using the cfg_list. - """ - assert len(cfg_list) % 2 == 0 - for key, value in zip(cfg_list[0::2], cfg_list[1::2]): - for g_cfg in g_cfgs: - if hasattr(g_cfg, key): - try: - value = eval(value) - except Exception: # for file path - pass - setattr(g_cfg, key, value) - break - - -def position_encoding_init(n_position, d_pos_vec): - """ - Generate the initial values for the sinusoid position encoding table. - """ - channels = d_pos_vec - position = np.arange(n_position) - num_timescales = channels // 2 - log_timescale_increment = (np.log(float(1e4) / float(1)) / - (num_timescales - 1)) - inv_timescales = np.exp(np.arange( - num_timescales)) * -log_timescale_increment - scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales, - 0) - signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1) - signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant') - position_enc = signal - return position_enc.astype("float32") - - -def create_data(is_static=False): - if is_static: - return [ - src_word_np, src_pos_np, src_slf_attn_bias_np, trg_word_np, - trg_pos_np, trg_slf_attn_bias_np, trg_src_attn_bias_np, lbl_word_np, - lbl_weight_np - ] - else: - enc_inputs = [ - to_variable( - src_word_np, name='src_word'), to_variable( - src_pos_np, name='src_pos'), to_variable( - src_slf_attn_bias_np, name='src_slf_attn_bias') - ] - dec_inputs = [ - to_variable( - trg_word_np, name='trg_word'), to_variable( - trg_pos_np, name='trg_pos'), to_variable( - trg_slf_attn_bias_np, name='trg_slf_attn_bias'), - to_variable( - trg_src_attn_bias_np, name='trg_src_attn_bias') - ] - label = to_variable(lbl_word_np, name='lbl_word') - weight = to_variable(lbl_weight_np, name='lbl_weight') - return enc_inputs, dec_inputs, label, weight - - -def create_feed_dict_list(data, init=False): - if init: - data_input_names = encoder_data_input_fields + \ - decoder_data_input_fields[:-1] + label_data_input_fields + pos_enc_param_names - else: - data_input_names = encoder_data_input_fields + \ - decoder_data_input_fields[:-1] + label_data_input_fields - feed_dict_list = dict() - for i in range(len(data_input_names)): - feed_dict_list[data_input_names[i]] = data[i] - return feed_dict_list - - -def make_all_inputs(input_fields): - """ - Define the input data layers for the transformer model. - """ - inputs = [] - for input_field in input_fields: - input_var = fluid.layers.data( - name=input_field, - shape=input_descs[input_field][0], - dtype=input_descs[input_field][1], - lod_level=input_descs[input_field][2] - if len(input_descs[input_field]) == 3 else 0, - append_batch_size=False) - inputs.append(input_var) - return inputs - - -# The placeholder for batch_size in compile time. Must be -1 currently to be -# consistent with some ops' infer-shape output in compile time, such as the -# sequence_expand op used in beamsearch decoder. -batch_size = -1 -# The placeholder for squence length in compile time. -seq_len = ModelHyperParams.max_length -# Here list the data shapes and data types of all inputs. -# The shapes here act as placeholder and are set to pass the infer-shape in -# compile time. -input_descs = { - # The actual data shape of src_word is: - # [batch_size, max_src_len_in_batch, 1] - "src_word": [(batch_size, seq_len, 1), "int64", 2], - # The actual data shape of src_pos is: - # [batch_size, max_src_len_in_batch, 1] - "src_pos": [(batch_size, seq_len, 1), "int64"], - # This input is used to remove attention weights on paddings in the - # encoder. - # The actual data shape of src_slf_attn_bias is: - # [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch] - "src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len, - seq_len), "float32"], - # The actual data shape of trg_word is: - # [batch_size, max_trg_len_in_batch, 1] - "trg_word": [(batch_size, seq_len, 1), "int64", - 2], # lod_level is only used in fast decoder. - # The actual data shape of trg_pos is: - # [batch_size, max_trg_len_in_batch, 1] - "trg_pos": [(batch_size, seq_len, 1), "int64"], - # This input is used to remove attention weights on paddings and - # subsequent words in the decoder. - # The actual data shape of trg_slf_attn_bias is: - # [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch] - "trg_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len, - seq_len), "float32"], - # This input is used to remove attention weights on paddings of the source - # input in the encoder-decoder attention. - # The actual data shape of trg_src_attn_bias is: - # [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch] - "trg_src_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len, - seq_len), "float32"], - # This input is used in independent decoder program for inference. - # The actual data shape of enc_output is: - # [batch_size, max_src_len_in_batch, d_model] - "enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"], - # The actual data shape of label_word is: - # [batch_size * max_trg_len_in_batch, 1] - "lbl_word": [(batch_size * seq_len, 1), "int64"], - # This input is used to mask out the loss of paddding tokens. - # The actual data shape of label_weight is: - # [batch_size * max_trg_len_in_batch, 1] - "lbl_weight": [(batch_size * seq_len, 1), "float32"], - # This input is used in beam-search decoder. - "init_score": [(batch_size, 1), "float32", 2], - # This input is used in beam-search decoder for the first gather - # (cell states updation) - "init_idx": [(batch_size, ), "int32"], -} - -# Names of word embedding table which might be reused for weight sharing. -word_emb_param_names = ( - "src_word_emb_table", - "trg_word_emb_table", ) -# Names of position encoding table which will be initialized externally. -pos_enc_param_names = ( - "src_pos_enc_table", - "trg_pos_enc_table", ) -# separated inputs for different usages. -encoder_data_input_fields = ( - "src_word", - "src_pos", - "src_slf_attn_bias", ) -decoder_data_input_fields = ( - "trg_word", - "trg_pos", - "trg_slf_attn_bias", - "trg_src_attn_bias", - "enc_output", ) -label_data_input_fields = ( - "lbl_word", - "lbl_weight", ) -# In fast decoder, trg_pos (only containing the current time step) is generated -# by ops and trg_slf_attn_bias is not needed. -fast_decoder_data_input_fields = ( - "trg_word", - "init_score", - "init_idx", - "trg_src_attn_bias", ) -# if we use py_reader -use_py_reader = False - -# if we run sync mode -sync = False - -# how many batches we use -batch_num = 5 - -np.random.seed = 90 -src_word_np = np.arange(1, TrainTaskConfig.batch_size * seq_len + 1).reshape( - [TrainTaskConfig.batch_size, seq_len, 1]).astype('int64') -src_pos_np = np.random.randint( - 1, seq_len, size=(TrainTaskConfig.batch_size, seq_len, 1), dtype='int64') -src_slf_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size, - ModelHyperParams.n_head, seq_len, - seq_len).astype('float32') - -trg_word_np = np.arange(1, TrainTaskConfig.batch_size * seq_len + 1).reshape( - [TrainTaskConfig.batch_size, seq_len, 1]).astype('int64') -trg_pos_np = np.random.randint( - 1, seq_len, size=(TrainTaskConfig.batch_size, seq_len, 1), dtype='int64') -trg_slf_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size, - ModelHyperParams.n_head, seq_len, - seq_len).astype('float32') -trg_src_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size, - ModelHyperParams.n_head, seq_len, - seq_len).astype('float32') - -lbl_word_np = np.random.randint( - 1, - ModelHyperParams.src_vocab_size - 1, - size=(TrainTaskConfig.batch_size * seq_len, 1), - dtype='int64') -lbl_weight_np = np.random.randn(TrainTaskConfig.batch_size * seq_len, - 1).astype('float32') - -pos_inp1 = position_encoding_init(ModelHyperParams.max_length, - ModelHyperParams.d_model) -pos_inp2 = position_encoding_init(ModelHyperParams.max_length, - ModelHyperParams.d_model) - - -class PrePostProcessLayer(Layer): - def __init__(self, name_scope, process_cmd, shape_len=None): - super(PrePostProcessLayer, self).__init__(name_scope) - for cmd in process_cmd: - if cmd == "n": - self._layer_norm = LayerNorm( - name_scope=self.full_name(), - begin_norm_axis=shape_len - 1, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(1.)), - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(0.))) - - def forward(self, prev_out, out, process_cmd, dropout_rate=0.): - for cmd in process_cmd: - if cmd == "a": # add residual connection - out = out + prev_out if prev_out else out - elif cmd == "n": # add layer normalization - out = self._layer_norm(out) - elif cmd == "d": # add dropout - if dropout_rate: - out = fluid.layers.dropout( - out, - dropout_prob=dropout_rate, - seed=ModelHyperParams.dropout_seed, - is_test=False) - return out - - -class PositionwiseFeedForwardLayer(Layer): - def __init__(self, name_scope, d_inner_hid, d_hid, dropout_rate): - super(PositionwiseFeedForwardLayer, self).__init__(name_scope) - self._i2h = FC(name_scope=self.full_name(), - size=d_inner_hid, - num_flatten_dims=2, - act="relu") - self._h2o = FC(name_scope=self.full_name(), - size=d_hid, - num_flatten_dims=2) - self._dropout_rate = dropout_rate - - def forward(self, x): - hidden = self._i2h(x) - if self._dropout_rate: - hidden = fluid.layers.dropout( - hidden, - dropout_prob=self._dropout_rate, - seed=ModelHyperParams.dropout_seed, - is_test=False) - out = self._h2o(hidden) - return out - - -class MultiHeadAttentionLayer(Layer): - def __init__(self, - name_scope, - d_key, - d_value, - d_model, - n_head=1, - dropout_rate=0., - cache=None, - gather_idx=None, - static_kv=False): - super(MultiHeadAttentionLayer, self).__init__(name_scope) - self._n_head = n_head - self._d_key = d_key - self._d_value = d_value - self._d_model = d_model - self._dropout_rate = dropout_rate - self._q_fc = FC(name_scope=self.full_name(), - size=d_key * n_head, - bias_attr=False, - num_flatten_dims=2) - self._k_fc = FC(name_scope=self.full_name(), - size=d_key * n_head, - bias_attr=False, - num_flatten_dims=2) - self._v_fc = FC(name_scope=self.full_name(), - size=d_value * n_head, - bias_attr=False, - num_flatten_dims=2) - self._proj_fc = FC(name_scope=self.full_name(), - size=self._d_model, - bias_attr=False, - num_flatten_dims=2) - - def forward(self, queries, keys, values, attn_bias): - # compute q ,k ,v - keys = queries if keys is None else keys - values = keys if values is None else values - - q = self._q_fc(queries) - k = self._k_fc(keys) - v = self._v_fc(values) - - # split head - reshaped_q = fluid.layers.reshape( - x=q, shape=[0, 0, self._n_head, self._d_key], inplace=False) - transpose_q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3]) - reshaped_k = fluid.layers.reshape( - x=k, shape=[0, 0, self._n_head, self._d_key], inplace=False) - transpose_k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3]) - reshaped_v = fluid.layers.reshape( - x=v, shape=[0, 0, self._n_head, self._d_value], inplace=False) - transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3]) - - # scale dot product attention - product = fluid.layers.matmul( - x=transpose_q, - y=transpose_k, - transpose_y=True, - alpha=self._d_model**-0.5) - if attn_bias: - product += attn_bias - weights = fluid.layers.softmax(product) - if self._dropout_rate: - weights_droped = fluid.layers.dropout( - weights, - dropout_prob=self._dropout_rate, - seed=ModelHyperParams.dropout_seed, - is_test=False) - out = fluid.layers.matmul(weights_droped, transpose_v) - else: - out = fluid.layers.matmul(weights, transpose_v) - - # combine heads - if len(out.shape) != 4: - raise ValueError("Input(x) should be a 4-D Tensor.") - trans_x = fluid.layers.transpose(out, perm=[0, 2, 1, 3]) - final_out = fluid.layers.reshape( - x=trans_x, - shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], - inplace=False) - - # fc to output - proj_out = self._proj_fc(final_out) - return proj_out - - -class EncoderSubLayer(Layer): - def __init__(self, - name_scope, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd="n", - postprocess_cmd="da"): - - super(EncoderSubLayer, self).__init__(name_scope) - self._preprocess_cmd = preprocess_cmd - self._postprocess_cmd = postprocess_cmd - self._prepostprocess_dropout = prepostprocess_dropout - - self._preprocess_layer = PrePostProcessLayer(self.full_name(), - self._preprocess_cmd, 3) - self._multihead_attention_layer = MultiHeadAttentionLayer( - self.full_name(), d_key, d_value, d_model, n_head, - attention_dropout) - self._postprocess_layer = PrePostProcessLayer( - self.full_name(), self._postprocess_cmd, None) - self._preprocess_layer2 = PrePostProcessLayer(self.full_name(), - self._preprocess_cmd, 3) - self._positionwise_feed_forward = PositionwiseFeedForwardLayer( - self.full_name(), d_inner_hid, d_model, relu_dropout) - self._postprocess_layer2 = PrePostProcessLayer( - self.full_name(), self._postprocess_cmd, None) - - def forward(self, enc_input, attn_bias): - pre_process_multihead = self._preprocess_layer( - None, enc_input, self._preprocess_cmd, self._prepostprocess_dropout) - attn_output = self._multihead_attention_layer(pre_process_multihead, - None, None, attn_bias) - attn_output = self._postprocess_layer(enc_input, attn_output, - self._postprocess_cmd, - self._prepostprocess_dropout) - pre_process2_output = self._preprocess_layer2( - None, attn_output, self._preprocess_cmd, - self._prepostprocess_dropout) - ffd_output = self._positionwise_feed_forward(pre_process2_output) - return self._postprocess_layer2(attn_output, ffd_output, - self._postprocess_cmd, - self._prepostprocess_dropout) - - -class EncoderLayer(Layer): - def __init__(self, - name_scope, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd="n", - postprocess_cmd="da"): - - super(EncoderLayer, self).__init__(name_scope) - self._preprocess_cmd = preprocess_cmd - self._encoder_sublayers = list() - self._prepostprocess_dropout = prepostprocess_dropout - self._n_layer = n_layer - self._preprocess_layer = PrePostProcessLayer(self.full_name(), - self._preprocess_cmd, 3) - for i in range(n_layer): - self._encoder_sublayers.append( - self.add_sublayer( - 'esl_%d' % i, - EncoderSubLayer( - self.full_name(), n_head, d_key, d_value, d_model, - d_inner_hid, prepostprocess_dropout, attention_dropout, - relu_dropout, preprocess_cmd, postprocess_cmd))) - - def forward(self, enc_input, attn_bias): - for i in range(self._n_layer): - enc_output = self._encoder_sublayers[i](enc_input, attn_bias) - enc_input = enc_output - - return self._preprocess_layer(None, enc_output, self._preprocess_cmd, - self._prepostprocess_dropout) - - -class PrepareEncoderDecoderLayer(Layer): - def __init__(self, - name_scope, - src_vocab_size, - src_emb_dim, - src_max_len, - dropout_rate, - word_emb_param_name=None, - pos_enc_param_name=None): - super(PrepareEncoderDecoderLayer, self).__init__(name_scope) - self._src_max_len = src_max_len - self._src_emb_dim = src_emb_dim - self._src_vocab_size = src_vocab_size - self._dropout_rate = dropout_rate - self._input_emb = Embedding( - name_scope=self.full_name(), - size=[src_vocab_size, src_emb_dim], - padding_idx=0, - param_attr=fluid.ParamAttr( - name=word_emb_param_name, - initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5))) - - if pos_enc_param_name is pos_enc_param_names[0]: - pos_inp = pos_inp1 - else: - pos_inp = pos_inp2 - self._pos_emb = Embedding( - name_scope=self.full_name(), - size=[self._src_max_len, src_emb_dim], - param_attr=fluid.ParamAttr( - name=pos_enc_param_name, - initializer=fluid.initializer.NumpyArrayInitializer(pos_inp), - trainable=False)) - - # use in dygraph_mode to fit different length batch - # self._pos_emb._w = to_variable( - # position_encoding_init(self._src_max_len, self._src_emb_dim)) - - def forward(self, src_word, src_pos): - src_word_emb = self._input_emb(src_word) - src_word_emb = fluid.layers.scale( - x=src_word_emb, scale=self._src_emb_dim**0.5) - # # TODO change this to fit dynamic length input - src_pos_emb = self._pos_emb(src_pos) - src_pos_emb.stop_gradient = True - enc_input = src_word_emb + src_pos_emb - return fluid.layers.dropout( - enc_input, - dropout_prob=self._dropout_rate, - seed=ModelHyperParams.dropout_seed, - is_test=False) if self._dropout_rate else enc_input - - -class WrapEncoderLayer(Layer): - def __init__(self, name_cope, src_vocab_size, max_length, n_layer, n_head, - d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, - attention_dropout, relu_dropout, preprocess_cmd, - postprocess_cmd, weight_sharing): - """ - The wrapper assembles together all needed layers for the encoder. - """ - super(WrapEncoderLayer, self).__init__(name_cope) - - self._prepare_encoder_layer = PrepareEncoderDecoderLayer( - self.full_name(), - src_vocab_size, - d_model, - max_length, - prepostprocess_dropout, - word_emb_param_name=word_emb_param_names[0], - pos_enc_param_name=pos_enc_param_names[0]) - self._encoder = EncoderLayer( - self.full_name(), n_layer, n_head, d_key, d_value, d_model, - d_inner_hid, prepostprocess_dropout, attention_dropout, - relu_dropout, preprocess_cmd, postprocess_cmd) - - def forward(self, enc_inputs): - src_word, src_pos, src_slf_attn_bias = enc_inputs - enc_input = self._prepare_encoder_layer(src_word, src_pos) - enc_output = self._encoder(enc_input, src_slf_attn_bias) - return enc_output - - -class DecoderSubLayer(Layer): - def __init__(self, - name_scope, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - cache=None, - gather_idx=None): - super(DecoderSubLayer, self).__init__(name_scope) - self._postprocess_cmd = postprocess_cmd - self._preprocess_cmd = preprocess_cmd - self._prepostprcess_dropout = prepostprocess_dropout - self._pre_process_layer = PrePostProcessLayer(self.full_name(), - preprocess_cmd, 3) - self._multihead_attention_layer = MultiHeadAttentionLayer( - self.full_name(), - d_key, - d_value, - d_model, - n_head, - attention_dropout, - cache=cache, - gather_idx=gather_idx) - self._post_process_layer = PrePostProcessLayer(self.full_name(), - postprocess_cmd, None) - self._pre_process_layer2 = PrePostProcessLayer(self.full_name(), - preprocess_cmd, 3) - self._multihead_attention_layer2 = MultiHeadAttentionLayer( - self.full_name(), - d_key, - d_value, - d_model, - n_head, - attention_dropout, - cache=cache, - gather_idx=gather_idx, - static_kv=True) - self._post_process_layer2 = PrePostProcessLayer(self.full_name(), - postprocess_cmd, None) - self._pre_process_layer3 = PrePostProcessLayer(self.full_name(), - preprocess_cmd, 3) - self._positionwise_feed_forward_layer = PositionwiseFeedForwardLayer( - self.full_name(), d_inner_hid, d_model, relu_dropout) - self._post_process_layer3 = PrePostProcessLayer(self.full_name(), - postprocess_cmd, None) - - def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias): - pre_process_rlt = self._pre_process_layer( - None, dec_input, self._preprocess_cmd, self._prepostprcess_dropout) - slf_attn_output = self._multihead_attention_layer(pre_process_rlt, None, - None, slf_attn_bias) - slf_attn_output_pp = self._post_process_layer( - dec_input, slf_attn_output, self._postprocess_cmd, - self._prepostprcess_dropout) - pre_process_rlt2 = self._pre_process_layer2(None, slf_attn_output_pp, - self._preprocess_cmd, - self._prepostprcess_dropout) - enc_attn_output_pp = self._multihead_attention_layer2( - pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias) - enc_attn_output = self._post_process_layer2( - slf_attn_output_pp, enc_attn_output_pp, self._postprocess_cmd, - self._prepostprcess_dropout) - pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output, - self._preprocess_cmd, - self._prepostprcess_dropout) - ffd_output = self._positionwise_feed_forward_layer(pre_process_rlt3) - dec_output = self._post_process_layer3(enc_attn_output, ffd_output, - self._postprocess_cmd, - self._prepostprcess_dropout) - return dec_output - - -class DecoderLayer(Layer): - def __init__(self, - name_scope, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - caches=None, - gather_idx=None): - super(DecoderLayer, self).__init__(name_scope) - self._pre_process_layer = PrePostProcessLayer(self.full_name(), - preprocess_cmd, 3) - self._decoder_sub_layers = list() - self._n_layer = n_layer - self._preprocess_cmd = preprocess_cmd - self._prepostprocess_dropout = prepostprocess_dropout - for i in range(n_layer): - self._decoder_sub_layers.append( - self.add_sublayer( - 'dsl_%d' % i, - DecoderSubLayer( - self.full_name(), - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - cache=None if caches is None else caches[i], - gather_idx=gather_idx))) - - def forward(self, dec_input, enc_output, dec_slf_attn_bias, - dec_enc_attn_bias): - for i in range(self._n_layer): - tmp_dec_output = self._decoder_sub_layers[i]( - dec_input, enc_output, dec_slf_attn_bias, dec_enc_attn_bias) - dec_input = tmp_dec_output - - dec_output = self._pre_process_layer(None, tmp_dec_output, - self._preprocess_cmd, - self._prepostprocess_dropout) - return dec_output - - -class WrapDecoderLayer(Layer): - def __init__(self, - name_scope, - trg_vocab_size, - max_length, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - caches=None, - gather_idx=None): - """ - The wrapper assembles together all needed layers for the encoder. - """ - super(WrapDecoderLayer, self).__init__(name_scope) - - self._prepare_decoder_layer = PrepareEncoderDecoderLayer( - self.full_name(), - trg_vocab_size, - d_model, - max_length, - prepostprocess_dropout, - word_emb_param_name=word_emb_param_names[1], - pos_enc_param_name=pos_enc_param_names[1]) - self._decoder_layer = DecoderLayer( - self.full_name(), - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - caches=caches, - gather_idx=gather_idx) - self._weight_sharing = weight_sharing - if not weight_sharing: - self._fc = FC(self.full_name(), - size=trg_vocab_size, - bias_attr=False) - - def forward(self, dec_inputs=None, enc_output=None): - trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs - dec_input = self._prepare_decoder_layer(trg_word, trg_pos) - dec_output = self._decoder_layer(dec_input, enc_output, - trg_slf_attn_bias, trg_src_attn_bias) - - dec_output_reshape = fluid.layers.reshape( - dec_output, shape=[-1, dec_output.shape[-1]], inplace=False) - - if self._weight_sharing: - predict = fluid.layers.matmul( - x=dec_output_reshape, - y=self._prepare_decoder_layer._input_emb._w, - transpose_y=True) - else: - predict = self._fc(dec_output_reshape) - - if dec_inputs is None: - # Return probs for independent decoder program. - predict_out = fluid.layers.softmax(predict) - return predict_out - return predict - - -class TransFormer(Layer): - def __init__(self, - name_scope, - src_vocab_size, - trg_vocab_size, - max_length, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - preprocess_cmd, - postprocess_cmd, - weight_sharing, - label_smooth_eps, - use_py_reader=False, - is_test=False): - super(TransFormer, self).__init__(name_scope) - self._label_smooth_eps = label_smooth_eps - self._trg_vocab_size = trg_vocab_size - if weight_sharing: - assert src_vocab_size == trg_vocab_size, ( - "Vocabularies in source and target should be same for weight sharing." - ) - self._wrap_encoder_layer = WrapEncoderLayer( - self.full_name(), src_vocab_size, max_length, n_layer, n_head, - d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, - attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, - weight_sharing) - self._wrap_decoder_layer = WrapDecoderLayer( - self.full_name(), trg_vocab_size, max_length, n_layer, n_head, - d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, - attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, - weight_sharing) - - if weight_sharing: - self._wrap_decoder_layer._prepare_decoder_layer._input_emb._w = self._wrap_encoder_layer._prepare_encoder_layer._input_emb._w - - def forward(self, enc_inputs, dec_inputs, label, weights): - enc_output = self._wrap_encoder_layer(enc_inputs) - predict = self._wrap_decoder_layer(dec_inputs, enc_output) - if self._label_smooth_eps: - label_out = fluid.layers.label_smooth( - label=fluid.layers.one_hot( - input=label, depth=self._trg_vocab_size), - epsilon=self._label_smooth_eps) - - cost = fluid.layers.softmax_with_cross_entropy( - logits=predict, - label=label_out, - soft_label=True if self._label_smooth_eps else False) - weighted_cost = cost * weights - sum_cost = fluid.layers.reduce_sum(weighted_cost) - token_num = fluid.layers.reduce_sum(weights) - token_num.stop_gradient = True - avg_cost = sum_cost / token_num - return sum_cost, avg_cost, predict, token_num - - -class TestDygraphTransformer(unittest.TestCase): - def test_transformer_float32(self): - seed = 90 - with guard(): - fluid.default_startup_program().random_seed = seed - fluid.default_main_program().random_seed = seed - transformer = TransFormer( - 'transformer', - ModelHyperParams.src_vocab_size, - ModelHyperParams.trg_vocab_size, - ModelHyperParams.max_length + 1, - ModelHyperParams.n_layer, - ModelHyperParams.n_head, - ModelHyperParams.d_key, - ModelHyperParams.d_value, - ModelHyperParams.d_model, - ModelHyperParams.d_inner_hid, - ModelHyperParams.prepostprocess_dropout, - ModelHyperParams.attention_dropout, - ModelHyperParams.relu_dropout, - ModelHyperParams.preprocess_cmd, - ModelHyperParams.postprocess_cmd, - ModelHyperParams.weight_sharing, - TrainTaskConfig.label_smooth_eps, - use_py_reader=use_py_reader, - is_test=False) - if sync: - lr_decay = fluid.layers.learning_rate_scheduler.noam_decay( - ModelHyperParams.d_model, TrainTaskConfig.warmup_steps) - with fluid.default_main_program()._lr_schedule_guard(): - learning_rate = lr_decay * TrainTaskConfig.learning_rate - optimizer = fluid.optimizer.Adam( - learning_rate=learning_rate, - beta1=TrainTaskConfig.beta1, - beta2=TrainTaskConfig.beta2, - epsilon=TrainTaskConfig.eps) - else: - optimizer = fluid.optimizer.SGD(learning_rate=0.003) - dy_param_init = dict() - dy_param_updated = dict() - for i in range(batch_num): - enc_inputs, dec_inputs, label, weights = create_data() - dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = transformer( - enc_inputs, dec_inputs, label, weights) - - if i == 0: - for param in transformer.parameters(): - dy_param_init[param.name] = param.numpy() - - dy_avg_cost.backward() - optimizer.minimize(dy_avg_cost) - transformer.clear_gradients() - - if i == batch_num - 1: - for param in transformer.parameters(): - dy_param_updated[param.name] = param.numpy() - - with new_program_scope(): - fluid.default_startup_program().random_seed = seed - fluid.default_main_program().random_seed = seed - transformer = TransFormer( - 'transformer', - ModelHyperParams.src_vocab_size, - ModelHyperParams.trg_vocab_size, - ModelHyperParams.max_length + 1, - ModelHyperParams.n_layer, - ModelHyperParams.n_head, - ModelHyperParams.d_key, - ModelHyperParams.d_value, - ModelHyperParams.d_model, - ModelHyperParams.d_inner_hid, - ModelHyperParams.prepostprocess_dropout, - ModelHyperParams.attention_dropout, - ModelHyperParams.relu_dropout, - ModelHyperParams.preprocess_cmd, - ModelHyperParams.postprocess_cmd, - ModelHyperParams.weight_sharing, - TrainTaskConfig.label_smooth_eps, - use_py_reader=use_py_reader, - is_test=False) - exe = fluid.Executor(fluid.CPUPlace( - ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) - optimizer = fluid.optimizer.SGD(learning_rate=0.003) - - data_input_names = encoder_data_input_fields + decoder_data_input_fields[: - -1] + label_data_input_fields - all_inputs = make_all_inputs(data_input_names) - enc_inputs_len = len(encoder_data_input_fields) - dec_inputs_len = len(decoder_data_input_fields[:-1]) - enc_inputs = all_inputs[0:enc_inputs_len] - dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + - dec_inputs_len] - label = all_inputs[-2] - weights = all_inputs[-1] - static_param_updated = dict() - static_param_init = dict() - static_param_name_list = list() - static_sum_cost, static_avg_cost, static_predict, static_token_num = transformer( - enc_inputs, dec_inputs, label, weights) - optimizer.minimize(static_avg_cost) - for param in transformer.parameters(): - static_param_name_list.append(param.name) - out = exe.run(fluid.default_startup_program(), - fetch_list=static_param_name_list) - for i in range(len(static_param_name_list)): - static_param_init[static_param_name_list[i]] = out[i] - static_sum_cost_value = None - static_avg_cost_value = None - static_predict_value = None - static_token_num_value = None - for i in range(batch_num): - feed_dict = create_feed_dict_list(create_data(True)) - fetch_list = [ - static_sum_cost, static_avg_cost, static_predict, - static_token_num - ] - - fetch_list.extend(static_param_name_list) - out = exe.run(fluid.default_main_program(), - feed=feed_dict, - fetch_list=fetch_list) - static_sum_cost_value = out[0] - static_avg_cost_value = out[1] - static_predict_value = out[2] - static_token_num_value = out[3] - if i == batch_num - 1: - for k in range(4, len(out)): - static_param_updated[static_param_name_list[k - - 4]] = out[k] - - self.assertTrue( - np.array_equal(static_avg_cost_value, dy_avg_cost.numpy())) - self.assertTrue( - np.array_equal(static_sum_cost_value, dy_sum_cost.numpy())) - self.assertTrue( - np.array_equal(static_predict_value, dy_predict.numpy())) - self.assertTrue( - np.array_equal(static_token_num_value, dy_token_num.numpy())) - - for key, value in six.iteritems(static_param_init): - self.assertTrue(np.array_equal(value, dy_param_init[key])) - for key, value in six.iteritems(static_param_updated): - self.assertTrue(np.array_equal(value, dy_param_updated[key])) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py index 1bed866db82d5aed908a94e4b6d2e9958f15ebee..51fb66f7743e7d79fb3d75feb2d32e080f1f48df 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py @@ -20,12 +20,144 @@ from paddle.fluid import Embedding, LayerNorm, FC, Layer from paddle.fluid.dygraph import to_variable, guard from test_imperative_base import new_program_scope from paddle.fluid import core -from test_imperative_transformer import TransFormer, TrainTaskConfig, ModelHyperParams import numpy as np import six np.set_printoptions(suppress=True) +# Copy from models +class TrainTaskConfig(object): + # support both CPU and GPU now. + use_gpu = True + # the epoch number to train. + pass_num = 30 + # the number of sequences contained in a mini-batch. + # deprecated, set batch_size in args. + batch_size = 32 + # the hyper parameters for Adam optimizer. + # This static learning_rate will be multiplied to the LearningRateScheduler + # derived learning rate the to get the final learning rate. + learning_rate = 2.0 + beta1 = 0.9 + beta2 = 0.997 + eps = 1e-9 + # the parameters for learning rate scheduling. + warmup_steps = 8000 + # the weight used to mix up the ground-truth distribution and the fixed + # uniform distribution in label smoothing when training. + # Set this as zero if label smoothing is not wanted. + label_smooth_eps = 0.1 + # the directory for saving trained models. + model_dir = "trained_models" + # the directory for saving checkpoints. + ckpt_dir = "trained_ckpts" + # the directory for loading checkpoint. + # If provided, continue training from the checkpoint. + ckpt_path = None + # the parameter to initialize the learning rate scheduler. + # It should be provided if use checkpoints, since the checkpoint doesn't + # include the training step counter currently. + start_step = 0 + # the frequency to save trained models. + save_freq = 10000 + + +class InferTaskConfig(object): + use_gpu = True + # the number of examples in one run for sequence generation. + batch_size = 10 + # the parameters for beam search. + beam_size = 5 + max_out_len = 256 + # the number of decoded sentences to output. + n_best = 1 + # the flags indicating whether to output the special tokens. + output_bos = False + output_eos = False + output_unk = True + # the directory for loading the trained model. + model_path = "trained_models/pass_1.infer.model" + + +class ModelHyperParams(object): + # These following five vocabularies related configurations will be set + # automatically according to the passed vocabulary path and special tokens. + # size of source word dictionary. + src_vocab_size = 10000 + # size of target word dictionay + trg_vocab_size = 10000 + # index for token + bos_idx = 0 + # index for token + eos_idx = 1 + # index for token + unk_idx = 2 + # max length of sequences deciding the size of position encoding table. + max_length = 4 + # the dimension for word embeddings, which is also the last dimension of + # the input and output of multi-head attention, position-wise feed-forward + # networks, encoder and decoder. + d_model = 512 + # size of the hidden layer in position-wise feed-forward networks. + d_inner_hid = 2048 + # the dimension that keys are projected to for dot-product attention. + d_key = 64 + # the dimension that values are projected to for dot-product attention. + d_value = 64 + # number of head used in multi-head attention. + n_head = 8 + # number of sub-layers to be stacked in the encoder and decoder. + n_layer = 6 + # dropout rates of different modules. + prepostprocess_dropout = 0.1 + attention_dropout = 0.1 + relu_dropout = 0.1 + # to process before each sub-layer + preprocess_cmd = "n" # layer normalization + # to process after each sub-layer + postprocess_cmd = "da" # dropout + residual connection + # random seed used in dropout for CE. + dropout_seed = None + # the flag indicating whether to share embedding and softmax weights. + # vocabularies in source and target should be same for weight sharing. + weight_sharing = True + + +def merge_cfg_from_list(cfg_list, g_cfgs): + """ + Set the above global configurations using the cfg_list. + """ + assert len(cfg_list) % 2 == 0 + for key, value in zip(cfg_list[0::2], cfg_list[1::2]): + for g_cfg in g_cfgs: + if hasattr(g_cfg, key): + try: + value = eval(value) + except Exception: # for file path + pass + setattr(g_cfg, key, value) + break + + +def position_encoding_init(n_position, d_pos_vec): + """ + Generate the initial values for the sinusoid position encoding table. + """ + channels = d_pos_vec + position = np.arange(n_position) + num_timescales = channels // 2 + log_timescale_increment = (np.log(float(1e4) / float(1)) / + (num_timescales - 1)) + inv_timescales = np.exp(np.arange( + num_timescales)) * -log_timescale_increment + scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales, + 0) + signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1) + signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant') + position_enc = signal + return position_enc.astype("float32") + + def create_data(is_static=False): if is_static: return [ @@ -208,6 +340,598 @@ lbl_word_np = np.random.randint( lbl_weight_np = np.random.randn(TrainTaskConfig.batch_size * seq_len, 1).astype('float32') +pos_inp1 = position_encoding_init(ModelHyperParams.max_length, + ModelHyperParams.d_model) +pos_inp2 = position_encoding_init(ModelHyperParams.max_length, + ModelHyperParams.d_model) + + +class PrePostProcessLayer(Layer): + def __init__(self, name_scope, process_cmd, shape_len=None): + super(PrePostProcessLayer, self).__init__(name_scope) + for cmd in process_cmd: + if cmd == "n": + self._layer_norm = LayerNorm( + name_scope=self.full_name(), + begin_norm_axis=shape_len - 1, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(1.)), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(0.))) + + def forward(self, prev_out, out, process_cmd, dropout_rate=0.): + for cmd in process_cmd: + if cmd == "a": # add residual connection + out = out + prev_out if prev_out else out + elif cmd == "n": # add layer normalization + out = self._layer_norm(out) + elif cmd == "d": # add dropout + if dropout_rate: + out = fluid.layers.dropout( + out, + dropout_prob=dropout_rate, + seed=ModelHyperParams.dropout_seed, + is_test=False) + return out + + +class PositionwiseFeedForwardLayer(Layer): + def __init__(self, name_scope, d_inner_hid, d_hid, dropout_rate): + super(PositionwiseFeedForwardLayer, self).__init__(name_scope) + self._i2h = FC(name_scope=self.full_name(), + size=d_inner_hid, + num_flatten_dims=2, + act="relu") + self._h2o = FC(name_scope=self.full_name(), + size=d_hid, + num_flatten_dims=2) + self._dropout_rate = dropout_rate + + def forward(self, x): + hidden = self._i2h(x) + if self._dropout_rate: + hidden = fluid.layers.dropout( + hidden, + dropout_prob=self._dropout_rate, + seed=ModelHyperParams.dropout_seed, + is_test=False) + out = self._h2o(hidden) + return out + + +class MultiHeadAttentionLayer(Layer): + def __init__(self, + name_scope, + d_key, + d_value, + d_model, + n_head=1, + dropout_rate=0., + cache=None, + gather_idx=None, + static_kv=False): + super(MultiHeadAttentionLayer, self).__init__(name_scope) + self._n_head = n_head + self._d_key = d_key + self._d_value = d_value + self._d_model = d_model + self._dropout_rate = dropout_rate + self._q_fc = FC(name_scope=self.full_name(), + size=d_key * n_head, + bias_attr=False, + num_flatten_dims=2) + self._k_fc = FC(name_scope=self.full_name(), + size=d_key * n_head, + bias_attr=False, + num_flatten_dims=2) + self._v_fc = FC(name_scope=self.full_name(), + size=d_value * n_head, + bias_attr=False, + num_flatten_dims=2) + self._proj_fc = FC(name_scope=self.full_name(), + size=self._d_model, + bias_attr=False, + num_flatten_dims=2) + + def forward(self, queries, keys, values, attn_bias): + # compute q ,k ,v + keys = queries if keys is None else keys + values = keys if values is None else values + + q = self._q_fc(queries) + k = self._k_fc(keys) + v = self._v_fc(values) + + # split head + reshaped_q = fluid.layers.reshape( + x=q, shape=[0, 0, self._n_head, self._d_key], inplace=False) + transpose_q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3]) + reshaped_k = fluid.layers.reshape( + x=k, shape=[0, 0, self._n_head, self._d_key], inplace=False) + transpose_k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3]) + reshaped_v = fluid.layers.reshape( + x=v, shape=[0, 0, self._n_head, self._d_value], inplace=False) + transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3]) + + # scale dot product attention + product = fluid.layers.matmul( + x=transpose_q, + y=transpose_k, + transpose_y=True, + alpha=self._d_model**-0.5) + if attn_bias: + product += attn_bias + weights = fluid.layers.softmax(product) + if self._dropout_rate: + weights_droped = fluid.layers.dropout( + weights, + dropout_prob=self._dropout_rate, + seed=ModelHyperParams.dropout_seed, + is_test=False) + out = fluid.layers.matmul(weights_droped, transpose_v) + else: + out = fluid.layers.matmul(weights, transpose_v) + + # combine heads + if len(out.shape) != 4: + raise ValueError("Input(x) should be a 4-D Tensor.") + trans_x = fluid.layers.transpose(out, perm=[0, 2, 1, 3]) + final_out = fluid.layers.reshape( + x=trans_x, + shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], + inplace=False) + + # fc to output + proj_out = self._proj_fc(final_out) + return proj_out + + +class EncoderSubLayer(Layer): + def __init__(self, + name_scope, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd="n", + postprocess_cmd="da"): + + super(EncoderSubLayer, self).__init__(name_scope) + self._preprocess_cmd = preprocess_cmd + self._postprocess_cmd = postprocess_cmd + self._prepostprocess_dropout = prepostprocess_dropout + + self._preprocess_layer = PrePostProcessLayer(self.full_name(), + self._preprocess_cmd, 3) + self._multihead_attention_layer = MultiHeadAttentionLayer( + self.full_name(), d_key, d_value, d_model, n_head, + attention_dropout) + self._postprocess_layer = PrePostProcessLayer( + self.full_name(), self._postprocess_cmd, None) + self._preprocess_layer2 = PrePostProcessLayer(self.full_name(), + self._preprocess_cmd, 3) + self._positionwise_feed_forward = PositionwiseFeedForwardLayer( + self.full_name(), d_inner_hid, d_model, relu_dropout) + self._postprocess_layer2 = PrePostProcessLayer( + self.full_name(), self._postprocess_cmd, None) + + def forward(self, enc_input, attn_bias): + pre_process_multihead = self._preprocess_layer( + None, enc_input, self._preprocess_cmd, self._prepostprocess_dropout) + attn_output = self._multihead_attention_layer(pre_process_multihead, + None, None, attn_bias) + attn_output = self._postprocess_layer(enc_input, attn_output, + self._postprocess_cmd, + self._prepostprocess_dropout) + pre_process2_output = self._preprocess_layer2( + None, attn_output, self._preprocess_cmd, + self._prepostprocess_dropout) + ffd_output = self._positionwise_feed_forward(pre_process2_output) + return self._postprocess_layer2(attn_output, ffd_output, + self._postprocess_cmd, + self._prepostprocess_dropout) + + +class EncoderLayer(Layer): + def __init__(self, + name_scope, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd="n", + postprocess_cmd="da"): + + super(EncoderLayer, self).__init__(name_scope) + self._preprocess_cmd = preprocess_cmd + self._encoder_sublayers = list() + self._prepostprocess_dropout = prepostprocess_dropout + self._n_layer = n_layer + self._preprocess_layer = PrePostProcessLayer(self.full_name(), + self._preprocess_cmd, 3) + for i in range(n_layer): + self._encoder_sublayers.append( + self.add_sublayer( + 'esl_%d' % i, + EncoderSubLayer( + self.full_name(), n_head, d_key, d_value, d_model, + d_inner_hid, prepostprocess_dropout, attention_dropout, + relu_dropout, preprocess_cmd, postprocess_cmd))) + + def forward(self, enc_input, attn_bias): + for i in range(self._n_layer): + enc_output = self._encoder_sublayers[i](enc_input, attn_bias) + enc_input = enc_output + + return self._preprocess_layer(None, enc_output, self._preprocess_cmd, + self._prepostprocess_dropout) + + +class PrepareEncoderDecoderLayer(Layer): + def __init__(self, + name_scope, + src_vocab_size, + src_emb_dim, + src_max_len, + dropout_rate, + word_emb_param_name=None, + pos_enc_param_name=None): + super(PrepareEncoderDecoderLayer, self).__init__(name_scope) + self._src_max_len = src_max_len + self._src_emb_dim = src_emb_dim + self._src_vocab_size = src_vocab_size + self._dropout_rate = dropout_rate + self._input_emb = Embedding( + name_scope=self.full_name(), + size=[src_vocab_size, src_emb_dim], + padding_idx=0, + param_attr=fluid.ParamAttr( + name=word_emb_param_name, + initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5))) + + if pos_enc_param_name is pos_enc_param_names[0]: + pos_inp = pos_inp1 + else: + pos_inp = pos_inp2 + self._pos_emb = Embedding( + name_scope=self.full_name(), + size=[self._src_max_len, src_emb_dim], + param_attr=fluid.ParamAttr( + name=pos_enc_param_name, + initializer=fluid.initializer.NumpyArrayInitializer(pos_inp), + trainable=False)) + + # use in dygraph_mode to fit different length batch + # self._pos_emb._w = to_variable( + # position_encoding_init(self._src_max_len, self._src_emb_dim)) + + def forward(self, src_word, src_pos): + src_word_emb = self._input_emb(src_word) + src_word_emb = fluid.layers.scale( + x=src_word_emb, scale=self._src_emb_dim**0.5) + # # TODO change this to fit dynamic length input + src_pos_emb = self._pos_emb(src_pos) + src_pos_emb.stop_gradient = True + enc_input = src_word_emb + src_pos_emb + return fluid.layers.dropout( + enc_input, + dropout_prob=self._dropout_rate, + seed=ModelHyperParams.dropout_seed, + is_test=False) if self._dropout_rate else enc_input + + +class WrapEncoderLayer(Layer): + def __init__(self, name_cope, src_vocab_size, max_length, n_layer, n_head, + d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, + attention_dropout, relu_dropout, preprocess_cmd, + postprocess_cmd, weight_sharing): + """ + The wrapper assembles together all needed layers for the encoder. + """ + super(WrapEncoderLayer, self).__init__(name_cope) + + self._prepare_encoder_layer = PrepareEncoderDecoderLayer( + self.full_name(), + src_vocab_size, + d_model, + max_length, + prepostprocess_dropout, + word_emb_param_name=word_emb_param_names[0], + pos_enc_param_name=pos_enc_param_names[0]) + self._encoder = EncoderLayer( + self.full_name(), n_layer, n_head, d_key, d_value, d_model, + d_inner_hid, prepostprocess_dropout, attention_dropout, + relu_dropout, preprocess_cmd, postprocess_cmd) + + def forward(self, enc_inputs): + src_word, src_pos, src_slf_attn_bias = enc_inputs + enc_input = self._prepare_encoder_layer(src_word, src_pos) + enc_output = self._encoder(enc_input, src_slf_attn_bias) + return enc_output + + +class DecoderSubLayer(Layer): + def __init__(self, + name_scope, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + cache=None, + gather_idx=None): + super(DecoderSubLayer, self).__init__(name_scope) + self._postprocess_cmd = postprocess_cmd + self._preprocess_cmd = preprocess_cmd + self._prepostprcess_dropout = prepostprocess_dropout + self._pre_process_layer = PrePostProcessLayer(self.full_name(), + preprocess_cmd, 3) + self._multihead_attention_layer = MultiHeadAttentionLayer( + self.full_name(), + d_key, + d_value, + d_model, + n_head, + attention_dropout, + cache=cache, + gather_idx=gather_idx) + self._post_process_layer = PrePostProcessLayer(self.full_name(), + postprocess_cmd, None) + self._pre_process_layer2 = PrePostProcessLayer(self.full_name(), + preprocess_cmd, 3) + self._multihead_attention_layer2 = MultiHeadAttentionLayer( + self.full_name(), + d_key, + d_value, + d_model, + n_head, + attention_dropout, + cache=cache, + gather_idx=gather_idx, + static_kv=True) + self._post_process_layer2 = PrePostProcessLayer(self.full_name(), + postprocess_cmd, None) + self._pre_process_layer3 = PrePostProcessLayer(self.full_name(), + preprocess_cmd, 3) + self._positionwise_feed_forward_layer = PositionwiseFeedForwardLayer( + self.full_name(), d_inner_hid, d_model, relu_dropout) + self._post_process_layer3 = PrePostProcessLayer(self.full_name(), + postprocess_cmd, None) + + def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias): + pre_process_rlt = self._pre_process_layer( + None, dec_input, self._preprocess_cmd, self._prepostprcess_dropout) + slf_attn_output = self._multihead_attention_layer(pre_process_rlt, None, + None, slf_attn_bias) + slf_attn_output_pp = self._post_process_layer( + dec_input, slf_attn_output, self._postprocess_cmd, + self._prepostprcess_dropout) + pre_process_rlt2 = self._pre_process_layer2(None, slf_attn_output_pp, + self._preprocess_cmd, + self._prepostprcess_dropout) + enc_attn_output_pp = self._multihead_attention_layer2( + pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias) + enc_attn_output = self._post_process_layer2( + slf_attn_output_pp, enc_attn_output_pp, self._postprocess_cmd, + self._prepostprcess_dropout) + pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output, + self._preprocess_cmd, + self._prepostprcess_dropout) + ffd_output = self._positionwise_feed_forward_layer(pre_process_rlt3) + dec_output = self._post_process_layer3(enc_attn_output, ffd_output, + self._postprocess_cmd, + self._prepostprcess_dropout) + return dec_output + + +class DecoderLayer(Layer): + def __init__(self, + name_scope, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + caches=None, + gather_idx=None): + super(DecoderLayer, self).__init__(name_scope) + self._pre_process_layer = PrePostProcessLayer(self.full_name(), + preprocess_cmd, 3) + self._decoder_sub_layers = list() + self._n_layer = n_layer + self._preprocess_cmd = preprocess_cmd + self._prepostprocess_dropout = prepostprocess_dropout + for i in range(n_layer): + self._decoder_sub_layers.append( + self.add_sublayer( + 'dsl_%d' % i, + DecoderSubLayer( + self.full_name(), + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + cache=None if caches is None else caches[i], + gather_idx=gather_idx))) + + def forward(self, dec_input, enc_output, dec_slf_attn_bias, + dec_enc_attn_bias): + for i in range(self._n_layer): + tmp_dec_output = self._decoder_sub_layers[i]( + dec_input, enc_output, dec_slf_attn_bias, dec_enc_attn_bias) + dec_input = tmp_dec_output + + dec_output = self._pre_process_layer(None, tmp_dec_output, + self._preprocess_cmd, + self._prepostprocess_dropout) + return dec_output + + +class WrapDecoderLayer(Layer): + def __init__(self, + name_scope, + trg_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + caches=None, + gather_idx=None): + """ + The wrapper assembles together all needed layers for the encoder. + """ + super(WrapDecoderLayer, self).__init__(name_scope) + + self._prepare_decoder_layer = PrepareEncoderDecoderLayer( + self.full_name(), + trg_vocab_size, + d_model, + max_length, + prepostprocess_dropout, + word_emb_param_name=word_emb_param_names[1], + pos_enc_param_name=pos_enc_param_names[1]) + self._decoder_layer = DecoderLayer( + self.full_name(), + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + caches=caches, + gather_idx=gather_idx) + self._weight_sharing = weight_sharing + if not weight_sharing: + self._fc = FC(self.full_name(), + size=trg_vocab_size, + bias_attr=False) + + def forward(self, dec_inputs=None, enc_output=None): + trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs + dec_input = self._prepare_decoder_layer(trg_word, trg_pos) + dec_output = self._decoder_layer(dec_input, enc_output, + trg_slf_attn_bias, trg_src_attn_bias) + + dec_output_reshape = fluid.layers.reshape( + dec_output, shape=[-1, dec_output.shape[-1]], inplace=False) + + if self._weight_sharing: + predict = fluid.layers.matmul( + x=dec_output_reshape, + y=self._prepare_decoder_layer._input_emb._w, + transpose_y=True) + else: + predict = self._fc(dec_output_reshape) + + if dec_inputs is None: + # Return probs for independent decoder program. + predict_out = fluid.layers.softmax(predict) + return predict_out + return predict + + +class TransFormer(Layer): + def __init__(self, + name_scope, + src_vocab_size, + trg_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + label_smooth_eps, + use_py_reader=False, + is_test=False): + super(TransFormer, self).__init__(name_scope) + self._label_smooth_eps = label_smooth_eps + self._trg_vocab_size = trg_vocab_size + if weight_sharing: + assert src_vocab_size == trg_vocab_size, ( + "Vocabularies in source and target should be same for weight sharing." + ) + self._wrap_encoder_layer = WrapEncoderLayer( + self.full_name(), src_vocab_size, max_length, n_layer, n_head, + d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, + attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, + weight_sharing) + self._wrap_decoder_layer = WrapDecoderLayer( + self.full_name(), trg_vocab_size, max_length, n_layer, n_head, + d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, + attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, + weight_sharing) + + if weight_sharing: + self._wrap_decoder_layer._prepare_decoder_layer._input_emb._w = self._wrap_encoder_layer._prepare_encoder_layer._input_emb._w + + def forward(self, enc_inputs, dec_inputs, label, weights): + enc_output = self._wrap_encoder_layer(enc_inputs) + predict = self._wrap_decoder_layer(dec_inputs, enc_output) + if self._label_smooth_eps: + label_out = fluid.layers.label_smooth( + label=fluid.layers.one_hot( + input=label, depth=self._trg_vocab_size), + epsilon=self._label_smooth_eps) + + cost = fluid.layers.softmax_with_cross_entropy( + logits=predict, + label=label_out, + soft_label=True if self._label_smooth_eps else False) + weighted_cost = cost * weights + sum_cost = fluid.layers.reduce_sum(weighted_cost) + token_num = fluid.layers.reduce_sum(weights) + token_num.stop_gradient = True + avg_cost = sum_cost / token_num + return sum_cost, avg_cost, predict, token_num + class TestDygraphTransformerSortGradient(unittest.TestCase): def test_transformer_sort_gradient_float32(self): diff --git a/python/requirements.txt b/python/requirements.txt index c4ced49be3332edd43adccd748274fbbaaf06777..d1e34b524632e37f5cd7fbae5fb6258787d8de91 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -16,3 +16,4 @@ funcsigs pyyaml decorator prettytable +objgraph