run_program_op_func.h 6.9 KB
Newer Older
0
0x45f 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <vector>

#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/eager/eager_tensor.h"
#include "paddle/fluid/eager/to_static/run_program_op_node.h"
#include "paddle/fluid/eager/utils.h"
23
#include "paddle/fluid/memory/allocation/allocator.h"
0
0x45f 已提交
24

25 26 27
// Filter params without grads in global block. In this case, we will
// tag its AutogradMeta with stop_gradient = True to avoid fault from
// reducer while training on multi-cards.
28 29 30 31
static void clear_no_grad_edges(const std::vector<paddle::Tensor>& params,
                                const paddle::framework::BlockDesc* block_desc,
                                egr::GradNodeBase* grad_node,
                                size_t slot_id) {
32 33 34
  for (size_t i = 0; i < params.size(); ++i) {
    auto p_grad_name = paddle::framework::GradVarName(params[i].name());
    if (!block_desc->HasVar(p_grad_name)) {
L
Leo Chen 已提交
35
      VLOG(3) << "clear edge of " << p_grad_name;
36 37 38 39 40
      grad_node->MutableOutputMeta()[slot_id][i].GetMutableEdge().Clear();
    }
  }
}

41
static void clear_no_grad_edges_with_partial_block(
42
    const std::vector<paddle::Tensor>& params,
43 44 45 46 47 48 49 50
    const paddle::framework::BlockDesc* forward_block_desc,
    const paddle::framework::BlockDesc* backward_block_desc,
    egr::GradNodeBase* grad_node,
    size_t slot_id) {
  for (size_t i = 0; i < params.size(); ++i) {
    auto p_grad_name = paddle::framework::GradVarName(params[i].name());
    if (!forward_block_desc->HasVar(p_grad_name) &&
        !backward_block_desc->HasVar(p_grad_name)) {
L
Leo Chen 已提交
51
      VLOG(3) << "clear edge of " << p_grad_name;
52 53 54 55 56
      grad_node->MutableOutputMeta()[slot_id][i].GetMutableEdge().Clear();
    }
  }
}

57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
static void clear_unused_out_var_in_backward(
    const std::vector<paddle::Tensor*>& out,
    const paddle::framework::BlockDesc* backward_block,
    paddle::framework::Scope* scope) {
  std::deque<std::shared_ptr<paddle::memory::Allocation>>* garbages =
      new std::deque<std::shared_ptr<paddle::memory::Allocation>>();
  for (auto* out_tensor : out) {
    if (!backward_block->HasVar(out_tensor->name())) {
      auto var = scope->FindVar(out_tensor->name());
      if (var == nullptr) {
        continue;
      }
      if (var->IsType<phi::DenseTensor>()) {
        garbages->emplace_back(
            var->GetMutable<phi::DenseTensor>()->MoveMemoryHolder());
      }
    }
  }
  delete garbages;
}

static std::vector<paddle::Tensor> filter_unused_input_var_in_backward(
    const std::vector<paddle::Tensor>& x,
80
    const std::vector<std::string>& x_names,
81 82 83
    const paddle::framework::BlockDesc* backward_block) {
  auto filter_x = std::vector<paddle::Tensor>(x);
  for (size_t i = 0; i < x.size(); i++) {
84
    if (!backward_block->HasVar(x_names[i])) {
85 86 87 88 89 90 91 92
      auto fake = paddle::Tensor(std::make_shared<phi::DenseTensor>());
      fake.set_name(paddle::framework::kFakeVarName);
      filter_x[i] = fake;
    }
  }
  return filter_x;
}

J
Jiabin Yang 已提交
93
inline void run_program_ad_func(
94 95 96
    const std::vector<paddle::Tensor>& x,
    const std::vector<paddle::Tensor>& params,
    std::vector<paddle::Tensor*>& out,                   // NOLINT
0
0x45f 已提交
97
    std::vector<paddle::framework::Scope*>& step_scope,  // NOLINT
98
    std::vector<paddle::Tensor*>& dout,                  // NOLINT
0
0x45f 已提交
99 100 101 102 103 104 105 106 107 108 109 110 111 112
    const paddle::framework::AttributeMap& attrs) {
  // Prepare Autograd Meta
  auto deref_out = details::DereferenceTensors(out);
  std::vector<egr::AutogradMeta*> p_autograd_x =
      egr::EagerUtils::nullable_autograd_meta(x);
  std::vector<egr::AutogradMeta*> p_autograd_params =
      egr::EagerUtils::nullable_autograd_meta(params);
  std::vector<egr::AutogradMeta*> p_autograd_outs =
      egr::EagerUtils::nullable_autograd_meta(deref_out);

  bool trace_backward = egr::Controller::Instance().HasGrad();
  bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(
      trace_backward, &p_autograd_x, &p_autograd_params);

113 114 115 116 117 118 119
  VLOG(2) << "start run run_program with require_any_grad = "
          << require_any_grad;
  // Call forward function
  // if require_any_grad is False, don't save any middle vars.
  RunProgramAPI(x, params, out, step_scope, dout, require_any_grad, attrs);
  VLOG(2) << "start run run_program grad";

0
0x45f 已提交
120
  if (require_any_grad) {
121 122 123
    auto x_names =
        PADDLE_GET_CONST(std::vector<std::string>, attrs.at("x_names"));

0
0x45f 已提交
124 125 126 127 128 129
    egr::EagerUtils::PassStopGradient(false, &p_autograd_outs);
    // Create GradOpNode (1 means [out_grad], 2 means [x_grad, paramx_grad])
    auto grad_node = std::make_shared<GradNodeRunProgram>(1, 2);

    // Set Attributes
    grad_node->SetAttrMap(attrs);
130 131 132 133 134 135 136

    auto* forward_global_block = PADDLE_GET_CONST(
        paddle::framework::BlockDesc*, attrs.at("forward_global_block"));
    auto* backward_global_block = PADDLE_GET_CONST(
        paddle::framework::BlockDesc*, attrs.at("backward_global_block"));
    // Clear unused x vars
    auto filter_x =
137
        filter_unused_input_var_in_backward(x, x_names, backward_global_block);
0
0x45f 已提交
138
    // Set TensorWrappers
139 140 141
    grad_node->SetFwdX(filter_x);
    // Clear unused out vars
    clear_unused_out_var_in_backward(out, backward_global_block, step_scope[0]);
142

0
0x45f 已提交
143 144 145 146
    grad_node->SetFwdParams(params);
    grad_node->SetStepScope(step_scope);

    // Set Grad out rank as same as fwd input and set stop gradient to bwd
147 148 149
    // NOTE(@xiongkun): Not every tensor in x(list of tensor) is required
    // gradient. for example: x[1] is not used for output, the x[1] is ignored.

150
    std::vector<const paddle::Tensor*> x_require_grad;
151
    for (size_t i = 0; i < x.size(); ++i) {
152
      auto& name = x_names[i];
153 154 155 156 157 158 159
      if (forward_global_block->HasVar(name) ||
          backward_global_block->HasVar(name)) {
        x_require_grad.push_back(&x[i]);
      }
    }

    grad_node->SetGradOutMeta(x_require_grad, /*slot id*/ 0);
160
    grad_node->SetGradOutMeta(params, /*slot id*/ 1);
161 162

    VLOG(2) << "clear_no_grad_edges.";
163 164 165 166 167
    clear_no_grad_edges_with_partial_block(params,
                                           forward_global_block,
                                           backward_global_block,
                                           grad_node.get(),
                                           /*slot id*/ 1);
0
0x45f 已提交
168

169
    grad_node->SetGradInMeta(deref_out, 0);
0
0x45f 已提交
170 171 172 173 174 175 176

    egr::EagerUtils::SetOutRankWithSlot(&p_autograd_outs, 0);

    // Set History for output set current Grad Node for
    egr::EagerUtils::SetHistory(&p_autograd_outs, grad_node);
  }
}