backward.cc 16.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/eager/backward.h"
16

17
#include "paddle/fluid/eager/general_grad.h"
J
Jiabin Yang 已提交
18
#include "paddle/phi/kernels/autotune/switch_autotune.h"
19 20 21

namespace egr {

22
std::unordered_map<GradNodeBase*, int> getInDegreeMap(
23
    const std::deque<GradNodeBase*>& init_queue) {
24
  // Calculate in_degree for each node
25 26
  // We can completely remove this pass, if in_degree were set during forward
  // pass
27 28 29
  std::unordered_map<GradNodeBase*, int> node_in_degree_map;

  // Copy nodes
30
  std::deque<GradNodeBase*> queue = init_queue;
31 32 33 34 35
  std::unordered_set<GradNodeBase*> visited;

  // Visit each node exactly once in any order
  while (!queue.empty()) {
    GradNodeBase* node = queue.front();
36
    queue.pop_front();
37 38 39 40 41 42

    if (visited.count(node)) {
      continue;
    }
    visited.insert(node);

43 44 45 46 47
    PADDLE_ENFORCE_NOT_NULL(
        node,
        paddle::platform::errors::Fatal(
            "We got null node when we traverse the backward graph, and this "
            "should not happened please check your code and contact us."));
48
    // Find and append next nodes
49 50 51 52 53
    const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
        metas = node->OutputMeta();
    for (const auto& meta_list : metas) {
      for (const GradSlotMeta& meta : meta_list) {
        const auto& edge = meta.GetEdge();
54 55 56 57 58 59 60 61 62 63
        GradNodeBase* next_node = edge.GetMutableGradNode().get();
        // Next node could be nullptr if it is leaf tensor with no
        // AccumulationNode attached
        // Or it could also originated from dispensable inputs
        if (!next_node) continue;

        // Update in_degree
        if (!node_in_degree_map.count(next_node))
          node_in_degree_map[next_node] = 0;
        node_in_degree_map[next_node]++;
64
        queue.push_back(next_node);
65 66 67
      }
    }
  }
68

69
  return node_in_degree_map;
70 71 72 73 74 75
}

// Enforce GradNode has TensorWrappers as Input
void EnforceGradNodeHasInput(GradNodeBase* node) {
  VLOG(6) << "Running in EnforceGradNodeHasInput";
  PADDLE_ENFORCE_NE(
76 77
      node->IsTensorWrappersCleared(),
      true,
78 79 80 81 82 83 84 85
      paddle::platform::errors::Fatal(
          "The TensorWrappers of %s do not exist. This may be because:\n"
          "You calculate backward twice for the same subgraph without "
          "setting retain_graph=True. Please set retain_graph=True in the "
          "first backward/grad call.\n",
          node->name()));
}

86 87 88 89 90 91 92
void DuplicateCheck(const std::vector<paddle::experimental::Tensor>& inputs,
                    bool is_input) {
  std::unordered_set<AutogradMeta*> visisted_ins;
  std::string msg = is_input ? "inputs" : "outputs";
  for (auto in : inputs) {
    AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(in);
    PADDLE_ENFORCE_EQ(
93 94
        visisted_ins.count(auto_grad_meta),
        0,
95
        paddle::platform::errors::AlreadyExists(
96 97 98 99
            "%s contain duplicate tensor %s, please check %s carefully.",
            msg,
            in.name(),
            msg));
100
    visisted_ins.insert(auto_grad_meta);
101 102 103
  }
}

104 105
GeneralGrad* GeneralGrad::general_grad_ = new GeneralGrad();

106 107 108
std::vector<paddle::experimental::Tensor> RunBackward(
    const std::vector<paddle::experimental::Tensor>& tensors,  // output
    const std::vector<paddle::experimental::Tensor>& grad_tensors,
109 110
    bool retain_graph,
    bool create_graph = false,
111 112 113
    const std::vector<paddle::experimental::Tensor>& inputs = {},
    bool allow_unused = false,
    const std::vector<paddle::experimental::Tensor>& no_grad_vars = {}) {
114
  VLOG(3) << "Start Backward";
115

116 117 118 119
  // *Gradient Hook should happen at node-level
  // *Inplace version check should perform at node-level
  // *Cross-batch accumulation happens at forward pass

120 121
  // GeneralGrad
  bool is_general_grad = !inputs.empty();
122
  if (is_general_grad) GeneralGrad::Instance().Clear();
123

124 125 126
  /* --- Initialization --- */
  // 1. Init queue with starting nodes
  // 2. Prepare initial input buffers
127 128
  std::deque<GradNodeBase*> queue;
  std::deque<GradNodeBase*> orig_queue;
129 130 131
  std::unordered_map<GradNodeBase*, std::unique_ptr<GradTensorHolder>>
      node_input_buffers_dict;
  for (size_t i = 0; i < tensors.size(); i++) {
132
    const paddle::experimental::Tensor& tensor = tensors[i];
133

134 135 136 137 138 139 140
    AutogradMeta* auto_grad_meta = EagerUtils::nullable_autograd_meta(tensor);
    if (auto_grad_meta == nullptr) {
      VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
                 "stop_gradient=True: "
              << tensor.name();
      continue;
    }
141 142 143 144 145 146
    // Get grad input info from target tensors
    auto input_info = auto_grad_meta->OutRankInfo();

    VLOG(2) << "Out Rank of Tensor is slot: " << input_info.first
            << ", rank: " << input_info.second;
    // Get target GradNodeBase from target tensors
147 148 149 150 151 152 153 154 155 156
    auto shared_grad_node = auto_grad_meta->GetMutableGradNode();

    if (shared_grad_node == nullptr || shared_grad_node.get() == nullptr ||
        auto_grad_meta->StopGradient()) {
      VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
                 "stop_gradient=True: "
              << tensor.name();
      continue;
    }

157
    // TODO(zhanlve): Copy and Modify GradNode if is_general_grad
158
    GradNodeBase* grad_node = shared_grad_node.get();
159 160
    if (is_general_grad) {
      // Save orig grad node
161
      orig_queue.push_back(grad_node);
162 163 164 165 166 167 168

      // Replace grad_node with copied grad_node
      grad_node = GeneralGrad::Instance().CopyGradNode(shared_grad_node);

      // Record potential startup grad node
      GeneralGrad::Instance().GetPotentialStartupNodes()->insert(grad_node);
    }
169 170 171

    // Prepare GradTensorHolder
    if (!node_input_buffers_dict.count(grad_node)) {
172 173
      VLOG(6) << "Create Value for grad input tensor " << i
              << " of grad node: " << grad_node->name();
174 175 176
      node_input_buffers_dict[grad_node] =
          std::make_unique<GradTensorHolder>(grad_node->InputMeta());
    }
177 178 179
    bool copy_from_grad_t =
        grad_tensors.size() > 0 && grad_tensors[i].initialized();
    if (copy_from_grad_t) {
180 181 182 183 184
      PADDLE_ENFORCE(
          grad_tensors.size() == tensors.size(),
          paddle::platform::errors::Fatal(
              "Detected size mismatch between tensors and grad_tensors"
              "grad_tensors should either have "
185
              "size = 0 or same size as tensors."));
186 187
      // Feed given tensor if it's provided
      VLOG(6) << "Fill grad input tensor " << i << "with give grad tensor";
188

189 190 191
      // Deep copy
      node_input_buffers_dict[grad_node]->CopyValueFromTensor(
          input_info.first, input_info.second, grad_tensors[i]);
192 193 194 195 196 197 198
    } else {
      VLOG(6) << "Fill grad input tensor " << i << " with 1.0";
      // Initialize tensor with 1.0
      // Forward Tensor "tensor" is passed to indicate tensortype, datatype and
      // dims
      // GradTensorHolder will initialize another tensor with same tensortype,
      // datatype and dims but filled with 1.0
199
      node_input_buffers_dict[grad_node]->CopyValueFromTensor(
200
          input_info.first, input_info.second, tensor, /*fill_one=*/true);
201 202
    }

203
    // Prepare queue, potential startup_nodes
204
    queue.push_back(grad_node);
205 206 207
  }

  if (is_general_grad) {
208 209 210
    // Prepare several vital preprocess for GeneralGrad
    GeneralGrad::Instance().PreparedForGeneralGrad(
        inputs, no_grad_vars, orig_queue, &queue, node_input_buffers_dict);
211 212
  }

213
  VLOG(6) << "Update In degree Map for backward";
214 215 216 217
  // 3. Compute in_degree for each node
  std::unordered_map<GradNodeBase*, int> node_in_degree_map =
      getInDegreeMap(queue);

218
  VLOG(3) << "Startup_ops's size is " << queue.size();
219

220 221 222
  /* --- Topological Visit --- */
  // 1. Pop queue
  // 2. Run node
223
  //    |- Check and capture target result
224 225 226
  //    |- node(grads)
  //    |- Prepare for next node
  // 3. Update queue
227
  VLOG(3) << "Run Backward";
228 229
  while (!queue.empty()) {
    GradNodeBase* node = queue.front();
230
    VLOG(3) << "Running GradNode:" << node->name() << " addr:" << node;
231

232
    paddle::platform::RecordEvent node_record_event(
233
        std::string((*node).name()),
234 235
        paddle::platform::TracerEventType::Operator,
        1);
236

237
    if (queue.size() > 1 && node_in_degree_map[node] != 0) {
238
      queue.pop_front();
239 240
      continue;
    }
241
    queue.pop_front();
242

243
    // Run node: This is where Hook happens
244 245
    auto node_input_buffer_iter = node_input_buffers_dict.find(node);
    PADDLE_ENFORCE_NE(
246 247
        node_input_buffer_iter,
        node_input_buffers_dict.end(),
248
        paddle::platform::errors::Fatal(
249
            "Unable to find next node in the GradTensorHolder \n"
250
            "Trying to run Node without configuring its GradTensorHolder."));
251 252

    std::unique_ptr<GradTensorHolder> node_input_buffer =
253
        std::move(node_input_buffer_iter->second);
254

255
    // Check input
256 257
    EnforceGradNodeHasInput(node);

258
    VLOG(6) << "Run Backward Kernel with GradTensorHolder.";
259
    // Run Pre Backward Node and get outputs
260 261
    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                         kSlotSmallVectorSize>
262 263
        grad_output_tensors = (*node)(
            node_input_buffer->Buffers(), create_graph, is_general_grad);
264

265 266 267 268 269
    if (!inputs.empty() && is_general_grad) {
      GeneralGrad::Instance().SetResultForEnddingNodes(grad_output_tensors,
                                                       node);
    }

270 271 272 273 274 275 276
    // retain_grad or not
    if (!retain_graph) {
      VLOG(6)
          << "retain_graph is false, need to clear the TensorWrapper of nodes.";
      node->ClearTensorWrappers();
    }

277
    // TODO(jiabin): Should we erase it or find a more efficient way.
278
    node_input_buffers_dict.erase(node_input_buffer_iter);
279 280

    // Prepare GradTensorHolder for next node
281 282 283
    const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
        metas = node->OutputMeta();
    PADDLE_ENFORCE(metas.size() == grad_output_tensors.size() || metas.empty(),
284 285
                   paddle::platform::errors::Fatal(
                       "Number of edges should be either empty ( for leaf node "
286 287
                       ") or the same as number of output grad tensors, but we "
                       "got edges size is: %d, grad_output size is: %d",
288 289
                       metas.size(),
                       grad_output_tensors.size()));
290

291 292 293
    for (size_t i = 0; i < metas.size(); i++) {
      for (size_t j = 0; j < metas[i].size(); j++) {
        const Edge& edge = metas[i][j].GetEdge();
J
Jiabin Yang 已提交
294 295 296
        if (!edge.IsInitialized()) {
          continue;
        }
297 298
        auto edge_rank = edge.GetEdgeRankInfo();
        // Since we make edge has as same rank as bwd outputs, we indexing them
299
        // with the same rank(i, j)
300
        auto next_node_shared = edge.GetMutableGradNode();
301 302 303
        VLOG(3) << "Node: " << node->name() << " addr:" << node
                << ", Found pending node: " << next_node_shared->name()
                << " addr: " << next_node_shared.get();
304 305 306
        // Next node could be nullptr if it is leaf tensor with no
        // AccumulationNode attached
        // Or it could also originated from dispensable inputs
307 308 309 310
        if (!next_node_shared || !next_node_shared.get() ||
            grad_output_tensors[i].empty()) {
          continue;
        }
311

312
        PADDLE_ENFORCE_LT(
313 314
            j,
            grad_output_tensors[i].size(),
315 316 317 318 319
            paddle::platform::errors::Fatal(
                "Rank of grad_output_tensors should be less than "
                "grad_output_tensors[i].size(), which is: %d. This error may "
                "indicate autoprune or autograd api error. ",
                grad_output_tensors.size()));
320 321
        paddle::experimental::Tensor& grad_output_tensor =
            grad_output_tensors[i][j];
322 323 324

        if ((!grad_output_tensor.defined() ||
             !grad_output_tensor.initialized())) {
325 326
          VLOG(6) << "We get grad_output_tensor with slot: " << i
                  << ", rank: " << j << " as uninitialized or undefined tensor";
327
        }
328

329 330 331 332
        VLOG(6) << "Get Edge and grad_output_tensor with slot: " << i
                << ", rank: " << j
                << " 's name is: " << grad_output_tensor.name();

333 334 335 336 337 338 339 340 341 342
        auto* next_node = next_node_shared.get();
        if (!node_input_buffers_dict.count(next_node)) {
          const auto& input_meta = next_node->InputMeta();
          auto grad_tensor_holder =
              std::make_unique<GradTensorHolder>(input_meta);
          VLOG(6) << "Construct GradTensorHolder for grad node: "
                  << next_node->name();
          node_input_buffers_dict[next_node] = std::move(grad_tensor_holder);
        }

343 344
        VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first
                << ", rank: " << edge_rank.second;
345

346 347 348 349
        node_input_buffers_dict[next_node]->add(edge_rank.first,
                                                edge_rank.second,
                                                grad_output_tensor,
                                                create_graph);
350 351 352

        // Update queue
        node_in_degree_map[next_node]--;
353 354
        VLOG(6) << next_node->name()
                << " ref_cnt is: " << node_in_degree_map[next_node];
355

356 357 358 359
        PADDLE_ENFORCE(
            node_in_degree_map[next_node] >= 0,
            paddle::platform::errors::Fatal(
                "Detected in-degree value smaller than zero. For Node: %s"
360
                "Node's in-degree cannot be negative.",
361
                next_node->name()));
362

363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
        if (is_general_grad) {
          if (node_in_degree_map[next_node] == 0 &&
              GeneralGrad::Instance().IsNeededNodes(next_node)) {
            if (dynamic_cast<egr::GradNodeAccumulation*>(next_node)) {
              queue.push_front(std::move(next_node));
            } else {
              queue.push_back(std::move(next_node));
            }
          }
        } else {
          if (node_in_degree_map[next_node] == 0) {
            if (dynamic_cast<egr::GradNodeAccumulation*>(next_node)) {
              queue.push_front(std::move(next_node));
            } else {
              queue.push_back(std::move(next_node));
            }
379
          }
380 381 382 383
        }
      }
    }
  }
384

385 386 387 388 389 390
  VLOG(6) << "Run Backward Final hook size: "
          << egr::Controller::Instance().FinalBackwardHooks().size();
  for (auto& hook : egr::Controller::Instance().FinalBackwardHooks()) {
    (*hook)();
  }
  egr::Controller::Instance().ClearFinalBackwardHooks();
391 392
  if (!is_general_grad) return {};
  return GeneralGrad::Instance().GetResults(inputs, allow_unused, create_graph);
393 394
}

395
void Backward(
396
    const std::vector<paddle::experimental::Tensor>& tensors,  // outputs
397 398
    const std::vector<paddle::experimental::Tensor>& grad_tensors,
    bool retain_graph) {
399
  VLOG(3) << "Run in Backward";
400
  paddle::platform::RecordEvent backward_record_event(
401
      "backward", paddle::platform::TracerEventType::UserDefined, 1);
402
  RunBackward(tensors, grad_tensors, retain_graph);
J
Jiabin Yang 已提交
403
  phi::autotune::AutoTuneStatus::Instance().Update();
404 405 406
}

std::vector<paddle::experimental::Tensor> Grad(
407
    const std::vector<paddle::experimental::Tensor>& tensors,  // outputs
408 409
    const std::vector<paddle::experimental::Tensor>& inputs,
    const std::vector<paddle::experimental::Tensor>& grad_tensors,
410 411 412 413
    bool retain_graph,
    bool create_graph,
    bool only_inputs,
    bool allow_unused,
414
    const std::vector<paddle::experimental::Tensor>& no_grad_vars) {
415
  VLOG(3) << "Run in Grad";
416 417 418 419

  DuplicateCheck(inputs, true /* is_input */);
  DuplicateCheck(tensors, false /* is_input */);

420 421 422 423 424 425 426
  return RunBackward(tensors,
                     grad_tensors,
                     retain_graph,
                     create_graph,
                     inputs,
                     allow_unused,
                     no_grad_vars);
427
}
428
}  // namespace egr