backward.cc 16.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/eager/backward.h"
16

17
#include "paddle/fluid/eager/general_grad.h"
J
Jiabin Yang 已提交
18
#include "paddle/phi/kernels/autotune/switch_autotune.h"
19 20 21

namespace egr {

22
std::unordered_map<GradNodeBase*, int> getInDegreeMap(
23
    const std::deque<GradNodeBase*>& init_queue) {
24
  // Calculate in_degree for each node
25 26
  // We can completely remove this pass, if in_degree were set during forward
  // pass
27 28 29
  std::unordered_map<GradNodeBase*, int> node_in_degree_map;

  // Copy nodes
30
  std::deque<GradNodeBase*> queue = init_queue;
31 32 33 34 35
  std::unordered_set<GradNodeBase*> visited;

  // Visit each node exactly once in any order
  while (!queue.empty()) {
    GradNodeBase* node = queue.front();
36
    queue.pop_front();
37 38 39 40 41 42

    if (visited.count(node)) {
      continue;
    }
    visited.insert(node);

43 44 45 46 47
    PADDLE_ENFORCE_NOT_NULL(
        node,
        paddle::platform::errors::Fatal(
            "We got null node when we traverse the backward graph, and this "
            "should not happened please check your code and contact us."));
48
    // Find and append next nodes
49 50 51 52 53
    const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
        metas = node->OutputMeta();
    for (const auto& meta_list : metas) {
      for (const GradSlotMeta& meta : meta_list) {
        const auto& edge = meta.GetEdge();
54 55 56 57 58 59 60 61 62 63
        GradNodeBase* next_node = edge.GetMutableGradNode().get();
        // Next node could be nullptr if it is leaf tensor with no
        // AccumulationNode attached
        // Or it could also originated from dispensable inputs
        if (!next_node) continue;

        // Update in_degree
        if (!node_in_degree_map.count(next_node))
          node_in_degree_map[next_node] = 0;
        node_in_degree_map[next_node]++;
64
        queue.push_back(next_node);
65 66 67
      }
    }
  }
68

69
  return node_in_degree_map;
70 71 72 73 74 75
}

// Enforce GradNode has TensorWrappers as Input
void EnforceGradNodeHasInput(GradNodeBase* node) {
  VLOG(6) << "Running in EnforceGradNodeHasInput";
  PADDLE_ENFORCE_NE(
76 77
      node->IsTensorWrappersCleared(),
      true,
78 79 80 81 82 83 84 85
      paddle::platform::errors::Fatal(
          "The TensorWrappers of %s do not exist. This may be because:\n"
          "You calculate backward twice for the same subgraph without "
          "setting retain_graph=True. Please set retain_graph=True in the "
          "first backward/grad call.\n",
          node->name()));
}

86 87 88 89 90 91 92
void DuplicateCheck(const std::vector<paddle::experimental::Tensor>& inputs,
                    bool is_input) {
  std::unordered_set<AutogradMeta*> visisted_ins;
  std::string msg = is_input ? "inputs" : "outputs";
  for (auto in : inputs) {
    AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(in);
    PADDLE_ENFORCE_EQ(
93 94
        visisted_ins.count(auto_grad_meta),
        0,
95
        paddle::platform::errors::AlreadyExists(
96 97 98 99
            "%s contain duplicate tensor %s, please check %s carefully.",
            msg,
            in.name(),
            msg));
100
    visisted_ins.insert(auto_grad_meta);
101 102 103
  }
}

104 105
GeneralGrad* GeneralGrad::general_grad_ = new GeneralGrad();

106 107 108
std::vector<paddle::experimental::Tensor> RunBackward(
    const std::vector<paddle::experimental::Tensor>& tensors,  // output
    const std::vector<paddle::experimental::Tensor>& grad_tensors,
109 110
    bool retain_graph,
    bool create_graph = false,
111 112 113
    const std::vector<paddle::experimental::Tensor>& inputs = {},
    bool allow_unused = false,
    const std::vector<paddle::experimental::Tensor>& no_grad_vars = {}) {
114
  VLOG(3) << "Start Backward";
115

116 117 118 119
  // *Gradient Hook should happen at node-level
  // *Inplace version check should perform at node-level
  // *Cross-batch accumulation happens at forward pass

120 121
  // GeneralGrad
  bool is_general_grad = !inputs.empty();
122
  if (is_general_grad) GeneralGrad::Instance().Clear();
123

124 125 126
  /* --- Initialization --- */
  // 1. Init queue with starting nodes
  // 2. Prepare initial input buffers
127 128
  std::deque<GradNodeBase*> queue;
  std::deque<GradNodeBase*> orig_queue;
129 130 131
  std::unordered_map<GradNodeBase*, std::unique_ptr<GradTensorHolder>>
      node_input_buffers_dict;
  for (size_t i = 0; i < tensors.size(); i++) {
132
    const paddle::experimental::Tensor& tensor = tensors[i];
133

134 135
    AutogradMeta* auto_grad_meta = EagerUtils::nullable_autograd_meta(tensor);
    if (auto_grad_meta == nullptr) {
136
      VLOG(5) << "Skip auto grad since there is no grad op for var or loss is "
137 138 139 140
                 "stop_gradient=True: "
              << tensor.name();
      continue;
    }
141 142 143
    // Get grad input info from target tensors
    auto input_info = auto_grad_meta->OutRankInfo();

144
    VLOG(5) << "Out Rank of Tensor is slot: " << input_info.first
145 146
            << ", rank: " << input_info.second;
    // Get target GradNodeBase from target tensors
147 148 149 150
    auto shared_grad_node = auto_grad_meta->GetMutableGradNode();

    if (shared_grad_node == nullptr || shared_grad_node.get() == nullptr ||
        auto_grad_meta->StopGradient()) {
151
      VLOG(5) << "Skip auto grad since there is no grad op for var or loss is "
152 153 154 155 156
                 "stop_gradient=True: "
              << tensor.name();
      continue;
    }

157
    // TODO(zhanlve): Copy and Modify GradNode if is_general_grad
158
    GradNodeBase* grad_node = shared_grad_node.get();
159 160
    if (is_general_grad) {
      // Save orig grad node
161
      orig_queue.push_back(grad_node);
162 163 164 165 166 167 168

      // Replace grad_node with copied grad_node
      grad_node = GeneralGrad::Instance().CopyGradNode(shared_grad_node);

      // Record potential startup grad node
      GeneralGrad::Instance().GetPotentialStartupNodes()->insert(grad_node);
    }
169 170 171

    // Prepare GradTensorHolder
    if (!node_input_buffers_dict.count(grad_node)) {
172
      VLOG(5) << "Create Value for grad input tensor " << i
173
              << " of grad node: " << grad_node->name();
174 175 176
      node_input_buffers_dict[grad_node] =
          std::make_unique<GradTensorHolder>(grad_node->InputMeta());
    }
177 178 179
    bool copy_from_grad_t =
        grad_tensors.size() > 0 && grad_tensors[i].initialized();
    if (copy_from_grad_t) {
180 181 182 183 184
      PADDLE_ENFORCE(
          grad_tensors.size() == tensors.size(),
          paddle::platform::errors::Fatal(
              "Detected size mismatch between tensors and grad_tensors"
              "grad_tensors should either have "
185
              "size = 0 or same size as tensors."));
186
      // Feed given tensor if it's provided
187
      VLOG(3) << "Fill grad input tensor " << i << "with give grad tensor";
188

189 190 191
      // Deep copy
      node_input_buffers_dict[grad_node]->CopyValueFromTensor(
          input_info.first, input_info.second, grad_tensors[i]);
192
    } else {
193
      VLOG(3) << "Fill grad input tensor " << i << " with 1.0";
194 195 196 197 198
      // Initialize tensor with 1.0
      // Forward Tensor "tensor" is passed to indicate tensortype, datatype and
      // dims
      // GradTensorHolder will initialize another tensor with same tensortype,
      // datatype and dims but filled with 1.0
199
      node_input_buffers_dict[grad_node]->CopyValueFromTensor(
200
          input_info.first, input_info.second, tensor, /*fill_one=*/true);
201 202
    }

203
    // Prepare queue, potential startup_nodes
204
    queue.push_back(grad_node);
205 206 207
  }

  if (is_general_grad) {
208 209 210
    // Prepare several vital preprocess for GeneralGrad
    GeneralGrad::Instance().PreparedForGeneralGrad(
        inputs, no_grad_vars, orig_queue, &queue, node_input_buffers_dict);
211 212
  }

213
  VLOG(5) << "Update In degree Map for backward";
214 215 216 217
  // 3. Compute in_degree for each node
  std::unordered_map<GradNodeBase*, int> node_in_degree_map =
      getInDegreeMap(queue);

218
  VLOG(5) << "Startup_ops's size is " << queue.size();
219

220 221 222
  /* --- Topological Visit --- */
  // 1. Pop queue
  // 2. Run node
223
  //    |- Check and capture target result
224 225 226
  //    |- node(grads)
  //    |- Prepare for next node
  // 3. Update queue
227 228
  while (!queue.empty()) {
    GradNodeBase* node = queue.front();
229 230
    VLOG(3) << "Preparing GradNode:" << node->name() << " addr:" << node;
    VLOG(4) << EagerUtils::GradNodeStr(*node);
231
    paddle::platform::RecordEvent node_record_event(
232
        std::string((*node).name()),
233 234
        paddle::platform::TracerEventType::Operator,
        1);
235

236
    if (queue.size() > 1 && node_in_degree_map[node] != 0) {
237
      queue.pop_front();
238 239
      continue;
    }
240
    queue.pop_front();
241

242
    // Run node: This is where Hook happens
243 244
    auto node_input_buffer_iter = node_input_buffers_dict.find(node);
    PADDLE_ENFORCE_NE(
245 246
        node_input_buffer_iter,
        node_input_buffers_dict.end(),
247
        paddle::platform::errors::Fatal(
248
            "Unable to find next node in the GradTensorHolder \n"
249
            "Trying to run Node without configuring its GradTensorHolder."));
250 251

    std::unique_ptr<GradTensorHolder> node_input_buffer =
252
        std::move(node_input_buffer_iter->second);
253

254
    // Check input
255 256
    EnforceGradNodeHasInput(node);

257
    VLOG(7) << "Run Backward Kernel with GradTensorHolder.";
258
    // Run Pre Backward Node and get outputs
259 260
    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                         kSlotSmallVectorSize>
261 262
        grad_output_tensors = (*node)(
            node_input_buffer->Buffers(), create_graph, is_general_grad);
263

264 265 266 267 268
    if (!inputs.empty() && is_general_grad) {
      GeneralGrad::Instance().SetResultForEnddingNodes(grad_output_tensors,
                                                       node);
    }

269 270
    // retain_grad or not
    if (!retain_graph) {
271
      VLOG(3)
272 273 274 275
          << "retain_graph is false, need to clear the TensorWrapper of nodes.";
      node->ClearTensorWrappers();
    }

276
    // TODO(jiabin): Should we erase it or find a more efficient way.
277
    node_input_buffers_dict.erase(node_input_buffer_iter);
278 279

    // Prepare GradTensorHolder for next node
280 281 282
    const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
        metas = node->OutputMeta();
    PADDLE_ENFORCE(metas.size() == grad_output_tensors.size() || metas.empty(),
283 284
                   paddle::platform::errors::Fatal(
                       "Number of edges should be either empty ( for leaf node "
285 286
                       ") or the same as number of output grad tensors, but we "
                       "got edges size is: %d, grad_output size is: %d",
287 288
                       metas.size(),
                       grad_output_tensors.size()));
289

290 291 292
    for (size_t i = 0; i < metas.size(); i++) {
      for (size_t j = 0; j < metas[i].size(); j++) {
        const Edge& edge = metas[i][j].GetEdge();
J
Jiabin Yang 已提交
293 294 295
        if (!edge.IsInitialized()) {
          continue;
        }
296 297
        auto edge_rank = edge.GetEdgeRankInfo();
        // Since we make edge has as same rank as bwd outputs, we indexing them
298
        // with the same rank(i, j)
299
        auto next_node_shared = edge.GetMutableGradNode();
300 301 302
        VLOG(3) << "Node: " << node->name() << " addr:" << node
                << ", Found pending node: " << next_node_shared->name()
                << " addr: " << next_node_shared.get();
303 304 305
        // Next node could be nullptr if it is leaf tensor with no
        // AccumulationNode attached
        // Or it could also originated from dispensable inputs
306 307 308 309
        if (!next_node_shared || !next_node_shared.get() ||
            grad_output_tensors[i].empty()) {
          continue;
        }
310

311
        PADDLE_ENFORCE_LT(
312 313
            j,
            grad_output_tensors[i].size(),
314 315 316 317 318
            paddle::platform::errors::Fatal(
                "Rank of grad_output_tensors should be less than "
                "grad_output_tensors[i].size(), which is: %d. This error may "
                "indicate autoprune or autograd api error. ",
                grad_output_tensors.size()));
319 320
        paddle::experimental::Tensor& grad_output_tensor =
            grad_output_tensors[i][j];
321 322 323

        if ((!grad_output_tensor.defined() ||
             !grad_output_tensor.initialized())) {
324
          VLOG(7) << "We get grad_output_tensor with slot: " << i
325
                  << ", rank: " << j << " as uninitialized or undefined tensor";
326
        }
327

328
        VLOG(7) << "Get Edge and grad_output_tensor with slot: " << i
329 330 331
                << ", rank: " << j
                << " 's name is: " << grad_output_tensor.name();

332 333 334 335 336
        auto* next_node = next_node_shared.get();
        if (!node_input_buffers_dict.count(next_node)) {
          const auto& input_meta = next_node->InputMeta();
          auto grad_tensor_holder =
              std::make_unique<GradTensorHolder>(input_meta);
337
          VLOG(7) << "Construct GradTensorHolder for grad node: "
338 339 340 341
                  << next_node->name();
          node_input_buffers_dict[next_node] = std::move(grad_tensor_holder);
        }

342
        VLOG(3) << "Sum grad inputs for edge slot: " << edge_rank.first
343
                << ", rank: " << edge_rank.second;
344

345 346 347 348
        node_input_buffers_dict[next_node]->add(edge_rank.first,
                                                edge_rank.second,
                                                grad_output_tensor,
                                                create_graph);
349 350 351

        // Update queue
        node_in_degree_map[next_node]--;
352
        VLOG(7) << next_node->name()
353
                << " ref_cnt is: " << node_in_degree_map[next_node];
354

355 356 357 358
        PADDLE_ENFORCE(
            node_in_degree_map[next_node] >= 0,
            paddle::platform::errors::Fatal(
                "Detected in-degree value smaller than zero. For Node: %s"
359
                "Node's in-degree cannot be negative.",
360
                next_node->name()));
361

362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
        if (is_general_grad) {
          if (node_in_degree_map[next_node] == 0 &&
              GeneralGrad::Instance().IsNeededNodes(next_node)) {
            if (dynamic_cast<egr::GradNodeAccumulation*>(next_node)) {
              queue.push_front(std::move(next_node));
            } else {
              queue.push_back(std::move(next_node));
            }
          }
        } else {
          if (node_in_degree_map[next_node] == 0) {
            if (dynamic_cast<egr::GradNodeAccumulation*>(next_node)) {
              queue.push_front(std::move(next_node));
            } else {
              queue.push_back(std::move(next_node));
            }
378
          }
379 380 381 382
        }
      }
    }
  }
383

384
  VLOG(7) << "Run Backward Final hook size: "
385 386 387 388 389
          << egr::Controller::Instance().FinalBackwardHooks().size();
  for (auto& hook : egr::Controller::Instance().FinalBackwardHooks()) {
    (*hook)();
  }
  egr::Controller::Instance().ClearFinalBackwardHooks();
390 391
  if (!is_general_grad) return {};
  return GeneralGrad::Instance().GetResults(inputs, allow_unused, create_graph);
392
  VLOG(3) << "Finish Backward";
393 394
}

395
void Backward(
396
    const std::vector<paddle::experimental::Tensor>& tensors,  // outputs
397 398
    const std::vector<paddle::experimental::Tensor>& grad_tensors,
    bool retain_graph) {
399
  VLOG(3) << "Run in Backward";
400
  paddle::platform::RecordEvent backward_record_event(
401
      "backward", paddle::platform::TracerEventType::UserDefined, 1);
402
  RunBackward(tensors, grad_tensors, retain_graph);
J
Jiabin Yang 已提交
403
  phi::autotune::AutoTuneStatus::Instance().Update();
404 405 406
}

std::vector<paddle::experimental::Tensor> Grad(
407
    const std::vector<paddle::experimental::Tensor>& tensors,  // outputs
408 409
    const std::vector<paddle::experimental::Tensor>& inputs,
    const std::vector<paddle::experimental::Tensor>& grad_tensors,
410 411 412 413
    bool retain_graph,
    bool create_graph,
    bool only_inputs,
    bool allow_unused,
414
    const std::vector<paddle::experimental::Tensor>& no_grad_vars) {
415
  VLOG(3) << "Run in Grad";
416 417 418 419

  DuplicateCheck(inputs, true /* is_input */);
  DuplicateCheck(tensors, false /* is_input */);

420 421 422 423 424 425 426
  return RunBackward(tensors,
                     grad_tensors,
                     retain_graph,
                     create_graph,
                     inputs,
                     allow_unused,
                     no_grad_vars);
427
}
428
}  // namespace egr