backward.cc 16.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/eager/backward.h"
16

17
#include "paddle/fluid/eager/general_grad.h"
J
Jiabin Yang 已提交
18
#include "paddle/phi/kernels/autotune/switch_autotune.h"
19 20 21

namespace egr {

22
std::unordered_map<GradNodeBase*, int> getInDegreeMap(
23
    const std::deque<GradNodeBase*>& init_queue) {
24
  // Calculate in_degree for each node
25 26
  // We can completely remove this pass, if in_degree were set during forward
  // pass
27 28 29
  std::unordered_map<GradNodeBase*, int> node_in_degree_map;

  // Copy nodes
30
  std::deque<GradNodeBase*> queue = init_queue;
31 32 33 34 35
  std::unordered_set<GradNodeBase*> visited;

  // Visit each node exactly once in any order
  while (!queue.empty()) {
    GradNodeBase* node = queue.front();
36
    queue.pop_front();
37 38 39 40 41 42

    if (visited.count(node)) {
      continue;
    }
    visited.insert(node);

43 44 45 46 47
    PADDLE_ENFORCE_NOT_NULL(
        node,
        paddle::platform::errors::Fatal(
            "We got null node when we traverse the backward graph, and this "
            "should not happened please check your code and contact us."));
48
    // Find and append next nodes
49 50 51 52 53
    const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
        metas = node->OutputMeta();
    for (const auto& meta_list : metas) {
      for (const GradSlotMeta& meta : meta_list) {
        const auto& edge = meta.GetEdge();
54 55 56 57 58 59 60 61 62 63
        GradNodeBase* next_node = edge.GetMutableGradNode().get();
        // Next node could be nullptr if it is leaf tensor with no
        // AccumulationNode attached
        // Or it could also originated from dispensable inputs
        if (!next_node) continue;

        // Update in_degree
        if (!node_in_degree_map.count(next_node))
          node_in_degree_map[next_node] = 0;
        node_in_degree_map[next_node]++;
64
        queue.push_back(next_node);
65 66 67
      }
    }
  }
68

69
  return node_in_degree_map;
70 71 72 73 74
}

// Enforce GradNode has TensorWrappers as Input
void EnforceGradNodeHasInput(GradNodeBase* node) {
  PADDLE_ENFORCE_NE(
75 76
      node->IsTensorWrappersCleared(),
      true,
77 78 79 80 81 82 83 84
      paddle::platform::errors::Fatal(
          "The TensorWrappers of %s do not exist. This may be because:\n"
          "You calculate backward twice for the same subgraph without "
          "setting retain_graph=True. Please set retain_graph=True in the "
          "first backward/grad call.\n",
          node->name()));
}

85 86 87 88 89 90 91
void DuplicateCheck(const std::vector<paddle::experimental::Tensor>& inputs,
                    bool is_input) {
  std::unordered_set<AutogradMeta*> visisted_ins;
  std::string msg = is_input ? "inputs" : "outputs";
  for (auto in : inputs) {
    AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(in);
    PADDLE_ENFORCE_EQ(
92 93
        visisted_ins.count(auto_grad_meta),
        0,
94
        paddle::platform::errors::AlreadyExists(
95 96 97 98
            "%s contain duplicate tensor %s, please check %s carefully.",
            msg,
            in.name(),
            msg));
99
    visisted_ins.insert(auto_grad_meta);
100 101 102
  }
}

103 104
GeneralGrad* GeneralGrad::general_grad_ = new GeneralGrad();

105 106 107
std::vector<paddle::experimental::Tensor> RunBackward(
    const std::vector<paddle::experimental::Tensor>& tensors,  // output
    const std::vector<paddle::experimental::Tensor>& grad_tensors,
108 109
    bool retain_graph,
    bool create_graph = false,
110 111 112
    const std::vector<paddle::experimental::Tensor>& inputs = {},
    bool allow_unused = false,
    const std::vector<paddle::experimental::Tensor>& no_grad_vars = {}) {
113
  VLOG(3) << "Start Backward";
114

115 116 117 118
  // *Gradient Hook should happen at node-level
  // *Inplace version check should perform at node-level
  // *Cross-batch accumulation happens at forward pass

119 120
  // GeneralGrad
  bool is_general_grad = !inputs.empty();
121
  if (is_general_grad) GeneralGrad::Instance().Clear();
122

123 124 125
  /* --- Initialization --- */
  // 1. Init queue with starting nodes
  // 2. Prepare initial input buffers
126 127
  std::deque<GradNodeBase*> queue;
  std::deque<GradNodeBase*> orig_queue;
128 129 130
  std::unordered_map<GradNodeBase*, std::unique_ptr<GradTensorHolder>>
      node_input_buffers_dict;
  for (size_t i = 0; i < tensors.size(); i++) {
131
    const paddle::experimental::Tensor& tensor = tensors[i];
132

133 134
    AutogradMeta* auto_grad_meta = EagerUtils::nullable_autograd_meta(tensor);
    if (auto_grad_meta == nullptr) {
J
Jiabin Yang 已提交
135
      VLOG(5) << "Skip auto grad since there is no grad op for var or loss is "
136 137 138 139
                 "stop_gradient=True: "
              << tensor.name();
      continue;
    }
140 141 142
    // Get grad input info from target tensors
    auto input_info = auto_grad_meta->OutRankInfo();

J
Jiabin Yang 已提交
143
    VLOG(5) << "Out Rank of Tensor is slot: " << input_info.first
144 145
            << ", rank: " << input_info.second;
    // Get target GradNodeBase from target tensors
146 147 148 149
    auto shared_grad_node = auto_grad_meta->GetMutableGradNode();

    if (shared_grad_node == nullptr || shared_grad_node.get() == nullptr ||
        auto_grad_meta->StopGradient()) {
J
Jiabin Yang 已提交
150
      VLOG(5) << "Skip auto grad since there is no grad op for var or loss is "
151 152 153 154 155
                 "stop_gradient=True: "
              << tensor.name();
      continue;
    }

156
    // TODO(zhanlve): Copy and Modify GradNode if is_general_grad
157
    GradNodeBase* grad_node = shared_grad_node.get();
158 159
    if (is_general_grad) {
      // Save orig grad node
160
      orig_queue.push_back(grad_node);
161 162 163 164 165 166 167

      // Replace grad_node with copied grad_node
      grad_node = GeneralGrad::Instance().CopyGradNode(shared_grad_node);

      // Record potential startup grad node
      GeneralGrad::Instance().GetPotentialStartupNodes()->insert(grad_node);
    }
168 169 170

    // Prepare GradTensorHolder
    if (!node_input_buffers_dict.count(grad_node)) {
J
Jiabin Yang 已提交
171
      VLOG(5) << "Create Value for grad input tensor " << i
172
              << " of grad node: " << grad_node->name();
173 174 175
      node_input_buffers_dict[grad_node] =
          std::make_unique<GradTensorHolder>(grad_node->InputMeta());
    }
176 177 178
    bool copy_from_grad_t =
        grad_tensors.size() > 0 && grad_tensors[i].initialized();
    if (copy_from_grad_t) {
179 180 181 182 183
      PADDLE_ENFORCE(
          grad_tensors.size() == tensors.size(),
          paddle::platform::errors::Fatal(
              "Detected size mismatch between tensors and grad_tensors"
              "grad_tensors should either have "
184
              "size = 0 or same size as tensors."));
185
      // Feed given tensor if it's provided
J
Jiabin Yang 已提交
186
      VLOG(3) << "Fill grad input tensor " << i << "with give grad tensor";
187

188 189 190
      // Deep copy
      node_input_buffers_dict[grad_node]->CopyValueFromTensor(
          input_info.first, input_info.second, grad_tensors[i]);
191
    } else {
J
Jiabin Yang 已提交
192
      VLOG(3) << "Fill grad input tensor " << i << " with 1.0";
193 194 195 196 197
      // Initialize tensor with 1.0
      // Forward Tensor "tensor" is passed to indicate tensortype, datatype and
      // dims
      // GradTensorHolder will initialize another tensor with same tensortype,
      // datatype and dims but filled with 1.0
198
      node_input_buffers_dict[grad_node]->CopyValueFromTensor(
199
          input_info.first, input_info.second, tensor, /*fill_one=*/true);
200 201
    }

202
    // Prepare queue, potential startup_nodes
203
    queue.push_back(grad_node);
204 205 206
  }

  if (is_general_grad) {
207 208 209
    // Prepare several vital preprocess for GeneralGrad
    GeneralGrad::Instance().PreparedForGeneralGrad(
        inputs, no_grad_vars, orig_queue, &queue, node_input_buffers_dict);
210 211
  }

J
Jiabin Yang 已提交
212
  VLOG(5) << "Update In degree Map for backward";
213 214 215 216
  // 3. Compute in_degree for each node
  std::unordered_map<GradNodeBase*, int> node_in_degree_map =
      getInDegreeMap(queue);

J
Jiabin Yang 已提交
217
  VLOG(5) << "Startup_ops's size is " << queue.size();
218

219 220 221
  /* --- Topological Visit --- */
  // 1. Pop queue
  // 2. Run node
222
  //    |- Check and capture target result
223 224 225
  //    |- node(grads)
  //    |- Prepare for next node
  // 3. Update queue
226 227
  while (!queue.empty()) {
    GradNodeBase* node = queue.front();
J
Jiabin Yang 已提交
228 229
    VLOG(3) << "Preparing GradNode:" << node->name() << " addr:" << node;
    VLOG(4) << EagerUtils::GradNodeStr(*node);
230
    paddle::platform::RecordEvent node_record_event(
231
        std::string((*node).name()),
232 233
        paddle::platform::TracerEventType::Operator,
        1);
234

235
    if (queue.size() > 1 && node_in_degree_map[node] != 0) {
236
      queue.pop_front();
237 238
      continue;
    }
239
    queue.pop_front();
240

241
    // Run node: This is where Hook happens
242 243
    auto node_input_buffer_iter = node_input_buffers_dict.find(node);
    PADDLE_ENFORCE_NE(
244 245
        node_input_buffer_iter,
        node_input_buffers_dict.end(),
246
        paddle::platform::errors::Fatal(
247
            "Unable to find next node in the GradTensorHolder \n"
248
            "Trying to run Node without configuring its GradTensorHolder."));
249 250

    std::unique_ptr<GradTensorHolder> node_input_buffer =
251
        std::move(node_input_buffer_iter->second);
252

253
    // Check input
254 255
    EnforceGradNodeHasInput(node);

J
Jiabin Yang 已提交
256
    VLOG(7) << "Run Backward Kernel with GradTensorHolder.";
257
    // Run Pre Backward Node and get outputs
258 259
    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                         kSlotSmallVectorSize>
260 261
        grad_output_tensors = (*node)(
            node_input_buffer->Buffers(), create_graph, is_general_grad);
262

263 264 265 266 267
    if (!inputs.empty() && is_general_grad) {
      GeneralGrad::Instance().SetResultForEnddingNodes(grad_output_tensors,
                                                       node);
    }

268 269
    // retain_grad or not
    if (!retain_graph) {
J
Jiabin Yang 已提交
270
      VLOG(3)
271 272 273 274
          << "retain_graph is false, need to clear the TensorWrapper of nodes.";
      node->ClearTensorWrappers();
    }

275
    // TODO(jiabin): Should we erase it or find a more efficient way.
276
    node_input_buffers_dict.erase(node_input_buffer_iter);
277 278

    // Prepare GradTensorHolder for next node
279 280 281
    const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
        metas = node->OutputMeta();
    PADDLE_ENFORCE(metas.size() == grad_output_tensors.size() || metas.empty(),
282 283
                   paddle::platform::errors::Fatal(
                       "Number of edges should be either empty ( for leaf node "
284 285
                       ") or the same as number of output grad tensors, but we "
                       "got edges size is: %d, grad_output size is: %d",
286 287
                       metas.size(),
                       grad_output_tensors.size()));
288

289 290 291
    for (size_t i = 0; i < metas.size(); i++) {
      for (size_t j = 0; j < metas[i].size(); j++) {
        const Edge& edge = metas[i][j].GetEdge();
J
Jiabin Yang 已提交
292 293 294
        if (!edge.IsInitialized()) {
          continue;
        }
295 296
        auto edge_rank = edge.GetEdgeRankInfo();
        // Since we make edge has as same rank as bwd outputs, we indexing them
297
        // with the same rank(i, j)
298
        auto next_node_shared = edge.GetMutableGradNode();
299 300 301
        VLOG(3) << "Node: " << node->name() << " addr:" << node
                << ", Found pending node: " << next_node_shared->name()
                << " addr: " << next_node_shared.get();
302 303 304
        // Next node could be nullptr if it is leaf tensor with no
        // AccumulationNode attached
        // Or it could also originated from dispensable inputs
305 306 307 308
        if (!next_node_shared || !next_node_shared.get() ||
            grad_output_tensors[i].empty()) {
          continue;
        }
309

310
        PADDLE_ENFORCE_LT(
311 312
            j,
            grad_output_tensors[i].size(),
313 314 315 316 317
            paddle::platform::errors::Fatal(
                "Rank of grad_output_tensors should be less than "
                "grad_output_tensors[i].size(), which is: %d. This error may "
                "indicate autoprune or autograd api error. ",
                grad_output_tensors.size()));
318 319
        paddle::experimental::Tensor& grad_output_tensor =
            grad_output_tensors[i][j];
320 321 322

        if ((!grad_output_tensor.defined() ||
             !grad_output_tensor.initialized())) {
J
Jiabin Yang 已提交
323
          VLOG(7) << "We get grad_output_tensor with slot: " << i
324
                  << ", rank: " << j << " as uninitialized or undefined tensor";
325
        }
326

J
Jiabin Yang 已提交
327
        VLOG(7) << "Get Edge and grad_output_tensor with slot: " << i
328 329 330
                << ", rank: " << j
                << " 's name is: " << grad_output_tensor.name();

331 332 333 334 335
        auto* next_node = next_node_shared.get();
        if (!node_input_buffers_dict.count(next_node)) {
          const auto& input_meta = next_node->InputMeta();
          auto grad_tensor_holder =
              std::make_unique<GradTensorHolder>(input_meta);
J
Jiabin Yang 已提交
336
          VLOG(7) << "Construct GradTensorHolder for grad node: "
337 338 339 340
                  << next_node->name();
          node_input_buffers_dict[next_node] = std::move(grad_tensor_holder);
        }

J
Jiabin Yang 已提交
341
        VLOG(3) << "Sum grad inputs for edge slot: " << edge_rank.first
342
                << ", rank: " << edge_rank.second;
343

344 345 346 347
        node_input_buffers_dict[next_node]->add(edge_rank.first,
                                                edge_rank.second,
                                                grad_output_tensor,
                                                create_graph);
348 349 350

        // Update queue
        node_in_degree_map[next_node]--;
J
Jiabin Yang 已提交
351
        VLOG(7) << next_node->name()
352
                << " ref_cnt is: " << node_in_degree_map[next_node];
353

354 355 356 357
        PADDLE_ENFORCE(
            node_in_degree_map[next_node] >= 0,
            paddle::platform::errors::Fatal(
                "Detected in-degree value smaller than zero. For Node: %s"
358
                "Node's in-degree cannot be negative.",
359
                next_node->name()));
360

361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
        if (is_general_grad) {
          if (node_in_degree_map[next_node] == 0 &&
              GeneralGrad::Instance().IsNeededNodes(next_node)) {
            if (dynamic_cast<egr::GradNodeAccumulation*>(next_node)) {
              queue.push_front(std::move(next_node));
            } else {
              queue.push_back(std::move(next_node));
            }
          }
        } else {
          if (node_in_degree_map[next_node] == 0) {
            if (dynamic_cast<egr::GradNodeAccumulation*>(next_node)) {
              queue.push_front(std::move(next_node));
            } else {
              queue.push_back(std::move(next_node));
            }
377
          }
378 379 380 381
        }
      }
    }
  }
382

J
Jiabin Yang 已提交
383
  VLOG(7) << "Run Backward Final hook size: "
384 385 386 387 388
          << egr::Controller::Instance().FinalBackwardHooks().size();
  for (auto& hook : egr::Controller::Instance().FinalBackwardHooks()) {
    (*hook)();
  }
  egr::Controller::Instance().ClearFinalBackwardHooks();
389
  if (!is_general_grad) return {};
J
Jiabin Yang 已提交
390
  VLOG(3) << "Finish Backward";
391
  return GeneralGrad::Instance().GetResults(inputs, allow_unused, create_graph);
392 393
}

394
void Backward(
395
    const std::vector<paddle::experimental::Tensor>& tensors,  // outputs
396 397
    const std::vector<paddle::experimental::Tensor>& grad_tensors,
    bool retain_graph) {
398
  VLOG(3) << "Run in Backward";
399
  paddle::platform::RecordEvent backward_record_event(
400
      "backward", paddle::platform::TracerEventType::UserDefined, 1);
401
  RunBackward(tensors, grad_tensors, retain_graph);
J
Jiabin Yang 已提交
402
  phi::autotune::AutoTuneStatus::Instance().Update();
403 404 405
}

std::vector<paddle::experimental::Tensor> Grad(
406
    const std::vector<paddle::experimental::Tensor>& tensors,  // outputs
407 408
    const std::vector<paddle::experimental::Tensor>& inputs,
    const std::vector<paddle::experimental::Tensor>& grad_tensors,
409 410 411 412
    bool retain_graph,
    bool create_graph,
    bool only_inputs,
    bool allow_unused,
413
    const std::vector<paddle::experimental::Tensor>& no_grad_vars) {
414
  VLOG(3) << "Run in Grad";
415 416 417 418

  DuplicateCheck(inputs, true /* is_input */);
  DuplicateCheck(tensors, false /* is_input */);

419 420 421 422 423 424 425
  return RunBackward(tensors,
                     grad_tensors,
                     retain_graph,
                     create_graph,
                     inputs,
                     allow_unused,
                     no_grad_vars);
426
}
427
}  // namespace egr