grad_tensor_holder.cc 9.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/eager/grad_tensor_holder.h"

17
#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
18
#include "paddle/fluid/framework/convert_utils.h"
19
#include "paddle/fluid/framework/var_type.h"
20
#include "paddle/fluid/imperative/gradient_accumulator.h"
21
#include "paddle/phi/core/sparse_coo_tensor.h"
22
#include "paddle/phi/kernels/funcs/math_function.h"
23 24 25 26
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
#endif
27 28 29

namespace egr {

30
void GradTensorHolder::SetBufferSlotRankZeros(size_t slot_id, size_t rank) {
31
  // Set not grad var to zero and set stop gradient as default value: true
32 33 34 35
  buffer_[slot_id][rank] =
      paddle::experimental::zeros_like(buffer_[slot_id][rank]);
}

36 37 38 39
void GradTensorHolder::CopyValueFromTensor(size_t slot_id,
                                           size_t rank,
                                           const paddle::Tensor& t,
                                           bool fill_one) {
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
  // TODO(jiabin): We need to deal with empty input_buffer with slot size not
  // empty;
  PADDLE_ENFORCE(slot_id < buffer_.size(),
                 paddle::platform::errors::Fatal(
                     "Invalid slot_id for GradTensorHolder::add() "
                     "which exceeds size of buffer"));
  VLOG(6) << "Add Tensor for buffer_ slot: " << slot_id
          << ", size: " << buffer_[slot_id].size();
  if (buffer_[slot_id].empty()) {
    VLOG(6) << "Pass add Tensor for buffer_ slot: " << slot_id
            << " since its buffer_ is empty ";
    return;
  }
  PADDLE_ENFORCE(
      rank < buffer_[slot_id].size(),
      paddle::platform::errors::Fatal(
          "Invalid rank for GradTensorHolder::add() which exceeds size "
          "of buffer slot %d, got slot size is: %d rank is: %d",
58 59 60
          slot_id,
          buffer_[slot_id].size(),
          rank));
61
  if (!fill_one) {
62
    paddle::Tensor& buffer_tensor = buffer_[slot_id][rank];
63
    if ((!buffer_tensor.defined() || !buffer_tensor.initialized())) {
64
      // Perform deep copy here
C
Chen Weihang 已提交
65
      buffer_tensor.copy_(t, t.place(), false);
66 67 68 69 70 71 72 73 74
      auto* meta = egr::EagerUtils::autograd_meta(&buffer_tensor);
      auto* origin_meta = egr::EagerUtils::nullable_autograd_meta(t);
      if (origin_meta) {
        auto grad_node = origin_meta->GetMutableGradNode();
        if (grad_node && grad_node.get()) {
          meta->SetGradNode(origin_meta->GetMutableGradNode());
        }
        meta->WeakGrad() = origin_meta->WeakGrad();
      }
75
    } else {
76 77 78
      PADDLE_THROW(paddle::platform::errors::Fatal(
          "Cannot copy grad_tensors' value to grad tensor holders,"
          "input buffer has already been initialized."));
79 80 81 82
    }
  } else {
    // Create new tensor->impl and fill it with 1.0
    if (t.defined()) {
83
      // Fill 1.0, use full to support complex, one_like don't support it.
84 85 86 87 88 89
      if (t.is_dense_tensor()) {
        buffer_[slot_id][rank] =
            paddle::experimental::full(t.shape(), 1, t.dtype(), t.place());
      } else if (t.is_sparse_csr_tensor() || t.is_sparse_coo_tensor()) {
        buffer_[slot_id][rank] =
            paddle::experimental::sparse::full_like(t, 1, t.dtype());
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
#ifdef PADDLE_WITH_DISTRIBUTE
      } else if (t.is_dist_tensor()) {
        VLOG(6) << "Create a new dist tensor.";
        // TODO(chenweihang): we need a shard_tensor API in C++
        // TODO(chenweihang): replace by valid dist_attr later
        auto temp =
            paddle::experimental::full(t.shape(), 1, t.dtype(), t.place());
        auto dense_temp =
            std::dynamic_pointer_cast<phi::DenseTensor>(temp.impl());
        auto dist_tensor = std::make_shared<phi::distributed::DistTensor>(
            dense_temp,
            dense_temp->meta(),
            std::make_shared<
                phi::distributed::auto_parallel::TensorDistAttr>());
        temp.set_impl(dist_tensor);
        buffer_[slot_id][rank] = temp;
#endif
107 108 109 110 111
      } else {
        PADDLE_THROW(paddle::platform::errors::Fatal(
            "Only Support DENSE_TENSOR, SPARSE_COO_TENSOR, SPARSE_CSR_TENSOR "
            "now."));
      }
112 113
    }
  }
114 115
  egr::EagerUtils::autograd_meta(&(buffer_[slot_id][rank]))
      ->SetStopGradient(false);
116 117
}

118 119
void GradTensorHolder::add(size_t slot_id,
                           size_t rank,
120
                           const paddle::Tensor& t,
121
                           bool create_graph) {
122 123 124 125 126
  if (!t.initialized()) {
    VLOG(3) << "No need to do accumulate for uninitialized t.";
    return;
  }  // TODO(jiabin): Remove this when we fix all kernel.

127 128 129 130 131 132 133 134 135 136 137 138 139 140
  PADDLE_ENFORCE(slot_id < buffer_.size(),
                 paddle::platform::errors::Fatal(
                     "Invalid slot_id for GradTensorHolder::add() "
                     "which exceeds size of buffer"));
  if (buffer_[slot_id].empty()) {
    VLOG(6) << "Pass add Tensor for buffer_ slot: " << slot_id
            << " since its buffer_ is empty ";
    return;
  }
  PADDLE_ENFORCE(
      rank < buffer_[slot_id].size(),
      paddle::platform::errors::Fatal(
          "Invalid rank for GradTensorHolder::add() which exceeds size "
          "of buffer slot %d, got slot size is: %d rank is: %d",
141 142 143
          slot_id,
          buffer_[slot_id].size(),
          rank));
144

145
  paddle::Tensor& buffer_tensor = buffer_[slot_id][rank];
146 147 148 149 150 151 152
  // TODO(jiabin): Code bellow is ugly to divide which inner var we used,
  // remove framework::Variable
  // related code later.
  // This if statement is trying to test neither phi::Tensor nor
  // framework::Variable is initialized.
  if ((!buffer_tensor.defined() || !buffer_tensor.initialized())) {
    // Simply copy tensor->impl
J
Jiabin Yang 已提交
153 154
    VLOG(6) << "Move Tensor for buffer_ slot: " << slot_id
            << ", size: " << buffer_[slot_id].size();
155 156
    buffer_tensor = t;
  } else {
J
Jiabin Yang 已提交
157 158
    VLOG(6) << "Add Tensor for buffer_ slot: " << slot_id
            << ", size: " << buffer_[slot_id].size();
159
    // Accumulation
160 161
    PADDLE_ENFORCE_EQ(t.initialized(),
                      true,
162 163 164 165 166
                      paddle::platform::errors::Fatal(
                          "We can only accumulate initialized tensor, but we "
                          "got tensor: %s is empty please check you network "
                          "and make sure it creates grads.",
                          t.name()));
167

168 169
    if (t.is_dense_tensor()) {
      if (buffer_tensor.is_dense_tensor()) {
170
        if (create_graph || t.is_custom_device()) {
J
Jiabin Yang 已提交
171
          buffer_tensor = add_ad_func(t, buffer_tensor);
172
        } else {
173
          paddle::imperative::TensorAdd<paddle::Tensor>(t, &buffer_tensor);
174
        }
175 176 177 178
      } else {
        // TODO(jiabin): Support Other TensorBase later
        // TODO(zhanlve): Replace SelectedRowsAddTensor with
        // add_dygraph_function once it's supported
179 180
        paddle::Tensor new_buffer(std::make_shared<phi::DenseTensor>(),
                                  "tmp_accumulator");
181 182
        paddle::imperative::SelectedRowsAddTensor(
            buffer_tensor, t, &new_buffer);
183 184
        buffer_tensor.set_impl(new_buffer.impl());
      }
185 186
    } else if (t.is_sparse_coo_tensor()) {
      auto t_sparse = std::dynamic_pointer_cast<phi::SparseCooTensor>(t.impl());
187
      paddle::Tensor t_values(
188 189 190 191 192
          std::make_shared<phi::DenseTensor>(t_sparse->non_zero_elements()));
      // In fact, the gradient of SparseTensor is still a SparseTensor
      if (buffer_tensor.is_sparse_coo_tensor()) {
        auto buffer_sparse = std::dynamic_pointer_cast<phi::SparseCooTensor>(
            buffer_tensor.impl());
193 194
        paddle::Tensor buffer_values(std::make_shared<phi::DenseTensor>(
            buffer_sparse->non_zero_elements()));
195
        if (create_graph || t.is_custom_device()) {
J
Jiabin Yang 已提交
196
          buffer_values = add_ad_func(t_values, buffer_values);
197
        } else {
198 199
          paddle::imperative::TensorAdd<paddle::Tensor>(t_values,
                                                        &buffer_values);
200 201
        }
      }
202 203 204 205
#ifdef PADDLE_WITH_DISTRIBUTE
    } else if (t.is_dist_tensor()) {
      buffer_tensor = add_ad_func(t, buffer_tensor);
#endif
206 207 208 209 210 211 212 213
    } else {
      // TODO(jiabin): Support Other TensorBase later
      // TODO(zhanlve): Replace SelectedRowsAddTensor with add_dygraph_function
      // once it's supported
      if (buffer_tensor.is_dense_tensor()) {
        paddle::imperative::SelectedRowsAddToTensor(t, &buffer_tensor);
      } else {
        buffer_tensor =
214 215
            std::move(*paddle::imperative::SelectedRowsMerge<paddle::Tensor>(
                t, buffer_tensor));
216 217 218 219 220
      }
    }
  }
}

221
}  // namespace egr