layer.cc 10.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/imperative/layer.h"
M
minqiyang 已提交
16

17 18 19 20 21 22 23 24
#include <deque>
#include <limits>
#include <map>
#include <random>
#include <utility>

#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
25
#include "paddle/fluid/framework/operator.h"
M
minqiyang 已提交
26 27 28
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device_context.h"
29 30 31 32 33
#include "paddle/fluid/string/printf.h"

namespace paddle {
namespace imperative {

X
polish  
Xin Pan 已提交
34 35
const char* PyLayer::kFwdInp = "X";
const char* PyLayer::kFwdOut = "Out";
X
polish  
Xin Pan 已提交
36

X
Xin Pan 已提交
37 38
std::map<int, py::object> py_funcs_;

39 40
using framework::Variable;

M
minqiyang 已提交
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
namespace detail {

template <typename T>
class TensorAddToFunctor : public boost::static_visitor<> {
 public:
  TensorAddToFunctor(int64_t numel, const T* x, T* y)
      : numel_(numel), x_(x), y_(y) {}

  void operator()(const platform::CPUPlace& place) {
    platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
        platform::DeviceContextPool::Instance().Get(place));
    auto blas =
        operators::math::GetBlas<platform::CPUDeviceContext, float>(*ctx);
    blas.AXPY(numel_, 1., x_, y_);
  }

#ifdef PADDLE_WITH_CUDA
  void operator()(const platform::CUDAPlace& place) {
    platform::CUDADeviceContext* ctx =
        dynamic_cast<platform::CUDADeviceContext*>(
            platform::DeviceContextPool::Instance().Get(place));
    auto blas =
        operators::math::GetBlas<platform::CUDADeviceContext, float>(*ctx);
    blas.AXPY(numel_, 1., x_, y_);
  }
#else
  void operator()(const platform::CUDAPlace& place) {
    PADDLE_THROW("Do NOT support gradient merge in place %s", place);
  }
#endif

  // there is NO blas in CUDAPinnedPlace
  void operator()(const platform::CUDAPinnedPlace& place) {
    PADDLE_THROW("Do NOT support gradient merge in place %s", place);
  }

 private:
  int64_t numel_;
  const T* x_;
  T* y_;
};

}  // namespace detail

void AddGradTo(Variable* src, Variable* dst, platform::Place place) {
  framework::Tensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
  framework::Tensor* src_tensor = src->GetMutable<framework::LoDTensor>();

M
minqiyang 已提交
89 90 91 92 93
  // FIXME(minqiyang): loss_grad op will pass a zero grad of label
  // ugly fix for it
  if (src_tensor->numel() == 0) {
    return;
  }
M
minqiyang 已提交
94

95 96 97
  PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(),
                 "dst_numel %lld vs. src_numel %lld", dst_tensor->numel(),
                 src_tensor->numel());
M
minqiyang 已提交
98 99 100 101 102

  detail::TensorAddToFunctor<float> func(
      src_tensor->numel(), src_tensor->data<float>(),
      dst_tensor->mutable_data<float>(place));
  boost::apply_visitor(func, place);
103 104 105 106
}

class Autograd {
 public:
X
Xin Pan 已提交
107
  Autograd() {}
108 109

  void RunBackward(VarBase* var) {
X
Xin Pan 已提交
110
    if (var->IsStopGradient()) {
111 112
      return;
    }
X
Xin Pan 已提交
113
    VLOG(3) << "start autograd";
114 115

    std::deque<OpBase*> ready;
X
Xin Pan 已提交
116
    ready.push_back(var->PreOp());
117

X
Xin Pan 已提交
118
    std::map<OpBase*, int> dep_counts = ComputeDepCounts(var->PreOp());
119 120 121 122

    while (!ready.empty()) {
      OpBase* ready_op = ready.front();
      ready.pop_front();
X
Xin Pan 已提交
123 124 125 126 127 128 129
      std::map<std::string, std::vector<VarBase*>> input_grads =
          ready_op->ApplyGrad();

      for (auto it : input_grads) {
        const std::vector<VarBase*>& ingrads = it.second;
        for (size_t i = 0; i < ingrads.size(); ++i) {
          if (!ingrads[i]) continue;
X
Xin Pan 已提交
130
          if (ready_op->input_vars_[it.first][i]->IsStopGradient()) {
131 132
            continue;
          }
X
Xin Pan 已提交
133
          OpBase* pre_op = ready_op->pre_ops_[it.first][i];
X
Xin Pan 已提交
134 135 136 137 138 139 140 141
          if (!pre_op) continue;

          dep_counts[pre_op] -= 1;
          PADDLE_ENFORCE(dep_counts[pre_op] >= 0);
          bool pre_op_ready = dep_counts[pre_op] == 0;
          if (pre_op_ready) {
            ready.push_back(pre_op);
          }
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
        }
      }
    }
  }

 private:
  std::map<OpBase*, int> ComputeDepCounts(OpBase* op) {
    std::map<OpBase*, int> ret;

    std::deque<OpBase*> queue;
    queue.push_back(op);
    std::unordered_set<OpBase*> visited;
    visited.insert(op);
    while (!queue.empty()) {
      OpBase* candidate = queue.front();
      queue.pop_front();
X
Xin Pan 已提交
158
      for (auto it : candidate->pre_ops_) {
X
Xin Pan 已提交
159 160 161 162 163 164 165
        for (OpBase* pre_op : it.second) {
          if (!pre_op) continue;
          if (visited.find(pre_op) == visited.end()) {
            visited.insert(pre_op);
            queue.push_back(pre_op);
          }
          ret[pre_op] += 1;
166 167 168 169 170 171 172
        }
      }
    }
    return ret;
  }
};

M
minqiyang 已提交
173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
framework::LoDTensor* VarBase::CopiedTensor() const {
  PADDLE_ENFORCE(var_->IsInitialized(),
                 "Variable must be initialized when getting numpy tensor");
  platform::Place place = var_->Get<framework::LoDTensor>().place();
  framework::LoDTensor* result = new framework::LoDTensor();
  result->Resize(var_->Get<framework::LoDTensor>().dims());
  result->set_lod(var_->Get<framework::LoDTensor>().lod());
  if (platform::is_gpu_place(place)) {
    VLOG(3) << "fetch tensor " << var_desc_->Name() << " from gpu";

    framework::TensorCopy(var_->Get<framework::LoDTensor>(),
                          platform::CPUPlace(), result);

    platform::DeviceContext* dev_ctx =
        platform::DeviceContextPool::Instance().Get(place);
    dev_ctx->Wait();
  } else {
    TensorCopy(var_->Get<framework::LoDTensor>(), platform::CPUPlace(), result);
  }

  return result;
}

M
minqiyang 已提交
196
framework::LoDTensor& VarBase::GradValue() {
197
  VLOG(3) << "get var grad " << var_desc_->Name();
M
minqiyang 已提交
198
  return *(grads_->var_->GetMutable<framework::LoDTensor>());
199 200
}

X
Xin Pan 已提交
201
std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
X
Xin Pan 已提交
202
  if (!grad_op_desc_ && backward_id_ <= 0) {
203
    LOG(WARNING) << "op with no grad: " << op_desc_->Type();
X
Xin Pan 已提交
204
    return {};
205 206
  }

X
Xin Pan 已提交
207
  std::map<std::string, std::vector<framework::Variable*>> grad_outputs;
X
Xin Pan 已提交
208 209
  if (backward_id_ > 0) {
    VLOG(3) << "py_layer_grad";
X
Xin Pan 已提交
210 211 212
    grad_outputs[framework::GradVarName(PyLayer::kFwdOut)] = PyLayer::ApplyGrad(
        backward_id_,
        grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)]);
X
Xin Pan 已提交
213 214
  } else {
    VLOG(3) << "op grad " << grad_op_desc_->Type();
X
polish  
Xin Pan 已提交
215 216 217 218 219 220 221 222
    for (auto it : grad_output_vars_) {
      auto& outputs = grad_outputs[it.first];
      for (size_t i = 0; i < it.second.size(); ++i) {
        // Allocate a new variable
        Variable* tmp_var = new framework::Variable();
        tmp_var->GetMutable<framework::LoDTensor>();
        outputs.push_back(tmp_var);
      }
223 224
    }

X
Xin Pan 已提交
225
    framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
226

X
Xin Pan 已提交
227 228 229
    // No need to do compile time infer shape here.
    // grad_op_desc_->InferShape(*block_);
    grad_op_desc_->InferVarType(block_);
X
Xin Pan 已提交
230

X
Xin Pan 已提交
231 232 233 234 235
    std::unique_ptr<framework::OperatorBase> opbase =
        framework::OpRegistry::CreateOp(*grad_op_desc_);
    framework::OperatorWithKernel* op_kernel =
        dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
    PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
X
Xin Pan 已提交
236

X
Xin Pan 已提交
237
    framework::Scope scope;
M
minqiyang 已提交
238
    platform::Place place = expected_place_;
X
Xin Pan 已提交
239 240 241 242
    PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
    p.op.RuntimeInferShape(scope, place, ctx);
    p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
  }
X
Xin Pan 已提交
243 244 245 246

  for (auto it : grad_output_vars_) {
    auto& outputs = grad_outputs[it.first];
    auto& origin_outputs = it.second;
X
polish  
Xin Pan 已提交
247
    PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
248

X
Xin Pan 已提交
249
    for (size_t i = 0; i < outputs.size(); ++i) {
X
polish  
Xin Pan 已提交
250
      framework::Variable* grad = outputs[i];
M
minqiyang 已提交
251
      framework::Variable* orig_grad = origin_outputs[i];
M
minqiyang 已提交
252
      AddGradTo(grad, orig_grad, expected_place_);
X
polish  
Xin Pan 已提交
253
      delete grad;
254 255
    }
  }
X
Xin Pan 已提交
256
  return input_vars_;
257 258
}

X
Xin Pan 已提交
259
void VarBase::RunBackward() {
260
  if (!pre_op_) return;
X
Xin Pan 已提交
261

X
Xin Pan 已提交
262
  VLOG(3) << "start backward";
M
minqiyang 已提交
263
  auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>();
M
minqiyang 已提交
264 265 266 267
  operators::math::set_constant(
      *(platform::DeviceContextPool::Instance().Get(
          var_->GetMutable<framework::LoDTensor>()->place())),
      grads_t, 1.0);
X
Xin Pan 已提交
268

X
Xin Pan 已提交
269 270 271
  PADDLE_ENFORCE(
      grads_ ==
      pre_op_->output_vars_[pre_op_out_name_][pre_op_out_idx_]->grads_);
X
Xin Pan 已提交
272
  Autograd().RunBackward(this);
273 274
}

X
Xin Pan 已提交
275 276 277 278
void PyLayer::RegisterFunc(int func_id, const py::object& py_func) {
  py_funcs_[func_id] = py_func;
}

X
polish  
Xin Pan 已提交
279 280
int PyLayer::NumFuncs() { return py_funcs_.size(); }

X
Xin Pan 已提交
281
std::vector<VarBase*> PyLayer::Apply(int func_id,
X
Xin Pan 已提交
282
                                     const std::vector<VarBase*>& inputs) {
X
polish  
Xin Pan 已提交
283
  std::vector<framework::Variable*> invars;
X
Xin Pan 已提交
284
  for (const VarBase* in : inputs) {
X
polish  
Xin Pan 已提交
285
    invars.push_back(in->var_);
X
Xin Pan 已提交
286 287
  }
  PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
X
polish  
Xin Pan 已提交
288 289 290
  std::vector<Variable*> outvars = CallPythonFunc(py_funcs_[func_id], invars);
  std::vector<VarBase*> ret;
  for (Variable* v : outvars) {
291
    ret.push_back(new VarBase(v, new VarBase(true)));
X
polish  
Xin Pan 已提交
292
  }
X
Xin Pan 已提交
293 294 295
  return ret;
}

X
polish  
Xin Pan 已提交
296 297 298 299 300
std::vector<Variable*> PyLayer::ApplyGrad(
    int func_id, const std::vector<framework::Variable*>& inputs) {
  PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
  return CallPythonFunc(py_funcs_[func_id], inputs);
}
X
Xin Pan 已提交
301

X
polish  
Xin Pan 已提交
302 303 304 305 306 307 308
std::vector<framework::Variable*> PyLayer::CallPythonFunc(
    const py::object& callable, const std::vector<framework::Variable*>& ins) {
  py::gil_scoped_acquire guard;
  py::tuple in_args(ins.size());
  for (size_t i = 0; i < ins.size(); ++i) {
    const framework::LoDTensor& t = ins[i]->Get<framework::LoDTensor>();
    in_args[i] = t.IsInitialized() ? py::cast(t) : py::cast(nullptr);
X
Xin Pan 已提交
309
  }
X
polish  
Xin Pan 已提交
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332
  VLOG(3) << "pyfunc in " << py::len(in_args);

  // TODO(panyx0718): Who owns the returned LoDTensor.
  auto ret = callable(in_args);
  auto ret_tuple = py::cast<py::tuple>(ret);
  size_t ret_num = py::len(ret_tuple);
  std::vector<framework::Variable*> outs;
  VLOG(3) << "pyfunc out " << ret_num;
  for (size_t i = 0; i < ret_num; ++i) {
    try {
      auto* py_out_tensor = py::cast<framework::LoDTensor*>(ret_tuple[i]);
      PADDLE_ENFORCE_NOT_NULL(py_out_tensor,
                              "Output tensor %d should not be nullptr", i);
      auto* var = new framework::Variable();
      auto* tensor = var->GetMutable<framework::LoDTensor>();
      tensor->ShareDataWith(*py_out_tensor);
      tensor->set_lod(py_out_tensor->lod());
      outs.push_back(var);
    } catch (py::cast_error&) {
      PADDLE_THROW("The %d-th output must be LoDTensor", i);
    }
  }
  return outs;
X
Xin Pan 已提交
333 334
}

335 336
}  // namespace imperative
}  // namespace paddle