reference_count_op_handle.h 4.5 KB
Newer Older
S
sneaxiy 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <atomic>
#include <string>
#include <unordered_map>
#include <vector>

#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/scope.h"
S
sneaxiy 已提交
25
#include "paddle/fluid/framework/selected_rows.h"
S
sneaxiy 已提交
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
#include "paddle/fluid/framework/tensor.h"

namespace paddle {
namespace framework {
namespace details {

using ReferenceCountMap = std::unordered_map<std::string, int>;
using AtomicReferenceCountMap =
    std::unordered_map<std::string, std::atomic<int>>;
using DeviceReferenceCountMap =
    std::unordered_map<int, std::unique_ptr<ReferenceCountMap>>;
using AtomicDeviceReferenceCountMap =
    std::unordered_map<int, std::unique_ptr<AtomicReferenceCountMap>>;
using DeviceGarbageCollectorMap =
    std::unordered_map<int,
                       std::unique_ptr<GarbageCollector<framework::Tensor>>>;

class ReferenceCountOpHandle : public OpHandleBase {
 public:
  ReferenceCountOpHandle(ir::Node *node, const Scope *scope,
                         const platform::CUDAPlace &place,
                         const std::vector<std::string> &var_names,
                         GarbageCollector<Tensor> *gc,
                         AtomicReferenceCountMap *ref_cnts)
S
sneaxiy 已提交
50
      : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) {
S
sneaxiy 已提交
51 52 53 54 55 56
    dev_ctx_ = static_cast<platform::CUDADeviceContext *>(
        platform::DeviceContextPool::Instance().Get(place));
    if (IsStreamGarabageCollector()) {
      PADDLE_ENFORCE(cudaSetDevice(place.device));
      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
    }
S
sneaxiy 已提交
57 58

    for (auto &name : var_names) AddVar(name);
S
sneaxiy 已提交
59 60 61 62 63 64 65 66 67 68 69 70
  }

  ~ReferenceCountOpHandle() {
    if (IsStreamGarabageCollector()) {
      auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
      PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
      PADDLE_ENFORCE(cudaEventDestroy(event_));
    }
  }

  std::string Name() const override { return "reference_count"; }

S
sneaxiy 已提交
71 72 73 74 75 76 77 78
  void AddVar(const std::string &name) {
    auto it = var_names_.find(name);
    if (it != var_names_.end())
      ++(it->second);
    else
      var_names_[name] = 1;
  }

S
sneaxiy 已提交
79
 protected:
S
sneaxiy 已提交
80
  void RunImpl() override {
S
sneaxiy 已提交
81
    auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
S
sneaxiy 已提交
82 83 84
    std::vector<Tensor *> tensors;
    for (auto &pair : var_names_) {
      auto &name = pair.first;
S
sneaxiy 已提交
85 86 87
      auto it = ref_cnts_->find(name);
      if (it == ref_cnts_->end()) continue;

S
sneaxiy 已提交
88
      auto *var = exec_scope->FindVar(name);
S
sneaxiy 已提交
89 90 91 92 93 94 95 96 97 98 99
      if (var == nullptr) continue;

      if (var->IsType<LoDTensor>()) {
        if (it->second.fetch_sub(pair.second) <= pair.second) {
          tensors.emplace_back(var->GetMutable<LoDTensor>());
        }
      } else if (var->IsType<SelectedRows>()) {
        if (it->second.fetch_sub(pair.second) <= pair.second) {
          tensors.emplace_back(
              var->GetMutable<SelectedRows>()->mutable_value());
        }
S
sneaxiy 已提交
100 101 102 103 104 105 106 107 108
      }
    }

    if (!tensors.empty()) {
      ClearTensors(tensors);
    }
  }

 private:
S
sneaxiy 已提交
109
  void ClearTensors(const std::vector<Tensor *> &tensors) {
S
sneaxiy 已提交
110
    auto *gc = dynamic_cast<StreamGarbageCollector<Tensor> *>(gc_);
S
sneaxiy 已提交
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
    if (gc != nullptr) {
      auto compute_stream = dev_ctx_->stream();
      auto callback_stream = gc->stream();
      auto callback_func = [=]() {
        PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
        PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
      };
      gc_->Add(tensors, callback_func);
    } else {
      gc_->Add(tensors);
    }
  }

  bool IsStreamGarabageCollector() const {
    return dynamic_cast<const StreamGarbageCollector<Tensor> *>(gc_) != nullptr;
  }

  const Scope *scope_;
  platform::CUDADeviceContext *dev_ctx_;
S
sneaxiy 已提交
130
  std::unordered_map<std::string, int> var_names_;
S
sneaxiy 已提交
131 132 133 134 135 136 137 138
  GarbageCollector<Tensor> *gc_;       // not own
  AtomicReferenceCountMap *ref_cnts_;  // not own
  cudaEvent_t event_;
};

}  // namespace details
}  // namespace framework
}  // namespace paddle