cache.h 10.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once
16

17 18
#include <algorithm>
#include <mutex>
19
#include <numeric>
20 21
#include <unordered_map>
#include <vector>
22

23 24 25 26
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/errors.h"

H
hong 已提交
27 28
DECLARE_int32(search_cache_max_number);

29 30 31 32 33 34 35 36
inline void HashCombine(std::size_t* seed) {}

// combine hash value
// https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
template <typename T, typename... Rest>
inline void HashCombine(std::size_t* seed, const T& v, Rest... rest) {
  std::hash<T> hasher;
  *seed ^= hasher(v) + 0x9e3779b9 + (*seed << 6) + (*seed >> 2);
H
hong 已提交
37
  *seed *= 0x00000100000001B3;
38 39 40 41 42 43 44 45 46
  HashCombine(seed, rest...);
}

// custom specialization of std::hash can be injected in namespace std
// ref: https://en.cppreference.com/w/cpp/utility/hash
namespace std {
template <typename T>
struct hash<std::vector<T>> {
  std::size_t operator()(std::vector<T> const& vec) const noexcept {
H
hong 已提交
47
    std::size_t seed = 0xcbf29ce484222325;
48 49 50 51 52 53 54 55 56 57 58
    for (auto val : vec) {
      HashCombine(&seed, val);
    }
    return seed;
  }
};
}  // namespace std

namespace phi {
namespace autotune {

59 60 61 62
struct ConvAutoTuneResult {
  ConvAutoTuneResult() {}
  ConvAutoTuneResult(int64_t a, size_t size, bool search)
      : algo(a), workspace_size(size), exhaustive_search(search) {}
H
hong 已提交
63 64 65

  int64_t algo;
  size_t workspace_size = 0;
66
  bool exhaustive_search = false;
H
hong 已提交
67 68
};

69 70 71 72 73 74 75
template <typename... Args>
size_t GetKey(Args&&... args) {
  size_t seed = 0;
  HashCombine(&seed, std::forward<Args>(args)...);
  return seed;
}

H
hong 已提交
76 77
struct ConvCacheKey {
  ConvCacheKey() {}
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
  ConvCacheKey(const std::vector<int64_t>& arg_x_dims,
               const std::vector<int64_t>& arg_w_dims,
               const std::vector<int>& arg_strides,
               const std::vector<int>& arg_paddings,
               const std::vector<int>& arg_dilations,
               phi::DataType arg_dtype,
               int arg_groups,
               int64_t arg_data_layout)
      : x_dims(arg_x_dims),
        w_dims(arg_w_dims),
        strides(arg_strides),
        paddings(arg_paddings),
        dilations(arg_dilations),
        dtype(arg_dtype),
        groups(arg_groups),
        data_layout(arg_data_layout) {}
H
hong 已提交
94
  size_t hash_value() const {
95 96 97 98 99 100 101 102
    return GetKey(x_dims,
                  w_dims,
                  strides,
                  paddings,
                  dilations,
                  static_cast<int64_t>(dtype),
                  groups,
                  data_layout);
H
hong 已提交
103
  }
104 105 106 107 108 109 110 111 112

  std::vector<int64_t> x_dims;
  std::vector<int64_t> w_dims;
  std::vector<int> strides;
  std::vector<int> paddings;
  std::vector<int> dilations;
  phi::DataType dtype;
  int groups;
  int64_t data_layout;
H
hong 已提交
113 114 115 116 117 118 119 120 121 122 123
};

struct ConvCacheKeyHash {
  size_t operator()(const ConvCacheKey& cache) const {
    return cache.hash_value();
  }
};

struct ConvCacheKeyEqual {
  size_t operator()(const ConvCacheKey& first,
                    const ConvCacheKey& second) const {
124 125 126 127 128 129 130 131
    if (first.x_dims != second.x_dims) return false;
    if (first.w_dims != second.w_dims) return false;
    if (first.strides != second.strides) return false;
    if (first.paddings != second.paddings) return false;
    if (first.dilations != second.dilations) return false;
    if (first.dtype != second.dtype) return false;
    if (first.groups != second.groups) return false;
    if (first.data_layout != second.data_layout) return false;
H
hong 已提交
132 133 134 135 136 137 138 139 140

    return true;
  }
};

class CudnnAlgorithmsCacheMap {
 public:
  CudnnAlgorithmsCacheMap() : cache_mutex_(new std::mutex()) { hash_.clear(); }

141
  ConvAutoTuneResult Get(const ConvCacheKey& key) {
H
hong 已提交
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
    std::lock_guard<std::mutex> lock(*cache_mutex_);
    PADDLE_ENFORCE_NE(
        hash_.find(key),
        hash_.end(),
        phi::errors::PreconditionNotMet("The key does not exist."));
    return hash_[key];
  }

  bool Find(const ConvCacheKey& key) {
    bool ret = false;
    std::lock_guard<std::mutex> lock(*cache_mutex_);
    if (hash_.find(key) != hash_.end()) {
      cache_hits_++;
      ret = true;
    } else {
      cache_misses_++;
    }
    return ret;
  }

  void Clean() {
    std::lock_guard<std::mutex> lock(*cache_mutex_);
    hash_.clear();
    cache_hits_ = 0;
    cache_misses_ = 0;
  }

169
  void Set(const ConvCacheKey& key, ConvAutoTuneResult algo) {
H
hong 已提交
170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
    std::lock_guard<std::mutex> lock(*cache_mutex_);
    if (hash_.size() > static_cast<size_t>(FLAGS_search_cache_max_number)) {
      hash_.clear();
    }
    hash_[key] = algo;
  }

  int64_t CacheMisses() const { return cache_misses_; }

  int64_t CacheHits() const { return cache_hits_; }

  float CacheHitRate() const {
    int64_t num_accesses = cache_hits_ + cache_misses_;
    float cache_hit_rate = 0.;
    if (num_accesses != 0) {
      cache_hit_rate =
          static_cast<float>(cache_hits_) / static_cast<float>(num_accesses);
    }
    return cache_hit_rate;
  }

  int64_t Size() const { return hash_.size(); }

 private:
194 195 196 197
  std::unordered_map<ConvCacheKey,
                     ConvAutoTuneResult,
                     ConvCacheKeyHash,
                     ConvCacheKeyEqual>
H
hong 已提交
198 199 200 201 202 203
      hash_;
  std::shared_ptr<std::mutex> cache_mutex_;

  int64_t cache_hits_{0};
  int64_t cache_misses_{0};
};
204

205 206 207 208
size_t TransposeKey(const std::vector<int64_t>& x_dims,
                    const std::vector<int32_t>& perm,
                    phi::DataType dtype);

209 210 211
template <typename AlgorithmT>
class AlgorithmsCache {
 public:
212
  AlgorithmsCache() : cache_mutex_(new std::mutex()) { hash_.clear(); }
213

H
hong 已提交
214
  AlgorithmT Get(const size_t& key) {
215
    std::lock_guard<std::mutex> lock(*cache_mutex_);
216 217 218 219 220 221 222
    PADDLE_ENFORCE_NE(
        hash_.find(key),
        hash_.end(),
        phi::errors::PreconditionNotMet("The key does not exist."));
    return hash_[key];
  }

H
hong 已提交
223
  bool Find(const size_t& key) {
224
    bool ret = false;
225
    std::lock_guard<std::mutex> lock(*cache_mutex_);
226 227 228 229 230 231 232 233 234
    if (hash_.find(key) != hash_.end()) {
      cache_hits_++;
      ret = true;
    } else {
      cache_misses_++;
    }
    return ret;
  }

235 236 237 238 239 240 241
  void Clean() {
    std::lock_guard<std::mutex> lock(*cache_mutex_);
    hash_.clear();
    cache_hits_ = 0;
    cache_misses_ = 0;
  }

H
hong 已提交
242
  void Set(const size_t& key, AlgorithmT algo) {
243
    std::lock_guard<std::mutex> lock(*cache_mutex_);
244 245 246
    hash_[key] = algo;
  }

247 248 249 250
  int64_t CacheMisses() const { return cache_misses_; }

  int64_t CacheHits() const { return cache_hits_; }

251 252
  float CacheHitRate() const {
    int64_t num_accesses = cache_hits_ + cache_misses_;
253 254 255 256 257
    float cache_hit_rate = 0.;
    if (num_accesses != 0) {
      cache_hit_rate =
          static_cast<float>(cache_hits_) / static_cast<float>(num_accesses);
    }
258 259 260
    return cache_hit_rate;
  }

261
  int64_t Size() const { return hash_.size(); }
262 263 264

 private:
  std::unordered_map<size_t, AlgorithmT> hash_;
265
  std::shared_ptr<std::mutex> cache_mutex_;
266 267 268 269 270 271 272 273 274

  int64_t cache_hits_{0};
  int64_t cache_misses_{0};
};

enum class AlgorithmType {
  kConvForward = 1,
  kConvBackwardData = 2,
  kConvBackwardFilter = 3,
275 276
  kTranspose = 4,
  kAlgorithmCount = 5
277 278
};

279
// AlgorithmsConfigKey -> AlgorithmsID
H
hong 已提交
280
// (todo. hong) use cudnnConvolutionFwdAlgo_t
281 282 283
using AlgorithmsCacheMap = AlgorithmsCache<int64_t>;
// AlgorithmType -> AlgorithmsCache
using AlgorithmsTypeMap = std::unordered_map<int64_t, AlgorithmsCacheMap>;
H
hong 已提交
284 285
using CudnnAlgorithmsTypeMap =
    std::unordered_map<int64_t, CudnnAlgorithmsCacheMap>;
286 287 288 289 290 291 292 293

class AutoTuneCache {
 public:
  static AutoTuneCache& Instance() {
    static AutoTuneCache autotune_cache;
    return autotune_cache;
  }

294 295
  AlgorithmsCacheMap& Get(const AlgorithmType& algo_type) {
    return auto_tune_map_[static_cast<int64_t>(algo_type)];
296 297
  }

298 299
  CudnnAlgorithmsCacheMap& GetConv(const AlgorithmType& algo_type) {
    return cudnn_auto_tune_map_[static_cast<int64_t>(algo_type)];
300 301
  }

302 303
  AlgorithmsCacheMap& GetTranspose() { return Get(AlgorithmType::kTranspose); }

304
  void Clean() {
305
    for (auto& v : auto_tune_map_) {
306
      v.second.Clean();
307
    }
H
hong 已提交
308 309 310 311

    for (auto& v : cudnn_auto_tune_map_) {
      v.second.Clean();
    }
312 313
  }

314 315
  void UpdateStatus();

316 317 318 319 320 321 322 323 324 325 326 327 328 329 330
  // The number of total config cached
  int64_t Size() const { return total_size_; }

  int64_t CacheHits() const { return total_cache_hits_; }

  int64_t CacheMisses() const { return total_cache_misses_; }

  float CacheHitRate() const {
    float total_cache_hit_rate = 0.;
    int64_t total_num_accesses = total_cache_hits_ + total_cache_misses_;
    if (total_num_accesses != 0) {
      total_cache_hit_rate = static_cast<float>(total_cache_hits_) /
                             static_cast<float>(total_num_accesses);
    }
    return total_cache_hit_rate;
331 332 333
  }

 private:
334 335 336 337 338 339 340 341
  AutoTuneCache() : autotune_cache_mutex_(new std::mutex()) {
    for (int i = 1; i < static_cast<int>(AlgorithmType::kAlgorithmCount); ++i) {
      Register(static_cast<AlgorithmType>(i));
    }
  }

  void Register(const AlgorithmType& algo_type) {
    std::lock_guard<std::mutex> lock(*autotune_cache_mutex_);
H
hong 已提交
342 343 344 345 346 347 348 349 350 351 352 353 354 355
    if (algo_type == AlgorithmType::kConvForward ||
        algo_type == AlgorithmType::kConvBackwardData ||
        algo_type == AlgorithmType::kConvBackwardFilter) {
      int64_t key = static_cast<int64_t>(algo_type);
      if (auto_tune_map_.find(key) == auto_tune_map_.end()) {
        CudnnAlgorithmsCacheMap cache;
        cudnn_auto_tune_map_[key] = cache;
      }
    } else {
      int64_t key = static_cast<int64_t>(algo_type);
      if (auto_tune_map_.find(key) == auto_tune_map_.end()) {
        AlgorithmsCacheMap cache;
        auto_tune_map_[key] = cache;
      }
356 357 358
    }
  }

359
  AlgorithmsTypeMap auto_tune_map_;
H
hong 已提交
360
  CudnnAlgorithmsTypeMap cudnn_auto_tune_map_;
361
  std::shared_ptr<std::mutex> autotune_cache_mutex_;
362 363 364
  int64_t total_cache_hits_{0};
  int64_t total_cache_misses_{0};
  int64_t total_size_{0};
365 366
};

367 368
}  // namespace autotune
}  // namespace phi