auto_tune_base.h 6.0 KB
Newer Older
L
limingshu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <type_traits>
#include "glog/logging.h"
#include "paddle/phi/kernels/autotune/gpu_timer.h"
20
#include "paddle/phi/kernels/autotune/switch_autotune.h"
L
limingshu 已提交
21 22 23 24

namespace phi {
namespace autotune {

L
limingshu 已提交
25
template <typename T, typename ReturnType, typename... Args>
L
limingshu 已提交
26 27
class KernelCallback {
 public:
L
limingshu 已提交
28 29
  using ReturnT = ReturnType;
  using FuncType = ReturnType (*)(Args...);
L
limingshu 已提交
30 31 32 33 34

  KernelCallback() {}
  explicit KernelCallback(FuncType func_) : func(func_) {}
  virtual ~KernelCallback() {}

L
limingshu 已提交
35
  ReturnType Run(Args... args) { return func(args...); }
L
limingshu 已提交
36 37 38 39 40

 private:
  FuncType func;
};

L
limingshu 已提交
41 42 43 44
template <typename T, typename ReturnType, typename... Args>
static KernelCallback<T, ReturnType, Args...> MakeCallback(
    ReturnType (*cb)(Args...)) {
  return KernelCallback<T, ReturnType, Args...>(cb);
L
limingshu 已提交
45 46
}

47
template <typename T, typename KernelType>
L
limingshu 已提交
48 49 50 51
class AutoTuneBase {
 public:
  AutoTuneBase() {}
  virtual ~AutoTuneBase() {}
52

53 54
  explicit AutoTuneBase(KernelType kernel) {
    kernels_.push_back(/*default=*/kernel);
L
limingshu 已提交
55 56
  }

L
limingshu 已提交
57 58
  template <typename ReturnType, typename... Args>
  void AddCallBack(ReturnType (*func)(Args...)) {
59 60
    if (!is_init_) {
      std::lock_guard<std::mutex> lock(mutex_);
L
limingshu 已提交
61
      kernels_.push_back(MakeCallback<T>(func));
62
    }
63 64
  }

65 66 67 68 69 70
  template <typename Context, typename... Args>
  void Run(const Context& ctx,
           const AlgorithmType& algo,
           const size_t key,
           Args&&... args) {
    is_init_ = true;
71
    CheckKernelSize();
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
    auto& cache = AutoTuneCache::Instance().Get(algo);
    if (cache.Find(key)) {
      auto best_idx = cache.Get(key);
      kernels_[best_idx].Run(args...);
    } else {
      bool use_autotune = AutoTuneStatus::Instance().UseAutoTune();
      if (use_autotune) {
        // All avaliable kernels have ran while picking the best kernel,
        // so there may be no need for another kernel run.
        auto best_idx = PickBestKernel(ctx, args...);
        cache.Set(key, best_idx);
      } else {
        kernels_[0].Run(args...);
      }
    }
L
limingshu 已提交
87 88
  }

89
 protected:
90 91 92 93
  bool is_init_{false};
  std::vector<KernelType> kernels_;
  mutable std::mutex mutex_;

94
  void CheckKernelSize() {
L
limingshu 已提交
95 96 97
    PADDLE_ENFORCE_GT(
        kernels_.size(),
        0,
98
        phi::errors::InvalidArgument(
L
limingshu 已提交
99
            "kernel num must be greater than 0, now is %d", kernels_.size()));
100 101 102 103 104
  }

  template <typename Context, typename... Args>
  size_t PickBestKernel(const Context& ctx, Args&&... args) {
    std::lock_guard<std::mutex> lock(mutex_);
105
    size_t best_idx = 0;
L
limingshu 已提交
106 107
    float min_time = std::numeric_limits<float>::max();

108
    // Time cost test estabulished in default stream.
L
limingshu 已提交
109
    for (int i = 0; i < kernels_.size(); ++i) {
110
      auto time = RunAndMeasureKernel<Context>(ctx, i, args...);
L
limingshu 已提交
111 112
      if (time < min_time) {
        min_time = time;
113
        best_idx = i;
L
limingshu 已提交
114 115
      }
    }
116 117
    VLOG(3) << "best kernel idx is " << best_idx;
    return best_idx;
L
limingshu 已提交
118 119
  }

120 121
  template <typename Context, typename... Args>
  float RunAndMeasureKernel(const Context& ctx, const int idx, Args&&... args) {
122 123
    // Regard 1st run as warmup, judge the compare result by the time cost
    // of rest cycles.
124
    constexpr int repeats = 6;
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
    phi::GpuTimer timer;
    float time_cost = 0;
    const auto& stream = ctx.stream();

    ctx.Wait();
    for (int i = 0; i < repeats; ++i) {
      timer.Start(stream);
      kernels_[idx].Run(args...);
      timer.Stop(stream);
      auto time = timer.ElapsedTime();
      if (i > 0) {
        time_cost += time;
      }
      VLOG(3) << "kernel[" << idx << "][" << i << "th time cost is " << time;
    }
    return time_cost;
  }
L
limingshu 已提交
142 143
};

144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
// To init the auto_tuner object.
#define DEFINE_AUTOTUNER_COMMON_OBJ(name)                                \
  template <typename T, typename ReturnType, typename... Args>           \
  class name##AutoTuner                                                  \
      : public AutoTuneBase<T, KernelCallback<T, ReturnType, Args...>> { \
   public:                                                               \
    static name##AutoTuner<T, ReturnType, Args...>* Instance(            \
        ReturnType (*func)(Args...)) {                                   \
      static std::once_flag name##_init_flag;                            \
      static std::unique_ptr<name##AutoTuner<T, ReturnType, Args...>>    \
          instance;                                                      \
      std::call_once(name##_init_flag, [&] {                             \
        auto obj = MakeCallback<T>(func);                                \
        instance.reset(new name##AutoTuner<T, ReturnType, Args...>);     \
        instance->AddCallBack(func);                                     \
      });                                                                \
      return instance.get();                                             \
    }                                                                    \
  };

// To init auto_tuner inital function.
#define DEFINE_AUTOTUNER_FN(name)                                    \
  template <typename T, typename ReturnType, typename... Args>       \
  static name##AutoTuner<T, ReturnType, Args...>* Make##name##Tuner( \
      ReturnType (*func)(Args...)) {                                 \
    return name##AutoTuner<T, ReturnType, Args...>::Instance(func);  \
170 171
  }

172 173 174 175 176 177 178 179
#define DEFINE_AUTOTUNER(name) \
  DEFINE_AUTOTUNER_COMMON_OBJ(name) DEFINE_AUTOTUNER_FN(name)

DEFINE_AUTOTUNER(Transpose)

#undef DEFINE_AUTOTUNER_COMMON_OBJECT
#undef DEFINE_AUTOTUNER_FN
#undef DEFINE_AUTOTUNER
L
limingshu 已提交
180 181 182

}  // namespace autotune
}  // namespace phi