auto_tune_base.h 5.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <type_traits>
#include "glog/logging.h"
#include "paddle/phi/kernels/autotune/gpu_timer.h"
20
#include "paddle/phi/kernels/autotune/switch_autotune.h"
21 22 23 24

namespace phi {
namespace autotune {

L
limingshu 已提交
25
template <typename T, typename ReturnType, typename... Args>
26 27
class KernelCallback {
 public:
L
limingshu 已提交
28 29
  using ReturnT = ReturnType;
  using FuncType = ReturnType (*)(Args...);
30 31 32 33 34

  KernelCallback() {}
  explicit KernelCallback(FuncType func_) : func(func_) {}
  virtual ~KernelCallback() {}

L
limingshu 已提交
35
  ReturnType Run(Args... args) { return func(args...); }
36 37 38 39 40

 private:
  FuncType func;
};

L
limingshu 已提交
41 42 43 44
template <typename T, typename ReturnType, typename... Args>
static KernelCallback<T, ReturnType, Args...> MakeCallback(
    ReturnType (*cb)(Args...)) {
  return KernelCallback<T, ReturnType, Args...>(cb);
45 46
}

47
template <typename T, typename KernelType>
48 49 50 51
class AutoTuneBase {
 public:
  AutoTuneBase() {}
  virtual ~AutoTuneBase() {}
52

53 54
  explicit AutoTuneBase(KernelType kernel) {
    kernels_.push_back(/*default=*/kernel);
55 56
  }

L
limingshu 已提交
57 58
  template <typename ReturnType, typename... Args>
  void AddCallBack(ReturnType (*func)(Args...)) {
59 60
    if (!is_init_) {
      std::lock_guard<std::mutex> lock(mutex_);
L
limingshu 已提交
61
      kernels_.push_back(MakeCallback<T>(func));
62
    }
63 64
  }

65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
  template <typename Context, typename... Args>
  void Run(const Context& ctx,
           const AlgorithmType& algo,
           const size_t key,
           Args&&... args) {
    PADDLE_ENFORCE_GT(
        kernels_.size(),
        0,
        paddle::platform::errors::InvalidArgument(
            "kernel num must be greater than 0, now is %d", kernels_.size()));
    is_init_ = true;

    auto& cache = AutoTuneCache::Instance().Get(algo);
    if (cache.Find(key)) {
      auto best_idx = cache.Get(key);
      kernels_[best_idx].Run(args...);
    } else {
      bool use_autotune = AutoTuneStatus::Instance().UseAutoTune();
      if (use_autotune) {
        // All avaliable kernels have ran while picking the best kernel,
        // so there may be no need for another kernel run.
        auto best_idx = PickBestKernel(ctx, args...);
        cache.Set(key, best_idx);
      } else {
        kernels_[0].Run(args...);
      }
    }
92 93
  }

94 95 96 97 98
 private:
  bool is_init_{false};
  std::vector<KernelType> kernels_;
  mutable std::mutex mutex_;

99
  template <typename Context, typename... Args>
100 101
  size_t PickBestKernel(const Context& ctx, Args&&... args) {
    std::lock_guard<std::mutex> lock(mutex_);
102 103 104 105 106
    PADDLE_ENFORCE_GT(
        kernels_.size(),
        0,
        paddle::platform::errors::InvalidArgument(
            "kernel num must be greater than 0, now is %d", kernels_.size()));
107
    size_t best_idx = 0;
108 109
    float min_time = std::numeric_limits<float>::max();

110
    // Time cost test estabulished in default stream.
111
    for (int i = 0; i < kernels_.size(); ++i) {
112
      auto time = RunAndMeasureKernel<Context>(ctx, i, args...);
113 114
      if (time < min_time) {
        min_time = time;
115
        best_idx = i;
116 117
      }
    }
118 119
    VLOG(3) << "best kernel idx is " << best_idx;
    return best_idx;
120 121
  }

122 123
  template <typename Context, typename... Args>
  float RunAndMeasureKernel(const Context& ctx, const int idx, Args&&... args) {
124 125
    // Regard 1st run as warmup, judge the compare result by the time cost
    // of rest cycles.
126
    constexpr int repeats = 4;
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
    phi::GpuTimer timer;
    float time_cost = 0;
    const auto& stream = ctx.stream();

    ctx.Wait();
    for (int i = 0; i < repeats; ++i) {
      timer.Start(stream);
      kernels_[idx].Run(args...);
      timer.Stop(stream);
      auto time = timer.ElapsedTime();
      if (i > 0) {
        time_cost += time;
      }
      VLOG(3) << "kernel[" << idx << "][" << i << "th time cost is " << time;
    }
    return time_cost;
  }
144 145
};

L
limingshu 已提交
146 147 148
template <typename T, typename ReturnType, typename... Args>
static AutoTuneBase<T, KernelCallback<T, ReturnType, Args...>> MakeAutoTuner(
    ReturnType (*func)(Args...)) {
149 150 151 152
  auto obj = MakeCallback<T>(func);
  return AutoTuneBase<T, decltype(obj)>(obj);
}

L
limingshu 已提交
153 154 155
template <typename T, typename ReturnType, typename... Args>
class TransposeAutoTuner
    : public AutoTuneBase<T, KernelCallback<T, ReturnType, Args...>> {
156
 public:
L
limingshu 已提交
157 158
  static AutoTuneBase<T, KernelCallback<T, ReturnType, Args...>>* Instance(
      ReturnType (*func)(Args...)) {
159
    static std::once_flag transpose_init_flag_;
L
limingshu 已提交
160 161 162
    static std::unique_ptr<
        AutoTuneBase<T, KernelCallback<T, ReturnType, Args...>>>
        instance_;
163
    std::call_once(transpose_init_flag_, [&] {
L
limingshu 已提交
164 165
      auto obj = MakeCallback<T>(func);
      instance_.reset(new AutoTuneBase<T, decltype(obj)>(obj));
166 167 168 169 170
    });
    return instance_.get();
  }
};

L
limingshu 已提交
171 172 173 174
template <typename T, typename ReturnType, typename... Args>
static AutoTuneBase<T, KernelCallback<T, ReturnType, Args...>>*
MakeTransposeTuner(ReturnType (*func)(Args...)) {
  return TransposeAutoTuner<T, ReturnType, Args...>::Instance(func);
175 176 177 178
}

}  // namespace autotune
}  // namespace phi
新手
引导
客服 返回
顶部