eager_layout_auto_tune.h 12.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
#include "paddle/fluid/eager/eager_layout_transformer.h"
#include "paddle/fluid/imperative/layout_autotune.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
namespace egr {
22 23 24 25 26 27 28 29 30 31 32 33 34 35
inline bool NeedTransLayout(
    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                               kSlotSmallVectorSize>& tensors_vector,
    const paddle::experimental::DataLayout& layout) {
  for (size_t i = 0; i < tensors_vector.size(); i++) {
    for (size_t idx = 0; idx < tensors_vector[0].size(); idx++) {
      if (layout != tensors_vector[i][idx].layout()) {
        return true;
      }
    }
  }
  return false;
}
inline std::shared_ptr<EagerLayoutTransformer> BaseTransformer(
36 37 38 39
    const std::string& op_name,
    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                               kSlotSmallVectorSize>& tensors_vector) {
  std::shared_ptr<EagerLayoutTransformer> transposer = nullptr;
40 41 42 43 44 45 46 47 48
  bool unstart =
      (paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout() ==
       paddle::experimental::DataLayout::UNDEFINED);
  auto first_layout = tensors_vector[0][0].layout();
  VLOG(3) << "Layout autotune was is start ? " << (!unstart) << op_name
          << "'s layout is " << first_layout;

  transposer = std::make_shared<EagerLayoutTransformer>(
      op_name, tensors_vector, first_layout);
49 50 51
  return transposer;
}

52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
// For agnostic op like add, relu, exp
inline std::shared_ptr<EagerLayoutTransformer> EagerLayoutAutotune(
    const std::string& op_name,
    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                               kSlotSmallVectorSize>& tensors_vector) {
  auto desired_layout =
      paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
  auto default_layout =
      paddle::imperative::LayoutAutoTune::Instance().GetDefaultLayout();
  auto first_layout = tensors_vector[0][0].layout();
  if (NeedTransLayout(tensors_vector, first_layout)) {
    bool need_trans_back = false;
    for (size_t i = 0; i < tensors_vector.size(); i++) {
      for (size_t idx = 0; idx < tensors_vector[0].size(); idx++) {
        if (4 != tensors_vector[i][idx].shape().size()) {
          need_trans_back = true;
          VLOG(3) << "Agnostic op " << op_name << " shape is "
                  << tensors_vector[i][idx].shape().size() << " and layout is "
                  << tensors_vector[i][idx].layout();
        }
      }
    }
    auto final_layout = need_trans_back ? default_layout : desired_layout;
    return std::make_shared<EagerLayoutTransformer>(
        op_name, tensors_vector, final_layout);
  }
  return BaseTransformer(op_name, tensors_vector);
}

81 82 83 84 85 86 87
// For lightly op like reduce
template <typename T>
inline std::shared_ptr<EagerLayoutTransformer> EagerLayoutAutotune(
    const std::string& op_name,
    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                               kSlotSmallVectorSize>& tensors_vector,
    T* attr) {
88 89 90 91
  VLOG(3) << "Lightly op " << op_name << "'s shape is "
          << tensors_vector[0][0].shape().size() << " and layout is "
          << tensors_vector[0][0].layout();

92 93 94 95 96 97 98 99 100 101 102 103 104 105
  std::shared_ptr<EagerLayoutTransformer> transposer = nullptr;
  transposer =
      std::make_shared<EagerLightlyLayoutSensitiveOpTransformer>(op_name);
  return transposer;
}

// For lightly op like argmax
template <typename T1, typename T2>
inline std::shared_ptr<EagerLayoutTransformer> EagerLayoutAutotune(
    const std::string& op_name,
    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                               kSlotSmallVectorSize>& tensors_vector,
    T1* axis,
    T2* keep_dim) {
106 107 108 109
  VLOG(3) << "Lightly op " << op_name << "'s shape is "
          << tensors_vector[0][0].shape().size() << " and layout is "
          << tensors_vector[0][0].layout();

110 111 112
  return EagerLayoutAutotune<T1>(op_name, tensors_vector, axis);
}

113
// heavily string data_format, data_layout
114 115 116 117 118 119
template <>
inline std::shared_ptr<EagerLayoutTransformer> EagerLayoutAutotune(
    const std::string& op_name,
    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                               kSlotSmallVectorSize>& tensors_vector,
    std::string* attr) {
120 121 122
  auto first_layout = tensors_vector[0][0].layout();
  auto transposer = std::make_shared<EagerLayoutTransformer>(
      op_name, tensors_vector, first_layout);
123 124 125
  if (paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout() ==
      paddle::experimental::DataLayout::UNDEFINED) {
    // Layout autotune only supports model with convolutional layers
126
    VLOG(3) << "Optimze Layout was not started " << op_name;
127 128 129 130 131 132 133 134 135 136
    if (op_name != "conv2d") {
      return transposer;
    } else {
      auto data_type = tensors_vector[0][0].dtype();
      bool is_tune_fp32 =
          (data_type == paddle::experimental::DataType::FLOAT32) &&
          (*attr == "NHWC");
      bool is_tune_fp16 =
          (data_type == paddle::experimental::DataType::FLOAT16) &&
          (*attr == "NCHW");
137
      VLOG(3) << "Conv2d_dy's dtype " << data_type << " format" << (*attr);
138 139 140 141 142 143 144 145 146 147 148 149
      if (is_tune_fp32) {
        paddle::imperative::LayoutAutoTune::Instance().SetDesiredLayout(
            paddle::experimental::DataLayout::NCHW);

        paddle::imperative::LayoutAutoTune::Instance().SetDefaultLayout(
            paddle::experimental::DataLayout::NHWC);
      } else if (is_tune_fp16) {
        paddle::imperative::LayoutAutoTune::Instance().SetDesiredLayout(
            paddle::experimental::DataLayout::NHWC);
        paddle::imperative::LayoutAutoTune::Instance().SetDefaultLayout(
            paddle::experimental::DataLayout::NCHW);
      } else {
150
        egr::Controller::Instance().DisableLayoutAutoTune();
151 152
        return transposer;
      }
153 154 155
      VLOG(3)
          << "Tune the layout from " << *attr << " to "
          << paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
156 157 158 159 160
    }
  }

  if (paddle::imperative::LayoutAutoTune::Instance().IsHeavilyLayoutSensitive(
          op_name)) {
161 162 163
    VLOG(3)
        << op_name
        << "'s LayoutTransformer is EagerHeavilyLayoutSensitiveOpTransformer";
164 165 166 167 168
    auto heavily_transposer =
        std::make_shared<EagerHeavilyLayoutSensitiveOpTransformer>(op_name,
                                                                   attr);
    return heavily_transposer;
  }
169 170

  VLOG(3) << op_name << "'s LayoutTransformer is unimplemented. Use default.";
171 172 173 174 175 176 177 178 179 180
  return transposer;
}

// lightly  transpose
template <>
inline std::shared_ptr<EagerLayoutTransformer> EagerLayoutAutotune(
    const std::string& op_name,
    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                               kSlotSmallVectorSize>& tensors_vector,
    std::vector<int>* attr) {
181
  auto first_layout = tensors_vector[0][0].layout();
182 183 184
  std::shared_ptr<EagerLayoutTransformer> transposer = nullptr;
  if (paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout() ==
      paddle::experimental::DataLayout::UNDEFINED) {
185 186 187
    VLOG(3) << "Optimze Layout was not started" << op_name;
    transposer = std::make_shared<EagerLayoutTransformer>(
        op_name, tensors_vector, first_layout);
188 189
    return transposer;
  }
190 191 192
  if (op_name == "transpose2" &&
      (tensors_vector[0][0].layout() ==
       paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout())) {
193
    auto trans = std::make_shared<EagerTransposeOpTransformer>(op_name);
194 195 196 197
    trans->SetAttr(attr,
                   tensors_vector[0][0].layout() ==
                       paddle::experimental::DataLayout::NHWC);
    return trans;
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
  }
  transposer =
      std::make_shared<EagerLightlyLayoutSensitiveOpTransformer>(op_name);
  return transposer;
}

// lightly int argmax
template <>
inline std::shared_ptr<EagerLayoutTransformer>
EagerLayoutAutotune<paddle::experimental::Scalar, bool>(
    const std::string& op_name,
    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                               kSlotSmallVectorSize>& tensors_vector,
    paddle::experimental::Scalar* axis,
    bool* keep_dim) {
213
  auto first_layout = tensors_vector[0][0].layout();
214 215 216
  std::shared_ptr<EagerLayoutTransformer> transposer = nullptr;
  if (paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout() ==
      paddle::experimental::DataLayout::UNDEFINED) {
217 218 219
    VLOG(3) << "Optimze Layout was not started" << op_name;
    transposer = std::make_shared<EagerLayoutTransformer>(
        op_name, tensors_vector, first_layout);
220 221 222 223
    return transposer;
  }
  auto desired_layout =
      paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
224 225
  if (op_name == "argmax" &&
      (tensors_vector[0][0].layout() == desired_layout) && (*keep_dim)) {
226 227
    std::shared_ptr<EagerArgmaxOpTransformer> argmax_transform = nullptr;
    argmax_transform = std::make_shared<EagerArgmaxOpTransformer>(op_name);
228 229 230 231
    argmax_transform->SetAttr(axis,
                              tensors_vector[0][0].layout() ==
                                  paddle::experimental::DataLayout::NHWC);
    return argmax_transform;
232 233 234 235 236 237
  }
  transposer =
      std::make_shared<EagerLightlyLayoutSensitiveOpTransformer>(op_name);
  return transposer;
}

238
// lightly for flatten
239 240 241 242 243 244 245
template <>
inline std::shared_ptr<EagerLayoutTransformer> EagerLayoutAutotune<int, int>(
    const std::string& op_name,
    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                               kSlotSmallVectorSize>& tensors_vector,
    int* start_axis,
    int* stop_axis) {
246
  auto first_layout = tensors_vector[0][0].layout();
247
  std::shared_ptr<EagerLayoutTransformer> transposer = nullptr;
248 249 250 251 252 253
  auto desired_layout =
      paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
  if (desired_layout == paddle::experimental::DataLayout::UNDEFINED) {
    VLOG(3) << "Optimze Layout was not started" << op_name;
    transposer = std::make_shared<EagerLayoutTransformer>(
        op_name, tensors_vector, first_layout);
254 255
    return transposer;
  }
256
  bool no_tranpose = tensors_vector[0][0].layout() == desired_layout;
257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
  bool is_valid = ((*start_axis) == 1 && (*stop_axis) == 3);
  if (op_name == "flatten" || op_name == "flatten_contiguous_range") {
    if (no_tranpose && is_valid) {
      std::shared_ptr<EagerFlattenOpTransformer> flatten_transform = nullptr;
      flatten_transform = std::make_shared<EagerFlattenOpTransformer>(op_name);
      return flatten_transform;
    }
  }

  transposer =
      std::make_shared<EagerLightlyLayoutSensitiveOpTransformer>(op_name);
  return transposer;
}

// lightly int Concat
272
template <>
273 274 275 276 277 278 279 280
inline std::shared_ptr<EagerLayoutTransformer>
EagerLayoutAutotune<paddle::experimental::Scalar>(
    const std::string& op_name,
    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                               kSlotSmallVectorSize>& tensors_vector,
    paddle::experimental::Scalar* axis) {
  auto desired_layout =
      paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
281
  auto first_layout = tensors_vector[0][0].layout();
282 283
  std::shared_ptr<EagerLayoutTransformer> transposer = nullptr;
  if (desired_layout == paddle::experimental::DataLayout::UNDEFINED) {
284 285 286
    VLOG(3) << "Optimze Layout was not started" << op_name;
    transposer = std::make_shared<EagerLayoutTransformer>(
        op_name, tensors_vector, first_layout);
287 288 289
    return transposer;
  }

290 291
  if (NeedTransLayout(tensors_vector, desired_layout)) {
    VLOG(3) << op_name << " need transpose to default layout";
292 293 294 295 296 297 298 299 300 301 302
    transposer =
        std::make_shared<EagerLightlyLayoutSensitiveOpTransformer>(op_name);
    return transposer;
  } else {
    auto trans = std::make_shared<EagerConcatOpTransformer>(op_name);
    trans->SetAttr(axis, desired_layout);
    return trans;
  }
}

}  // namespace egr