performance_comparison_test.cc 12.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <glog/logging.h>
#include <gtest/gtest.h>

#include <bitset>
#include <iostream>

#include "paddle/cinn/auto_schedule/auto_tuner.h"
#include "paddle/cinn/common/target.h"
#include "paddle/cinn/frontend/net_builder.h"
#include "paddle/cinn/frontend/optimize.h"
#include "paddle/cinn/frontend/paddle_model_convertor.h"
#include "paddle/cinn/frontend/syntax.h"
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/node.h"
#include "paddle/cinn/hlir/framework/pass.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/runtime/flags.h"
#include "paddle/cinn/utils/data_util.h"
#include "test/cpp/cinn/program_builder.h"

35 36 37 38 39 40 41 42
/* This test is used as a tool to evaluate or compare performance of 3
 * schedules(no schedule, manual schedule, auto-schedule). One can specify which
 * schedules to be evaluated through `FLAGS_evaluate_knobs` and specify which
 * operator or model through `--gtest_filter=PerformanceTester.xx`, for example,
 * `FLAGS_evaluate_knobs=4
 * --gtest_filter=PerformanceTester.Matmul` means it will evaluate auto-schedule
 * on Matmul operator. You can refer to explanation of following flags or
 * parameters for more detail.
43 44
 */

45 46 47
DEFINE_string(resnet50_model_dir,
              "./ResNet50",
              "the path to paddle model resnet50.");
48
// Flags that control which schedule tests will be run.
49 50 51 52 53 54 55 56 57
// Bit with index 0 controls no schedule test, means options = 1 = "001" will
// run no schedule test. Bit with index 1 controls manual schedule test, means
// options = 2 = "010" will run manual schedule test. Bit with index 2 controls
// auto schedule test, means options = 4 = "100" will run auto schedule test.
// The default value is -1, which means that this flag is disabled to set the
// options
DEFINE_int32(evaluate_knobs,
             -1,
             "the options to control which schedule tests will be run.");
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74

namespace cinn {
namespace auto_schedule {

using ::cinn::hlir::framework::BuildScope;
using ::cinn::hlir::framework::Graph;
using ::cinn::hlir::framework::GraphCompiler;
using ::cinn::hlir::framework::Instruction;
using ::cinn::hlir::framework::Scope;

class PerformanceTester : public ::testing::Test {
 public:
  struct Options {
    // times of compiled runtime program will be executed repeatedly.
    int repeat_times = 2;
    // the num_tuning_rounds for auto tuning
    int num_tuning_rounds = 2;
75 76
    // knobs to control which schedules will be measured, refer to
    // FLAGS_evaluate_knobs explanation
77 78 79 80 81 82 83 84 85
    std::bitset<3> evaluate_knobs = 0UL;
  };

  void Evaluate(const frontend::Program& program) {
    if (FLAGS_evaluate_knobs >= 0) {
      options_.evaluate_knobs = FLAGS_evaluate_knobs;
    }
    VLOG(3) << "evaluate_knobs = " << options_.evaluate_knobs;

86 87 88
    auto worker_fn = [this, &program](const std::string& schedule_name,
                                      BuildRuntimeProgramFn build_fn,
                                      bool execute = true) {
89 90 91 92 93 94
      Context::Global().ResetNameId();
      VLOG(3) << "Initialize graph.";
      auto graph = std::make_shared<hlir::framework::Graph>(program, target_);
      VLOG(3) << "Apply graph pass.";
      hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
      VLOG(3) << "Build " << schedule_name << " program.";
95 96 97 98 99
      auto scope = BuildScope(target_, graph);
      auto graph_compiler =
          std::make_unique<GraphCompiler>(target_, scope, graph);
      auto runtime_program =
          (this->*build_fn)(graph.get(), graph_compiler.get());
100 101 102 103 104 105
      if (execute) {
        VLOG(3) << "Execute " << schedule_name << " program.";
        runtime_program->ExecuteTest(options_.repeat_times);
      }
    };

106 107
    // if no one is set, build no/manual schedule cases to ensure their build
    // functions are valid
108
    if (options_.evaluate_knobs.none()) {
109 110 111 112 113 114
      worker_fn("no schedule",
                &PerformanceTester::BuildNoScheduleProgram,
                /* execute */ false);
      worker_fn("manual schedule",
                &PerformanceTester::BuildManualScheduleProgram,
                /* execute */ false);
115 116 117 118 119
    } else {
      if (options_.evaluate_knobs.test(0)) {
        worker_fn("no schedule", &PerformanceTester::BuildNoScheduleProgram);
      }
      if (options_.evaluate_knobs.test(1)) {
120 121
        worker_fn("manual schedule",
                  &PerformanceTester::BuildManualScheduleProgram);
122 123
      }
      if (options_.evaluate_knobs.test(2)) {
124 125
        worker_fn("auto schedule",
                  &PerformanceTester::BuildAutoScheduleProgram);
126 127 128 129 130
      }
    }
  }

 protected:
131 132 133 134 135 136 137 138 139 140 141
  using BuildRuntimeProgramFn = std::unique_ptr<hlir::framework::Program> (
      PerformanceTester::*)(Graph*, GraphCompiler*);

  std::unique_ptr<hlir::framework::Program> BuildNoScheduleProgram(
      Graph* graph, GraphCompiler* graph_compiler) {
    const auto& dtype_dict =
        graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
            "inferdtype");
    const auto& shape_dict = graph->GetAttrs<
        absl::flat_hash_map<std::string, hlir::framework::shape_t>>(
        "infershape");
142 143

    std::shared_ptr<hlir::framework::OpLowerer> op_lowerer =
144 145
        std::make_unique<hlir::framework::OpLowerer>(
            dtype_dict, shape_dict, target_);
146 147 148 149 150 151 152 153 154 155

    GraphCompiler::CompileOptions compile_options;
    compile_options.with_instantiate_variables = true;

    if (graph->fusion_groups.empty()) {
      hlir::framework::ApplyPasses(graph, {"BuildNonFusedGroupsPass"});
    }
    compile_options.groups = graph->fusion_groups;

    for (auto group : graph->fusion_groups) {
156
      compile_options.lowered_funcs.push_back(
157 158 159
          op_lowerer->Lower(group,
                            /*apply_op_schedule = */ false,
                            /*apply_group_schedule=*/false));
160 161
    }

162 163
    VLOG(3) << "===========================No Schedule LoweredFunc "
               "Begin===========================";
164 165 166 167 168
    for (const auto& funcvec : compile_options.lowered_funcs) {
      for (const auto& func : funcvec) {
        VLOG(3) << func;
      }
    }
169 170
    VLOG(3) << "===========================No Schedule LoweredFunc "
               "End=============================";
171 172 173 174

    return graph_compiler->Build(compile_options).runtime_program;
  }

175 176
  std::unique_ptr<hlir::framework::Program> BuildManualScheduleProgram(
      Graph* graph, GraphCompiler* graph_compiler) {
177 178 179
    return graph_compiler->Build();
  }

180 181
  std::unique_ptr<hlir::framework::Program> BuildAutoScheduleProgram(
      Graph* graph, GraphCompiler* graph_compiler) {
182 183 184 185
    auto tuner = std::make_unique<AutoTuner>(target_, graph);

    AutoTuner::Config tuning_config;
    TuningOptions tuning_options;
186 187
    tuning_options.num_tuning_rounds = options_.num_tuning_rounds;
    tuning_options.num_measure_trials = 2;
188 189 190 191 192 193 194 195 196
    tuning_options.num_samples_per_iteration = 2;

    tuner->Initialize(tuning_config, graph_compiler);
    TuningResult tuning_result = tuner->Tune(tuning_options);

    GraphCompiler::CompileOptions compile_options;
    compile_options.with_instantiate_variables = true;
    compile_options.Apply(tuning_result);

197 198
    VLOG(3) << "===========================Auto Schedule LoweredFunc "
               "Begin===========================";
199 200 201 202 203
    for (const auto& funcvec : compile_options.lowered_funcs) {
      for (const auto& func : funcvec) {
        VLOG(3) << func;
      }
    }
204 205
    VLOG(3) << "===========================Auto Schedule LoweredFunc "
               "End=============================";
206 207 208 209 210 211 212 213 214 215 216 217 218 219

    return graph_compiler->Build(compile_options).runtime_program;
  }

#ifdef CINN_WITH_CUDA
  Target target_ = common::DefaultNVGPUTarget();
#else
  Target target_ = common::DefaultHostTarget();
#endif
  Options options_;
};

constexpr int batch_size = 2;

220 221 222
TEST_F(PerformanceTester, Mul) {
  Evaluate(tests::OpBuilder("mul").Build({{"X", {32, 16}}, {"Y", {16, 32}}}));
}
223 224

TEST_F(PerformanceTester, Add) {
225 226
  Evaluate(tests::OpBuilder("elementwise_add")
               .Build({{"X", {1, 56, 56, 256}}, {"Y", {1, 56, 56, 256}}}));
227 228 229
}

TEST_F(PerformanceTester, Matmul) {
230 231
  Evaluate(tests::OpBuilder("matmul").Build(
      {{"X", {batch_size, 2048}}, {"Y", {2048, 1000}}}));
232 233
}

234 235 236
TEST_F(PerformanceTester, Relu) {
  Evaluate(tests::OpBuilder("relu").Build({{"X", {batch_size, 64, 56, 56}}}));
}
237 238 239 240 241

TEST_F(PerformanceTester, Conv2d) {
  std::vector<int> strides{2, 2};
  std::vector<int> paddings{3, 3};
  std::vector<int> dilations{1, 1};
242 243 244
  int groups = 1;
  std::string conv_type = "forward";
  std::string data_format = "NCHW";
245 246
  std::string padding_algorithm = "EXPLICIT";

247 248 249 250 251 252 253 254 255
  Evaluate(tests::OpBuilder("conv2d").Build(
      {{"X", {batch_size, 3, 224, 224}}, {"W", {64, 3, 7, 7}}},
      {{"stride", strides},
       {"padding", paddings},
       {"dilation", dilations},
       {"groups", groups},
       {"conv_type", conv_type},
       {"data_format", data_format},
       {"padding_algorithm", padding_algorithm}}));
256 257 258 259 260 261 262 263
}

TEST_F(PerformanceTester, Pool2d) {
  std::vector<int32_t> input_shape{batch_size, 64, 112, 112};
  std::string pooling_type = "max";
  std::vector<int> ksize{3, 3};
  std::vector<int> strides{2, 2};
  std::vector<int> paddings{1, 1, 1, 1};
264 265 266 267 268
  bool ceil_mode = false;
  bool exclusive = true;
  bool global_pooling = false;
  std::string data_format = "NCHW";
  bool adaptive = false;
269 270
  std::string padding_algorithm = "EXPLICIT";

271 272 273 274 275 276 277 278 279 280 281 282
  Evaluate(tests::OpBuilder("pool2d").Build(
      {{"X", {batch_size, 64, 112, 112}}},
      {{"pool_type", pooling_type},
       {"kernel_size", ksize},
       {"stride_size", strides},
       {"padding_size", paddings},
       {"ceil_mode", ceil_mode},
       {"exclusive", exclusive},
       {"global_pooling", global_pooling},
       {"data_format", data_format},
       {"adaptive", adaptive},
       {"padding_algorithm", padding_algorithm}}));
283 284 285 286 287 288 289 290
}

TEST_F(PerformanceTester, BatchNorm) {
  std::vector<int32_t> input_shape{batch_size, 64, 112, 112};
  std::vector<int32_t> scale_shape{64};
  std::vector<int32_t> bias_shape{64};
  std::vector<int32_t> mean_shape{64};
  std::vector<int32_t> variance_shape{64};
291 292
  float epsilon = 1e-5f;
  float momentum = 0.9f;
293 294
  const std::string& data_layout = "NCHW";

295 296 297 298 299 300 301 302 303
  Evaluate(tests::OpBuilder("batch_norm")
               .Build({{"X", {batch_size, 64, 112, 112}},
                       {"scale", {64}},
                       {"bias", {64}},
                       {"mean", {64}},
                       {"variance", {64}}},
                      {{"epsilon", epsilon},
                       {"momentum", momentum},
                       {"data_layout", data_layout}}));
304 305 306 307 308
}

TEST_F(PerformanceTester, Reshape) {
  std::vector<int32_t> output_shape{batch_size, 2048};

309 310
  Evaluate(tests::OpBuilder("reshape").Build({{"X", {batch_size, 2048, 1, 1}}},
                                             {{"shape", output_shape}}));
311 312 313
}

TEST_F(PerformanceTester, Softmax) {
314 315
  std::vector<int> axes = {-1};
  std::string mode = "fast";
316 317
  std::string data_format = "AnyLayout";

318 319 320
  Evaluate(tests::OpBuilder("softmax").Build(
      {{"X", {batch_size, 1000}}},
      {{"axes", axes}, {"mode", mode}, {"data_format", data_format}}));
321 322 323
}

TEST_F(PerformanceTester, Scale) {
324 325
  float scale = 1.0f;
  float bias = 0.0f;
326 327
  bool bias_after_scale = true;

328 329 330 331 332
  Evaluate(tests::OpBuilder("scale").Build(
      {{"X", {batch_size, 1000}}},
      {{"scale", scale},
       {"bias", bias},
       {"bias_after_scale", bias_after_scale}}));
333 334 335 336 337
}

TEST_F(PerformanceTester, LookupTable) {
  int64_t padding_idx = -1;

338 339 340 341
  Evaluate(tests::OpBuilder("lookup_table")
               .Build({{"table", {50001, 768}},
                       {"ids", {10, 128, 1}, common::Int(64)}},
                      {{"padding_idx", padding_idx}}));
342 343 344 345 346 347
}

TEST_F(PerformanceTester, Gather) {
  int axis = 3;

  Evaluate(tests::OpBuilder("gather").Build(
348 349 350
      {{"operand", {10, 12, 128, 512}},
       {"index", {1, 1, 1, 128}, common::Int(32)}},
      {{"axis", axis}}));
351 352 353 354 355
}

// paddle model test
TEST_F(PerformanceTester, ResNet50) {
  CHECK_NE(FLAGS_resnet50_model_dir, "");
356 357
  std::unordered_map<std::string, std::vector<int64_t>> feeds = {
      {"inputs", {batch_size, 3, 224, 224}}};
358 359 360 361 362 363
  Evaluate(cinn::frontend::PaddleModelConvertor(common::DefaultNVGPUTarget())
               .LoadModel(FLAGS_resnet50_model_dir, true, feeds));
}

}  // namespace auto_schedule
}  // namespace cinn