performance_comparison_test.cc 12.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <glog/logging.h>
#include <gtest/gtest.h>

#include <bitset>
#include <iostream>

#include "paddle/cinn/auto_schedule/auto_tuner.h"
#include "paddle/cinn/common/target.h"
#include "paddle/cinn/frontend/net_builder.h"
#include "paddle/cinn/frontend/optimize.h"
#include "paddle/cinn/frontend/paddle_model_convertor.h"
#include "paddle/cinn/frontend/syntax.h"
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/node.h"
#include "paddle/cinn/hlir/framework/pass.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/runtime/flags.h"
#include "paddle/cinn/utils/data_util.h"
#include "test/cpp/cinn/program_builder.h"

35 36 37 38 39 40 41 42
/* This test is used as a tool to evaluate or compare performance of 3
 * schedules(no schedule, manual schedule, auto-schedule). One can specify which
 * schedules to be evaluated through `FLAGS_evaluate_knobs` and specify which
 * operator or model through `--gtest_filter=PerformanceTester.xx`, for example,
 * `FLAGS_evaluate_knobs=4
 * --gtest_filter=PerformanceTester.Matmul` means it will evaluate auto-schedule
 * on Matmul operator. You can refer to explanation of following flags or
 * parameters for more detail.
43 44
 */

45 46 47
DEFINE_string(resnet50_model_dir,
              "./ResNet50",
              "the path to paddle model resnet50.");
48
// Flags that control which schedule tests will be run.
49 50 51 52 53 54 55 56 57
// Bit with index 0 controls no schedule test, means options = 1 = "001" will
// run no schedule test. Bit with index 1 controls manual schedule test, means
// options = 2 = "010" will run manual schedule test. Bit with index 2 controls
// auto schedule test, means options = 4 = "100" will run auto schedule test.
// The default value is -1, which means that this flag is disabled to set the
// options
DEFINE_int32(evaluate_knobs,
             -1,
             "the options to control which schedule tests will be run.");
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
DECLARE_int32(cinn_parallel_compile_size);

namespace cinn {
namespace auto_schedule {

using ::cinn::hlir::framework::BuildScope;
using ::cinn::hlir::framework::Graph;
using ::cinn::hlir::framework::GraphCompiler;
using ::cinn::hlir::framework::Instruction;
using ::cinn::hlir::framework::Scope;

class PerformanceTester : public ::testing::Test {
 public:
  struct Options {
    // times of compiled runtime program will be executed repeatedly.
    int repeat_times = 2;
    // the num_tuning_rounds for auto tuning
    int num_tuning_rounds = 2;
76 77
    // knobs to control which schedules will be measured, refer to
    // FLAGS_evaluate_knobs explanation
78 79 80 81 82 83 84 85 86 87 88
    std::bitset<3> evaluate_knobs = 0UL;
  };

  void SetUp() override { FLAGS_cinn_parallel_compile_size = 0; }

  void Evaluate(const frontend::Program& program) {
    if (FLAGS_evaluate_knobs >= 0) {
      options_.evaluate_knobs = FLAGS_evaluate_knobs;
    }
    VLOG(3) << "evaluate_knobs = " << options_.evaluate_knobs;

89 90 91
    auto worker_fn = [this, &program](const std::string& schedule_name,
                                      BuildRuntimeProgramFn build_fn,
                                      bool execute = true) {
92 93 94 95 96 97
      Context::Global().ResetNameId();
      VLOG(3) << "Initialize graph.";
      auto graph = std::make_shared<hlir::framework::Graph>(program, target_);
      VLOG(3) << "Apply graph pass.";
      hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
      VLOG(3) << "Build " << schedule_name << " program.";
98 99 100 101 102
      auto scope = BuildScope(target_, graph);
      auto graph_compiler =
          std::make_unique<GraphCompiler>(target_, scope, graph);
      auto runtime_program =
          (this->*build_fn)(graph.get(), graph_compiler.get());
103 104 105 106 107 108
      if (execute) {
        VLOG(3) << "Execute " << schedule_name << " program.";
        runtime_program->ExecuteTest(options_.repeat_times);
      }
    };

109 110
    // if no one is set, build no/manual schedule cases to ensure their build
    // functions are valid
111
    if (options_.evaluate_knobs.none()) {
112 113 114 115 116 117
      worker_fn("no schedule",
                &PerformanceTester::BuildNoScheduleProgram,
                /* execute */ false);
      worker_fn("manual schedule",
                &PerformanceTester::BuildManualScheduleProgram,
                /* execute */ false);
118 119 120 121 122
    } else {
      if (options_.evaluate_knobs.test(0)) {
        worker_fn("no schedule", &PerformanceTester::BuildNoScheduleProgram);
      }
      if (options_.evaluate_knobs.test(1)) {
123 124
        worker_fn("manual schedule",
                  &PerformanceTester::BuildManualScheduleProgram);
125 126
      }
      if (options_.evaluate_knobs.test(2)) {
127 128
        worker_fn("auto schedule",
                  &PerformanceTester::BuildAutoScheduleProgram);
129 130 131 132 133
      }
    }
  }

 protected:
134 135 136 137 138 139 140 141 142 143 144
  using BuildRuntimeProgramFn = std::unique_ptr<hlir::framework::Program> (
      PerformanceTester::*)(Graph*, GraphCompiler*);

  std::unique_ptr<hlir::framework::Program> BuildNoScheduleProgram(
      Graph* graph, GraphCompiler* graph_compiler) {
    const auto& dtype_dict =
        graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
            "inferdtype");
    const auto& shape_dict = graph->GetAttrs<
        absl::flat_hash_map<std::string, hlir::framework::shape_t>>(
        "infershape");
145 146

    std::shared_ptr<hlir::framework::OpLowerer> op_lowerer =
147 148
        std::make_unique<hlir::framework::OpLowerer>(
            dtype_dict, shape_dict, target_);
149 150 151 152 153 154 155 156 157 158

    GraphCompiler::CompileOptions compile_options;
    compile_options.with_instantiate_variables = true;

    if (graph->fusion_groups.empty()) {
      hlir::framework::ApplyPasses(graph, {"BuildNonFusedGroupsPass"});
    }
    compile_options.groups = graph->fusion_groups;

    for (auto group : graph->fusion_groups) {
159 160
      compile_options.lowered_funcs.push_back(
          op_lowerer->LowerWithoutSchedule(group));
161 162
    }

163 164
    VLOG(3) << "===========================No Schedule LoweredFunc "
               "Begin===========================";
165 166 167 168 169
    for (const auto& funcvec : compile_options.lowered_funcs) {
      for (const auto& func : funcvec) {
        VLOG(3) << func;
      }
    }
170 171
    VLOG(3) << "===========================No Schedule LoweredFunc "
               "End=============================";
172 173 174 175

    return graph_compiler->Build(compile_options).runtime_program;
  }

176 177
  std::unique_ptr<hlir::framework::Program> BuildManualScheduleProgram(
      Graph* graph, GraphCompiler* graph_compiler) {
178 179 180
    return graph_compiler->Build();
  }

181 182
  std::unique_ptr<hlir::framework::Program> BuildAutoScheduleProgram(
      Graph* graph, GraphCompiler* graph_compiler) {
183 184 185 186
    auto tuner = std::make_unique<AutoTuner>(target_, graph);

    AutoTuner::Config tuning_config;
    TuningOptions tuning_options;
187 188
    tuning_options.num_tuning_rounds = options_.num_tuning_rounds;
    tuning_options.num_measure_trials = 2;
189 190 191 192 193 194 195 196 197
    tuning_options.num_samples_per_iteration = 2;

    tuner->Initialize(tuning_config, graph_compiler);
    TuningResult tuning_result = tuner->Tune(tuning_options);

    GraphCompiler::CompileOptions compile_options;
    compile_options.with_instantiate_variables = true;
    compile_options.Apply(tuning_result);

198 199
    VLOG(3) << "===========================Auto Schedule LoweredFunc "
               "Begin===========================";
200 201 202 203 204
    for (const auto& funcvec : compile_options.lowered_funcs) {
      for (const auto& func : funcvec) {
        VLOG(3) << func;
      }
    }
205 206
    VLOG(3) << "===========================Auto Schedule LoweredFunc "
               "End=============================";
207 208 209 210 211 212 213 214 215 216 217 218 219 220

    return graph_compiler->Build(compile_options).runtime_program;
  }

#ifdef CINN_WITH_CUDA
  Target target_ = common::DefaultNVGPUTarget();
#else
  Target target_ = common::DefaultHostTarget();
#endif
  Options options_;
};

constexpr int batch_size = 2;

221 222 223
TEST_F(PerformanceTester, Mul) {
  Evaluate(tests::OpBuilder("mul").Build({{"X", {32, 16}}, {"Y", {16, 32}}}));
}
224 225

TEST_F(PerformanceTester, Add) {
226 227
  Evaluate(tests::OpBuilder("elementwise_add")
               .Build({{"X", {1, 56, 56, 256}}, {"Y", {1, 56, 56, 256}}}));
228 229 230
}

TEST_F(PerformanceTester, Matmul) {
231 232
  Evaluate(tests::OpBuilder("matmul").Build(
      {{"X", {batch_size, 2048}}, {"Y", {2048, 1000}}}));
233 234
}

235 236 237
TEST_F(PerformanceTester, Relu) {
  Evaluate(tests::OpBuilder("relu").Build({{"X", {batch_size, 64, 56, 56}}}));
}
238 239 240 241 242

TEST_F(PerformanceTester, Conv2d) {
  std::vector<int> strides{2, 2};
  std::vector<int> paddings{3, 3};
  std::vector<int> dilations{1, 1};
243 244 245
  int groups = 1;
  std::string conv_type = "forward";
  std::string data_format = "NCHW";
246 247
  std::string padding_algorithm = "EXPLICIT";

248 249 250 251 252 253 254 255 256
  Evaluate(tests::OpBuilder("conv2d").Build(
      {{"X", {batch_size, 3, 224, 224}}, {"W", {64, 3, 7, 7}}},
      {{"stride", strides},
       {"padding", paddings},
       {"dilation", dilations},
       {"groups", groups},
       {"conv_type", conv_type},
       {"data_format", data_format},
       {"padding_algorithm", padding_algorithm}}));
257 258 259 260 261 262 263 264
}

TEST_F(PerformanceTester, Pool2d) {
  std::vector<int32_t> input_shape{batch_size, 64, 112, 112};
  std::string pooling_type = "max";
  std::vector<int> ksize{3, 3};
  std::vector<int> strides{2, 2};
  std::vector<int> paddings{1, 1, 1, 1};
265 266 267 268 269
  bool ceil_mode = false;
  bool exclusive = true;
  bool global_pooling = false;
  std::string data_format = "NCHW";
  bool adaptive = false;
270 271
  std::string padding_algorithm = "EXPLICIT";

272 273 274 275 276 277 278 279 280 281 282 283
  Evaluate(tests::OpBuilder("pool2d").Build(
      {{"X", {batch_size, 64, 112, 112}}},
      {{"pool_type", pooling_type},
       {"kernel_size", ksize},
       {"stride_size", strides},
       {"padding_size", paddings},
       {"ceil_mode", ceil_mode},
       {"exclusive", exclusive},
       {"global_pooling", global_pooling},
       {"data_format", data_format},
       {"adaptive", adaptive},
       {"padding_algorithm", padding_algorithm}}));
284 285 286 287 288 289 290 291
}

TEST_F(PerformanceTester, BatchNorm) {
  std::vector<int32_t> input_shape{batch_size, 64, 112, 112};
  std::vector<int32_t> scale_shape{64};
  std::vector<int32_t> bias_shape{64};
  std::vector<int32_t> mean_shape{64};
  std::vector<int32_t> variance_shape{64};
292 293
  float epsilon = 1e-5f;
  float momentum = 0.9f;
294 295
  const std::string& data_layout = "NCHW";

296 297 298 299 300 301 302 303 304
  Evaluate(tests::OpBuilder("batch_norm")
               .Build({{"X", {batch_size, 64, 112, 112}},
                       {"scale", {64}},
                       {"bias", {64}},
                       {"mean", {64}},
                       {"variance", {64}}},
                      {{"epsilon", epsilon},
                       {"momentum", momentum},
                       {"data_layout", data_layout}}));
305 306 307 308 309
}

TEST_F(PerformanceTester, Reshape) {
  std::vector<int32_t> output_shape{batch_size, 2048};

310 311
  Evaluate(tests::OpBuilder("reshape").Build({{"X", {batch_size, 2048, 1, 1}}},
                                             {{"shape", output_shape}}));
312 313 314
}

TEST_F(PerformanceTester, Softmax) {
315 316
  std::vector<int> axes = {-1};
  std::string mode = "fast";
317 318
  std::string data_format = "AnyLayout";

319 320 321
  Evaluate(tests::OpBuilder("softmax").Build(
      {{"X", {batch_size, 1000}}},
      {{"axes", axes}, {"mode", mode}, {"data_format", data_format}}));
322 323 324
}

TEST_F(PerformanceTester, Scale) {
325 326
  float scale = 1.0f;
  float bias = 0.0f;
327 328
  bool bias_after_scale = true;

329 330 331 332 333
  Evaluate(tests::OpBuilder("scale").Build(
      {{"X", {batch_size, 1000}}},
      {{"scale", scale},
       {"bias", bias},
       {"bias_after_scale", bias_after_scale}}));
334 335 336 337 338
}

TEST_F(PerformanceTester, LookupTable) {
  int64_t padding_idx = -1;

339 340 341 342
  Evaluate(tests::OpBuilder("lookup_table")
               .Build({{"table", {50001, 768}},
                       {"ids", {10, 128, 1}, common::Int(64)}},
                      {{"padding_idx", padding_idx}}));
343 344 345 346 347 348
}

TEST_F(PerformanceTester, Gather) {
  int axis = 3;

  Evaluate(tests::OpBuilder("gather").Build(
349 350 351
      {{"operand", {10, 12, 128, 512}},
       {"index", {1, 1, 1, 128}, common::Int(32)}},
      {{"axis", axis}}));
352 353 354 355 356
}

// paddle model test
TEST_F(PerformanceTester, ResNet50) {
  CHECK_NE(FLAGS_resnet50_model_dir, "");
357 358
  std::unordered_map<std::string, std::vector<int64_t>> feeds = {
      {"inputs", {batch_size, 3, 224, 224}}};
359 360 361 362 363 364
  Evaluate(cinn::frontend::PaddleModelConvertor(common::DefaultNVGPUTarget())
               .LoadModel(FLAGS_resnet50_model_dir, true, feeds));
}

}  // namespace auto_schedule
}  // namespace cinn