benchmark_eager_cpu.cc 9.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Eager Dygraph

17
#include <paddle/fluid/framework/op_registry.h>
18

19 20 21 22 23 24
#include <chrono>

#include "gtest/gtest.h"
#include "paddle/fluid/eager/api/all.h"
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/eager/backward.h"
25
#include "paddle/fluid/imperative/tracer.h"
26
#include "paddle/phi/core/flags.h"
T
tianshuo78520a 已提交
27 28
#include "test/cpp/eager/performance_tests/benchmark_utils.h"
#include "test/cpp/eager/test_utils.h"
29 30 31 32 33

#ifdef WITH_GPERFTOOLS
#include "gperftools/profiler.h"
#endif

34 35 36 37 38 39 40
#include "paddle/phi/core/kernel_registry.h"

PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
41 42
PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT);
43

44 45
using namespace egr;            // NOLINT
using namespace egr_utils_api;  // NOLINT
46 47 48

TEST(Benchmark, EagerScaleCPU) {
  // Prepare Device Contexts
49
  eager_test::InitEnv(paddle::platform::CPUPlace());
50

R
risemeup1 已提交
51
  for (const std::string mode : {"Accuracy", "Performance"}) {
52
    paddle::framework::DDim ddim = phi::make_ddim({2, 4, 4, 4});
53 54 55 56 57 58 59
    paddle::Tensor tensor =
        eager_test::CreateTensorWithValue(ddim,
                                          paddle::platform::CPUPlace(),
                                          phi::DataType::FLOAT32,
                                          phi::DataLayout::NCHW,
                                          5.0,
                                          true);
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
    RetainGradForTensor(tensor);

    if (mode == "Accuracy") {
      benchmark_eager_scale(tensor, true /* accuracy_check*/);

    } else if (mode == "Performance") {
      auto t_start = std::chrono::high_resolution_clock::now();
#ifdef WITH_GPERFTOOLS
      ProfilerStart("eager_scale_cpu.out");
#endif
      benchmark_eager_scale(tensor);

#ifdef WITH_GPERFTOOLS
      ProfilerStop();
#endif
      auto t_end = std::chrono::high_resolution_clock::now();
      double elapsed_time_ms =
          std::chrono::duration<double, std::milli>(t_end - t_start).count();

      std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;

    } else {
      PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
    }
  }
}

87 88 89 90
TEST(Benchmark, EagerMatmulCPU) {
  // Prepare Device Contexts
  eager_test::InitEnv(paddle::platform::CPUPlace());

R
risemeup1 已提交
91
  for (const std::string mode : {"Accuracy", "Performance"}) {
92
    paddle::framework::DDim ddimX = phi::make_ddim({2, 2});
93 94 95 96 97 98 99
    paddle::Tensor X =
        eager_test::CreateTensorWithValue(ddimX,
                                          paddle::platform::CPUPlace(),
                                          phi::DataType::FLOAT32,
                                          phi::DataLayout::NCHW,
                                          1.0,
                                          true);
100 101 102
    RetainGradForTensor(X);

    paddle::framework::DDim ddimY = phi::make_ddim({2, 2});
103 104 105 106 107 108 109
    paddle::Tensor Y =
        eager_test::CreateTensorWithValue(ddimY,
                                          paddle::platform::CPUPlace(),
                                          phi::DataType::FLOAT32,
                                          phi::DataLayout::NCHW,
                                          2.0,
                                          true);
110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
    RetainGradForTensor(Y);

    if (mode == "Accuracy") {
      benchmark_eager_matmul(X, Y, true /* accuracy_check */);

    } else if (mode == "Performance") {
      auto t_start = std::chrono::high_resolution_clock::now();
#ifdef WITH_GPERFTOOLS
      ProfilerStart("eager_matmul_cpu.out");
#endif
      benchmark_eager_matmul(X, Y);

#ifdef WITH_GPERFTOOLS
      ProfilerStop();
#endif
      auto t_end = std::chrono::high_resolution_clock::now();
      double elapsed_time_ms =
          std::chrono::duration<double, std::milli>(t_end - t_start).count();
      std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;

    } else {
      PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
    }
  }
}

136 137
TEST(Benchmark, EagerIntermediateMatmulCPU) {
  // Prepare Device Contexts
138
  eager_test::InitEnv(paddle::platform::CPUPlace());
139 140 141 142

  auto tracer = std::make_shared<paddle::imperative::Tracer>();
  paddle::imperative::SetCurrentTracer(tracer);

R
risemeup1 已提交
143
  for (const std::string mode : {"Accuracy", "Performance"}) {
144
    paddle::framework::DDim ddimX = phi::make_ddim({2, 2});
145 146 147 148 149 150 151
    paddle::Tensor X =
        eager_test::CreateTensorWithValue(ddimX,
                                          paddle::platform::CPUPlace(),
                                          phi::DataType::FLOAT32,
                                          phi::DataLayout::NCHW,
                                          1.0,
                                          true);
152 153
    RetainGradForTensor(X);

154
    paddle::framework::DDim ddimY = phi::make_ddim({2, 2});
155 156 157 158 159 160 161
    paddle::Tensor Y =
        eager_test::CreateTensorWithValue(ddimY,
                                          paddle::platform::CPUPlace(),
                                          phi::DataType::FLOAT32,
                                          phi::DataLayout::NCHW,
                                          2.0,
                                          true);
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
    RetainGradForTensor(Y);

    if (mode == "Accuracy") {
      benchmark_eager_intermediate_matmul(X, Y, true /* accuracy_check */);

    } else if (mode == "Performance") {
      auto t_start = std::chrono::high_resolution_clock::now();
#ifdef WITH_GPERFTOOLS
      ProfilerStart("eager_intermediate_matmul_cpu.out");
#endif
      benchmark_eager_intermediate_matmul(X, Y);

#ifdef WITH_GPERFTOOLS
      ProfilerStop();
#endif
      auto t_end = std::chrono::high_resolution_clock::now();
      double elapsed_time_ms =
          std::chrono::duration<double, std::milli>(t_end - t_start).count();
      std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;

    } else {
      PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
    }
  }
}

TEST(Benchmark, EagerIntermediateMLPCPU) {
  // Prepare Device Contexts
190
  eager_test::InitEnv(paddle::platform::CPUPlace());
191 192 193 194

  auto tracer = std::make_shared<paddle::imperative::Tracer>();
  paddle::imperative::SetCurrentTracer(tracer);

R
risemeup1 已提交
195
  for (const std::string mode : {"Accuracy", "Performance"}) {
196
    paddle::framework::DDim ddimX = phi::make_ddim({MLP_M, MLP_N});
197 198 199 200 201 202 203
    paddle::Tensor X =
        eager_test::CreateTensorWithValue(ddimX,
                                          paddle::platform::CPUPlace(),
                                          phi::DataType::FLOAT32,
                                          phi::DataLayout::NCHW,
                                          MLP_X_VAL,
                                          true);
204 205
    RetainGradForTensor(X);

206 207
    std::vector<paddle::Tensor> Ws;
    std::vector<paddle::Tensor> Bs;
208
    for (size_t i = 0; i < MLP_NUM_LINEAR; i++) {
209
      paddle::framework::DDim ddimW = phi::make_ddim({MLP_N, MLP_K});
210 211 212 213 214 215 216
      paddle::Tensor W =
          eager_test::CreateTensorWithValue(ddimW,
                                            paddle::platform::CPUPlace(),
                                            phi::DataType::FLOAT32,
                                            phi::DataLayout::NCHW,
                                            MLP_W_VAL,
                                            true);
217 218
      RetainGradForTensor(W);

219
      paddle::framework::DDim ddimB = phi::make_ddim({MLP_K});
220 221 222 223 224 225 226
      paddle::Tensor B =
          eager_test::CreateTensorWithValue(ddimB,
                                            paddle::platform::CPUPlace(),
                                            phi::DataType::FLOAT32,
                                            phi::DataLayout::NCHW,
                                            MLP_B_VAL,
                                            true);
227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
      RetainGradForTensor(B);

      Ws.emplace_back(std::move(W));
      Bs.emplace_back(std::move(B));
    }

    if (mode == "Accuracy") {
      benchmark_eager_intermediate_mlp(X, Ws, Bs, true /* accuracy_check */);

    } else if (mode == "Performance") {
      auto t_start = std::chrono::high_resolution_clock::now();
#ifdef WITH_GPERFTOOLS
      ProfilerStart("eager_intermediate_mlp_cpu.out");
#endif
      benchmark_eager_intermediate_mlp(X, Ws, Bs);

#ifdef WITH_GPERFTOOLS
      ProfilerStop();
#endif
      auto t_end = std::chrono::high_resolution_clock::now();
      double elapsed_time_ms =
          std::chrono::duration<double, std::milli>(t_end - t_start).count();
      std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;

    } else {
      PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
    }
  }
}
256

257
USE_OP_ITSELF(scale);
258
USE_OP_ITSELF(elementwise_add);
259
USE_OP_ITSELF(matmul_v2);
260
USE_OP_ITSELF(reduce_sum);