benchmark.cc 18.6 KB
Newer Older
1
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
T
tensor-tang 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License. */

#include <iostream>
T
tensor-tang 已提交
16
#include <random>
17

T
tensor-tang 已提交
18 19
#include "gflags/gflags.h"
#include "glog/logging.h"
20
#include "paddle/phi/api/profiler/device_tracer.h"
21 22 23 24
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/kernels/funcs/jit/kernels.h"
T
tensor-tang 已提交
25 26 27 28

DEFINE_int32(burning, 10, "Burning times.");
DEFINE_int32(repeat, 3000, "Repeat times.");
DEFINE_int32(max_size, 1000, "The Max size would be tested.");
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
DEFINE_string(filter, "", "The Benchmark name would be run.");

class BenchJITKernel {
 public:
  BenchJITKernel() = default;
  virtual ~BenchJITKernel() = default;
  virtual void Run() = 0;
  virtual const char* Name() = 0;
  virtual const char* Dtype() = 0;
  virtual const char* Place() = 0;
};

static std::vector<BenchJITKernel*> g_all_benchmarks;

BenchJITKernel* InsertBenchmark(BenchJITKernel* b) {
  g_all_benchmarks.push_back(b);
  return b;
}

#define BENCH_JITKERNEL(name, dtype, place)                                    \
  class BenchJITKernel_##name##_##dtype##_##place##_ : public BenchJITKernel { \
   public:                                                                     \
    const char* Name() override { return #name; }                              \
    const char* Dtype() override { return #dtype; }                            \
    const char* Place() override { return #place; }                            \
    void Run() override;                                                       \
  };                                                                           \
T
tensor-tang 已提交
56
  static auto inserted_##name##_##dtype##_##place##_ UNUSED =                  \
57 58 59 60 61 62 63 64 65 66 67 68 69
      InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_());     \
  void BenchJITKernel_##name##_##dtype##_##place##_::Run()

void RUN_ALL_BENCHMARK() {
  for (auto p : g_all_benchmarks) {
    if (!FLAGS_filter.empty() && FLAGS_filter != p->Name()) {
      continue;
    }
    LOG(INFO) << "Benchmark " << p->Name() << "." << p->Dtype() << "."
              << p->Place();
    p->Run();
  }
}
T
tensor-tang 已提交
70 71

template <typename T>
72 73 74 75 76
void RandomVec(const int n,
               T* a,
               const T lower = static_cast<T>(-20.f),
               const T upper = static_cast<T>(20.f),
               unsigned int seed = 100) {
77
  std::mt19937 rng(seed);
T
tensor-tang 已提交
78 79 80 81 82 83 84 85 86 87 88 89 90 91
  std::uniform_real_distribution<double> uniform_dist(0, 1);
  for (int i = 0; i < n; ++i) {
    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
  }
}

std::vector<int> TestSizes() {
  std::vector<int> s;
  for (int i = 1; i <= FLAGS_max_size; ++i) {
    s.push_back(i);
  }
  return s;
}

92
template <typename KernelTuple, typename... Args>
T
tensor-tang 已提交
93 94
struct BenchFunc {
  // return this function avg time
T
tensor-tang 已提交
95
  // TODO(TJ): clear cache every time
96
  double operator()(const typename KernelTuple::func_type tgt, Args... args) {
T
tensor-tang 已提交
97 98 99
    for (int i = 0; i < FLAGS_burning; ++i) {
      tgt(args...);
    }
100
    auto start = phi::PosixInNsec() * 1e-3;
T
tensor-tang 已提交
101 102 103
    for (int i = 0; i < FLAGS_repeat; ++i) {
      tgt(args...);
    }
104
    auto end = phi::PosixInNsec() * 1e-3;
105
    return static_cast<double>(end - start) / FLAGS_repeat;
T
tensor-tang 已提交
106 107 108
  }
};

109
namespace jit = phi::jit;
T
tensor-tang 已提交
110

111 112 113
template <typename KernelTuple, typename PlaceType, typename... Args>
void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) {
  BenchFunc<KernelTuple, Args...> benchmark;
T
tensor-tang 已提交
114
  std::vector<std::pair<std::string, double>> infos;
115 116 117
  auto funcs = jit::GetAllCandidateFuncsWithTypes<KernelTuple, PlaceType>(attr);
  for (auto f : funcs) {
    infos.push_back(std::make_pair(f.first, benchmark(f.second, args...)));
T
tensor-tang 已提交
118 119 120
  }

  // Test result from Get function
121
  auto tgt = jit::KernelFuncs<KernelTuple, PlaceType>::Cache().At(attr);
T
tensor-tang 已提交
122
  if (!tgt) {
123
    PADDLE_THROW(phi::errors::Fatal("Benchmark target can not be empty."));
T
tensor-tang 已提交
124
  }
T
tensor-tang 已提交
125 126 127 128
  infos.push_back(std::make_pair("Target", benchmark(tgt, args...)));

  // print
  std::ostringstream loginfos;
129 130
  loginfos << "Kernel Type " << jit::to_string(KernelTuple::kernel_type) << ": "
           << attr << ": ";
T
tensor-tang 已提交
131 132 133 134
  for (auto pair : infos) {
    loginfos << pair.first << " takes " << pair.second << " us; ";
  }
  LOG(INFO) << loginfos.str();
T
tensor-tang 已提交
135 136
}

137 138 139
template <typename KernelTuple, typename PlaceType>
void BenchKernelXYZN() {
  using T = typename KernelTuple::data_type;
T
tensor-tang 已提交
140
  for (int d : TestSizes()) {
141
    phi::DenseTensor x, y, z;
T
tensor-tang 已提交
142 143 144 145 146 147 148 149
    x.Resize({d});
    y.Resize({d});
    z.Resize({d});
    T* x_data = x.mutable_data<T>(PlaceType());
    T* y_data = y.mutable_data<T>(PlaceType());
    T* z_data = z.mutable_data<T>(PlaceType());
    RandomVec<T>(d, x_data);
    RandomVec<T>(d, y_data);
150 151
    BenchAllImpls<KernelTuple, PlaceType>(
        d, x.data<T>(), y.data<T>(), z_data, d);
T
tensor-tang 已提交
152
    // test inplace
153
    BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), z_data, z_data, d);
T
tensor-tang 已提交
154 155
  }
}
156

157 158 159
template <typename KernelTuple, typename PlaceType>
void BenchKernelAXYN() {
  using T = typename KernelTuple::data_type;
160 161
  for (int d : TestSizes()) {
    const T a = static_cast<T>(3);
162
    phi::DenseTensor x, y;
T
tensor-tang 已提交
163 164 165 166 167
    x.Resize({d});
    y.Resize({d});
    T* x_data = x.mutable_data<T>(PlaceType());
    T* y_data = y.mutable_data<T>(PlaceType());
    RandomVec<T>(d, x_data);
168
    BenchAllImpls<KernelTuple, PlaceType>(d, &a, x.data<T>(), y_data, d);
T
tensor-tang 已提交
169
    // test inplace
170
    BenchAllImpls<KernelTuple, PlaceType>(d, &a, x.data<T>(), x_data, d);
171 172 173
  }
}

174 175 176
template <typename KernelTuple, typename PlaceType>
void BenchKernelXYN() {
  using T = typename KernelTuple::data_type;
177
  for (int d : TestSizes()) {
178
    phi::DenseTensor x, y;
T
tensor-tang 已提交
179 180 181 182 183
    x.Resize({d});
    y.Resize({d});
    T* x_data = x.mutable_data<T>(PlaceType());
    T* y_data = y.mutable_data<T>(PlaceType());
    RandomVec<T>(d, x_data);
184
    BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), y_data, d);
185 186 187
  }
}

188 189 190
template <typename KernelTuple, typename PlaceType>
void BenchKernelLSTM() {
  using T = typename KernelTuple::data_type;
T
tensor-tang 已提交
191 192
  for (bool use_peephole : {true, false}) {
    for (int d : TestSizes()) {
193 194
      const jit::lstm_attr_t attr(
          d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh, use_peephole);
195
      phi::DenseTensor x, ct_1, ct, ht, wp, checked;
T
tensor-tang 已提交
196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211
      x.Resize({4 * d});
      ct_1.Resize({d});
      ct.Resize({d});
      ht.Resize({d});
      wp.Resize({3 * d});
      checked.Resize({2 * d});
      auto place = PlaceType();
      RandomVec<T>(x.numel(), x.mutable_data<T>(place), -2.f, 2.f);
      RandomVec<T>(wp.numel(), wp.mutable_data<T>(place), -2.f, 2.f);
      RandomVec<T>(ct_1.numel(), ct_1.mutable_data<T>(place), -2.f, 2.f);
      const T* ct_1_data = ct_1.data<T>();
      const T* wp_data = wp.data<T>();
      T* x_data = x.mutable_data<T>(place);
      T* checked_data = checked.mutable_data<T>(place);
      T* ct_data = ct.mutable_data<T>(place);
      T* ht_data = ht.mutable_data<T>(place);
T
tensor-tang 已提交
212 213 214 215 216 217 218 219 220
      jit::lstm_t step;
      step.gates = x_data;
      step.ct_1 = ct_1_data;
      step.ct = ct_data;
      step.ht = ht_data;
      if (use_peephole) {
        step.wp = wp_data;
        step.checked = checked_data;
      }
221
      BenchAllImpls<KernelTuple, PlaceType>(attr, &step, &attr);
T
tensor-tang 已提交
222 223 224 225
    }
  }
}

226 227 228
template <typename KernelTuple, typename PlaceType>
void BenchKernelGRU() {
  using T = typename KernelTuple::data_type;
229
  for (int d : TestSizes()) {
T
tensor-tang 已提交
230
    const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh);
T
tensor-tang 已提交
231
    auto place = PlaceType();
232
    phi::DenseTensor x, ht_1, ht;
T
tensor-tang 已提交
233 234 235 236 237 238 239 240
    x.Resize({3 * d});
    ht_1.Resize({d});
    ht.Resize({d});
    RandomVec<T>(3 * d, x.mutable_data<T>(place), -2.f, 2.f);
    RandomVec<T>(d, ht_1.mutable_data<T>(place), -2.f, 2.f);
    const T* ht_1_data = ht_1.data<T>();
    T* x_data = x.mutable_data<T>(place);
    T* ht_data = ht.mutable_data<T>(place);
241 242 243 244
    jit::gru_t step;
    step.gates = x_data;
    step.ht_1 = ht_1_data;
    step.ht = ht_data;
245
    BenchAllImpls<KernelTuple, PlaceType>(attr, &step, &attr);
246 247 248
  }
}

249 250 251
template <typename KernelTuple, typename PlaceType>
void BenchKernelSeqPool() {
  using T = typename KernelTuple::data_type;
252 253
  std::vector<jit::SeqPoolType> pool_types = {
      jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
254
  for (auto type : pool_types) {
T
tensor-tang 已提交
255
    for (int w : TestSizes()) {
T
tensor-tang 已提交
256
      jit::seq_pool_attr_t attr(w, type);
T
tensor-tang 已提交
257
      for (int h : TestSizes()) {
T
tensor-tang 已提交
258
        attr.h = h;
259
        phi::DenseTensor x, y;
T
tensor-tang 已提交
260 261 262 263 264
        x.Resize({h * w});
        y.Resize({w});
        RandomVec<T>(h * w, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
        const T* x_data = x.data<T>();
        T* y_data = y.mutable_data<T>(PlaceType());
265
        BenchAllImpls<KernelTuple, PlaceType>(attr, x_data, y_data, &attr);
266 267 268 269 270
      }
    }
  }
}

271 272 273
template <typename KernelTuple, typename PlaceType>
void BenchKernelEmbSeqPool() {
  using T = typename KernelTuple::data_type;
274 275 276
  std::vector<jit::SeqPoolType> pool_types = {jit::SeqPoolType::kSum};
  int64_t tbl_h = 1e4;
  for (int tbl_w : {10, 16, 256}) {
277
    phi::DenseTensor table;
278 279 280 281 282
    table.Resize({tbl_h, tbl_w});
    RandomVec<T>(tbl_h * tbl_w, table.mutable_data<T>(PlaceType()), -2.f, 2.f);
    const T* table_data = table.data<T>();
    for (auto type : pool_types) {
      for (int idx_w : {1, 2, 10, 16}) {
283
        for (int idx_h : {1, 2, 9, 13, 16}) {
284
          int64_t out_w = tbl_w * idx_w;
285 286
          jit::emb_seq_pool_attr_t attr(
              tbl_h, tbl_w, idx_h, idx_w, out_w, type);
287
          phi::DenseTensor idx, out;
288 289 290
          idx.Resize({idx_h, idx_w});
          out.Resize({out_w});
          RandomVec<int64_t>(idx_h * idx_w,
291 292
                             idx.mutable_data<int64_t>(PlaceType()),
                             0,
293 294 295
                             tbl_h - 1);
          const int64_t* idx_data = idx.data<int64_t>();
          T* o_data = out.mutable_data<T>(PlaceType());
296 297
          BenchAllImpls<KernelTuple, PlaceType>(
              attr, table_data, idx_data, o_data, &attr);
298 299 300 301 302 303
        }
      }
    }
  }
}

304 305 306
template <typename KernelTuple, typename PlaceType>
void BenchKernelSgd() {
  using T = typename KernelTuple::data_type;
307
  const T lr = 0.1;
308 309
  auto UnDuplicatedRandomVec = [](int n,
                                  const int64_t lower,
310
                                  const int64_t upper) -> std::vector<int64_t> {
G
GaoWei8 已提交
311
    PADDLE_ENFORCE_LE(
312 313
        static_cast<size_t>(upper - lower),
        n - 1,
314
        phi::errors::InvalidArgument(
G
GaoWei8 已提交
315 316
            "The range of Sgd (upper - lower) should be equal to or lower "
            "than n-1 (Sgd size -1). But upper - lower is %d and n-1 is %d.",
317 318
            static_cast<size_t>(upper - lower),
            (n - 1)));
G
GaoWei8 已提交
319
    PADDLE_ENFORCE_GT(
320 321
        n,
        0,
322
        phi::errors::InvalidArgument(
323
            "The Sgd size should be larger than 0. But the n is %d.", n));
324 325 326 327
    std::vector<int64_t> all, out;
    for (int i = 0; i < n; ++i) {
      all.push_back(i);
    }
328 329 330 331
    std::random_device rnd;
    int64_t seed_tmp = rnd();
    std::default_random_engine rng(seed_tmp);
    std::shuffle(all.begin(), all.end(), rng);
332 333 334 335 336 337
    out.insert(out.begin(), all.begin(), all.begin() + n);
    return out;
  };
  for (int param_h : {1, 1000}) {
    for (int grad_w : {1, 2, 8, 16, 30, 256}) {
      // only benchmark inplace
338
      phi::DenseTensor param;
339 340 341 342
      param.Resize({param_h, grad_w});
      T* param_data = param.mutable_data<T>(PlaceType());
      RandomVec<T>(param_h * grad_w, param_data, -2.f, 2.f);
      for (int rows_size = 1; rows_size <= std::min(param_h, 10); ++rows_size) {
343
        phi::DenseTensor grad;
344 345 346
        grad.Resize({rows_size, grad_w});
        std::vector<int64_t> rows =
            UnDuplicatedRandomVec(rows_size, 0, rows_size - 1);
347 348
        RandomVec<T>(
            rows_size * grad_w, grad.mutable_data<T>(PlaceType()), -2.f, 2.f);
349 350 351
        const T* grad_data = grad.data<T>();
        const int64_t* rows_data = rows.data();
        jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size);
352 353
        BenchAllImpls<KernelTuple, PlaceType>(
            attr, &lr, param_data, grad_data, rows_data, param_data, &attr);
354 355 356 357 358
      }
    }
  }
}

359 360 361
template <typename KernelTuple, typename PlaceType>
void BenchKernelMatMul() {
  using T = typename KernelTuple::data_type;
T
tensor-tang 已提交
362
  for (int m : {1, 2, 3, 4}) {
363
    for (int n : TestSizes()) {
T
tensor-tang 已提交
364
      for (int k : TestSizes()) {
365
        phi::DenseTensor a, b, c;
T
tensor-tang 已提交
366 367 368 369 370 371 372 373
        a.Resize({m * k});
        b.Resize({k * n});
        c.Resize({m * n});
        RandomVec<T>(m * k, a.mutable_data<T>(PlaceType()), -2.f, 2.f);
        RandomVec<T>(k * n, b.mutable_data<T>(PlaceType()), -2.f, 2.f);
        const T* a_data = a.data<T>();
        const T* b_data = b.data<T>();
        T* c_data = c.mutable_data<T>(PlaceType());
374
        const jit::matmul_attr_t attr{m, n, k};
375 376
        BenchAllImpls<KernelTuple, PlaceType>(
            attr, a_data, b_data, c_data, &attr);
T
tensor-tang 已提交
377 378 379 380 381
      }
    }
  }
}

382 383 384
template <typename KernelTuple, typename PlaceType>
void BenchKernelLayerNorm() {
  using T = typename KernelTuple::data_type;
385 386 387 388 389 390 391
  const T epsilon = 9.99999975e-06;
  for (int n : {1, 2, 10}) {
    for (int x_dim_0 : {1, 9, 17, 50}) {
      int left = n * x_dim_0;
      for (int x_dim_1 : TestSizes()) {
        int right = x_dim_1;
        int sz = left * right;
392
        phi::DenseTensor x, mean, var, scale, bias, out;
393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412
        x.Resize({n, x_dim_0, x_dim_1});
        out.Resize({n, x_dim_0, x_dim_1});
        mean.Resize({n, x_dim_0});
        var.Resize({n, x_dim_0});
        scale.Resize({x_dim_1});
        bias.Resize({x_dim_1});

        RandomVec<T>(sz, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
        RandomVec<T>(left, mean.mutable_data<T>(PlaceType()), -2.f, 2.f);
        RandomVec<T>(left, var.mutable_data<T>(PlaceType()), -2.f, 2.f);
        RandomVec<T>(right, scale.mutable_data<T>(PlaceType()), -2.f, 2.f);
        RandomVec<T>(right, bias.mutable_data<T>(PlaceType()), -2.f, 2.f);

        const T* scale_data = scale.data<T>();
        const T* bias_data = bias.data<T>();
        T* x_data = x.data<T>();
        T* mean_data = mean.data<T>();
        T* var_data = var.data<T>();
        T* out_data = out.mutable_data<T>(PlaceType());

413 414 415 416 417 418 419 420 421 422
        BenchAllImpls<KernelTuple, PlaceType>(right,
                                              x_data,
                                              out_data,
                                              mean_data,
                                              var_data,
                                              scale_data,
                                              bias_data,
                                              left,
                                              epsilon,
                                              right);
423 424 425 426 427
      }
    }
  }
}

428 429 430
template <typename KernelTuple, typename PlaceType>
void BenchKernelCRFDecoding() {
  using T = typename KernelTuple::data_type;
431 432 433 434 435
  constexpr int state_trans_base_idx = 2;
  for (int seq_len : {1, 11, 17, 50}) {
    for (int tag_num : TestSizes()) {
      int x_sz = seq_len * tag_num;
      int w_sz = (tag_num + state_trans_base_idx) * tag_num;
436
      phi::DenseTensor x, w, alpha, track;
437 438 439 440 441 442 443 444 445 446 447 448 449
      x.Resize({seq_len, tag_num});
      w.Resize({tag_num + state_trans_base_idx, tag_num});
      alpha.Resize({seq_len, tag_num});
      track.Resize({seq_len, tag_num});

      RandomVec<T>(x_sz, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
      RandomVec<T>(w_sz, w.mutable_data<T>(PlaceType()), -2.f, 2.f);

      const T* x_data = x.data<T>();
      const T* w_data = w.data<T>();
      T* alpha_data = alpha.mutable_data<T>(PlaceType());
      int* track_data = track.mutable_data<int>(PlaceType());

450 451
      BenchAllImpls<KernelTuple, PlaceType>(
          tag_num, seq_len, x_data, w_data, alpha_data, track_data, tag_num);
452 453 454 455
    }
  }
}

456 457 458
template <typename KernelTuple, typename PlaceType>
void BenchKernelVBroadcast() {
  using T = typename KernelTuple::data_type;
459
  for (int64_t w : {1, 16, 64, 100, 256}) {
460
    phi::DenseTensor x;
461 462 463
    x.Resize({w});
    RandomVec<T>(w, x.mutable_data<T>(PlaceType()));
    const T* x_data = x.data<T>();
464
    for (int h : TestSizes()) {
465
      phi::DenseTensor y;
466 467
      y.Resize({h * w});
      T* y_data = y.mutable_data<T>(PlaceType());
468 469
      BenchAllImpls<KernelTuple, PlaceType>(
          w, x_data, y_data, static_cast<int64_t>(h), w);
470 471 472 473
    }
  }
}

474 475 476 477
#define BenchKernelVMul BenchKernelXYZN
#define BenchKernelVAdd BenchKernelXYZN
#define BenchKernelVAddRelu BenchKernelXYZN
#define BenchKernelVSub BenchKernelXYZN
478

479 480
#define BenchKernelVScal BenchKernelAXYN
#define BenchKernelVAddBias BenchKernelAXYN
481

482 483 484 485 486 487 488
#define BenchKernelVRelu BenchKernelXYN
#define BenchKernelVIdentity BenchKernelXYN
#define BenchKernelVSquare BenchKernelXYN
#define BenchKernelVExp BenchKernelXYN
#define BenchKernelVSigmoid BenchKernelXYN
#define BenchKernelVTanh BenchKernelXYN
#define BenchKernelVCopy BenchKernelXYN
489

490 491
#define BenchKernelLSTMCtHt BenchKernelLSTM
#define BenchKernelLSTMC1H1 BenchKernelLSTM
492

493 494 495
#define BenchKernelGRUH1 BenchKernelGRU
#define BenchKernelGRUHtPart1 BenchKernelGRU
#define BenchKernelGRUHtPart2 BenchKernelGRU
496

497
using CPUPlace = phi::CPUPlace;
498

499 500 501 502
#define BENCH_FP32_CPU(name)                                \
  BENCH_JITKERNEL(name, FP32, CPU) {                        \
    BenchKernel##name<jit::name##Tuple<float>, CPUPlace>(); \
  }
503

504 505 506 507 508
// xyzn
BENCH_FP32_CPU(VMul);
BENCH_FP32_CPU(VAdd);
BENCH_FP32_CPU(VAddRelu);
BENCH_FP32_CPU(VSub);
509

510 511 512
// axyn
BENCH_FP32_CPU(VScal);
BENCH_FP32_CPU(VAddBias);
513

514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539
// xyn
BENCH_FP32_CPU(VRelu);
BENCH_FP32_CPU(VIdentity);
BENCH_FP32_CPU(VSquare);
BENCH_FP32_CPU(VExp);
BENCH_FP32_CPU(VSigmoid);
BENCH_FP32_CPU(VTanh);
BENCH_FP32_CPU(VCopy);

// LSTM
BENCH_FP32_CPU(LSTMCtHt);
BENCH_FP32_CPU(LSTMC1H1);

// GRU
BENCH_FP32_CPU(GRUH1);
BENCH_FP32_CPU(GRUHtPart1);
BENCH_FP32_CPU(GRUHtPart2);

BENCH_FP32_CPU(LayerNorm);
BENCH_FP32_CPU(CRFDecoding);

BENCH_FP32_CPU(SeqPool);
BENCH_FP32_CPU(EmbSeqPool);
BENCH_FP32_CPU(MatMul);
BENCH_FP32_CPU(Sgd);
BENCH_FP32_CPU(VBroadcast);
540

541 542 543 544 545 546
// Benchmark all jit kernels including jitcode, mkl and refer.
// To use this tool, run command: ./benchmark [options...]
// Options:
//     --burning: the burning time before count
//     --repeat: the repeat times
//     --max_size: the max size would be tested
547
//     --filter: the bench name would be run
548
int main(int argc, char* argv[]) {
549
  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
550 551 552
  google::InitGoogleLogging(argv[0]);
  LOG(INFO) << "Burning " << FLAGS_burning << " times, Repeat " << FLAGS_repeat
            << " times.";
T
tensor-tang 已提交
553

554
  RUN_ALL_BENCHMARK();
555
}