math_function_test.cu 17.1 KB
Newer Older
1
//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
D
dzhwinter 已提交
2 3 4 5 6 7 8 9 10 11 12 13
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
Q
qijun 已提交
14
#include "gtest/gtest.h"
Y
Yu Yang 已提交
15
#include "paddle/fluid/operators/math/blas.h"
Y
Yi Wang 已提交
16
#include "paddle/fluid/operators/math/math_function.h"
Y
Yu Yang 已提交
17
#include "paddle/fluid/platform/device_context.h"
Q
qijun 已提交
18

19 20
void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
                    const std::vector<float>& data) {
21 22 23 24 25 26
  PADDLE_ENFORCE_EQ(
      size, data.size(),
      paddle::platform::errors::InvalidArgument(
          "The size of argument data should"
          " be equal to the argument size. Expected %d, but received %d.",
          size, data.size()));
27 28 29 30 31
  for (size_t i = 0; i < data.size(); ++i) {
    in_ptr[i] = paddle::platform::float16(data[i]);
  }
}

Y
Yu Yang 已提交
32 33 34 35 36 37 38
template <typename T>
inline paddle::operators::math::BlasT<paddle::platform::CUDADeviceContext, T>
GetBlas(const paddle::platform::CUDADeviceContext& context) {
  return paddle::operators::math::GetBlas<paddle::platform::CUDADeviceContext,
                                          T>(context);
}

39
TEST(math_function, notrans_mul_trans_fp32) {
40 41 42 43 44
  paddle::framework::Tensor input1;
  paddle::framework::Tensor input1_gpu;
  paddle::framework::Tensor input2_gpu;
  paddle::framework::Tensor out_gpu;
  paddle::framework::Tensor out;
45

46 47 48
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
49 50

  float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
Q
qijun 已提交
51 52 53
  float arr[6] = {0, 1, 2, 3, 4, 5};
  memcpy(input1_ptr, arr, 6 * sizeof(float));

54 55
  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
  paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
Q
qijun 已提交
56

57
  out_gpu.mutable_data<float>({2, 2}, gpu_place);
Y
Yu Yang 已提交
58 59
  GetBlas<float>(context).MatMul(input1_gpu, false, input2_gpu, true, 1,
                                 &out_gpu, 0);
Q
qijun 已提交
60

61
  paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
Q
qijun 已提交
62 63 64 65 66 67 68 69 70

  float* out_ptr = out.data<float>();
  context.Wait();
  EXPECT_EQ(out_ptr[0], 5);
  EXPECT_EQ(out_ptr[1], 14);
  EXPECT_EQ(out_ptr[2], 14);
  EXPECT_EQ(out_ptr[3], 50);
}

71
TEST(math_function, notrans_mul_trans_fp16) {
72 73 74 75 76
  paddle::framework::Tensor input1;
  paddle::framework::Tensor input1_gpu;
  paddle::framework::Tensor input2_gpu;
  paddle::framework::Tensor out_gpu;
  paddle::framework::Tensor out;
77

78 79 80
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
81

K
Kexin Zhao 已提交
82 83 84 85 86
  // fp16 GEMM in cublas requires GPU compute capability >= 53
  if (context.GetComputeCapability() < 53) {
    return;
  }

87 88
  paddle::platform::float16* input1_ptr =
      input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
89 90
  fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});

91 92
  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
  paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
93

94
  out_gpu.mutable_data<paddle::platform::float16>({2, 2}, gpu_place);
95

Y
Yu Yang 已提交
96 97 98
  GetBlas<paddle::platform::float16>(context).MatMul(
      input1_gpu, false, input2_gpu, true, paddle::platform::float16(1),
      &out_gpu, paddle::platform::float16(0));
99

100
  paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
101

102
  paddle::platform::float16* out_ptr = out.data<paddle::platform::float16>();
103 104 105 106 107 108 109 110
  context.Wait();
  EXPECT_EQ(static_cast<float>(out_ptr[0]), 5);
  EXPECT_EQ(static_cast<float>(out_ptr[1]), 14);
  EXPECT_EQ(static_cast<float>(out_ptr[2]), 14);
  EXPECT_EQ(static_cast<float>(out_ptr[3]), 50);
}

TEST(math_function, trans_mul_notrans_fp32) {
111 112 113 114 115
  paddle::framework::Tensor input1;
  paddle::framework::Tensor input1_gpu;
  paddle::framework::Tensor input2_gpu;
  paddle::framework::Tensor out_gpu;
  paddle::framework::Tensor out;
116

117 118 119
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
Q
qijun 已提交
120

121
  float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
Q
qijun 已提交
122 123 124
  float arr[6] = {0, 1, 2, 3, 4, 5};
  memcpy(input1_ptr, arr, 6 * sizeof(float));

125 126
  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
  paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
Q
qijun 已提交
127

128
  out_gpu.mutable_data<float>({3, 3}, gpu_place);
Q
qijun 已提交
129

Y
Yu Yang 已提交
130 131
  GetBlas<float>(context).MatMul(input1_gpu, true, input2_gpu, false, 1,
                                 &out_gpu, 0);
Q
qijun 已提交
132

133
  paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
Q
qijun 已提交
134 135 136 137 138 139 140 141 142 143 144 145 146 147

  float* out_ptr = out.data<float>();
  context.Wait();
  EXPECT_EQ(out_ptr[0], 9);
  EXPECT_EQ(out_ptr[1], 12);
  EXPECT_EQ(out_ptr[2], 15);
  EXPECT_EQ(out_ptr[3], 12);
  EXPECT_EQ(out_ptr[4], 17);
  EXPECT_EQ(out_ptr[5], 22);
  EXPECT_EQ(out_ptr[6], 15);
  EXPECT_EQ(out_ptr[7], 22);
  EXPECT_EQ(out_ptr[8], 29);
}

148
TEST(math_function, trans_mul_notrans_fp16) {
149 150 151 152 153
  paddle::framework::Tensor input1;
  paddle::framework::Tensor input1_gpu;
  paddle::framework::Tensor input2_gpu;
  paddle::framework::Tensor out_gpu;
  paddle::framework::Tensor out;
154

155 156 157
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
158

K
Kexin Zhao 已提交
159 160 161 162 163
  // fp16 GEMM in cublas requires GPU compute capability >= 53
  if (context.GetComputeCapability() < 53) {
    return;
  }

164 165
  paddle::platform::float16* input1_ptr =
      input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
166 167
  fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});

168 169
  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
  paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
170

171
  out_gpu.mutable_data<paddle::platform::float16>({3, 3}, gpu_place);
172

Y
Yu Yang 已提交
173 174 175
  GetBlas<paddle::platform::float16>(context).MatMul(
      input1_gpu, true, input2_gpu, false, paddle::platform::float16(1),
      &out_gpu, paddle::platform::float16(0));
176

177
  paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
178

179
  paddle::platform::float16* out_ptr = out.data<paddle::platform::float16>();
180 181 182 183 184 185 186 187 188 189 190 191 192
  context.Wait();
  EXPECT_EQ(static_cast<float>(out_ptr[0]), 9);
  EXPECT_EQ(static_cast<float>(out_ptr[1]), 12);
  EXPECT_EQ(static_cast<float>(out_ptr[2]), 15);
  EXPECT_EQ(static_cast<float>(out_ptr[3]), 12);
  EXPECT_EQ(static_cast<float>(out_ptr[4]), 17);
  EXPECT_EQ(static_cast<float>(out_ptr[5]), 22);
  EXPECT_EQ(static_cast<float>(out_ptr[6]), 15);
  EXPECT_EQ(static_cast<float>(out_ptr[7]), 22);
  EXPECT_EQ(static_cast<float>(out_ptr[8]), 29);
}

TEST(math_function, gemm_notrans_cublas_fp32) {
193 194 195 196 197 198
  paddle::framework::Tensor input1;
  paddle::framework::Tensor input2;
  paddle::framework::Tensor input3;
  paddle::framework::Tensor input1_gpu;
  paddle::framework::Tensor input2_gpu;
  paddle::framework::Tensor input3_gpu;
199

200 201 202
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
Q
qijun 已提交
203 204 205 206

  int m = 2;
  int n = 3;
  int k = 3;
207
  float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
Q
qijun 已提交
208 209
  float arr1[6] = {0, 1, 2, 3, 4, 5};
  memcpy(input1_ptr, arr1, 6 * sizeof(float));
210
  float* input2_ptr = input2.mutable_data<float>({3, 4}, cpu_place);
Q
qijun 已提交
211 212
  float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
  memcpy(input2_ptr, arr2, 12 * sizeof(float));
213
  float* input3_ptr = input3.mutable_data<float>({2, 4}, cpu_place);
Q
qijun 已提交
214 215 216
  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
  memcpy(input3_ptr, arr3, 8 * sizeof(float));

217 218 219
  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
  paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
  paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
Q
qijun 已提交
220 221
  float* a = input1_gpu.data<float>();
  float* b = input2_gpu.data<float>();
222
  float* c = input3_gpu.mutable_data<float>(gpu_place);
Q
qijun 已提交
223

Y
Yu Yang 已提交
224 225
  GetBlas<float>(context).GEMM(false, false, m, n, k, 1, a, 3, b + 1, 4, 1,
                               c + 1, 4);
Q
qijun 已提交
226

227
  paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
Q
qijun 已提交
228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245

  // numpy code:
  // a = np.arange(6).reshape(2, 3)
  // b = np.arange(12).reshape(3, 4)[:, 1:]
  // c = np.arange(8).reshape(2, 4)[:, 1:]
  // out = np.arange(8).reshape(2, 4)
  // out[:, 1:] = np.dot(a, b) + c
  context.Wait();
  EXPECT_EQ(input3_ptr[0], 0);
  EXPECT_EQ(input3_ptr[1], 24);
  EXPECT_EQ(input3_ptr[2], 28);
  EXPECT_EQ(input3_ptr[3], 32);
  EXPECT_EQ(input3_ptr[4], 4);
  EXPECT_EQ(input3_ptr[5], 73);
  EXPECT_EQ(input3_ptr[6], 86);
  EXPECT_EQ(input3_ptr[7], 99);
}

246
TEST(math_function, gemm_notrans_cublas_fp16) {
247 248 249 250 251 252
  paddle::framework::Tensor input1;
  paddle::framework::Tensor input2;
  paddle::framework::Tensor input3;
  paddle::framework::Tensor input1_gpu;
  paddle::framework::Tensor input2_gpu;
  paddle::framework::Tensor input3_gpu;
253

254 255 256
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
257

K
Kexin Zhao 已提交
258 259 260 261 262
  // fp16 GEMM in cublas requires GPU compute capability >= 53
  if (context.GetComputeCapability() < 53) {
    return;
  }

263 264 265
  int m = 2;
  int n = 3;
  int k = 3;
266 267
  paddle::platform::float16* input1_ptr =
      input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
268
  fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
269 270
  paddle::platform::float16* input2_ptr =
      input2.mutable_data<paddle::platform::float16>({3, 4}, cpu_place);
271 272
  fill_fp16_data(input2_ptr, input2.numel(),
                 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
273 274
  paddle::platform::float16* input3_ptr =
      input3.mutable_data<paddle::platform::float16>({2, 4}, cpu_place);
275 276
  fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});

277 278 279 280 281 282 283
  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
  paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
  paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
  paddle::platform::float16* a = input1_gpu.data<paddle::platform::float16>();
  paddle::platform::float16* b = input2_gpu.data<paddle::platform::float16>();
  paddle::platform::float16* c =
      input3_gpu.mutable_data<paddle::platform::float16>(gpu_place);
284

Y
Yu Yang 已提交
285 286 287
  GetBlas<paddle::platform::float16>(context).GEMM(
      false, false, m, n, k, static_cast<paddle::platform::float16>(1), a, 3,
      b + 1, 4, static_cast<paddle::platform::float16>(1), c + 1, 4);
288

289
  paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308

  // numpy code:
  // a = np.arange(6).reshape(2, 3)
  // b = np.arange(12).reshape(3, 4)[:, 1:]
  // c = np.arange(8).reshape(2, 4)[:, 1:]
  // out = np.arange(8).reshape(2, 4)
  // out[:, 1:] = np.dot(a, b) + c
  context.Wait();
  EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
  EXPECT_EQ(static_cast<float>(input3_ptr[1]), 24);
  EXPECT_EQ(static_cast<float>(input3_ptr[2]), 28);
  EXPECT_EQ(static_cast<float>(input3_ptr[3]), 32);
  EXPECT_EQ(static_cast<float>(input3_ptr[4]), 4);
  EXPECT_EQ(static_cast<float>(input3_ptr[5]), 73);
  EXPECT_EQ(static_cast<float>(input3_ptr[6]), 86);
  EXPECT_EQ(static_cast<float>(input3_ptr[7]), 99);
}

TEST(math_function, gemm_trans_cublas_fp32) {
309 310 311 312 313 314
  paddle::framework::Tensor input1;
  paddle::framework::Tensor input2;
  paddle::framework::Tensor input3;
  paddle::framework::Tensor input1_gpu;
  paddle::framework::Tensor input2_gpu;
  paddle::framework::Tensor input3_gpu;
315

316 317 318
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
Q
qijun 已提交
319 320 321 322

  int m = 2;
  int n = 3;
  int k = 3;
323
  float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
Q
qijun 已提交
324 325
  float arr1[6] = {0, 1, 2, 3, 4, 5};
  memcpy(input1_ptr, arr1, 6 * sizeof(float));
326
  float* input2_ptr = input2.mutable_data<float>({4, 3}, cpu_place);
Q
qijun 已提交
327 328
  float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
  memcpy(input2_ptr, arr2, 12 * sizeof(float));
329
  float* input3_ptr = input3.mutable_data<float>({2, 4}, cpu_place);
Q
qijun 已提交
330 331 332
  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
  memcpy(input3_ptr, arr3, 8 * sizeof(float));

333 334 335
  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
  paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
  paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
Q
qijun 已提交
336 337
  float* a = input1_gpu.data<float>();
  float* b = input2_gpu.data<float>();
338
  float* c = input3_gpu.mutable_data<float>(gpu_place);
Q
qijun 已提交
339

Y
Yu Yang 已提交
340 341
  GetBlas<float>(context).GEMM(false, true, m, n, k, 1, a, 3, b + 3, 3, 1,
                               c + 1, 4);
Q
qijun 已提交
342

343
  paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
Q
qijun 已提交
344

345
  context.Wait();
Q
qijun 已提交
346 347 348 349 350 351 352 353
  EXPECT_EQ(input3_ptr[0], 0);
  EXPECT_EQ(input3_ptr[1], 24);
  EXPECT_EQ(input3_ptr[2], 28);
  EXPECT_EQ(input3_ptr[3], 32);
  EXPECT_EQ(input3_ptr[4], 4);
  EXPECT_EQ(input3_ptr[5], 73);
  EXPECT_EQ(input3_ptr[6], 86);
  EXPECT_EQ(input3_ptr[7], 99);
354 355 356
}

TEST(math_function, gemm_trans_cublas_fp16) {
357 358 359 360 361 362
  paddle::framework::Tensor input1;
  paddle::framework::Tensor input2;
  paddle::framework::Tensor input3;
  paddle::framework::Tensor input1_gpu;
  paddle::framework::Tensor input2_gpu;
  paddle::framework::Tensor input3_gpu;
363

364 365 366
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
367

K
Kexin Zhao 已提交
368 369 370 371 372
  // fp16 GEMM in cublas requires GPU compute capability >= 53
  if (context.GetComputeCapability() < 53) {
    return;
  }

373 374 375
  int m = 2;
  int n = 3;
  int k = 3;
376 377
  paddle::platform::float16* input1_ptr =
      input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
378
  fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
379 380
  paddle::platform::float16* input2_ptr =
      input2.mutable_data<paddle::platform::float16>({4, 3}, cpu_place);
381 382
  fill_fp16_data(input2_ptr, input2.numel(),
                 {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11});
383 384
  paddle::platform::float16* input3_ptr =
      input3.mutable_data<paddle::platform::float16>({2, 4}, cpu_place);
385 386
  fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});

387 388 389 390 391 392 393
  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
  paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
  paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
  paddle::platform::float16* a = input1_gpu.data<paddle::platform::float16>();
  paddle::platform::float16* b = input2_gpu.data<paddle::platform::float16>();
  paddle::platform::float16* c =
      input3_gpu.mutable_data<paddle::platform::float16>(gpu_place);
394

Y
Yu Yang 已提交
395 396 397
  GetBlas<paddle::platform::float16>(context).GEMM(
      false, true, m, n, k, static_cast<paddle::platform::float16>(1), a, 3,
      b + 3, 3, static_cast<paddle::platform::float16>(1), c + 1, 4);
398

399
  paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
400 401 402 403 404 405 406 407 408 409

  context.Wait();
  EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
  EXPECT_EQ(static_cast<float>(input3_ptr[1]), 24);
  EXPECT_EQ(static_cast<float>(input3_ptr[2]), 28);
  EXPECT_EQ(static_cast<float>(input3_ptr[3]), 32);
  EXPECT_EQ(static_cast<float>(input3_ptr[4]), 4);
  EXPECT_EQ(static_cast<float>(input3_ptr[5]), 73);
  EXPECT_EQ(static_cast<float>(input3_ptr[6]), 86);
  EXPECT_EQ(static_cast<float>(input3_ptr[7]), 99);
Q
qijun 已提交
410
}
411 412 413

template <typename T>
void GemvTest(int m, int n, bool trans) {
414 415 416
  paddle::framework::Tensor mat_a;
  paddle::framework::Tensor vec_b;
  paddle::framework::Tensor vec_c;
417

418 419 420
  paddle::platform::CPUPlace cpu_place;
  paddle::platform::CUDAPlace gpu_place(0);
  paddle::platform::CUDADeviceContext context(gpu_place);
421 422 423 424 425

  T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place);
  T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);
  T* data_c = vec_c.mutable_data<T>({trans ? n : m}, cpu_place);

426 427 428
  paddle::framework::Tensor g_mat_a;
  paddle::framework::Tensor g_vec_b;
  paddle::framework::Tensor g_vec_c;
429 430 431
  T* g_data_a = g_mat_a.mutable_data<T>(mat_a.dims(), gpu_place);
  T* g_data_b = g_vec_b.mutable_data<T>(vec_b.dims(), gpu_place);
  T* g_data_c = g_vec_c.mutable_data<T>(vec_c.dims(), gpu_place);
432 433 434 435 436 437 438 439

  for (int i = 0; i < mat_a.numel(); ++i) {
    data_a[i] = static_cast<T>(i);
  }
  for (int i = 0; i < vec_b.numel(); ++i) {
    data_b[i] = static_cast<T>(i);
  }

440 441
  paddle::framework::TensorCopySync(mat_a, gpu_place, &g_mat_a);
  paddle::framework::TensorCopySync(vec_b, gpu_place, &g_vec_b);
442

Y
Yu Yang 已提交
443 444
  GetBlas<T>(context).GEMV(trans, static_cast<int>(m), static_cast<int>(n), 1.,
                           g_data_a, g_data_b, 0., g_data_c);
445

446
  paddle::framework::TensorCopySync(g_vec_c, cpu_place, &vec_c);
447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472

  if (!trans) {
    for (int i = 0; i < m; ++i) {
      T sum = 0.0;
      for (int j = 0; j < n; ++j) {
        sum += data_a[i * n + j] * data_b[j];
      }
      ASSERT_FLOAT_EQ(data_c[i], sum);
    }
  } else {
    for (int i = 0; i < n; ++i) {
      T sum = 0.0;
      for (int j = 0; j < m; ++j) {
        sum += data_a[j * n + i] * data_b[j];
      }
      ASSERT_FLOAT_EQ(data_c[i], sum);
    }
  }
}

TEST(math_function, gemv) {
  GemvTest<float>(3, 13, false);
  GemvTest<double>(3, 13, false);
  GemvTest<float>(3, 13, true);
  GemvTest<double>(3, 13, true);
}