Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
d1bfb4b7
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
d1bfb4b7
编写于
2月 10, 2023
作者:
H
Huang Jiyi
提交者:
GitHub
2月 10, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[phi decoupling] remove AllocatorFacade in phi (#50380)
* remove AllocatorFacade in phi * fix include * fix bugs
上级
13f57ec0
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
108 addition
and
153 deletion
+108
-153
paddle/phi/tests/common/test_scalar.cu
paddle/phi/tests/common/test_scalar.cu
+31
-48
paddle/phi/tests/kernels/strided_memcpy_test.cc
paddle/phi/tests/kernels/strided_memcpy_test.cc
+18
-20
paddle/phi/tests/kernels/test_math_function.cu
paddle/phi/tests/kernels/test_math_function.cu
+59
-85
未找到文件。
paddle/phi/tests/common/test_scalar.cu
浏览文件 @
d1bfb4b7
...
...
@@ -15,9 +15,9 @@ limitations under the License. */
#include <map> // NOLINT
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/complex.h"
#include "paddle/phi/common/float16.h"
...
...
@@ -44,12 +44,10 @@ TEST(Scalar, ConstructFromDenseTensor1) {
alloc
.
get
(),
phi
::
DenseTensorMeta
(
phi
::
DataType
::
FLOAT16
,
phi
::
make_ddim
({
1
}),
phi
::
DataLayout
::
NCHW
));
phi
::
CPUContext
dev_ctx
;
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
phi
::
CPUPlace
())
.
get
());
phi
::
DeviceContextPool
&
pool
=
phi
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
reinterpret_cast
<
phi
::
CPUContext
*>
(
pool
.
Get
(
phi
::
CPUPlace
()));
auto
*
dense_x_data
=
dev_ctx
.
Alloc
<
float16
>
(
&
dense_x
);
auto
*
dense_x_data
=
dev_ctx
->
Alloc
<
float16
>
(
&
dense_x
);
dense_x_data
[
0
]
=
1
;
phi
::
Scalar
scalar_test
(
dense_x
);
ASSERT_NEAR
(
1
,
scalar_test
.
to
<
float16
>
(),
1e-6
);
...
...
@@ -63,12 +61,10 @@ TEST(Scalar, ConstructFromDenseTensor2) {
alloc
.
get
(),
phi
::
DenseTensorMeta
(
phi
::
DataType
::
INT16
,
phi
::
make_ddim
({
1
}),
phi
::
DataLayout
::
NCHW
));
phi
::
CPUContext
dev_ctx
;
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
phi
::
CPUPlace
())
.
get
());
phi
::
DeviceContextPool
&
pool
=
phi
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
reinterpret_cast
<
phi
::
CPUContext
*>
(
pool
.
Get
(
phi
::
CPUPlace
()));
auto
*
dense_x_data
=
dev_ctx
.
Alloc
<
int16_t
>
(
&
dense_x
);
auto
*
dense_x_data
=
dev_ctx
->
Alloc
<
int16_t
>
(
&
dense_x
);
dense_x_data
[
0
]
=
1
;
phi
::
Scalar
scalar_test
(
dense_x
);
ASSERT_EQ
(
1
,
scalar_test
.
to
<
int16_t
>
());
...
...
@@ -82,12 +78,10 @@ TEST(Scalar, ConstructFromDenseTensor3) {
alloc
.
get
(),
phi
::
DenseTensorMeta
(
phi
::
DataType
::
INT8
,
phi
::
make_ddim
({
1
}),
phi
::
DataLayout
::
NCHW
));
phi
::
CPUContext
dev_ctx
;
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
phi
::
CPUPlace
())
.
get
());
phi
::
DeviceContextPool
&
pool
=
phi
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
reinterpret_cast
<
phi
::
CPUContext
*>
(
pool
.
Get
(
phi
::
CPUPlace
()));
auto
*
dense_x_data
=
dev_ctx
.
Alloc
<
int8_t
>
(
&
dense_x
);
auto
*
dense_x_data
=
dev_ctx
->
Alloc
<
int8_t
>
(
&
dense_x
);
dense_x_data
[
0
]
=
1
;
phi
::
Scalar
scalar_test
(
dense_x
);
ASSERT_EQ
(
1
,
scalar_test
.
to
<
int8_t
>
());
...
...
@@ -101,12 +95,10 @@ TEST(Scalar, ConstructFromDenseTensor4) {
alloc
.
get
(),
phi
::
DenseTensorMeta
(
phi
::
DataType
::
BOOL
,
phi
::
make_ddim
({
1
}),
phi
::
DataLayout
::
NCHW
));
phi
::
CPUContext
dev_ctx
;
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
phi
::
CPUPlace
())
.
get
());
phi
::
DeviceContextPool
&
pool
=
phi
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
reinterpret_cast
<
phi
::
CPUContext
*>
(
pool
.
Get
(
phi
::
CPUPlace
()));
auto
*
dense_x_data
=
dev_ctx
.
Alloc
<
bool
>
(
&
dense_x
);
auto
*
dense_x_data
=
dev_ctx
->
Alloc
<
bool
>
(
&
dense_x
);
dense_x_data
[
0
]
=
true
;
phi
::
Scalar
scalar_test
(
dense_x
);
ASSERT_EQ
(
true
,
scalar_test
.
to
<
bool
>
());
...
...
@@ -120,12 +112,10 @@ TEST(Scalar, ConstructFromDenseTensor5) {
phi
::
DenseTensorMeta
(
phi
::
DataType
::
COMPLEX64
,
phi
::
make_ddim
({
1
}),
phi
::
DataLayout
::
NCHW
));
phi
::
CPUContext
dev_ctx
;
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
phi
::
CPUPlace
())
.
get
());
phi
::
DeviceContextPool
&
pool
=
phi
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
reinterpret_cast
<
phi
::
CPUContext
*>
(
pool
.
Get
(
phi
::
CPUPlace
()));
auto
*
dense_x_data
=
dev_ctx
.
Alloc
<
complex64
>
(
&
dense_x
);
auto
*
dense_x_data
=
dev_ctx
->
Alloc
<
complex64
>
(
&
dense_x
);
dense_x_data
[
0
]
=
1
;
phi
::
Scalar
scalar_test
(
dense_x
);
complex64
expected_value
(
1
,
0
);
...
...
@@ -140,12 +130,10 @@ TEST(Scalar, ConstructFromDenseTensor6) {
phi
::
DenseTensorMeta
(
phi
::
DataType
::
COMPLEX128
,
phi
::
make_ddim
({
1
}),
phi
::
DataLayout
::
NCHW
));
phi
::
CPUContext
dev_ctx
;
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
phi
::
CPUPlace
())
.
get
());
phi
::
DeviceContextPool
&
pool
=
phi
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
reinterpret_cast
<
phi
::
CPUContext
*>
(
pool
.
Get
(
phi
::
CPUPlace
()));
auto
*
dense_x_data
=
dev_ctx
.
Alloc
<
complex128
>
(
&
dense_x
);
auto
*
dense_x_data
=
dev_ctx
->
Alloc
<
complex128
>
(
&
dense_x
);
dense_x_data
[
0
]
=
1
;
phi
::
Scalar
scalar_test
(
dense_x
);
complex128
expected_value
(
1
,
0
);
...
...
@@ -160,15 +148,12 @@ TEST(Scalar, ConstructFromDenseTensor7) {
alloc
.
get
(),
phi
::
DenseTensorMeta
(
phi
::
DataType
::
FLOAT32
,
phi
::
make_ddim
({
1
}),
phi
::
DataLayout
::
NCHW
));
phi
::
GPUContext
dev_ctx
{
phi
::
GPUPlace
()};
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
phi
::
GPUPlace
())
.
get
());
dev_ctx
.
Init
();
auto
*
dense_x_data
=
dev_ctx
.
Alloc
<
float
>
(
&
dense_x
);
FillTensor
<<<
1
,
1
,
0
,
dev_ctx
.
stream
()
>>>
(
dense_x_data
);
dev_ctx
.
Wait
();
phi
::
DeviceContextPool
&
pool
=
phi
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
reinterpret_cast
<
phi
::
GPUContext
*>
(
pool
.
Get
(
phi
::
GPUPlace
()));
auto
*
dense_x_data
=
dev_ctx
->
Alloc
<
float
>
(
&
dense_x
);
FillTensor
<<<
1
,
1
,
0
,
dev_ctx
->
stream
()
>>>
(
dense_x_data
);
dev_ctx
->
Wait
();
phi
::
Scalar
scalar_test
(
dense_x
);
ASSERT_NEAR
(
1
,
scalar_test
.
to
<
float
>
(),
1e-6
);
}
...
...
@@ -182,14 +167,12 @@ TEST(Scalar, ConstructFromTensor) {
phi
::
DenseTensorMeta
(
phi
::
DataType
::
FLOAT32
,
phi
::
make_ddim
({
1
}),
phi
::
DataLayout
::
NCHW
));
phi
::
GPUContext
dev_ctx
{
phi
::
GPUPlace
()};
dev_ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
phi
::
GPUPlace
())
.
get
());
dev_ctx
.
Init
();
auto
*
dense_x_data
=
dev_ctx
.
Alloc
<
float
>
(
dense_x
.
get
());
FillTensor
<<<
1
,
1
,
0
,
dev_ctx
.
stream
()
>>>
(
dense_x_data
);
dev_ctx
.
Wait
();
phi
::
DeviceContextPool
&
pool
=
phi
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
reinterpret_cast
<
phi
::
GPUContext
*>
(
pool
.
Get
(
phi
::
GPUPlace
()));
auto
*
dense_x_data
=
dev_ctx
->
Alloc
<
float
>
(
dense_x
.
get
());
FillTensor
<<<
1
,
1
,
0
,
dev_ctx
->
stream
()
>>>
(
dense_x_data
);
dev_ctx
->
Wait
();
paddle
::
experimental
::
Tensor
x
(
dense_x
);
paddle
::
experimental
::
Scalar
scalar_test
(
x
);
ASSERT_NEAR
(
1
,
scalar_test
.
to
<
float
>
(),
1e-6
);
...
...
paddle/phi/tests/kernels/strided_memcpy_test.cc
浏览文件 @
d1bfb4b7
...
...
@@ -15,7 +15,9 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/all_context.h"
namespace
phi
{
namespace
tests
{
...
...
@@ -89,16 +91,13 @@ TEST(StridedMemcpy, GPUCrop) {
phi
::
GPUPlace
gpu0
(
0
);
phi
::
CPUPlace
cpu
;
phi
::
GPUContext
ctx
(
gpu0
);
ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu0
,
ctx
.
stream
())
.
get
());
ctx
.
PartialInitWithAllocator
();
phi
::
DeviceContextPool
&
pool
=
phi
::
DeviceContextPool
::
Instance
();
auto
*
ctx
=
reinterpret_cast
<
phi
::
GPUContext
*>
(
pool
.
Get
(
phi
::
GPUPlace
()));
auto
src_allocation
=
paddle
::
memory
::
Alloc
(
gpu0
,
sizeof
(
src
));
int
*
gpu_src
=
reinterpret_cast
<
int
*>
(
src_allocation
->
ptr
());
paddle
::
memory
::
Copy
(
gpu0
,
gpu_src
,
cpu
,
src
,
sizeof
(
src
),
ctx
.
stream
());
paddle
::
memory
::
Copy
(
gpu0
,
gpu_src
,
cpu
,
src
,
sizeof
(
src
),
ctx
->
stream
());
phi
::
DDim
src_stride
({
5
,
1
});
...
...
@@ -110,10 +109,10 @@ TEST(StridedMemcpy, GPUCrop) {
phi
::
DDim
dst_stride
({
2
,
1
});
phi
::
funcs
::
StridedMemcpy
<
int
>
(
ctx
,
gpu_src
+
1
,
src_stride
,
dst_dim
,
dst_stride
,
gpu_dst
);
*
ctx
,
gpu_src
+
1
,
src_stride
,
dst_dim
,
dst_stride
,
gpu_dst
);
paddle
::
memory
::
Copy
(
cpu
,
dst
,
gpu0
,
gpu_dst
,
sizeof
(
dst
),
ctx
.
stream
());
ctx
.
Wait
();
paddle
::
memory
::
Copy
(
cpu
,
dst
,
gpu0
,
gpu_dst
,
sizeof
(
dst
),
ctx
->
stream
());
ctx
->
Wait
();
ASSERT_EQ
(
1
,
dst
[
0
]);
ASSERT_EQ
(
2
,
dst
[
1
]);
...
...
@@ -131,14 +130,13 @@ TEST(StridedMemcpy, GPUConcat) {
phi
::
GPUPlace
gpu0
(
0
);
phi
::
CPUPlace
cpu
;
phi
::
GPUContext
ctx
(
gpu0
);
ctx
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu0
,
ctx
.
stream
())
.
get
());
ctx
.
PartialInitWithAllocator
();
phi
::
DeviceContextPool
&
pool
=
phi
::
DeviceContextPool
::
Instance
();
auto
*
ctx
=
reinterpret_cast
<
phi
::
GPUContext
*>
(
pool
.
Get
(
phi
::
GPUPlace
()));
auto
gpu_src_allocation
=
paddle
::
memory
::
Alloc
(
gpu0
,
sizeof
(
src
));
int
*
gpu_src
=
reinterpret_cast
<
int
*>
(
gpu_src_allocation
->
ptr
());
paddle
::
memory
::
Copy
(
gpu0
,
gpu_src
,
cpu
,
src
,
sizeof
(
src
),
ctx
.
stream
());
paddle
::
memory
::
Copy
(
gpu0
,
gpu_src
,
cpu
,
src
,
sizeof
(
src
),
ctx
->
stream
());
int
dst
[
8
];
auto
gpu_dst_allocation
=
paddle
::
memory
::
Alloc
(
gpu0
,
sizeof
(
dst
));
...
...
@@ -149,12 +147,12 @@ TEST(StridedMemcpy, GPUConcat) {
phi
::
DDim
dst_stride
({
4
,
1
});
phi
::
funcs
::
StridedMemcpy
<
int
>
(
ctx
,
gpu_src
,
src_stride
,
dst_dim
,
dst_stride
,
gpu_dst
);
*
ctx
,
gpu_src
,
src_stride
,
dst_dim
,
dst_stride
,
gpu_dst
);
phi
::
funcs
::
StridedMemcpy
<
int
>
(
ctx
,
gpu_src
,
src_stride
,
dst_dim
,
dst_stride
,
gpu_dst
+
2
);
*
ctx
,
gpu_src
,
src_stride
,
dst_dim
,
dst_stride
,
gpu_dst
+
2
);
paddle
::
memory
::
Copy
(
cpu
,
dst
,
gpu0
,
gpu_dst
,
sizeof
(
dst
),
ctx
.
stream
());
ctx
.
Wait
();
paddle
::
memory
::
Copy
(
cpu
,
dst
,
gpu0
,
gpu_dst
,
sizeof
(
dst
),
ctx
->
stream
());
ctx
->
Wait
();
// clang-format off
int
expect_dst
[]
=
{
...
...
paddle/phi/tests/kernels/test_math_function.cu
浏览文件 @
d1bfb4b7
...
...
@@ -14,6 +14,7 @@
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"
...
...
@@ -51,11 +52,8 @@ TEST(math_function, notrans_mul_trans_fp32) {
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CUDAPlace
gpu_place
(
0
);
phi
::
GPUContext
context
(
gpu_place
);
context
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu_place
,
context
.
stream
())
.
get
());
context
.
PartialInitWithAllocator
();
phi
::
DeviceContextPool
&
pool
=
phi
::
DeviceContextPool
::
Instance
();
auto
*
context
=
reinterpret_cast
<
phi
::
GPUContext
*>
(
pool
.
Get
(
phi
::
GPUPlace
()));
float
*
input1_ptr
=
input1
.
mutable_data
<
float
>
({
2
,
3
},
cpu_place
);
float
arr
[
6
]
=
{
0
,
1
,
2
,
3
,
4
,
5
};
...
...
@@ -65,13 +63,13 @@ TEST(math_function, notrans_mul_trans_fp32) {
paddle
::
framework
::
TensorCopySync
(
input1
,
gpu_place
,
&
input2_gpu
);
out_gpu
.
mutable_data
<
float
>
({
2
,
2
},
gpu_place
);
GetBlas
<
float
>
(
context
).
MatMul
(
GetBlas
<
float
>
(
*
context
).
MatMul
(
input1_gpu
,
false
,
input2_gpu
,
true
,
1
,
&
out_gpu
,
0
);
paddle
::
framework
::
TensorCopySync
(
out_gpu
,
cpu_place
,
&
out
);
float
*
out_ptr
=
out
.
data
<
float
>
();
context
.
Wait
();
context
->
Wait
();
EXPECT_EQ
(
out_ptr
[
0
],
5
);
EXPECT_EQ
(
out_ptr
[
1
],
14
);
EXPECT_EQ
(
out_ptr
[
2
],
14
);
...
...
@@ -87,14 +85,11 @@ TEST(math_function, notrans_mul_trans_fp16) {
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CUDAPlace
gpu_place
(
0
);
phi
::
GPUContext
context
(
gpu_place
);
context
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu_place
,
context
.
stream
())
.
get
());
context
.
PartialInitWithAllocator
();
phi
::
DeviceContextPool
&
pool
=
phi
::
DeviceContextPool
::
Instance
();
auto
*
context
=
reinterpret_cast
<
phi
::
GPUContext
*>
(
pool
.
Get
(
phi
::
GPUPlace
()));
// fp16 GEMM in cublas requires GPU compute capability >= 53
if
(
context
.
GetComputeCapability
()
<
53
)
{
if
(
context
->
GetComputeCapability
()
<
53
)
{
return
;
}
...
...
@@ -107,18 +102,18 @@ TEST(math_function, notrans_mul_trans_fp16) {
out_gpu
.
mutable_data
<
phi
::
dtype
::
float16
>
({
2
,
2
},
gpu_place
);
GetBlas
<
phi
::
dtype
::
float16
>
(
context
).
MatMul
(
input1_gpu
,
false
,
input2_gpu
,
true
,
phi
::
dtype
::
float16
(
1
),
&
out_gpu
,
phi
::
dtype
::
float16
(
0
));
GetBlas
<
phi
::
dtype
::
float16
>
(
*
context
).
MatMul
(
input1_gpu
,
false
,
input2_gpu
,
true
,
phi
::
dtype
::
float16
(
1
),
&
out_gpu
,
phi
::
dtype
::
float16
(
0
));
paddle
::
framework
::
TensorCopySync
(
out_gpu
,
cpu_place
,
&
out
);
phi
::
dtype
::
float16
*
out_ptr
=
out
.
data
<
phi
::
dtype
::
float16
>
();
context
.
Wait
();
context
->
Wait
();
EXPECT_EQ
(
static_cast
<
float
>
(
out_ptr
[
0
]),
5
);
EXPECT_EQ
(
static_cast
<
float
>
(
out_ptr
[
1
]),
14
);
EXPECT_EQ
(
static_cast
<
float
>
(
out_ptr
[
2
]),
14
);
...
...
@@ -134,11 +129,8 @@ TEST(math_function, trans_mul_notrans_fp32) {
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CUDAPlace
gpu_place
(
0
);
phi
::
GPUContext
context
(
gpu_place
);
context
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu_place
,
context
.
stream
())
.
get
());
context
.
PartialInitWithAllocator
();
phi
::
DeviceContextPool
&
pool
=
phi
::
DeviceContextPool
::
Instance
();
auto
*
context
=
reinterpret_cast
<
phi
::
GPUContext
*>
(
pool
.
Get
(
phi
::
GPUPlace
()));
float
*
input1_ptr
=
input1
.
mutable_data
<
float
>
({
2
,
3
},
cpu_place
);
float
arr
[
6
]
=
{
0
,
1
,
2
,
3
,
4
,
5
};
...
...
@@ -149,13 +141,13 @@ TEST(math_function, trans_mul_notrans_fp32) {
out_gpu
.
mutable_data
<
float
>
({
3
,
3
},
gpu_place
);
GetBlas
<
float
>
(
context
).
MatMul
(
GetBlas
<
float
>
(
*
context
).
MatMul
(
input1_gpu
,
true
,
input2_gpu
,
false
,
1
,
&
out_gpu
,
0
);
paddle
::
framework
::
TensorCopySync
(
out_gpu
,
cpu_place
,
&
out
);
float
*
out_ptr
=
out
.
data
<
float
>
();
context
.
Wait
();
context
->
Wait
();
EXPECT_EQ
(
out_ptr
[
0
],
9
);
EXPECT_EQ
(
out_ptr
[
1
],
12
);
EXPECT_EQ
(
out_ptr
[
2
],
15
);
...
...
@@ -176,14 +168,11 @@ TEST(math_function, trans_mul_notrans_fp16) {
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CUDAPlace
gpu_place
(
0
);
phi
::
GPUContext
context
(
gpu_place
);
context
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu_place
,
context
.
stream
())
.
get
());
context
.
PartialInitWithAllocator
();
phi
::
DeviceContextPool
&
pool
=
phi
::
DeviceContextPool
::
Instance
();
auto
*
context
=
reinterpret_cast
<
phi
::
GPUContext
*>
(
pool
.
Get
(
phi
::
GPUPlace
()));
// fp16 GEMM in cublas requires GPU compute capability >= 53
if
(
context
.
GetComputeCapability
()
<
53
)
{
if
(
context
->
GetComputeCapability
()
<
53
)
{
return
;
}
...
...
@@ -196,18 +185,18 @@ TEST(math_function, trans_mul_notrans_fp16) {
out_gpu
.
mutable_data
<
phi
::
dtype
::
float16
>
({
3
,
3
},
gpu_place
);
GetBlas
<
phi
::
dtype
::
float16
>
(
context
).
MatMul
(
input1_gpu
,
true
,
input2_gpu
,
false
,
phi
::
dtype
::
float16
(
1
),
&
out_gpu
,
phi
::
dtype
::
float16
(
0
));
GetBlas
<
phi
::
dtype
::
float16
>
(
*
context
).
MatMul
(
input1_gpu
,
true
,
input2_gpu
,
false
,
phi
::
dtype
::
float16
(
1
),
&
out_gpu
,
phi
::
dtype
::
float16
(
0
));
paddle
::
framework
::
TensorCopySync
(
out_gpu
,
cpu_place
,
&
out
);
phi
::
dtype
::
float16
*
out_ptr
=
out
.
data
<
phi
::
dtype
::
float16
>
();
context
.
Wait
();
context
->
Wait
();
EXPECT_EQ
(
static_cast
<
float
>
(
out_ptr
[
0
]),
9
);
EXPECT_EQ
(
static_cast
<
float
>
(
out_ptr
[
1
]),
12
);
EXPECT_EQ
(
static_cast
<
float
>
(
out_ptr
[
2
]),
15
);
...
...
@@ -229,11 +218,8 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CUDAPlace
gpu_place
(
0
);
phi
::
GPUContext
context
(
gpu_place
);
context
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu_place
,
context
.
stream
())
.
get
());
context
.
PartialInitWithAllocator
();
phi
::
DeviceContextPool
&
pool
=
phi
::
DeviceContextPool
::
Instance
();
auto
*
context
=
reinterpret_cast
<
phi
::
GPUContext
*>
(
pool
.
Get
(
phi
::
GPUPlace
()));
int
m
=
2
;
int
n
=
3
;
...
...
@@ -255,7 +241,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
float
*
b
=
input2_gpu
.
data
<
float
>
();
float
*
c
=
input3_gpu
.
mutable_data
<
float
>
(
gpu_place
);
GetBlas
<
float
>
(
context
).
GEMM
(
GetBlas
<
float
>
(
*
context
).
GEMM
(
false
,
false
,
m
,
n
,
k
,
1
,
a
,
3
,
b
+
1
,
4
,
1
,
c
+
1
,
4
);
paddle
::
framework
::
TensorCopySync
(
input3_gpu
,
cpu_place
,
&
input3
);
...
...
@@ -266,7 +252,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
// c = np.arange(8).reshape(2, 4)[:, 1:]
// out = np.arange(8).reshape(2, 4)
// out[:, 1:] = np.dot(a, b) + c
context
.
Wait
();
context
->
Wait
();
EXPECT_EQ
(
input3_ptr
[
0
],
0
);
EXPECT_EQ
(
input3_ptr
[
1
],
24
);
EXPECT_EQ
(
input3_ptr
[
2
],
28
);
...
...
@@ -287,14 +273,11 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CUDAPlace
gpu_place
(
0
);
phi
::
GPUContext
context
(
gpu_place
);
context
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu_place
,
context
.
stream
())
.
get
());
context
.
PartialInitWithAllocator
();
phi
::
DeviceContextPool
&
pool
=
phi
::
DeviceContextPool
::
Instance
();
auto
*
context
=
reinterpret_cast
<
phi
::
GPUContext
*>
(
pool
.
Get
(
phi
::
GPUPlace
()));
// fp16 GEMM in cublas requires GPU compute capability >= 53
if
(
context
.
GetComputeCapability
()
<
53
)
{
if
(
context
->
GetComputeCapability
()
<
53
)
{
return
;
}
...
...
@@ -320,7 +303,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
phi
::
dtype
::
float16
*
c
=
input3_gpu
.
mutable_data
<
phi
::
dtype
::
float16
>
(
gpu_place
);
GetBlas
<
phi
::
dtype
::
float16
>
(
context
).
GEMM
(
GetBlas
<
phi
::
dtype
::
float16
>
(
*
context
).
GEMM
(
false
,
false
,
m
,
...
...
@@ -343,7 +326,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
// c = np.arange(8).reshape(2, 4)[:, 1:]
// out = np.arange(8).reshape(2, 4)
// out[:, 1:] = np.dot(a, b) + c
context
.
Wait
();
context
->
Wait
();
EXPECT_EQ
(
static_cast
<
float
>
(
input3_ptr
[
0
]),
0
);
EXPECT_EQ
(
static_cast
<
float
>
(
input3_ptr
[
1
]),
24
);
EXPECT_EQ
(
static_cast
<
float
>
(
input3_ptr
[
2
]),
28
);
...
...
@@ -364,11 +347,8 @@ TEST(math_function, gemm_trans_cublas_fp32) {
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CUDAPlace
gpu_place
(
0
);
phi
::
GPUContext
context
(
gpu_place
);
context
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu_place
,
context
.
stream
())
.
get
());
context
.
PartialInitWithAllocator
();
phi
::
DeviceContextPool
&
pool
=
phi
::
DeviceContextPool
::
Instance
();
auto
*
context
=
reinterpret_cast
<
phi
::
GPUContext
*>
(
pool
.
Get
(
phi
::
GPUPlace
()));
int
m
=
2
;
int
n
=
3
;
...
...
@@ -390,12 +370,12 @@ TEST(math_function, gemm_trans_cublas_fp32) {
float
*
b
=
input2_gpu
.
data
<
float
>
();
float
*
c
=
input3_gpu
.
mutable_data
<
float
>
(
gpu_place
);
GetBlas
<
float
>
(
context
).
GEMM
(
GetBlas
<
float
>
(
*
context
).
GEMM
(
false
,
true
,
m
,
n
,
k
,
1
,
a
,
3
,
b
+
3
,
3
,
1
,
c
+
1
,
4
);
paddle
::
framework
::
TensorCopySync
(
input3_gpu
,
cpu_place
,
&
input3
);
context
.
Wait
();
context
->
Wait
();
EXPECT_EQ
(
input3_ptr
[
0
],
0
);
EXPECT_EQ
(
input3_ptr
[
1
],
24
);
EXPECT_EQ
(
input3_ptr
[
2
],
28
);
...
...
@@ -416,14 +396,11 @@ TEST(math_function, gemm_trans_cublas_fp16) {
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CUDAPlace
gpu_place
(
0
);
phi
::
GPUContext
context
(
gpu_place
);
context
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu_place
,
context
.
stream
())
.
get
());
context
.
PartialInitWithAllocator
();
phi
::
DeviceContextPool
&
pool
=
phi
::
DeviceContextPool
::
Instance
();
auto
*
context
=
reinterpret_cast
<
phi
::
GPUContext
*>
(
pool
.
Get
(
phi
::
GPUPlace
()));
// fp16 GEMM in cublas requires GPU compute capability >= 53
if
(
context
.
GetComputeCapability
()
<
53
)
{
if
(
context
->
GetComputeCapability
()
<
53
)
{
return
;
}
...
...
@@ -449,7 +426,7 @@ TEST(math_function, gemm_trans_cublas_fp16) {
phi
::
dtype
::
float16
*
c
=
input3_gpu
.
mutable_data
<
phi
::
dtype
::
float16
>
(
gpu_place
);
GetBlas
<
phi
::
dtype
::
float16
>
(
context
).
GEMM
(
GetBlas
<
phi
::
dtype
::
float16
>
(
*
context
).
GEMM
(
false
,
true
,
m
,
...
...
@@ -466,7 +443,7 @@ TEST(math_function, gemm_trans_cublas_fp16) {
paddle
::
framework
::
TensorCopySync
(
input3_gpu
,
cpu_place
,
&
input3
);
context
.
Wait
();
context
->
Wait
();
EXPECT_EQ
(
static_cast
<
float
>
(
input3_ptr
[
0
]),
0
);
EXPECT_EQ
(
static_cast
<
float
>
(
input3_ptr
[
1
]),
24
);
EXPECT_EQ
(
static_cast
<
float
>
(
input3_ptr
[
2
]),
28
);
...
...
@@ -485,11 +462,8 @@ void GemvTest(int m, int n, bool trans) {
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CUDAPlace
gpu_place
(
0
);
phi
::
GPUContext
context
(
gpu_place
);
context
.
SetAllocator
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
gpu_place
,
context
.
stream
())
.
get
());
context
.
PartialInitWithAllocator
();
phi
::
DeviceContextPool
&
pool
=
phi
::
DeviceContextPool
::
Instance
();
auto
*
context
=
reinterpret_cast
<
phi
::
GPUContext
*>
(
pool
.
Get
(
phi
::
GPUPlace
()));
T
*
data_a
=
mat_a
.
mutable_data
<
T
>
({
m
,
n
},
cpu_place
);
T
*
data_b
=
vec_b
.
mutable_data
<
T
>
({
trans
?
m
:
n
},
cpu_place
);
...
...
@@ -512,14 +486,14 @@ void GemvTest(int m, int n, bool trans) {
paddle
::
framework
::
TensorCopySync
(
mat_a
,
gpu_place
,
&
g_mat_a
);
paddle
::
framework
::
TensorCopySync
(
vec_b
,
gpu_place
,
&
g_vec_b
);
GetBlas
<
T
>
(
context
).
GEMV
(
trans
,
static_cast
<
int
>
(
m
),
static_cast
<
int
>
(
n
),
1.
,
g_data_a
,
g_data_b
,
0.
,
g_data_c
);
GetBlas
<
T
>
(
*
context
).
GEMV
(
trans
,
static_cast
<
int
>
(
m
),
static_cast
<
int
>
(
n
),
1.
,
g_data_a
,
g_data_b
,
0.
,
g_data_c
);
paddle
::
framework
::
TensorCopySync
(
g_vec_c
,
cpu_place
,
&
vec_c
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录