Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
813f61d2
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
813f61d2
编写于
3月 15, 2022
作者:
zhouweiwei2014
提交者:
GitHub
3月 15, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
change CUDA implementation of randperm OP (#40464)
上级
6b7d4845
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
219 addition
and
18 deletion
+219
-18
paddle/phi/kernels/gpu/randperm_kernel.cu
paddle/phi/kernels/gpu/randperm_kernel.cu
+142
-18
python/paddle/fluid/tests/unittests/test_randperm_op.py
python/paddle/fluid/tests/unittests/test_randperm_op.py
+77
-0
未找到文件。
paddle/phi/kernels/gpu/randperm_kernel.cu
浏览文件 @
813f61d2
...
...
@@ -14,37 +14,161 @@
#include "paddle/phi/kernels/randperm_kernel.h"
#ifdef __NVCC__
#include <curand_kernel.h>
#include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hiprand_kernel.h>
#include <hipcub/hipcub.hpp>
namespace
cub
=
hipcub
;
#endif
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/randint_kernel.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
DECLARE_bool
(
use_curand
);
namespace
phi
{
template
<
typename
T
>
__global__
void
SwapRepeatKernel
(
int
*
key
,
T
*
data
,
int
n
,
uint64_t
seed
,
uint64_t
offset
)
{
size_t
idx
=
static_cast
<
size_t
>
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
if
(
idx
<
n
)
return
;
bool
first_repeat
=
false
;
if
(
data
[
idx
]
==
data
[
idx
+
1
])
{
if
(
idx
==
0
)
{
first_repeat
=
true
;
}
else
if
(
data
[
idx
]
!=
data
[
idx
-
1
])
{
first_repeat
=
true
;
}
}
if
(
!
first_repeat
)
return
;
int
repeat_size
=
1
;
for
(
int
i
=
idx
;
i
<
n
;
++
i
)
{
if
(
data
[
i
]
==
data
[
i
+
1
])
{
++
repeat_size
;
}
else
{
break
;
}
}
#ifdef __NVCC__
curandStatePhilox4_32_10_t
state
;
curand_init
(
seed
,
idx
,
offset
,
&
state
);
for
(
int
i
=
repeat_size
-
1
;
i
>
0
;
i
--
)
{
uint32_t
r
=
curand
(
&
state
)
%
(
i
+
1
);
#elif __HIPCC__
hiprandStatePhilox4_32_10_t
state
;
hiprand_init
(
seed
,
idx
,
offset
,
&
state
);
for
(
int
i
=
repeat_size
-
1
;
i
>
0
;
i
--
)
{
uint32_t
r
=
hiprand
(
&
state
)
%
(
i
+
1
);
#endif
if
(
r
!=
i
)
{
T
tmp
=
data
[
idx
+
i
];
data
[
idx
+
i
]
=
data
[
idx
+
r
];
data
[
idx
+
r
]
=
tmp
;
}
}
}
template
<
typename
T
,
typename
Context
>
void
RandpermRawKernel
(
const
Context
&
dev_ctx
,
int
n
,
DataType
dtype
,
int
seed
,
DenseTensor
*
out
)
{
DenseTensor
tmp
;
tmp
.
Resize
(
phi
::
make_ddim
({
n
}));
T
*
tmp_data
=
dev_ctx
.
template
HostAlloc
<
T
>(
&
tmp
);
std
::
shared_ptr
<
std
::
mt19937_64
>
engine
;
if
(
seed
)
{
engine
=
std
::
make_shared
<
std
::
mt19937_64
>
();
engine
->
seed
(
seed
);
if
(
FLAGS_use_curand
)
{
DenseTensor
key
;
RandintKernel
<
int
,
Context
>
(
dev_ctx
,
std
::
numeric_limits
<
int
>::
min
(),
std
::
numeric_limits
<
int
>::
max
(),
ScalarArray
({
n
}),
phi
::
DataType
::
INT32
,
&
key
);
DenseTensor
key_out
=
Empty
<
int
,
Context
>
(
dev_ctx
,
ScalarArray
({
n
}));
DenseTensor
range
=
Empty
<
T
,
Context
>
(
dev_ctx
,
ScalarArray
({
n
}));
T
*
range_data
=
range
.
data
<
T
>
();
funcs
::
ForRange
<
Context
>
for_range
(
dev_ctx
,
n
);
for_range
([
range_data
]
__device__
(
size_t
idx
)
{
range_data
[
idx
]
=
static_cast
<
T
>
(
idx
);
});
out
->
Resize
(
phi
::
make_ddim
({
n
}));
T
*
out_data
=
dev_ctx
.
template
Alloc
<
T
>(
out
);
// Refer to [Algorithm of randperm] https://osf.io/af2hy/ to
// improve performance of radix sort.
double
n_d
=
static_cast
<
double
>
(
n
);
int
begin_bit
=
0
;
int
end_bit
=
std
::
ceil
(
std
::
log2
(
n_d
-
(
6
*
n_d
*
n_d
+
1
)
/
(
12
*
std
::
log
(
0.9
))));
size_t
temp_storage_bytes
=
0
;
cub
::
DeviceRadixSort
::
SortPairs
<
int
,
T
>
(
nullptr
,
temp_storage_bytes
,
key
.
data
<
int
>
(),
key_out
.
data
<
int
>
(),
range
.
data
<
T
>
(),
out_data
,
n
,
begin_bit
,
end_bit
<
32
?
end_bit
:
32
,
dev_ctx
.
stream
());
auto
d_temp_storage
=
paddle
::
memory
::
Alloc
(
dev_ctx
,
temp_storage_bytes
);
cub
::
DeviceRadixSort
::
SortPairs
<
int
,
T
>
(
d_temp_storage
->
ptr
(),
temp_storage_bytes
,
key
.
data
<
int
>
(),
key_out
.
data
<
int
>
(),
range
.
data
<
T
>
(),
out_data
,
n
,
begin_bit
,
end_bit
<
32
?
end_bit
:
32
,
dev_ctx
.
stream
());
auto
gen_cuda
=
dev_ctx
.
GetGenerator
();
auto
seed_offset
=
gen_cuda
->
IncrementOffset
(
n
);
uint64_t
seed
=
seed_offset
.
first
;
uint64_t
offset
=
seed_offset
.
second
;
auto
config
=
phi
::
backends
::
gpu
::
GetGpuLaunchConfig1D
(
dev_ctx
,
n
);
SwapRepeatKernel
<
T
><<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
0
,
dev_ctx
.
stream
()
>>>
(
key_out
.
data
<
int
>
(),
out_data
,
n
,
seed
,
offset
);
}
else
{
engine
=
dev_ctx
.
GetHostGenerator
()
->
GetCPUEngine
();
}
DenseTensor
tmp
;
tmp
.
Resize
(
phi
::
make_ddim
({
n
}));
T
*
tmp_data
=
dev_ctx
.
template
HostAlloc
<
T
>(
&
tmp
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
tmp_data
[
i
]
=
static_cast
<
T
>
(
i
);
}
std
::
shuffle
(
tmp_data
,
tmp_data
+
n
,
*
engine
);
std
::
shared_ptr
<
std
::
mt19937_64
>
engine
;
if
(
seed
)
{
engine
=
std
::
make_shared
<
std
::
mt19937_64
>
();
engine
->
seed
(
seed
);
}
else
{
engine
=
dev_ctx
.
GetHostGenerator
()
->
GetCPUEngine
();
}
T
*
out_data
=
dev_ctx
.
template
Alloc
<
T
>(
out
);
auto
size
=
out
->
numel
()
*
paddle
::
experimental
::
SizeOf
(
out
->
dtype
());
paddle
::
memory
::
Copy
<
phi
::
GPUPlace
,
phi
::
Place
>
(
out
->
place
(),
out_data
,
tmp
.
place
(),
tmp_data
,
size
,
0
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
tmp_data
[
i
]
=
static_cast
<
T
>
(
i
);
}
std
::
shuffle
(
tmp_data
,
tmp_data
+
n
,
*
engine
);
T
*
out_data
=
dev_ctx
.
template
Alloc
<
T
>(
out
);
auto
size
=
out
->
numel
()
*
paddle
::
experimental
::
SizeOf
(
out
->
dtype
());
paddle
::
memory
::
Copy
<
phi
::
GPUPlace
,
phi
::
Place
>
(
out
->
place
(),
out_data
,
tmp
.
place
(),
tmp_data
,
size
,
0
);
}
}
template
<
typename
T
,
typename
Context
>
...
...
python/paddle/fluid/tests/unittests/test_randperm_op.py
浏览文件 @
813f61d2
...
...
@@ -18,6 +18,7 @@ from op_test import OpTest
import
paddle
import
paddle.fluid.core
as
core
from
paddle.static
import
program_guard
,
Program
import
os
def
check_randperm_out
(
n
,
data_np
):
...
...
@@ -129,5 +130,81 @@ class TestRandpermImperative(unittest.TestCase):
paddle
.
enable_static
()
class
TestRandomValue
(
unittest
.
TestCase
):
def
test_fixed_random_number
(
self
):
# Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
if
not
paddle
.
is_compiled_with_cuda
():
return
if
os
.
getenv
(
"FLAGS_use_curand"
,
None
)
in
(
'0'
,
'False'
,
None
):
return
print
(
"Test Fixed Random number on GPU------>"
)
paddle
.
disable_static
()
paddle
.
set_device
(
'gpu'
)
paddle
.
seed
(
2021
)
x
=
paddle
.
randperm
(
30000
,
dtype
=
'int32'
).
numpy
()
expect
=
[
24562
,
8409
,
9379
,
10328
,
20503
,
18059
,
9681
,
21883
,
11783
,
27413
]
self
.
assertTrue
(
np
.
array_equal
(
x
[
0
:
10
],
expect
))
expect
=
[
29477
,
27100
,
9643
,
16637
,
8605
,
16892
,
27767
,
2724
,
1612
,
13096
]
self
.
assertTrue
(
np
.
array_equal
(
x
[
10000
:
10010
],
expect
))
expect
=
[
298
,
4104
,
16479
,
22714
,
28684
,
7510
,
14667
,
9950
,
15940
,
28343
]
self
.
assertTrue
(
np
.
array_equal
(
x
[
20000
:
20010
],
expect
))
x
=
paddle
.
randperm
(
30000
,
dtype
=
'int64'
).
numpy
()
expect
=
[
6587
,
1909
,
5525
,
23001
,
6488
,
14981
,
14355
,
3083
,
29561
,
8171
]
self
.
assertTrue
(
np
.
array_equal
(
x
[
0
:
10
],
expect
))
expect
=
[
23460
,
12394
,
22501
,
5427
,
20185
,
9100
,
5127
,
1651
,
25806
,
4818
]
self
.
assertTrue
(
np
.
array_equal
(
x
[
10000
:
10010
],
expect
))
expect
=
[
5829
,
4508
,
16193
,
24836
,
8526
,
242
,
9984
,
9243
,
1977
,
11839
]
self
.
assertTrue
(
np
.
array_equal
(
x
[
20000
:
20010
],
expect
))
x
=
paddle
.
randperm
(
30000
,
dtype
=
'float32'
).
numpy
()
expect
=
[
5154.
,
10537.
,
14362.
,
29843.
,
27185.
,
28399.
,
27561.
,
4144.
,
22906.
,
10705.
]
self
.
assertTrue
(
np
.
array_equal
(
x
[
0
:
10
],
expect
))
expect
=
[
1958.
,
18414.
,
20090.
,
21910.
,
22746.
,
27346.
,
22347.
,
3002.
,
4564.
,
26991.
]
self
.
assertTrue
(
np
.
array_equal
(
x
[
10000
:
10010
],
expect
))
expect
=
[
25580.
,
12606.
,
553.
,
16387.
,
29536.
,
4241.
,
20946.
,
16899.
,
16339.
,
4662.
]
self
.
assertTrue
(
np
.
array_equal
(
x
[
20000
:
20010
],
expect
))
x
=
paddle
.
randperm
(
30000
,
dtype
=
'float64'
).
numpy
()
expect
=
[
19051.
,
2449.
,
21940.
,
11121.
,
282.
,
7330.
,
13747.
,
24321.
,
21147.
,
9163.
]
self
.
assertTrue
(
np
.
array_equal
(
x
[
0
:
10
],
expect
))
expect
=
[
15483.
,
1315.
,
5723.
,
20954.
,
13251.
,
25539.
,
5074.
,
1823.
,
14945.
,
17624.
]
self
.
assertTrue
(
np
.
array_equal
(
x
[
10000
:
10010
],
expect
))
expect
=
[
10516.
,
2552.
,
29970.
,
5941.
,
986.
,
8007.
,
24805.
,
26753.
,
12202.
,
21404.
]
self
.
assertTrue
(
np
.
array_equal
(
x
[
20000
:
20010
],
expect
))
paddle
.
enable_static
()
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录