Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
bbbd75e4
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
bbbd75e4
编写于
1月 18, 2022
作者:
zhouweiwei2014
提交者:
GitHub
1月 18, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
change CUDA implementaion of uniform/gaussian OP (#38611)
* change CUDA implementaion of uniform/gaussian OP * fix unittest
上级
a998c077
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
182 addition
and
20 deletion
+182
-20
paddle/fluid/operators/distribution_helper.h
paddle/fluid/operators/distribution_helper.h
+42
-2
paddle/fluid/operators/gaussian_random_op.cu
paddle/fluid/operators/gaussian_random_op.cu
+21
-7
paddle/fluid/operators/uniform_random_op.cu
paddle/fluid/operators/uniform_random_op.cu
+23
-8
paddle/fluid/platform/flags.cc
paddle/fluid/platform/flags.cc
+2
-0
paddle/scripts/paddle_build.bat
paddle/scripts/paddle_build.bat
+1
-0
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+2
-0
python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
...paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
+1
-1
python/paddle/fluid/tests/unittests/test_adamw_op.py
python/paddle/fluid/tests/unittests/test_adamw_op.py
+2
-2
python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
...n/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+44
-0
python/paddle/fluid/tests/unittests/test_uniform_random_op.py
...on/paddle/fluid/tests/unittests/test_uniform_random_op.py
+44
-0
未找到文件。
paddle/fluid/operators/distribution_helper.h
浏览文件 @
bbbd75e4
...
...
@@ -21,17 +21,26 @@ limitations under the License. */
#include <hiprand_kernel.h>
#endif
#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/fluid/platform/hostdevice.h"
#if !defined(_WIN32)
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
#else
// there is no equivalent intrinsics in msvc.
#define UNLIKELY(condition) (condition)
#endif
namespace
paddle
{
namespace
distribution
{
using
Tensor
=
framework
::
Tensor
;
/********************* Transformation Function **********************/
template
<
typename
T
>
struct
exponential_transform
{
explicit
exponential_transform
(
T
lambda
)
:
lambda_
(
lambda
)
{}
...
...
@@ -52,7 +61,37 @@ struct exponential_transform {
T
lambda_
;
};
template
<
typename
T
>
struct
uniform_transform
{
explicit
uniform_transform
(
T
min
,
T
max
)
:
range_
(
max
-
min
),
min_
(
min
)
{}
HOSTDEVICE
inline
T
operator
()(
T
val
)
const
{
if
(
UNLIKELY
(
val
==
static_cast
<
T
>
(
1.0
)))
{
return
min_
;
}
else
{
return
val
*
range_
+
min_
;
}
}
private:
T
range_
;
T
min_
;
};
template
<
typename
T
>
struct
normal_transform
{
explicit
normal_transform
(
T
mean
,
T
std
)
:
mean_
(
mean
),
std_
(
std
)
{}
HOSTDEVICE
inline
T
operator
()(
T
val
)
const
{
return
val
*
std_
+
mean_
;
}
private:
T
mean_
;
T
std_
;
};
#if defined(__NVCC__) || defined(__HIPCC__)
/*********************** Distribution Function *************************/
template
<
typename
T
>
struct
uniform_distribution
;
...
...
@@ -132,6 +171,7 @@ struct normal_distribution<double> {
};
#endif
/******** Launch GPU function of distribution and transformation *********/
template
<
typename
T
,
typename
DistOp
,
typename
TransformOp
>
__global__
void
DistributionKernel
(
size_t
size
,
uint64_t
seed
,
uint64_t
offset
,
DistOp
dist
,
TransformOp
trans
,
...
...
@@ -151,8 +191,8 @@ __global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset,
for
(
size_t
j
=
0
;
j
<
returns_count
;
j
++
)
{
size_t
index
=
i
+
j
*
total_thread
;
if
(
index
<
size
)
{
auto
random
=
static_cast
<
T
>
((
&
random_tuple
.
x
)[
j
])
;
out_data
[
index
]
=
trans
(
random
);
auto
random
=
(
&
random_tuple
.
x
)[
j
]
;
out_data
[
index
]
=
static_cast
<
T
>
(
trans
(
random
)
);
}
}
}
...
...
paddle/fluid/operators/gaussian_random_op.cu
浏览文件 @
bbbd75e4
...
...
@@ -19,8 +19,11 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/distribution_helper.h"
#include "paddle/fluid/operators/fill_constant_op.h"
DECLARE_bool
(
use_curand
);
namespace
paddle
{
namespace
operators
{
...
...
@@ -65,7 +68,10 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
thrust
::
counting_iterator
<
int64_t
>
index_sequence_begin
(
0
);
auto
shape
=
GetShape
(
context
);
tensor
->
Resize
(
shape
);
T
*
data
=
tensor
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
&
dev_cxt
=
context
.
template
device_context
<
platform
::
CUDADeviceContext
>();
T
*
data
=
tensor
->
mutable_data
<
T
>
(
dev_cxt
.
GetPlace
());
int64_t
size
=
tensor
->
numel
();
...
...
@@ -73,12 +79,20 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
auto
gen_cuda
=
framework
::
GetDefaultCUDAGenerator
(
device_id
);
if
(
gen_cuda
->
GetIsInitPy
()
&&
seed_flag
)
{
auto
seed_offset
=
gen_cuda
->
IncrementOffset
(
1
);
int64_t
gen_offset
=
size
*
seed_offset
.
second
;
thrust
::
transform
(
index_sequence_begin
,
index_sequence_begin
+
size
,
thrust
::
device_ptr
<
T
>
(
data
),
GaussianGenerator
<
T
>
(
mean
,
std
,
seed_offset
.
first
,
gen_offset
));
if
(
FLAGS_use_curand
)
{
using
MT
=
typename
details
::
MPTypeTrait
<
T
>::
Type
;
distribution
::
normal_distribution
<
MT
>
dist
;
distribution
::
normal_transform
<
MT
>
trans
(
mean
,
std
);
distribution
::
distribution_and_transform
<
T
>
(
dev_cxt
,
tensor
,
dist
,
trans
);
}
else
{
auto
seed_offset
=
gen_cuda
->
IncrementOffset
(
1
);
int64_t
gen_offset
=
size
*
seed_offset
.
second
;
thrust
::
transform
(
index_sequence_begin
,
index_sequence_begin
+
size
,
thrust
::
device_ptr
<
T
>
(
data
),
GaussianGenerator
<
T
>
(
mean
,
std
,
seed_offset
.
first
,
gen_offset
));
}
}
else
{
thrust
::
transform
(
index_sequence_begin
,
index_sequence_begin
+
size
,
thrust
::
device_ptr
<
T
>
(
data
),
...
...
paddle/fluid/operators/uniform_random_op.cu
浏览文件 @
bbbd75e4
...
...
@@ -18,7 +18,12 @@ limitations under the License. */
#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/distribution_helper.h"
#include "paddle/fluid/operators/uniform_random_op.h"
DECLARE_bool
(
use_curand
);
namespace
paddle
{
namespace
operators
{
...
...
@@ -123,7 +128,9 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
"unsupport type: %s."
,
framework
::
ToTypeName
(
out_var
->
Type
())));
}
T
*
data
=
tensor
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
&
dev_cxt
=
context
.
template
device_context
<
platform
::
CUDADeviceContext
>();
T
*
data
=
tensor
->
mutable_data
<
T
>
(
dev_cxt
.
GetPlace
());
unsigned
int
seed
=
static_cast
<
unsigned
int
>
(
context
.
Attr
<
int
>
(
"seed"
));
bool
seed_flag
=
false
;
if
(
seed
==
0
)
{
...
...
@@ -144,13 +151,21 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
int
device_id
=
context
.
GetPlace
().
GetDeviceId
();
auto
gen_cuda
=
framework
::
GetDefaultCUDAGenerator
(
device_id
);
if
(
gen_cuda
->
GetIsInitPy
()
&&
seed_flag
)
{
auto
seed_offset
=
gen_cuda
->
IncrementOffset
(
1
);
int64_t
gen_offset
=
size
*
seed_offset
.
second
;
thrust
::
transform
(
index_sequence_begin
,
index_sequence_begin
+
size
,
thrust
::
device_ptr
<
T
>
(
data
),
UniformGeneratorOffset
<
T
>
(
min
,
max
,
seed_offset
.
first
,
diag_num
,
diag_step
,
diag_val
,
gen_offset
));
if
(
FLAGS_use_curand
)
{
using
MT
=
typename
details
::
MPTypeTrait
<
T
>::
Type
;
distribution
::
uniform_distribution
<
MT
>
dist
;
distribution
::
uniform_transform
<
MT
>
trans
(
min
,
max
);
distribution
::
distribution_and_transform
<
T
>
(
dev_cxt
,
tensor
,
dist
,
trans
);
}
else
{
auto
seed_offset
=
gen_cuda
->
IncrementOffset
(
1
);
int64_t
gen_offset
=
size
*
seed_offset
.
second
;
thrust
::
transform
(
index_sequence_begin
,
index_sequence_begin
+
size
,
thrust
::
device_ptr
<
T
>
(
data
),
UniformGeneratorOffset
<
T
>
(
min
,
max
,
seed_offset
.
first
,
diag_num
,
diag_step
,
diag_val
,
gen_offset
));
}
}
else
{
thrust
::
transform
(
index_sequence_begin
,
index_sequence_begin
+
size
,
...
...
paddle/fluid/platform/flags.cc
浏览文件 @
bbbd75e4
...
...
@@ -545,6 +545,8 @@ PADDLE_DEFINE_EXPORTED_double(
*/
PADDLE_DEFINE_EXPORTED_bool
(
use_mkldnn
,
false
,
"Use MKLDNN to run"
);
PADDLE_DEFINE_EXPORTED_bool
(
use_curand
,
false
,
"Random OP use CURAND"
);
/**
* Debug related FLAG
* Name: FLAGS_call_stack_level
...
...
paddle/scripts/paddle_build.bat
浏览文件 @
bbbd75e4
...
...
@@ -662,6 +662,7 @@ for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
set
start
=
%start
:
~
4
,
10
%
set
FLAGS_call_stack_level
=
2
set
FLAGS_use_curand
=
True
dir
%THIRD
_PARTY_PATH:/
=
\
%
\install\openblas\lib
dir
%THIRD
_PARTY_PATH:/
=
\
%
\install\openblas\bin
dir
%THIRD
_PARTY_PATH:/
=
\
%
\install\zlib\bin
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
bbbd75e4
...
...
@@ -61,6 +61,8 @@ function init() {
# NOTE(chenweihang): For easy debugging, CI displays the C++ error stacktrace by default
export
FLAGS_call_stack_level
=
2
export
FLAGS_use_curand
=
True
# set CI_SKIP_CPP_TEST if only *.py changed
# In order to avoid using in some CI(such as daily performance), the current
# branch must not be `${BRANCH}` which is usually develop.
...
...
python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
浏览文件 @
bbbd75e4
...
...
@@ -274,7 +274,7 @@ class TestDistTraning(unittest.TestCase):
seq_length
=
16
class_size_per_card
=
2
vocab_size
=
class_size_per_card
*
self
.
model_parallel_size
seed
=
10
25
seed
=
10
0
set_random_seed
(
seed
)
rank_id
=
dist
.
get_rank
()
...
...
python/paddle/fluid/tests/unittests/test_adamw_op.py
浏览文件 @
bbbd75e4
...
...
@@ -333,7 +333,7 @@ class TestAdamWOpLayerwiseLR(TestAdamWOp):
lr_ratio
=
simple_lr_fun
)
loss_ref
=
np
.
array
(
[
4.8383293
,
3.084947
,
1.3323904
,
-
0.41943002
,
-
2.1710064
])
[
-
1.7267396
,
-
2.81524
,
-
3.9250019
,
-
5.05954
,
-
6.2272625
])
for
i
in
range
(
5
):
a1
=
linear1
(
a
)
out
=
linear2
(
a1
)
...
...
@@ -379,7 +379,7 @@ class TestAdamWOpLayerwiseLR(TestAdamWOp):
exe
.
run
(
startup
)
loss_ref
=
np
.
array
(
[
0.3
6120513
,
0.2720821
,
0.67208904
,
0.14607805
,
0.24098626
])
[
0.3
3895183
,
0.3159437
,
0.19472016
,
0.17764759
,
0.1520702
])
for
i
in
range
(
5
):
inputs
=
np
.
random
.
random
(
size
=
[
8
,
10
]).
astype
(
'float32'
)
outputs
=
np
.
random
.
random
(
size
=
[
8
,
1
]).
astype
(
'float32'
)
...
...
python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
浏览文件 @
bbbd75e4
...
...
@@ -287,5 +287,49 @@ class TestStandardNormalDtype(unittest.TestCase):
paddle
.
enable_static
()
class
TestRandomValue
(
unittest
.
TestCase
):
def
test_fixed_random_number
(
self
):
# Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
if
not
paddle
.
is_compiled_with_cuda
():
return
# Note(zhouwei): The Number of threads is determined by
# 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different
# GPU have different number of threads, which result in different
# random value. Only test on V100 GPU here.
if
not
"V100"
in
paddle
.
device
.
cuda
.
get_device_name
():
return
def
_check_random_value
(
dtype
,
expect
,
expect_mean
,
expect_std
):
x
=
paddle
.
randn
([
32
,
3
,
1024
,
1024
],
dtype
=
dtype
)
actual
=
x
.
numpy
()
self
.
assertTrue
(
np
.
allclose
(
actual
[
2
,
1
,
512
,
1000
:
1010
],
expect
))
self
.
assertTrue
(
np
.
mean
(
actual
),
expect_mean
)
self
.
assertTrue
(
np
.
std
(
actual
),
expect_std
)
print
(
"Test Fixed Random number on V100 GPU------>"
)
paddle
.
disable_static
()
paddle
.
set_device
(
'gpu'
)
paddle
.
seed
(
2021
)
expect
=
[
-
0.79037829
,
-
0.54411126
,
-
0.32266671
,
0.35791815
,
1.44169267
,
-
0.87785644
,
-
1.23909874
,
-
2.18194139
,
0.49489656
,
0.40703062
]
expect_mean
=
-
0.0000053026194133403266873214888799115129813799285329878330230713
expect_std
=
0.99999191058126390974081232343451119959354400634765625
_check_random_value
(
core
.
VarDesc
.
VarType
.
FP64
,
expect
,
expect_mean
,
expect_std
)
expect
=
[
-
0.7988942
,
1.8644791
,
0.02782744
,
1.3692524
,
0.6419724
,
0.12436751
,
0.12058455
,
-
1.9984808
,
1.5635862
,
0.18506318
]
expect_mean
=
-
0.00004762359094456769526004791259765625
expect_std
=
0.999975681304931640625
_check_random_value
(
core
.
VarDesc
.
VarType
.
FP32
,
expect
,
expect_mean
,
expect_std
)
paddle
.
enable_static
()
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_uniform_random_op.py
浏览文件 @
bbbd75e4
...
...
@@ -562,5 +562,49 @@ class TestUniformDtype(unittest.TestCase):
paddle
.
enable_static
()
class
TestRandomValue
(
unittest
.
TestCase
):
def
test_fixed_random_number
(
self
):
# Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
if
not
paddle
.
is_compiled_with_cuda
():
return
# Note(zhouwei): The Number of threads is determined by
# 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different
# GPU have different number of threads, which result in different
# random value. Only test on V100 GPU here.
if
not
"V100"
in
paddle
.
device
.
cuda
.
get_device_name
():
return
def
_check_random_value
(
dtype
,
expect
,
expect_mean
,
expect_std
):
x
=
paddle
.
rand
([
32
,
3
,
1024
,
1024
],
dtype
=
dtype
)
actual
=
x
.
numpy
()
self
.
assertTrue
(
np
.
allclose
(
actual
[
2
,
1
,
512
,
1000
:
1010
],
expect
))
self
.
assertEqual
(
np
.
mean
(
actual
),
expect_mean
)
self
.
assertEqual
(
np
.
std
(
actual
),
expect_std
)
print
(
"Test Fixed Random number on V100 GPU------>"
)
paddle
.
disable_static
()
paddle
.
set_device
(
'gpu'
)
paddle
.
seed
(
2021
)
expect
=
[
0.55298901
,
0.65184678
,
0.49375412
,
0.57943639
,
0.16459608
,
0.67181056
,
0.03021481
,
0.0238559
,
0.07742096
,
0.55972187
]
expect_mean
=
0.50000454338820143895816272561205551028251647949218750
expect_std
=
0.28867379167297479991560749112977646291255950927734375
_check_random_value
(
core
.
VarDesc
.
VarType
.
FP64
,
expect
,
expect_mean
,
expect_std
)
expect
=
[
0.45320973
,
0.17582087
,
0.725341
,
0.30849215
,
0.622257
,
0.46352342
,
0.97228295
,
0.12771158
,
0.286525
,
0.9810645
]
expect_mean
=
0.50002604722976684570312500
expect_std
=
0.2886914908885955810546875
_check_random_value
(
core
.
VarDesc
.
VarType
.
FP32
,
expect
,
expect_mean
,
expect_std
)
paddle
.
enable_static
()
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录