Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
75d15719
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
75d15719
编写于
9月 03, 2019
作者:
T
Tao Luo
提交者:
GitHub
9月 03, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refine PADDLE_ENFORCE codes for unify PADDLE_ASSERT_MSG (#19603)
test=develop
上级
1c2aae56
变更
19
隐藏空白更改
内联
并排
Showing
19 changed file
with
103 addition
and
91 deletion
+103
-91
paddle/fluid/framework/ir/node.h
paddle/fluid/framework/ir/node.h
+2
-2
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+3
-3
paddle/fluid/framework/tensor_util.h
paddle/fluid/framework/tensor_util.h
+1
-1
paddle/fluid/operators/detail/safe_ref.h
paddle/fluid/operators/detail/safe_ref.h
+1
-1
paddle/fluid/operators/dgc_op.h
paddle/fluid/operators/dgc_op.h
+4
-3
paddle/fluid/operators/dropout_op.cu
paddle/fluid/operators/dropout_op.cu
+4
-3
paddle/fluid/operators/math/im2col.cu
paddle/fluid/operators/math/im2col.cu
+8
-8
paddle/fluid/operators/math/sample_prob.cu
paddle/fluid/operators/math/sample_prob.cu
+3
-3
paddle/fluid/operators/math/selected_rows_functor.cu
paddle/fluid/operators/math/selected_rows_functor.cu
+5
-5
paddle/fluid/operators/math/vol2col.cu
paddle/fluid/operators/math/vol2col.cu
+4
-4
paddle/fluid/operators/optimizers/lars_momentum_op.h
paddle/fluid/operators/optimizers/lars_momentum_op.h
+1
-1
paddle/fluid/operators/sample_logits_op.h
paddle/fluid/operators/sample_logits_op.h
+12
-10
paddle/fluid/operators/sum_op.h
paddle/fluid/operators/sum_op.h
+3
-3
paddle/fluid/operators/sync_batch_norm_op.cu
paddle/fluid/operators/sync_batch_norm_op.cu
+2
-2
paddle/fluid/platform/cuda_helper.h
paddle/fluid/platform/cuda_helper.h
+6
-4
paddle/fluid/platform/cudnn_helper.h
paddle/fluid/platform/cudnn_helper.h
+26
-20
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+2
-2
paddle/fluid/platform/nccl_helper.h
paddle/fluid/platform/nccl_helper.h
+12
-12
paddle/fluid/platform/profiler.cu
paddle/fluid/platform/profiler.cu
+4
-4
未找到文件。
paddle/fluid/framework/ir/node.h
浏览文件 @
75d15719
...
...
@@ -66,12 +66,12 @@ class Node {
std
::
string
Name
()
const
{
return
name_
;
}
VarDesc
*
Var
()
const
{
PADDLE_ENFORCE
(
IsVar
()
);
PADDLE_ENFORCE
_EQ
(
IsVar
(),
true
);
return
var_desc_
.
get
();
}
OpDesc
*
Op
()
const
{
PADDLE_ENFORCE
(
IsOp
()
);
PADDLE_ENFORCE
_EQ
(
IsOp
(),
true
);
return
op_desc_
.
get
();
}
...
...
paddle/fluid/framework/tensor_util.cc
浏览文件 @
75d15719
...
...
@@ -53,7 +53,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto
src_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
src_place
);
auto
dst_cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
);
auto
ctx_place
=
ctx
.
GetPlace
();
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx_place
)
);
PADDLE_ENFORCE
_EQ
(
platform
::
is_gpu_place
(
ctx_place
),
true
);
auto
ctx_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx_place
);
PADDLE_ENFORCE_EQ
(
src_gpu_place
,
ctx_gpu_place
);
auto
stream
=
...
...
@@ -64,7 +64,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto
src_cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
auto
ctx_place
=
ctx
.
GetPlace
();
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx_place
)
);
PADDLE_ENFORCE
_EQ
(
platform
::
is_gpu_place
(
ctx_place
),
true
);
auto
ctx_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx_place
);
PADDLE_ENFORCE_EQ
(
dst_gpu_place
,
ctx_gpu_place
);
auto
stream
=
...
...
@@ -75,7 +75,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto
src_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
src_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
auto
ctx_place
=
ctx
.
GetPlace
();
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx_place
)
);
PADDLE_ENFORCE
_EQ
(
platform
::
is_gpu_place
(
ctx_place
),
true
);
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
).
stream
();
if
(
platform
::
is_same_place
(
src_place
,
dst_place
))
{
...
...
paddle/fluid/framework/tensor_util.h
浏览文件 @
75d15719
...
...
@@ -146,7 +146,7 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
dst
->
resize
(
src
.
numel
());
auto
dst_ptr
=
static_cast
<
void
*>
(
dst
->
data
());
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
src
.
place
())
);
PADDLE_ENFORCE
_EQ
(
platform
::
is_cpu_place
(
src
.
place
()),
true
);
memory
::
Copy
(
dst_place
,
dst_ptr
,
boost
::
get
<
platform
::
CPUPlace
>
(
src
.
place
()),
src_ptr
,
size
);
...
...
paddle/fluid/operators/detail/safe_ref.h
浏览文件 @
75d15719
...
...
@@ -25,7 +25,7 @@ namespace detail {
*/
template
<
typename
T
,
typename
...
ARGS
>
inline
T
&
Ref
(
T
*
ptr
,
ARGS
&&
...
args
)
{
PADDLE_ENFORCE
(
ptr
!=
null
ptr
,
::
paddle
::
string
::
Sprintf
(
args
...));
PADDLE_ENFORCE
_NOT_NULL
(
ptr
,
::
paddle
::
string
::
Sprintf
(
args
...));
return
*
ptr
;
}
...
...
paddle/fluid/operators/dgc_op.h
浏览文件 @
75d15719
...
...
@@ -23,14 +23,14 @@ namespace operators {
inline
float
get_period_sparcity
(
const
std
::
vector
<
float
>&
sparsity
,
float
cur_step
,
float
rampup_steps
)
{
PADDLE_ENFORCE
(
static_cast
<
int
>
(
cur_step
)
>=
0
);
PADDLE_ENFORCE
_GE
(
static_cast
<
int
>
(
cur_step
),
0
);
size_t
idx
=
static_cast
<
int
>
(
cur_step
*
sparsity
.
size
()
/
rampup_steps
);
if
(
idx
>=
sparsity
.
size
())
{
return
0.999
;
}
PADDLE_ENFORCE
(
idx
<
sparsity
.
size
());
PADDLE_ENFORCE
_LT
(
idx
,
sparsity
.
size
());
return
sparsity
[
idx
];
}
...
...
@@ -63,7 +63,8 @@ class DGCOpKernel : public framework::OpKernel<T> {
float
ratio
=
1
-
get_period_sparcity
(
sparsity
,
static_cast
<
float
>
(
*
current_step
),
rampup_step
);
PADDLE_ENFORCE
(
ratio
>
0.0
&&
ratio
<
1.0
);
PADDLE_ENFORCE_GE
(
ratio
,
0.0
);
PADDLE_ENFORCE_LT
(
ratio
,
1.0
);
int
k
=
static_cast
<
int
>
(
g
->
numel
()
*
ratio
);
VLOG
(
10
)
<<
"m:"
<<
m
<<
", use_nesterov:"
<<
use_nesterov
...
...
paddle/fluid/operators/dropout_op.cu
浏览文件 @
75d15719
...
...
@@ -86,9 +86,10 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
auto
*
x_data
=
x
->
data
<
T
>
();
auto
*
y_data
=
y
->
mutable_data
<
T
>
(
context
.
GetPlace
());
if
(
dropout_prob
==
1.0
f
)
{
PADDLE_ENFORCE
(
cudaMemsetAsync
(
y_data
,
0
,
x_numel
*
sizeof
(
T
),
stream
));
PADDLE_ENFORCE
(
cudaMemsetAsync
(
mask_data
,
0
,
x_numel
*
sizeof
(
*
mask_data
),
stream
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaMemsetAsync
(
y_data
,
0
,
x_numel
*
sizeof
(
T
),
stream
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaMemsetAsync
(
mask_data
,
0
,
x_numel
*
sizeof
(
*
mask_data
),
stream
));
return
;
}
...
...
paddle/fluid/operators/math/im2col.cu
浏览文件 @
75d15719
...
...
@@ -66,8 +66,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
const
framework
::
Tensor
&
im
,
const
std
::
vector
<
int
>&
dilation
,
const
std
::
vector
<
int
>&
stride
,
const
std
::
vector
<
int
>&
padding
,
framework
::
Tensor
*
col
)
{
PADDLE_ENFORCE
(
im
.
dims
().
size
()
==
3
);
PADDLE_ENFORCE
(
col
->
dims
().
size
()
==
5
);
PADDLE_ENFORCE
_EQ
(
im
.
dims
().
size
(),
3
);
PADDLE_ENFORCE
_EQ
(
col
->
dims
().
size
(),
5
);
int
im_channels
=
im
.
dims
()[
0
];
int
im_height
=
im
.
dims
()[
1
];
...
...
@@ -152,8 +152,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
const
std
::
vector
<
int
>&
dilation
,
const
std
::
vector
<
int
>&
stride
,
const
std
::
vector
<
int
>&
padding
,
framework
::
Tensor
*
im
)
{
PADDLE_ENFORCE
(
im
->
dims
().
size
()
==
3
);
PADDLE_ENFORCE
(
col
.
dims
().
size
()
==
5
);
PADDLE_ENFORCE
_EQ
(
im
->
dims
().
size
(),
3
);
PADDLE_ENFORCE
_EQ
(
col
.
dims
().
size
(),
5
);
int
im_channels
=
im
->
dims
()[
0
];
int
im_height
=
im
->
dims
()[
1
];
...
...
@@ -249,8 +249,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
const
framework
::
Tensor
&
im
,
const
std
::
vector
<
int
>&
dilation
,
const
std
::
vector
<
int
>&
stride
,
const
std
::
vector
<
int
>&
padding
,
framework
::
Tensor
*
col
)
{
PADDLE_ENFORCE
(
im
.
dims
().
size
()
==
3
);
PADDLE_ENFORCE
(
col
->
dims
().
size
()
==
5
);
PADDLE_ENFORCE
_EQ
(
im
.
dims
().
size
(),
3
);
PADDLE_ENFORCE
_EQ
(
col
->
dims
().
size
(),
5
);
int
im_channels
=
im
.
dims
()[
0
];
int
im_height
=
im
.
dims
()[
1
];
int
im_width
=
im
.
dims
()[
2
];
...
...
@@ -331,8 +331,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
const
std
::
vector
<
int
>&
dilation
,
const
std
::
vector
<
int
>&
stride
,
const
std
::
vector
<
int
>&
padding
,
framework
::
Tensor
*
im
)
{
PADDLE_ENFORCE
(
im
->
dims
().
size
()
==
3
);
PADDLE_ENFORCE
(
col
.
dims
().
size
()
==
5
);
PADDLE_ENFORCE
_EQ
(
im
->
dims
().
size
(),
3
);
PADDLE_ENFORCE
_EQ
(
col
.
dims
().
size
(),
5
);
int
im_channels
=
im
->
dims
()[
0
];
int
im_height
=
im
->
dims
()[
1
];
int
im_width
=
im
->
dims
()[
2
];
...
...
paddle/fluid/operators/math/sample_prob.cu
浏览文件 @
75d15719
...
...
@@ -142,9 +142,9 @@ void GPUSampleWithProb<T>::operator()(
int
num_tries
=
UniqSampler
<
T
>
(
sampler
,
num_samples
,
s_data
);
VLOG
(
1
)
<<
"num_tries: "
<<
num_tries
;
PADDLE_ENFORCE
(
cudaMemcpy
(
samples_data
+
num_true
,
s_data
,
sizeof
(
int64_t
)
*
num_samples
,
cudaMemcpyHostToDevice
));
PADDLE_ENFORCE
_CUDA_SUCCESS
(
cudaMemcpy
(
samples_data
+
num_true
,
s_data
,
sizeof
(
int64_t
)
*
num_samples
,
cudaMemcpyHostToDevice
));
int
threads
=
512
;
const
size_t
size
=
batch_size
*
num_sampled_classes
;
...
...
paddle/fluid/operators/math/selected_rows_functor.cu
浏览文件 @
75d15719
...
...
@@ -55,11 +55,11 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
auto
*
in1_data
=
in1_value
.
data
<
T
>
();
auto
in1_place
=
input1
.
place
();
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
in1_place
)
);
PADDLE_ENFORCE
_EQ
(
platform
::
is_gpu_place
(
in1_place
),
true
);
auto
in2_place
=
input2
.
place
();
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
in2_place
)
);
PADDLE_ENFORCE
_EQ
(
platform
::
is_gpu_place
(
in2_place
),
true
);
auto
out_place
=
context
.
GetPlace
();
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
out_place
)
);
PADDLE_ENFORCE
_EQ
(
platform
::
is_gpu_place
(
out_place
),
true
);
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
out_place
),
out_data
,
boost
::
get
<
platform
::
CUDAPlace
>
(
in1_place
),
in1_data
,
...
...
@@ -162,9 +162,9 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
}
auto
in1_place
=
input1
.
place
();
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
in1_place
)
);
PADDLE_ENFORCE
_EQ
(
platform
::
is_gpu_place
(
in1_place
),
true
);
auto
in2_place
=
input2
->
place
();
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
in2_place
)
);
PADDLE_ENFORCE
_EQ
(
platform
::
is_gpu_place
(
in2_place
),
true
);
auto
*
in1_data
=
in1_value
.
data
<
T
>
();
auto
*
in2_data
=
in2_value
->
data
<
T
>
();
...
...
paddle/fluid/operators/math/vol2col.cu
浏览文件 @
75d15719
...
...
@@ -78,8 +78,8 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
framework
::
Tensor
*
col
)
const
{
PADDLE_ENFORCE
(
vol
.
dims
().
size
()
==
4
);
PADDLE_ENFORCE
(
col
->
dims
().
size
()
==
7
);
PADDLE_ENFORCE
_EQ
(
vol
.
dims
().
size
(),
4
);
PADDLE_ENFORCE
_EQ
(
col
->
dims
().
size
(),
7
);
int
input_channels
=
vol
.
dims
()[
0
];
int
input_depth
=
vol
.
dims
()[
1
];
...
...
@@ -204,8 +204,8 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
framework
::
Tensor
*
vol
)
const
{
PADDLE_ENFORCE
(
vol
->
dims
().
size
()
==
4
);
PADDLE_ENFORCE
(
col
.
dims
().
size
()
==
7
);
PADDLE_ENFORCE
_EQ
(
vol
->
dims
().
size
(),
4
);
PADDLE_ENFORCE
_EQ
(
col
.
dims
().
size
(),
7
);
int
input_channels
=
vol
->
dims
()[
0
];
int
input_depth
=
vol
->
dims
()[
1
];
...
...
paddle/fluid/operators/optimizers/lars_momentum_op.h
浏览文件 @
75d15719
...
...
@@ -30,7 +30,7 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
auto
learning_rate
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"LearningRate"
);
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
// only support dense for now.
PADDLE_ENFORCE
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
()
);
PADDLE_ENFORCE
_EQ
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
(),
true
);
auto
grad
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Grad"
);
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
...
...
paddle/fluid/operators/sample_logits_op.h
浏览文件 @
75d15719
...
...
@@ -49,11 +49,12 @@ static void CPUTakeAlongD1(const platform::DeviceContext& ctx,
const
framework
::
Tensor
&
array
,
const
framework
::
Tensor
&
index
,
framework
::
Tensor
*
value
)
{
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
())
);
PADDLE_ENFORCE
_EQ
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()),
true
);
// UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
PADDLE_ENFORCE
(
index
.
dims
().
size
()
==
2
&&
array
.
dims
().
size
()
==
2
&&
index
.
dims
()[
0
]
==
array
.
dims
()[
0
]
&&
index
.
dims
()
==
value
->
dims
());
PADDLE_ENFORCE_EQ
(
index
.
dims
().
size
(),
2
);
PADDLE_ENFORCE_EQ
(
array
.
dims
().
size
(),
2
);
PADDLE_ENFORCE_EQ
(
index
.
dims
()[
0
],
array
.
dims
()[
0
]);
PADDLE_ENFORCE_EQ
(
index
.
dims
(),
value
->
dims
());
const
auto
batch_size
=
index
.
dims
()[
0
];
const
auto
num_take
=
index
.
dims
()[
1
];
...
...
@@ -88,11 +89,12 @@ static void CPUPutAlongD1(const platform::DeviceContext& ctx,
framework
::
Tensor
*
array
,
const
framework
::
Tensor
&
index
,
const
framework
::
Tensor
&
value
)
{
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
())
);
PADDLE_ENFORCE
_EQ
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()),
true
);
// UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
PADDLE_ENFORCE
(
index
.
dims
().
size
()
==
2
&&
array
->
dims
().
size
()
==
2
&&
index
.
dims
()[
0
]
==
array
->
dims
()[
0
]
&&
index
.
dims
()
==
value
.
dims
());
PADDLE_ENFORCE_EQ
(
index
.
dims
().
size
(),
2
);
PADDLE_ENFORCE_EQ
(
array
->
dims
().
size
(),
2
);
PADDLE_ENFORCE_EQ
(
index
.
dims
()[
0
],
array
->
dims
()[
0
]);
PADDLE_ENFORCE_EQ
(
index
.
dims
(),
value
.
dims
());
const
auto
batch_size
=
index
.
dims
()[
0
];
const
auto
num_put
=
index
.
dims
()[
1
];
auto
array_dims
=
array
->
dims
();
...
...
@@ -147,8 +149,8 @@ class SampleLogitsKernel : public framework::OpKernel<T> {
public:
using
Tensor
=
framework
::
Tensor
;
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
context
.
GetPlace
())
,
"This kernel only runs on CPU."
);
PADDLE_ENFORCE
_EQ
(
platform
::
is_cpu_place
(
context
.
GetPlace
()),
true
,
"This kernel only runs on CPU."
);
VLOG
(
3
)
<<
"Enter SampleLogitsKernel"
;
// get necessary inputs
const
Tensor
*
logits
=
context
.
Input
<
Tensor
>
(
"Logits"
);
...
...
paddle/fluid/operators/sum_op.h
浏览文件 @
75d15719
...
...
@@ -92,8 +92,8 @@ void LodTensorArrayCompute(const framework::ExecutionContext &context) {
bool
in_place
=
out_var
==
in_vars
[
0
];
auto
&
out_array
=
*
out_var
->
GetMutable
<
framework
::
LoDTensorArray
>
();
for
(
size_t
i
=
in_place
?
1
:
0
;
i
<
in_vars
.
size
();
++
i
)
{
PADDLE_ENFORCE
(
in_vars
[
i
]
->
IsType
<
framework
::
LoDTensorArray
>
()
,
"Only support all inputs are TensorArray"
);
PADDLE_ENFORCE
_EQ
(
in_vars
[
i
]
->
IsType
<
framework
::
LoDTensorArray
>
(),
true
,
"Only support all inputs are TensorArray"
);
auto
&
in_array
=
in_vars
[
i
]
->
Get
<
framework
::
LoDTensorArray
>
();
for
(
size_t
i
=
0
;
i
<
in_array
.
size
();
++
i
)
{
...
...
@@ -106,7 +106,7 @@ void LodTensorArrayCompute(const framework::ExecutionContext &context) {
context
.
device_context
(),
&
out_array
[
i
]);
out_array
[
i
].
set_lod
(
in_array
[
i
].
lod
());
}
else
{
PADDLE_ENFORCE
(
out_array
[
i
].
lod
()
==
in_array
[
i
].
lod
());
PADDLE_ENFORCE
_EQ
(
out_array
[
i
].
lod
(),
in_array
[
i
].
lod
());
auto
in
=
EigenVector
<
T
>::
Flatten
(
in_array
[
i
]);
auto
result
=
EigenVector
<
T
>::
Flatten
(
out_array
[
i
]);
result
.
device
(
*
context
.
template
device_context
<
DeviceContext
>()
...
...
paddle/fluid/operators/sync_batch_norm_op.cu
浏览文件 @
75d15719
...
...
@@ -178,7 +178,7 @@ class SyncBatchNormKernel : public framework::OpKernel<T> {
int
dtype
=
platform
::
ToNCCLDataType
(
x
->
type
());
// In-place operation
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclAllReduce
(
PADDLE_ENFORCE
_CUDA_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
stats
,
stats
,
2
*
C
+
1
,
static_cast
<
ncclDataType_t
>
(
dtype
),
ncclSum
,
comm
,
stream
));
...
...
@@ -398,7 +398,7 @@ class SyncBatchNormGradKernel : public framework::OpKernel<T> {
}
int
dtype
=
platform
::
ToNCCLDataType
(
x
->
type
());
// In-place operation
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclAllReduce
(
PADDLE_ENFORCE
_CUDA_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
stats
,
stats
,
2
*
C
+
1
,
static_cast
<
ncclDataType_t
>
(
dtype
),
ncclSum
,
comm
,
stream
));
...
...
paddle/fluid/platform/cuda_helper.h
浏览文件 @
75d15719
...
...
@@ -29,17 +29,19 @@ namespace platform {
class
CublasHandleHolder
{
public:
CublasHandleHolder
(
cudaStream_t
stream
,
cublasMath_t
math_type
)
{
PADDLE_ENFORCE
(
dynload
::
cublasCreate
(
&
handle_
));
PADDLE_ENFORCE
(
dynload
::
cublasSetStream
(
handle_
,
stream
));
PADDLE_ENFORCE
_CUDA_SUCCESS
(
dynload
::
cublasCreate
(
&
handle_
));
PADDLE_ENFORCE
_CUDA_SUCCESS
(
dynload
::
cublasSetStream
(
handle_
,
stream
));
#if CUDA_VERSION >= 9000
if
(
math_type
==
CUBLAS_TENSOR_OP_MATH
)
{
PADDLE_ENFORCE
(
PADDLE_ENFORCE
_CUDA_SUCCESS
(
dynload
::
cublasSetMathMode
(
handle_
,
CUBLAS_TENSOR_OP_MATH
));
}
#endif
}
~
CublasHandleHolder
()
{
PADDLE_ENFORCE
(
dynload
::
cublasDestroy
(
handle_
));
}
~
CublasHandleHolder
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cublasDestroy
(
handle_
));
}
template
<
typename
Callback
>
inline
void
Call
(
Callback
&&
callback
)
const
{
...
...
paddle/fluid/platform/cudnn_helper.h
浏览文件 @
75d15719
...
...
@@ -221,10 +221,10 @@ inline cudnnTensorFormat_t GetCudnnTensorFormat(
class
ScopedTensorDescriptor
{
public:
ScopedTensorDescriptor
()
{
PADDLE_ENFORCE
(
dynload
::
cudnnCreateTensorDescriptor
(
&
desc_
));
PADDLE_ENFORCE
_CUDA_SUCCESS
(
dynload
::
cudnnCreateTensorDescriptor
(
&
desc_
));
}
~
ScopedTensorDescriptor
()
{
PADDLE_ENFORCE
(
dynload
::
cudnnDestroyTensorDescriptor
(
desc_
));
PADDLE_ENFORCE
_CUDA_SUCCESS
(
dynload
::
cudnnDestroyTensorDescriptor
(
desc_
));
}
inline
cudnnTensorDescriptor_t
descriptor
(
const
cudnnTensorFormat_t
format
,
...
...
@@ -243,7 +243,7 @@ class ScopedTensorDescriptor {
if
(
groups
>
1
)
{
dims_with_group
[
1
]
=
dims_with_group
[
1
]
/
groups
;
}
PADDLE_ENFORCE
(
dynload
::
cudnnSetTensorNdDescriptor
(
PADDLE_ENFORCE
_CUDA_SUCCESS
(
dynload
::
cudnnSetTensorNdDescriptor
(
desc_
,
type
,
dims_with_group
.
size
(),
dims_with_group
.
data
(),
strides
.
data
()));
return
desc_
;
...
...
@@ -265,10 +265,10 @@ class ScopedTensorDescriptor {
class
ScopedFilterDescriptor
{
public:
ScopedFilterDescriptor
()
{
PADDLE_ENFORCE
(
dynload
::
cudnnCreateFilterDescriptor
(
&
desc_
));
PADDLE_ENFORCE
_CUDA_SUCCESS
(
dynload
::
cudnnCreateFilterDescriptor
(
&
desc_
));
}
~
ScopedFilterDescriptor
()
{
PADDLE_ENFORCE
(
dynload
::
cudnnDestroyFilterDescriptor
(
desc_
));
PADDLE_ENFORCE
_CUDA_SUCCESS
(
dynload
::
cudnnDestroyFilterDescriptor
(
desc_
));
}
inline
cudnnFilterDescriptor_t
descriptor
(
const
cudnnTensorFormat_t
format
,
...
...
@@ -284,7 +284,7 @@ class ScopedFilterDescriptor {
kernel_with_group
[
0
]
/=
groups
;
// NOTE: input filter(C) of the filter is already asserted to be C/groups.
}
PADDLE_ENFORCE
(
dynload
::
cudnnSetFilterNdDescriptor
(
PADDLE_ENFORCE
_CUDA_SUCCESS
(
dynload
::
cudnnSetFilterNdDescriptor
(
desc_
,
type
,
format
,
kernel_with_group
.
size
(),
kernel_with_group
.
data
()));
return
desc_
;
...
...
@@ -306,10 +306,12 @@ class ScopedFilterDescriptor {
class
ScopedConvolutionDescriptor
{
public:
ScopedConvolutionDescriptor
()
{
PADDLE_ENFORCE
(
dynload
::
cudnnCreateConvolutionDescriptor
(
&
desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnCreateConvolutionDescriptor
(
&
desc_
));
}
~
ScopedConvolutionDescriptor
()
{
PADDLE_ENFORCE
(
dynload
::
cudnnDestroyConvolutionDescriptor
(
desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroyConvolutionDescriptor
(
desc_
));
}
inline
cudnnConvolutionDescriptor_t
descriptor
(
...
...
@@ -332,7 +334,7 @@ class ScopedConvolutionDescriptor {
cudnnDataType_t
compute_type
=
(
type
==
CUDNN_DATA_DOUBLE
)
?
CUDNN_DATA_DOUBLE
:
CUDNN_DATA_FLOAT
;
PADDLE_ENFORCE
(
dynload
::
cudnnSetConvolutionNdDescriptor
(
PADDLE_ENFORCE
_CUDA_SUCCESS
(
dynload
::
cudnnSetConvolutionNdDescriptor
(
desc_
,
pads
.
size
(),
pads
.
data
(),
strides
.
data
(),
dilations
.
data
(),
CUDNN_CROSS_CORRELATION
,
compute_type
));
return
desc_
;
...
...
@@ -353,10 +355,10 @@ class ScopedConvolutionDescriptor {
class
ScopedPoolingDescriptor
{
public:
ScopedPoolingDescriptor
()
{
PADDLE_ENFORCE
(
dynload
::
cudnnCreatePoolingDescriptor
(
&
desc_
));
PADDLE_ENFORCE
_CUDA_SUCCESS
(
dynload
::
cudnnCreatePoolingDescriptor
(
&
desc_
));
}
~
ScopedPoolingDescriptor
()
{
PADDLE_ENFORCE
(
dynload
::
cudnnDestroyPoolingDescriptor
(
desc_
));
PADDLE_ENFORCE
_CUDA_SUCCESS
(
dynload
::
cudnnDestroyPoolingDescriptor
(
desc_
));
}
inline
cudnnPoolingDescriptor_t
descriptor
(
const
PoolingMode
&
mode
,
...
...
@@ -365,7 +367,7 @@ class ScopedPoolingDescriptor {
const
std
::
vector
<
int
>&
strides
)
{
PADDLE_ENFORCE_EQ
(
kernel
.
size
(),
pads
.
size
());
PADDLE_ENFORCE_EQ
(
kernel
.
size
(),
strides
.
size
());
PADDLE_ENFORCE
(
dynload
::
cudnnSetPoolingNdDescriptor
(
PADDLE_ENFORCE
_CUDA_SUCCESS
(
dynload
::
cudnnSetPoolingNdDescriptor
(
desc_
,
(
GetPoolingMode
(
mode
)),
CUDNN_PROPAGATE_NAN
,
// Always propagate nans.
kernel
.
size
(),
kernel
.
data
(),
pads
.
data
(),
strides
.
data
()));
...
...
@@ -380,16 +382,18 @@ class ScopedPoolingDescriptor {
class
ScopedSpatialTransformerDescriptor
{
public:
ScopedSpatialTransformerDescriptor
()
{
PADDLE_ENFORCE
(
dynload
::
cudnnCreateSpatialTransformerDescriptor
(
&
desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnCreateSpatialTransformerDescriptor
(
&
desc_
));
}
~
ScopedSpatialTransformerDescriptor
()
{
PADDLE_ENFORCE
(
dynload
::
cudnnDestroySpatialTransformerDescriptor
(
desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroySpatialTransformerDescriptor
(
desc_
));
}
template
<
typename
T
>
inline
cudnnSpatialTransformerDescriptor_t
descriptor
(
const
int
nbDims
,
const
int
dimA
[])
{
PADDLE_ENFORCE
(
dynload
::
cudnnSetSpatialTransformerNdDescriptor
(
PADDLE_ENFORCE
_CUDA_SUCCESS
(
dynload
::
cudnnSetSpatialTransformerNdDescriptor
(
desc_
,
CUDNN_SAMPLER_BILINEAR
,
CudnnDataType
<
T
>::
type
,
nbDims
,
dimA
));
return
desc_
;
}
...
...
@@ -402,10 +406,12 @@ class ScopedSpatialTransformerDescriptor {
class
ScopedActivationDescriptor
{
public:
ScopedActivationDescriptor
()
{
PADDLE_ENFORCE
(
dynload
::
cudnnCreateActivationDescriptor
(
&
desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnCreateActivationDescriptor
(
&
desc_
));
}
~
ScopedActivationDescriptor
()
{
PADDLE_ENFORCE
(
dynload
::
cudnnDestroyActivationDescriptor
(
desc_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroyActivationDescriptor
(
desc_
));
}
template
<
typename
T
>
...
...
@@ -467,15 +473,15 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
class
ScopedCTCLossDescriptor
{
public:
ScopedCTCLossDescriptor
()
{
PADDLE_ENFORCE
(
dynload
::
cudnnCreateCTCLossDescriptor
(
&
desc_
));
PADDLE_ENFORCE
_CUDA_SUCCESS
(
dynload
::
cudnnCreateCTCLossDescriptor
(
&
desc_
));
}
~
ScopedCTCLossDescriptor
()
{
PADDLE_ENFORCE
(
dynload
::
cudnnDestroyCTCLossDescriptor
(
desc_
));
PADDLE_ENFORCE
_CUDA_SUCCESS
(
dynload
::
cudnnDestroyCTCLossDescriptor
(
desc_
));
}
template
<
typename
T
>
inline
cudnnCTCLossDescriptor_t
descriptor
()
{
PADDLE_ENFORCE
(
PADDLE_ENFORCE
_CUDA_SUCCESS
(
dynload
::
cudnnSetCTCLossDescriptor
(
desc_
,
CudnnDataType
<
T
>::
type
));
return
desc_
;
}
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
75d15719
...
...
@@ -167,7 +167,7 @@ class CudnnHolder {
inline
void
ResetWorkspace
()
{
if
(
workspace_
)
{
// Maybe someone is using the current workspace
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
*
stream_
));
PADDLE_ENFORCE
_CUDA_SUCCESS
(
cudaStreamSynchronize
(
*
stream_
));
workspace_
=
nullptr
;
}
}
...
...
@@ -306,7 +306,7 @@ class CUDADeviceContext : public DeviceContext {
template
<
typename
Callback
>
void
RecordEvent
(
cudaEvent_t
ev
,
Callback
callback
)
{
callback
();
PADDLE_ENFORCE
(
cudaEventRecord
(
ev
,
stream_
));
PADDLE_ENFORCE
_CUDA_SUCCESS
(
cudaEventRecord
(
ev
,
stream_
));
}
template
<
typename
Callback
>
...
...
paddle/fluid/platform/nccl_helper.h
浏览文件 @
75d15719
...
...
@@ -63,11 +63,11 @@ class NCCLGroupGuard {
inline
NCCLGroupGuard
()
{
NCCLMutex
().
lock
();
PADDLE_ENFORCE
(
dynload
::
ncclGroupStart
());
PADDLE_ENFORCE
_CUDA_SUCCESS
(
dynload
::
ncclGroupStart
());
}
inline
~
NCCLGroupGuard
()
{
PADDLE_ENFORCE
(
dynload
::
ncclGroupEnd
());
PADDLE_ENFORCE
_CUDA_SUCCESS
(
dynload
::
ncclGroupEnd
());
NCCLMutex
().
unlock
();
}
};
...
...
@@ -94,7 +94,7 @@ struct NCCLContextMap {
explicit
NCCLContextMap
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
ncclUniqueId
*
nccl_id
=
nullptr
,
size_t
num_trainers
=
1
,
size_t
trainer_id
=
0
)
{
PADDLE_ENFORCE
(
!
places
.
empty
()
);
PADDLE_ENFORCE
_EQ
(
!
places
.
empty
(),
true
);
order_
.
reserve
(
places
.
size
());
for
(
auto
&
p
:
places
)
{
int
dev_id
=
boost
::
get
<
CUDAPlace
>
(
p
).
device
;
...
...
@@ -109,7 +109,7 @@ struct NCCLContextMap {
// if num_trainers == 1, should create a new nccl id for local comms.
if
(
num_trainers
==
1
&&
nccl_id
==
nullptr
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
NCCLGroupGuard
::
NCCLMutex
());
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclCommInitAll
(
PADDLE_ENFORCE
_CUDA_SUCCESS
(
platform
::
dynload
::
ncclCommInitAll
(
comms
.
get
(),
static_cast
<
int
>
(
order_
.
size
()),
order_
.
data
()));
}
else
{
PADDLE_ENFORCE_NOT_NULL
(
nccl_id
);
...
...
@@ -126,8 +126,8 @@ struct NCCLContextMap {
}
VLOG
(
1
)
<<
"init nccl rank:"
<<
rank
<<
", nranks:"
<<
nranks
<<
", gpu_id:"
<<
gpu_id
<<
", dev_id:"
<<
order_
[
i
];
PADDLE_ENFORCE
(
cudaSetDevice
(
gpu_id
));
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclCommInitRank
(
PADDLE_ENFORCE
_CUDA_SUCCESS
(
cudaSetDevice
(
gpu_id
));
PADDLE_ENFORCE
_CUDA_SUCCESS
(
platform
::
dynload
::
ncclCommInitRank
(
comms
.
get
()
+
i
,
nranks
,
*
nccl_id
,
rank
));
}
}
...
...
@@ -249,13 +249,13 @@ class NCCLCommunicator {
size_t
trainers_num
,
size_t
trainer_id
,
size_t
inter_trainers_num
,
size_t
exter_trainers_num
)
{
PADDLE_ENFORCE
(
trainers_num
==
inter_trainers_num
*
exter_trainers_num
,
"trainers_num:%llu != inter_trainers_num:%llu * "
"exter_trainers_num:%llu"
,
trainers_num
,
inter_trainers_num
,
exter_trainers_num
);
PADDLE_ENFORCE
_EQ
(
trainers_num
,
inter_trainers_num
*
exter_trainers_num
,
"trainers_num:%llu != inter_trainers_num:%llu * "
"exter_trainers_num:%llu"
,
trainers_num
,
inter_trainers_num
,
exter_trainers_num
);
PADDLE_ENFORCE
(
inter_trainers_num
>
1
,
"inter_trainers_num:%llu must > 1"
,
inter_trainers_num
);
PADDLE_ENFORCE
_GT
(
inter_trainers_num
,
1
,
"inter_trainers_num:%llu must > 1"
,
inter_trainers_num
);
int
inter_trainer_id
=
trainer_id
%
inter_trainers_num
;
for
(
size_t
i
=
0
;
i
<
inter_nccl_ids
.
size
();
i
++
)
{
...
...
paddle/fluid/platform/profiler.cu
浏览文件 @
75d15719
...
...
@@ -35,13 +35,13 @@ void DummyKernelAndEvent() {
ForEachDevice
([](
int
d
)
{
platform
::
SetDeviceId
(
d
);
cudaStream_t
stream
;
PADDLE_ENFORCE
(
cudaStreamCreate
(
&
stream
));
PADDLE_ENFORCE
_CUDA_SUCCESS
(
cudaStreamCreate
(
&
stream
));
Mark
(
"_cuda_startup_"
);
int
*
ptr
;
PADDLE_ENFORCE
(
cudaMalloc
(
&
ptr
,
sizeof
(
int
)));
PADDLE_ENFORCE
_CUDA_SUCCESS
(
cudaMalloc
(
&
ptr
,
sizeof
(
int
)));
DummyKernel
<<<
1
,
1
,
0
,
stream
>>>
(
ptr
);
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
PADDLE_ENFORCE
(
cudaFree
(
ptr
));
PADDLE_ENFORCE
_CUDA_SUCCESS
(
cudaStreamSynchronize
(
stream
));
PADDLE_ENFORCE
_CUDA_SUCCESS
(
cudaFree
(
ptr
));
});
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录