Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
772be4f5
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
772be4f5
编写于
2月 09, 2022
作者:
N
niuliling123
提交者:
GitHub
2月 09, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Replace EigenBroadcast with ElementwiseBroadcast in ReduceGrad (#39255)
上级
b3e049f8
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
114 addition
and
19 deletion
+114
-19
paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
+2
-8
paddle/fluid/operators/reduce_ops/reduce_op.h
paddle/fluid/operators/reduce_ops/reduce_op.h
+55
-4
paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+1
-1
paddle/fluid/operators/reduce_ops/reduce_sum_op.h
paddle/fluid/operators/reduce_ops/reduce_sum_op.h
+1
-1
paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
+1
-2
paddle/pten/kernels/gpu/elementwise.h
paddle/pten/kernels/gpu/elementwise.h
+10
-3
paddle/pten/kernels/gpu/reduce_grad.h
paddle/pten/kernels/gpu/reduce_grad.h
+44
-0
未找到文件。
paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
浏览文件 @
772be4f5
...
@@ -17,15 +17,9 @@
...
@@ -17,15 +17,9 @@
template
<
typename
T
>
template
<
typename
T
>
using
CUDAReduceMeanGradKernel
=
using
CUDAReduceMeanGradKernel
=
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
T
,
ops
::
ReduceCudaGradKernel
<
T
,
kps
::
DivideFunctor
>
;
ops
::
MeanGradFunctor
,
true
>
;
using
FP16CUDAReduceMeanGradKernel
=
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
,
ops
::
FP16MeanGradFunctor
,
true
>
;
REGISTER_OP_CUDA_KERNEL
(
reduce_mean_grad
,
CUDAReduceMeanGradKernel
<
bool
>
,
REGISTER_OP_CUDA_KERNEL
(
reduce_mean_grad
,
CUDAReduceMeanGradKernel
<
bool
>
,
FP16CUDAReduceMeanGradKernel
,
CUDAReduceMeanGradKernel
<
paddle
::
platform
::
float16
>
,
CUDAReduceMeanGradKernel
<
float
>
,
CUDAReduceMeanGradKernel
<
float
>
,
CUDAReduceMeanGradKernel
<
double
>
);
CUDAReduceMeanGradKernel
<
double
>
);
paddle/fluid/operators/reduce_ops/reduce_op.h
浏览文件 @
772be4f5
...
@@ -30,6 +30,7 @@ limitations under the License. */
...
@@ -30,6 +30,7 @@ limitations under the License. */
#if defined(__HIPCC__) || defined(__NVCC__)
#if defined(__HIPCC__) || defined(__NVCC__)
#include "paddle/pten/kernels/gpu/reduce.h"
#include "paddle/pten/kernels/gpu/reduce.h"
#include "paddle/pten/kernels/gpu/reduce_grad.h"
#endif
#endif
namespace
paddle
{
namespace
paddle
{
...
@@ -620,11 +621,12 @@ class ReduceGradOp : public framework::OperatorWithKernel {
...
@@ -620,11 +621,12 @@ class ReduceGradOp : public framework::OperatorWithKernel {
protected:
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
int
in_dtype
=
ctx
.
Attr
<
int
>
(
"in
_dtype"
);
int
out_dtype
=
ctx
.
Attr
<
int
>
(
"out
_dtype"
);
auto
input_data_type
=
auto
input_data_type
=
(
in_dtype
>=
0
)
?
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
in_dtype
)
(
out_dtype
>=
0
)
:
OperatorWithKernel
::
IndicateVarDataType
(
?
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
out_dtype
)
ctx
,
framework
::
GradVarName
(
"Out"
));
:
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
framework
::
GradVarName
(
"Out"
));
#ifdef PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MKLDNN
auto
CanMKLDNNReduceGradBeUsed
=
[
&
]()
{
auto
CanMKLDNNReduceGradBeUsed
=
[
&
]()
{
auto
dx_dims
=
ctx
.
Input
<
Tensor
>
(
"X"
)
->
dims
();
auto
dx_dims
=
ctx
.
Input
<
Tensor
>
(
"X"
)
->
dims
();
...
@@ -730,6 +732,55 @@ class ReduceCudaKernel : public framework::OpKernel<T> {
...
@@ -730,6 +732,55 @@ class ReduceCudaKernel : public framework::OpKernel<T> {
dev_ctx
,
*
input
,
reduce_all
,
dims_int64
,
false
,
pt_out_dtype
,
output
);
dev_ctx
,
*
input
,
reduce_all
,
dims_int64
,
false
,
pt_out_dtype
,
output
);
}
}
};
};
template
<
typename
T
,
template
<
typename
,
typename
>
class
TransformOp
>
class
ReduceCudaGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
bool
reduce_all
=
context
.
Attr
<
bool
>
(
"reduce_all"
);
std
::
vector
<
int
>
dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
auto
*
in_x
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
d_out
=
context
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
d_x
=
context
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
out_dtype
=
context
.
Attr
<
int
>
(
"in_dtype"
);
// get reduce_dim and reduce_num for reduce_mean_grad
int
dim_size
=
in_x
->
dims
().
size
();
std
::
vector
<
int
>
reduce_dims
=
GetReduceDim
(
dims
,
dim_size
,
reduce_all
);
auto
update_dims
=
vectorize
(
d_x
->
dims
());
int
reduce_num
=
1
;
for
(
auto
i
:
reduce_dims
)
{
reduce_num
*=
(
in_x
->
dims
())[
i
];
update_dims
[
i
]
=
1
;
}
// make new tensor
framework
::
Tensor
new_d_out
(
d_out
->
type
());
new_d_out
.
ShareDataWith
(
*
d_out
);
new_d_out
.
Resize
(
paddle
::
framework
::
make_ddim
(
update_dims
));
auto
&
dev_ctx
=
context
.
cuda_device_context
();
if
(
out_dtype
>
0
)
{
d_x
->
mutable_data
(
dev_ctx
.
GetPlace
(),
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
out_dtype
));
}
else
{
d_x
->
mutable_data
(
dev_ctx
.
GetPlace
(),
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
d_out
->
type
()));
}
auto
pt_d_out
=
paddle
::
experimental
::
MakePtenDenseTensor
(
new_d_out
);
auto
pt_d_x
=
paddle
::
experimental
::
MakePtenDenseTensor
(
*
d_x
);
auto
pt_out_dtype
=
pten
::
TransToPtenDataType
(
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
out_dtype
));
if
(
out_dtype
<=
0
)
{
pt_out_dtype
=
pten
::
TransToPtenDataType
(
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
d_out
->
type
()));
}
using
MPType
=
typename
kps
::
details
::
MPTypeTrait
<
T
>::
Type
;
pten
::
ReduceGrad
<
T
,
TransformOp
<
T
,
MPType
>>
(
dev_ctx
,
pt_d_out
.
get
(),
pt_d_x
.
get
(),
pt_out_dtype
,
TransformOp
<
T
,
MPType
>
(
reduce_num
));
}
};
#endif
#endif
}
// namespace operators
}
// namespace operators
...
...
paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
浏览文件 @
772be4f5
...
@@ -50,7 +50,7 @@ class ReduceSumOpGradMaker : public framework::SingleGradOpMaker<T> {
...
@@ -50,7 +50,7 @@ class ReduceSumOpGradMaker : public framework::SingleGradOpMaker<T> {
framework
::
OpKernelType
GetExpectedKernelType
(
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
const
framework
::
ExecutionContext
&
ctx
)
const
{
int
in_dtype
=
ctx
.
Attr
<
int
>
(
"
in
_dtype"
);
int
in_dtype
=
ctx
.
Attr
<
int
>
(
"
out
_dtype"
);
if
(
in_dtype
>=
0
)
{
if
(
in_dtype
>=
0
)
{
return
framework
::
OpKernelType
(
return
framework
::
OpKernelType
(
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
in_dtype
),
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
in_dtype
),
...
...
paddle/fluid/operators/reduce_ops/reduce_sum_op.h
浏览文件 @
772be4f5
...
@@ -74,7 +74,7 @@ class ReduceSumGradKernel : public framework::OpKernel<T> {
...
@@ -74,7 +74,7 @@ class ReduceSumGradKernel : public framework::OpKernel<T> {
auto
dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
auto
dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
if
(
context
.
GetPlace
().
GetType
()
==
platform
::
CPUPlace
().
GetType
()
&&
if
(
context
.
GetPlace
().
GetType
()
==
platform
::
CPUPlace
().
GetType
()
&&
dims
.
size
()
==
1
)
{
dims
.
size
()
==
1
)
{
int
in_dtype
=
context
.
Attr
<
int
>
(
"
in
_dtype"
);
int
in_dtype
=
context
.
Attr
<
int
>
(
"
out
_dtype"
);
if
(
in_dtype
>=
0
)
{
if
(
in_dtype
>=
0
)
{
Tensor
tmp_tensor
;
Tensor
tmp_tensor
;
...
...
paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
浏览文件 @
772be4f5
...
@@ -17,8 +17,7 @@
...
@@ -17,8 +17,7 @@
template
<
typename
T
>
template
<
typename
T
>
using
CUDAReduceSumGradKernel
=
using
CUDAReduceSumGradKernel
=
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
T
,
ops
::
ReduceCudaGradKernel
<
T
,
kps
::
IdentityFunctor
>
;
ops
::
SumGradFunctor
,
true
>
;
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
reduce_sum_grad
,
CUDAReduceSumGradKernel
<
bool
>
,
reduce_sum_grad
,
CUDAReduceSumGradKernel
<
bool
>
,
...
...
paddle/pten/kernels/gpu/elementwise.h
浏览文件 @
772be4f5
...
@@ -134,12 +134,19 @@ struct DimensionsTransform {
...
@@ -134,12 +134,19 @@ struct DimensionsTransform {
explicit
DimensionsTransform
(
const
std
::
vector
<
const
DenseTensor
*>
&
ins
,
explicit
DimensionsTransform
(
const
std
::
vector
<
const
DenseTensor
*>
&
ins
,
const
pten
::
framework
::
DDim
&
dims
,
const
pten
::
framework
::
DDim
&
dims
,
int
axis
)
{
int
axis
)
{
const
int
N
=
ins
.
size
(
);
const
int
N
=
max
(
static_cast
<
int
>
(
ins
.
size
()),
2
);
dim_size
=
dims
.
size
();
dim_size
=
dims
.
size
();
out_dims
=
pten
::
framework
::
vectorize
<
int64_t
>
(
dims
);
out_dims
=
pten
::
framework
::
vectorize
<
int64_t
>
(
dims
);
in_dims
.
resize
(
N
);
in_dims
.
resize
(
N
);
for
(
int
j
=
0
;
j
<
N
;
++
j
)
{
if
(
ins
.
size
()
==
1
)
{
in_dims
[
j
]
=
pten
::
framework
::
vectorize
<
int64_t
>
(
ins
[
j
]
->
dims
());
// when ins.size() = 1, broadcast input to output
in_dims
[
0
]
=
pten
::
framework
::
vectorize
<
int64_t
>
(
ins
[
0
]
->
dims
());
in_dims
[
1
]
=
out_dims
;
// Add out_dims to in_dims to avoid errors in dims merging
}
else
{
for
(
int
j
=
0
;
j
<
N
;
++
j
)
{
in_dims
[
j
]
=
pten
::
framework
::
vectorize
<
int64_t
>
(
ins
[
j
]
->
dims
());
}
}
}
InputDimensionsExtend
(
N
,
axis
);
InputDimensionsExtend
(
N
,
axis
);
...
...
paddle/pten/kernels/gpu/reduce_grad.h
0 → 100644
浏览文件 @
772be4f5
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
// CUDA and HIP use same api
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <algorithm>
#include <cmath>
#include <numeric>
#include <set>
#include <vector>
#include "paddle/pten/kernels/gpu/elementwise.h"
namespace
pten
{
template
<
typename
InT
,
typename
Functor
>
void
ReduceGrad
(
const
GPUContext
&
dev_ctx
,
DenseTensor
*
d_out
,
DenseTensor
*
d_x
,
DataType
out_dtype
,
Functor
functor
)
{
std
::
vector
<
const
DenseTensor
*>
inputs
=
{
d_out
};
std
::
vector
<
DenseTensor
*>
outputs
=
{
d_x
};
PD_VISIT_ALL_TYPES
(
out_dtype
,
"LaunchBroadcastElementwiseCudaKernel"
,
([
&
]
{
LaunchBroadcastElementwiseCudaKernel
<
pten
::
ElementwiseType
::
kUnary
,
InT
,
data_t
>
(
dev_ctx
,
inputs
,
&
outputs
,
0
,
functor
);
}));
}
}
// namespace pten
#endif
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录