Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
72d99c5d
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
72d99c5d
编写于
3月 02, 2021
作者:
Q
Qi Li
提交者:
GitHub
3月 02, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[ROCM] update fluid operators for rocm (part4), test=develop (#31225)
上级
91635de3
变更
16
展开全部
隐藏空白更改
内联
并排
Showing
16 changed file
with
1006 addition
and
124 deletion
+1006
-124
paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
+4
-4
paddle/fluid/operators/conv_cudnn_op.cu
paddle/fluid/operators/conv_cudnn_op.cu
+212
-15
paddle/fluid/operators/conv_cudnn_op_cache.h
paddle/fluid/operators/conv_cudnn_op_cache.h
+9
-2
paddle/fluid/operators/conv_miopen_helper.h
paddle/fluid/operators/conv_miopen_helper.h
+325
-0
paddle/fluid/operators/conv_op.cc
paddle/fluid/operators/conv_op.cc
+8
-4
paddle/fluid/operators/conv_transpose_cudnn_op.cu
paddle/fluid/operators/conv_transpose_cudnn_op.cu
+173
-4
paddle/fluid/operators/conv_transpose_op.cc
paddle/fluid/operators/conv_transpose_op.cc
+3
-3
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+6
-5
paddle/fluid/operators/math/concat_test.cc
paddle/fluid/operators/math/concat_test.cc
+1
-1
paddle/fluid/operators/math/pooling.cc
paddle/fluid/operators/math/pooling.cc
+18
-16
paddle/fluid/operators/math/pooling.cu
paddle/fluid/operators/math/pooling.cu
+36
-30
paddle/fluid/operators/math/pooling.h
paddle/fluid/operators/math/pooling.h
+24
-20
paddle/fluid/operators/pool_cudnn_op.cu.cc
paddle/fluid/operators/pool_cudnn_op.cu.cc
+162
-3
paddle/fluid/operators/pool_op.cc
paddle/fluid/operators/pool_op.cc
+5
-2
paddle/fluid/operators/pool_op.h
paddle/fluid/operators/pool_op.h
+15
-10
paddle/fluid/operators/spp_op.h
paddle/fluid/operators/spp_op.h
+5
-5
未找到文件。
paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
浏览文件 @
72d99c5d
...
...
@@ -59,14 +59,14 @@ int PoolPlugin::enqueue(int batchSize, const void *const *inputs,
paddle
::
operators
::
math
::
MaxPool
<
float
>
,
float
>
pool2d_forward
;
pool2d_forward
(
idata
,
input_shape
,
output_shape
,
ksize_
,
strides_
,
paddings_
,
pool_process
,
true
,
adaptive_
,
odatas
[
0
],
stream
);
paddings_
,
true
,
adaptive_
,
odatas
[
0
],
stream
,
pool_process
);
}
else
if
(
pool_type_
==
PoolType
::
avg
)
{
paddle
::
operators
::
math
::
AvgPool
<
float
>
pool_process
;
paddle
::
operators
::
math
::
Pool2dDirectCUDAFunctor
<
paddle
::
operators
::
math
::
AvgPool
<
float
>
,
float
>
pool2d_forward
;
pool2d_forward
(
idata
,
input_shape
,
output_shape
,
ksize_
,
strides_
,
paddings_
,
pool_process
,
true
,
adaptive_
,
odatas
[
0
],
stream
);
paddings_
,
true
,
adaptive_
,
odatas
[
0
],
stream
,
pool_process
);
}
return
cudaGetLastError
()
!=
cudaSuccess
;
...
...
@@ -224,14 +224,14 @@ int PoolPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
paddle
::
operators
::
math
::
MaxPool
<
float
>
,
float
>
pool2d_forward
;
pool2d_forward
(
input
,
input_shape
,
output_shape
,
ksize
,
strides_
,
paddings
,
pool_process
,
true
,
adaptive_
,
output
,
stream
);
true
,
adaptive_
,
output
,
stream
,
pool_process
);
}
else
if
(
pool_type_
==
"avg"
)
{
paddle
::
operators
::
math
::
AvgPool
<
float
>
pool_process
;
paddle
::
operators
::
math
::
Pool2dDirectCUDAFunctor
<
paddle
::
operators
::
math
::
AvgPool
<
float
>
,
float
>
pool2d_forward
;
pool2d_forward
(
input
,
input_shape
,
output_shape
,
ksize
,
strides_
,
paddings
,
pool_process
,
true
,
adaptive_
,
output
,
stream
);
true
,
adaptive_
,
output
,
stream
,
pool_process
);
}
return
cudaGetLastError
()
!=
cudaSuccess
;
...
...
paddle/fluid/operators/conv_cudnn_op.cu
浏览文件 @
72d99c5d
此差异已折叠。
点击以展开。
paddle/fluid/operators/conv_cudnn_op_cache.h
浏览文件 @
72d99c5d
...
...
@@ -18,7 +18,11 @@ limitations under the License. */
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/miopen_helper.h"
#else
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
DECLARE_uint64
(
conv_workspace_size_limit
);
DECLARE_bool
(
cudnn_exhaustive_search
);
...
...
@@ -26,8 +30,11 @@ DECLARE_int64(cudnn_exhaustive_search_times);
namespace
paddle
{
namespace
operators
{
#if CUDNN_VERSION_MIN(6, 0, 5)
#ifdef PADDLE_WITH_HIP
static
constexpr
size_t
kNUM_CUDNN_FWD_ALGS
=
1
;
static
constexpr
size_t
kNUM_CUDNN_BWD_FILTER_ALGS
=
1
;
static
constexpr
size_t
kNUM_CUDNN_BWD_DATA_ALGS
=
1
;
#elif CUDNN_VERSION_MIN(6, 0, 5)
static
constexpr
size_t
kNUM_CUDNN_FWD_ALGS
=
CUDNN_CONVOLUTION_FWD_ALGO_COUNT
;
static
constexpr
size_t
kNUM_CUDNN_BWD_FILTER_ALGS
=
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT
;
...
...
paddle/fluid/operators/conv_miopen_helper.h
0 → 100644
浏览文件 @
72d99c5d
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <array>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/conv_search_cache.h"
#include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/miopen_desc.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
DataLayout
=
platform
::
DataLayout
;
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
using
framework
::
AlgorithmsCache
;
static
inline
void
GetNCDHW
(
const
framework
::
DDim
&
dims
,
const
DataLayout
&
layout
,
int
*
N
,
int
*
C
,
int
*
D
,
int
*
H
,
int
*
W
)
{
*
N
=
dims
[
0
];
*
C
=
layout
==
DataLayout
::
kNCHW
?
dims
[
1
]
:
dims
[
dims
.
size
()
-
1
];
int
i
=
layout
==
DataLayout
::
kNCHW
?
0
:
1
;
if
(
dims
.
size
()
==
5
)
{
*
D
=
dims
[
2
-
i
];
*
H
=
dims
[
3
-
i
];
*
W
=
dims
[
4
-
i
];
}
else
{
*
D
=
1
;
*
H
=
dims
[
2
-
i
];
*
W
=
dims
[
3
-
i
];
}
}
template
<
typename
DeviceContext
,
typename
T
,
size_t
D
>
static
void
RemovePaddingSlice
(
const
framework
::
ExecutionContext
&
context
,
const
Tensor
*
input
,
Tensor
*
out
,
const
std
::
vector
<
int
>&
starts
,
const
std
::
vector
<
int
>&
axes
)
{
auto
&
place
=
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
();
auto
in_dims
=
input
->
dims
();
auto
new_out_dims
=
out
->
dims
();
auto
offsets
=
Eigen
::
array
<
int
,
D
>
();
auto
extents
=
Eigen
::
array
<
int
,
D
>
();
for
(
size_t
i
=
0
;
i
<
D
;
++
i
)
{
offsets
[
i
]
=
0
;
extents
[
i
]
=
new_out_dims
[
i
];
}
int
start
;
for
(
size_t
i
=
0
;
i
<
axes
.
size
();
++
i
)
{
start
=
starts
[
i
];
if
(
start
<
0
)
{
start
=
(
start
+
in_dims
[
axes
[
i
]]);
}
start
=
std
::
max
(
start
,
0
);
offsets
[
axes
[
i
]]
=
start
;
}
auto
in_t
=
framework
::
EigenTensor
<
T
,
D
,
Eigen
::
RowMajor
,
Eigen
::
DenseIndex
>::
From
(
*
input
);
auto
out_t
=
framework
::
EigenTensor
<
T
,
D
,
Eigen
::
RowMajor
,
Eigen
::
DenseIndex
>::
From
(
*
out
,
new_out_dims
);
out_t
.
device
(
place
)
=
in_t
.
slice
(
offsets
,
extents
);
}
template
<
typename
T
>
std
::
ostream
&
operator
<<
(
std
::
ostream
&
out
,
const
std
::
vector
<
T
>&
v
)
{
out
<<
"["
;
for
(
auto
const
&
tmp
:
v
)
out
<<
tmp
<<
","
;
out
<<
"]"
;
return
out
;
}
using
framework
::
ConvSearchCache
;
struct
ConvArgs
{
miopenHandle_t
handle
;
platform
::
TensorDescriptor
idesc
,
odesc
;
platform
::
FilterDescriptor
wdesc
;
platform
::
ConvolutionDescriptor
cdesc
;
const
framework
::
Tensor
*
x
,
*
w
,
*
o
;
miopenDataType_t
cudnn_dtype
;
// strides
std
::
vector
<
int
>
s
;
// paddings
std
::
vector
<
int
>
p
;
// dilations
std
::
vector
<
int
>
d
;
ConvArgs
(
const
framework
::
Tensor
*
x
,
const
framework
::
Tensor
*
w
,
const
framework
::
Tensor
*
o
,
const
std
::
vector
<
int
>
s
,
const
std
::
vector
<
int
>
p
,
const
std
::
vector
<
int
>
d
,
miopenDataType_t
dtype
)
:
x
(
x
),
w
(
w
),
o
(
o
),
s
(
s
),
p
(
p
),
d
(
d
),
cudnn_dtype
(
dtype
)
{}
};
template
<
typename
algo_t
>
struct
SearchAlgorithm
{};
template
<
>
struct
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
{
using
perf_t
=
miopenConvAlgoPerf_t
;
using
algo_t
=
miopenConvFwdAlgorithm_t
;
template
<
typename
T
>
static
algo_t
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
bool
deterministic
,
const
framework
::
ExecutionContext
&
ctx
)
{
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
bool
has_got_workspace_size
=
true
;
size_t
workspace_size_limit
=
FLAGS_conv_workspace_size_limit
*
1024
*
1024
;
size_t
workspace_size
=
0
;
algo_t
algo
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
auto
&
temp
=
ctx
.
cuda_device_context
();
AlgorithmsCache
<
algo_t
>&
algo_cache
=
*
(
framework
::
ConvSearchCache
::
Instance
().
GetForward
());
auto
x_dims
=
framework
::
vectorize
(
args
.
x
->
dims
());
auto
w_dims
=
framework
::
vectorize
(
args
.
w
->
dims
());
VLOG
(
10
)
<<
"miopenConvolutionFwdAlgoPerf_t:"
<<
", x_dims:"
<<
x_dims
<<
", w_dims:"
<<
w_dims
<<
", args.s"
<<
args
.
s
<<
", args.p"
<<
args
.
p
<<
", args.d"
<<
args
.
d
;
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
w_dims
,
args
.
s
,
args
.
p
,
args
.
d
,
0
,
static_cast
<
int64_t
>
(
args
.
cudnn_dtype
),
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
perf_t
,
kNUM_CUDNN_FWD_ALGS
>
perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenFindConvolutionForwardAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
args
.
wdesc
.
desc
(),
args
.
w
->
data
<
T
>
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
const_cast
<
T
*>
(
args
.
o
->
data
<
T
>
()),
kNUM_CUDNN_FWD_ALGS
,
&
returned_algo_count
,
perf_stat
.
data
(),
cudnn_workspace_ptr
,
workspace_size_limit
,
false
));
};
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size_limit
);
VLOG
(
3
)
<<
"FwdAlgo Perf result: (algo: stat, time, memory)"
;
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
const
auto
&
stat
=
perf_stat
[
i
];
VLOG
(
3
)
<<
stat
.
fwd_algo
;
}
return
perf_stat
[
0
].
fwd_algo
;
});
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
return
algo
;
}
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
algo_t
algo
)
{
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionForwardGetWorkSpaceSize
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
idesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
&
workspace_size
));
return
workspace_size
;
}
};
template
<
>
struct
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
{
using
perf_t
=
miopenConvAlgoPerf_t
;
using
algo_t
=
miopenConvBwdDataAlgorithm_t
;
template
<
typename
T
>
static
algo_t
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
bool
deterministic
,
const
framework
::
ExecutionContext
&
ctx
)
{
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
size_t
workspace_size_limit
=
FLAGS_conv_workspace_size_limit
*
1024
*
1024
;
size_t
workspace_size
=
0
;
bool
has_got_workspace_size
=
true
;
algo_t
algo
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
AlgorithmsCache
<
algo_t
>&
algo_cache
=
*
(
framework
::
ConvSearchCache
::
Instance
().
GetBackwardData
());
auto
x_dims
=
framework
::
vectorize
(
args
.
x
->
dims
());
auto
w_dims
=
framework
::
vectorize
(
args
.
w
->
dims
());
VLOG
(
10
)
<<
"miopenConvolutionFwdAlgoPerf_t"
<<
", x_dims:"
<<
x_dims
<<
", w_dims:"
<<
w_dims
<<
", args.s"
<<
args
.
s
<<
", args.p"
<<
args
.
p
<<
", args.d"
<<
args
.
d
;
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
w_dims
,
args
.
s
,
args
.
p
,
args
.
d
,
0
,
static_cast
<
int64_t
>
(
args
.
cudnn_dtype
),
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
perf_t
,
kNUM_CUDNN_FWD_ALGS
>
perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenFindConvolutionBackwardDataAlgorithm
(
args
.
handle
,
args
.
odesc
.
desc
(),
args
.
o
->
data
<
T
>
(),
args
.
wdesc
.
desc
(),
args
.
w
->
data
<
T
>
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
const_cast
<
T
*>
(
args
.
x
->
data
<
T
>
()),
kNUM_CUDNN_BWD_DATA_ALGS
,
&
returned_algo_count
,
perf_stat
.
data
(),
cudnn_workspace_ptr
,
workspace_size_limit
,
false
));
};
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size_limit
);
VLOG
(
3
)
<<
"BwdDataAlgo Perf result: (algo: stat, time, memory)"
;
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
const
auto
&
stat
=
perf_stat
[
i
];
VLOG
(
3
)
<<
stat
.
bwd_data_algo
;
}
return
perf_stat
[
0
].
bwd_data_algo
;
});
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
return
algo
;
}
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
algo_t
algo
)
{
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardDataGetWorkSpaceSize
(
args
.
handle
,
args
.
odesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
&
workspace_size
));
return
workspace_size
;
}
};
template
<
>
struct
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
{
using
perf_t
=
miopenConvAlgoPerf_t
;
using
algo_t
=
miopenConvBwdWeightsAlgorithm_t
;
template
<
typename
T
>
static
algo_t
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
bool
deterministic
,
const
framework
::
ExecutionContext
&
ctx
)
{
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
size_t
workspace_size_limit
=
FLAGS_conv_workspace_size_limit
*
1024
*
1024
;
size_t
workspace_size
=
0
;
bool
has_got_workspace_size
=
true
;
algo_t
algo
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
AlgorithmsCache
<
algo_t
>&
algo_cache
=
*
(
framework
::
ConvSearchCache
::
Instance
().
GetBackwardFilter
());
auto
x_dims
=
framework
::
vectorize
(
args
.
x
->
dims
());
auto
w_dims
=
framework
::
vectorize
(
args
.
w
->
dims
());
VLOG
(
10
)
<<
"miopenConvolutionFwdAlgoPerf_t:"
<<
", x_dims:"
<<
x_dims
<<
", w_dims:"
<<
w_dims
<<
", args.s"
<<
args
.
s
<<
", args.p"
<<
args
.
p
<<
", args.d"
<<
args
.
d
;
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
w_dims
,
args
.
s
,
args
.
p
,
args
.
d
,
0
,
static_cast
<
int64_t
>
(
args
.
cudnn_dtype
),
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
perf_t
,
kNUM_CUDNN_FWD_ALGS
>
perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenFindConvolutionBackwardWeightsAlgorithm
(
args
.
handle
,
args
.
odesc
.
desc
(),
args
.
o
->
data
<
T
>
(),
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
const_cast
<
T
*>
(
args
.
w
->
data
<
T
>
()),
kNUM_CUDNN_BWD_FILTER_ALGS
,
&
returned_algo_count
,
perf_stat
.
data
(),
cudnn_workspace_ptr
,
workspace_size_limit
,
false
));
};
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size_limit
);
VLOG
(
3
)
<<
"BwdFilterAlgo Perf result: (algo: stat, time, memory)"
;
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
const
auto
&
stat
=
perf_stat
[
i
];
VLOG
(
3
)
<<
stat
.
bwd_weights_algo
;
}
return
perf_stat
[
0
].
bwd_weights_algo
;
});
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
return
algo
;
}
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
algo_t
algo
)
{
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardWeightsGetWorkSpaceSize
(
args
.
handle
,
args
.
odesc
.
desc
(),
args
.
idesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
&
workspace_size
));
return
workspace_size
;
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/conv_op.cc
浏览文件 @
72d99c5d
...
...
@@ -21,9 +21,13 @@ limitations under the License. */
#include "paddle/fluid/framework/op_version_registry.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/miopen_helper.h"
#endif
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
...
...
@@ -149,7 +153,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
"AnyLayout"
;
// todo enable data layout when it's ready
framework
::
DataLayout
layout
=
framework
::
StringToDataLayout
(
data_format
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
CanCUDNNBeUsed
(
ctx
))
{
library
=
framework
::
LibraryType
::
kCUDNN
;
}
...
...
@@ -559,7 +563,7 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
auto
data_type
=
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"Input"
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
CanCUDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kCUDNN
;
}
...
...
@@ -744,7 +748,7 @@ framework::OpKernelType ConvOpDoubleGrad::GetExpectedKernelType(
std
::
string
data_format
=
"AnyLayout"
;
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
CanCUDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kCUDNN
;
}
...
...
paddle/fluid/operators/conv_transpose_cudnn_op.cu
浏览文件 @
72d99c5d
...
...
@@ -15,11 +15,14 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memory.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#else
#include "paddle/fluid/operators/conv_cudnn_helper.h"
#endif
#include "paddle/fluid/operators/conv_transpose_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/padding.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -212,7 +215,11 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
}
size_t
workspace_size
=
0
;
#ifdef PADDLE_WITH_HIP
miopenConvBwdDataAlgorithm_t
algo
{};
#else
cudnnConvolutionBwdDataAlgo_t
algo
{};
#endif
// ------------------- cudnn conv algorithm ---------------------
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
...
...
@@ -235,7 +242,12 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
args
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
platform
::
AllowTF32Cudnn
(),
c_groups
);
#ifdef PADDLE_WITH_HIP
using
search
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
#else
using
search
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
#endif
algo
=
search
::
Find
<
T
>
(
args
,
false
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search
::
GetWorkspaceSize
(
args
,
algo
));
...
...
@@ -250,6 +262,17 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
ScalingParamType
<
T
>
beta
=
0.0
f
;
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
#ifdef PADDLE_WITH_HIP
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardData
(
handle
,
&
alpha
,
args
.
odesc
.
desc
(),
input_data
+
input_offset
*
g
,
args
.
wdesc
.
desc
(),
filter_data
+
filter_offset
*
g
,
args
.
cdesc
.
desc
(),
algo
,
&
beta
,
args
.
idesc
.
desc
(),
transformed_output_data
+
output_offset
*
g
,
cudnn_workspace
,
workspace_size
));
};
#else // PADDLE_WITH_HIP
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
...
...
@@ -259,6 +282,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
cudnn_workspace
,
workspace_size
,
&
beta
,
args
.
idesc
.
desc
(),
transformed_output_data
+
output_offset
*
g
));
};
#endif // PADDLE_WITH_HIP
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size
);
}
if
(
!
is_sys_pad
&&
strides
.
size
()
==
2U
)
{
...
...
@@ -449,8 +473,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
padding_common
,
dilations
,
dtype
};
#ifdef PADDLE_WITH_HIP
miopenConvFwdAlgorithm_t
data_algo
{};
miopenConvBwdWeightsAlgorithm_t
filter_algo
{};
#else
cudnnConvolutionFwdAlgo_t
data_algo
{};
cudnnConvolutionBwdFilterAlgo_t
filter_algo
{};
#endif
auto
layout_tensor
=
GetCudnnTensorFormat
(
layout
);
size_t
workspace_size
=
0
;
...
...
@@ -472,7 +502,11 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
args1
.
odesc
.
set
(
input_transpose
,
iwo_groups
);
args1
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
platform
::
AllowTF32Cudnn
(),
c_groups
);
#ifdef PADDLE_WITH_HIP
using
search1
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
#else
using
search1
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
#endif
data_algo
=
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search1
::
GetWorkspaceSize
(
args1
,
data_algo
));
...
...
@@ -486,7 +520,11 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
args2
.
odesc
.
set
(
input_transpose
,
iwo_groups
);
args2
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
platform
::
AllowTF32Cudnn
(),
c_groups
);
#ifdef PADDLE_WITH_HIP
using
search2
=
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
#else
using
search2
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
#endif
filter_algo
=
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
filter_algo
));
...
...
@@ -504,6 +542,18 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
if
(
input_grad
)
{
// Because beta is zero, it is unnecessary to reset input_grad.
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
#ifdef PADDLE_WITH_HIP
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionForward
(
handle
,
&
alpha
,
args1
.
idesc
.
desc
(),
output_grad_data
+
output_grad_offset
*
g
,
args1
.
wdesc
.
desc
(),
filter_data
+
filter_offset
*
g
,
args1
.
cdesc
.
desc
(),
data_algo
,
&
beta
,
args1
.
odesc
.
desc
(),
input_grad_data
+
input_offset
*
g
,
cudnn_workspace
,
workspace_size
));
};
#else // PADDLE_WITH_HIP
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnConvolutionForward
(
...
...
@@ -513,6 +563,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
data_algo
,
cudnn_workspace
,
workspace_size
,
&
beta
,
args1
.
odesc
.
desc
(),
input_grad_data
+
input_offset
*
g
));
};
#endif // PADDLE_WITH_HIP
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size
);
}
...
...
@@ -540,6 +591,18 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset filter_grad.
// Gradient with respect to the filter
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
#ifdef PADDLE_WITH_HIP
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardWeights
(
handle
,
&
alpha
,
args2
.
odesc
.
desc
(),
input_data
+
input_offset
*
g
,
args2
.
idesc
.
desc
(),
output_grad_data
+
output_grad_offset
*
g
,
args2
.
cdesc
.
desc
(),
filter_algo
,
&
beta
,
args2
.
wdesc
.
desc
(),
filter_grad_data
+
filter_offset
*
g
,
cudnn_workspace
,
workspace_size
));
};
#else // PADDLE_WITH_HIP
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
...
...
@@ -549,6 +612,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
filter_algo
,
cudnn_workspace
,
workspace_size
,
&
beta
,
args2
.
wdesc
.
desc
(),
filter_grad_data
+
filter_offset
*
g
));
};
#endif // PADDLE_WITH_HIP
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size
);
}
}
...
...
@@ -840,7 +904,16 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
ConvArgs
args4
{
&
transformed_dO
,
ddW
,
&
transformed_dX_channel
,
strides
,
padding_common
,
dilations
,
dtype
};
#ifdef PADDLE_WITH_HIP
miopenConvBwdDataAlgorithm_t
bwd_algo1
=
static_cast
<
miopenConvBwdDataAlgorithm_t
>
(
0
);
miopenConvBwdDataAlgorithm_t
bwd_algo2
=
static_cast
<
miopenConvBwdDataAlgorithm_t
>
(
0
);
miopenConvFwdAlgorithm_t
data_algo
=
static_cast
<
miopenConvFwdAlgorithm_t
>
(
0
);
miopenConvBwdWeightsAlgorithm_t
filter_algo
=
static_cast
<
miopenConvBwdWeightsAlgorithm_t
>
(
0
);
#else
cudnnConvolutionBwdDataAlgo_t
bwd_algo1
=
static_cast
<
cudnnConvolutionBwdDataAlgo_t
>
(
0
);
cudnnConvolutionBwdDataAlgo_t
bwd_algo2
=
...
...
@@ -849,6 +922,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
static_cast
<
cudnnConvolutionFwdAlgo_t
>
(
0
);
cudnnConvolutionBwdFilterAlgo_t
filter_algo
=
static_cast
<
cudnnConvolutionBwdFilterAlgo_t
>
(
0
);
#endif
auto
layout
=
GetCudnnTensorFormat
(
platform
::
DataLayout
::
kNCHW
);
...
...
@@ -866,7 +940,11 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
args1
.
wdesc
.
set
(
*
W
,
layout
,
iwo_group
);
args1
.
odesc
.
set
(
transformed_ddX
,
iwo_group
);
args1
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_group
);
#ifdef PADDLE_WITH_HIP
using
search1
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
#else
using
search1
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
#endif
bwd_algo1
=
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
ctx
);
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
,
bwd_algo1
);
}
...
...
@@ -878,7 +956,11 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
args2
.
wdesc
.
set
(
*
ddW
,
layout
,
iwo_group
);
args2
.
odesc
.
set
(
transformed_X
,
iwo_group
);
args2
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_group
);
#ifdef PADDLE_WITH_HIP
using
search2
=
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
#else
using
search2
=
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
#endif
bwd_algo2
=
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
bwd_algo2
));
...
...
@@ -894,8 +976,11 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
args3
.
odesc
.
set
(
transformed_ddX_channel
,
iwo_group
);
args3
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_group
);
#ifdef PADDLE_WITH_HIP
using
search3
=
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
#else
using
search3
=
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
#endif
filter_algo
=
search3
::
Find
<
T
>
(
args3
,
false
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
,
filter_algo
));
...
...
@@ -909,8 +994,11 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
args4
.
wdesc
.
set
(
*
ddW
,
layout
,
iwo_group
);
args4
.
odesc
.
set
(
transformed_dX_channel
,
iwo_group
);
args4
.
cdesc
.
set
(
dtype
,
padding_common
,
strides
,
dilations
,
c_group
);
#ifdef PADDLE_WITH_HIP
using
search4
=
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
#else
using
search4
=
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
#endif
data_algo
=
search4
::
Find
<
T
>
(
args4
,
false
,
deterministic
,
ctx
);
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
,
data_algo
));
...
...
@@ -939,6 +1027,20 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
if
(
ddX
)
{
ddx
=
transformed_ddX
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardData
(
handle
,
&
alpha
,
args1
.
odesc
.
desc
(),
ddx
+
i
*
group_offset_in
,
args1
.
wdesc
.
desc
(),
w
+
i
*
group_offset_filter
,
args1
.
cdesc
.
desc
(),
bwd_algo1
,
&
beta
,
args1
.
idesc
.
desc
(),
transformed_ddy_channel
+
i
*
group_offset_out
,
workspace_ptr
,
workspace_size
));
},
workspace_size
);
#else // PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
...
@@ -951,10 +1053,25 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
transformed_ddy_channel
+
i
*
group_offset_out
));
},
workspace_size
);
#endif // PADDLE_WITH_HIP
}
}
if
(
ddW
)
{
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardData
(
handle
,
&
alpha
,
args2
.
odesc
.
desc
(),
x
+
i
*
group_offset_in
,
args2
.
wdesc
.
desc
(),
ddw
+
i
*
group_offset_filter
,
args2
.
cdesc
.
desc
(),
bwd_algo2
,
&
alpha
,
args2
.
idesc
.
desc
(),
transformed_ddy_channel
+
i
*
group_offset_out
,
workspace_ptr
,
workspace_size
));
},
workspace_size
);
#else // PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
...
@@ -967,6 +1084,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
transformed_ddy_channel
+
i
*
group_offset_out
));
},
workspace_size
);
#endif // PADDLE_WITH_HIP
}
}
if
((
!
is_sys_pad
)
&&
(
!
channel_last
))
{
...
...
@@ -997,6 +1115,20 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
if
(
dW
&&
ddX
)
{
ddx
=
transformed_ddX_channel
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionBackwardWeights
(
handle
,
&
alpha
,
args3
.
odesc
.
desc
(),
ddx
+
i
*
group_offset_in
,
args3
.
idesc
.
desc
(),
transformed_dy_channel
+
i
*
group_offset_out
,
args3
.
cdesc
.
desc
(),
filter_algo
,
&
beta
,
args3
.
wdesc
.
desc
(),
dw
+
i
*
group_offset_filter
,
workspace_ptr
,
workspace_size
));
},
workspace_size
);
#else // PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
...
@@ -1009,12 +1141,27 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
dw
+
i
*
group_offset_filter
));
},
workspace_size
);
#endif // PADDLE_WITH_HIP
}
}
if
(
dX
&&
ddW
)
{
ddw
=
ddW
->
data
<
T
>
();
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
#ifdef PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenConvolutionForward
(
handle
,
&
alpha
,
args4
.
idesc
.
desc
(),
transformed_dy_channel
+
i
*
group_offset_out
,
args4
.
wdesc
.
desc
(),
ddw
+
i
*
group_offset_filter
,
args4
.
cdesc
.
desc
(),
data_algo
,
&
beta
,
args4
.
odesc
.
desc
(),
transformed_dx
+
i
*
group_offset_in
,
workspace_ptr
,
workspace_size
));
},
workspace_size
);
#else // PADDLE_WITH_HIP
wkspace_handle
.
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
...
@@ -1027,6 +1174,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
transformed_dx
+
i
*
group_offset_in
));
},
workspace_size
);
#endif // PADDLE_WITH_HIP
}
if
(
channel_last
)
{
TransToChannelLast
<
paddle
::
platform
::
CUDADeviceContext
,
T
>
(
...
...
@@ -1042,6 +1190,26 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
REGISTER_OP_KERNEL
(
conv2d_transpose
,
CUDNN
,
::
paddle
::
platform
::
CUDAPlace
,
ops
::
CUDNNConvTransposeOpKernel
<
plat
::
float16
>
,
ops
::
CUDNNConvTransposeOpKernel
<
float
>
);
REGISTER_OP_KERNEL
(
conv2d_transpose_grad
,
CUDNN
,
::
paddle
::
platform
::
CUDAPlace
,
ops
::
CUDNNConvTransposeGradOpKernel
<
plat
::
float16
>
,
ops
::
CUDNNConvTransposeGradOpKernel
<
float
>
);
REGISTER_OP_KERNEL
(
conv2d_transpose_grad_grad
,
CUDNN
,
plat
::
CUDAPlace
,
paddle
::
operators
::
CUDNNConvTransposeDoubleGradOpKernel
<
float
>
,
paddle
::
operators
::
CUDNNConvTransposeDoubleGradOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
conv3d_transpose
,
CUDNN
,
::
paddle
::
platform
::
CUDAPlace
,
ops
::
CUDNNConvTransposeOpKernel
<
plat
::
float16
>
,
ops
::
CUDNNConvTransposeOpKernel
<
float
>
);
REGISTER_OP_KERNEL
(
conv3d_transpose_grad
,
CUDNN
,
::
paddle
::
platform
::
CUDAPlace
,
ops
::
CUDNNConvTransposeGradOpKernel
<
plat
::
float16
>
,
ops
::
CUDNNConvTransposeGradOpKernel
<
float
>
);
#else
REGISTER_OP_KERNEL
(
conv2d_transpose
,
CUDNN
,
::
paddle
::
platform
::
CUDAPlace
,
ops
::
CUDNNConvTransposeOpKernel
<
plat
::
float16
>
,
ops
::
CUDNNConvTransposeOpKernel
<
float
>
,
...
...
@@ -1064,3 +1232,4 @@ REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
ops
::
CUDNNConvTransposeGradOpKernel
<
plat
::
float16
>
,
ops
::
CUDNNConvTransposeGradOpKernel
<
float
>
,
ops
::
CUDNNConvTransposeGradOpKernel
<
double
>
);
#endif
paddle/fluid/operators/conv_transpose_op.cc
浏览文件 @
72d99c5d
...
...
@@ -183,7 +183,7 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
bool
use_cudnn
=
ctx
.
Attr
<
bool
>
(
"use_cudnn"
);
use_cudnn
&=
platform
::
is_gpu_place
(
ctx
.
GetPlace
());
auto
data_type
=
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"Input"
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
use_cudnn
&=
dev_ctx
.
cudnn_handle
()
!=
nullptr
;
...
...
@@ -481,7 +481,7 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
const
framework
::
ExecutionContext
&
ctx
)
const
{
bool
use_cudnn
=
ctx
.
Attr
<
bool
>
(
"use_cudnn"
);
use_cudnn
&=
platform
::
is_gpu_place
(
ctx
.
GetPlace
());
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
use_cudnn
&=
dev_ctx
.
cudnn_handle
()
!=
nullptr
;
...
...
@@ -581,7 +581,7 @@ framework::OpKernelType ConvTransposeOpDoubleGrad::GetExpectedKernelType(
const
framework
::
ExecutionContext
&
ctx
)
const
{
bool
use_cudnn
=
ctx
.
Attr
<
bool
>
(
"use_cudnn"
);
use_cudnn
&=
platform
::
is_gpu_place
(
ctx
.
GetPlace
());
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
use_cudnn
&=
dev_ctx
.
cudnn_handle
()
!=
nullptr
;
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
72d99c5d
...
...
@@ -28,15 +28,12 @@ function(math_library TARGET)
if
(
EXISTS
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
TARGET
}
.cu.cc
)
list
(
APPEND cu_srcs
${
TARGET
}
.cu.cc
)
endif
()
if
(
EXISTS
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
TARGET
}
.hip.cu
)
list
(
APPEND hip_srcs
${
TARGET
}
.hip.cu
)
endif
()
list
(
LENGTH cc_srcs cc_srcs_len
)
if
(
WITH_GPU
)
nv_library
(
${
TARGET
}
SRCS
${
cc_srcs
}
${
cu_srcs
}
DEPS
${
math_library_DEPS
}
${
math_common_deps
}
)
elseif
(
WITH_ROCM
_PLATFORM
AND
(
${
hip_srcs
}
MATCHES
".*
\\
.hip.cu$"
)
)
hip_library
_ops
(
${
TARGET
}
SRCS
${
cc_srcs
}
${
hip
_srcs
}
DEPS
${
math_library_DEPS
}
${
math_common_deps
}
)
elseif
(
WITH_ROCM
)
hip_library
(
${
TARGET
}
SRCS
${
cc_srcs
}
${
cu
_srcs
}
DEPS
${
math_library_DEPS
}
${
math_common_deps
}
)
elseif
(
${
cc_srcs_len
}
GREATER 0
)
cc_library
(
${
TARGET
}
SRCS
${
cc_srcs
}
DEPS
${
math_library_DEPS
}
${
math_common_deps
}
)
endif
()
...
...
@@ -89,6 +86,10 @@ if(WITH_GPU)
nv_test
(
math_function_gpu_test SRCS math_function_test.cu DEPS math_function
)
nv_test
(
selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function
)
endif
()
if
(
WITH_ROCM
)
hip_test
(
math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor
)
hip_test
(
selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function
)
endif
()
cc_test
(
concat_test SRCS concat_test.cc DEPS concat_and_split
)
cc_test
(
cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info
)
if
(
WITH_TESTING AND TEST im2col_test
)
...
...
paddle/fluid/operators/math/concat_test.cc
浏览文件 @
72d99c5d
...
...
@@ -442,7 +442,7 @@ void TestConcatMain() {
TEST
(
math
,
concat
)
{
TestConcatMain
<
paddle
::
platform
::
CPUDeviceContext
,
paddle
::
platform
::
CPUPlace
>
();
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TestConcatMain
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
CUDAPlace
>
();
#endif
...
...
paddle/fluid/operators/math/pooling.cc
浏览文件 @
72d99c5d
...
...
@@ -30,8 +30,9 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_process
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
)
{
const
std
::
vector
<
int
>&
paddings
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
,
PoolProcess
pool_process
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_height
=
input
.
dims
()[
2
];
const
int
input_width
=
input
.
dims
()[
3
];
...
...
@@ -104,8 +105,8 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
string
data_format
,
PoolProcess
pool_process
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
)
{
const
std
::
string
data_format
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
,
PoolProcess
pool_process
)
{
bool
channel_last
=
(
data_format
==
"NHWC"
);
const
int
batch_size
=
input
.
dims
()[
0
];
...
...
@@ -249,8 +250,8 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
output
,
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_grad_process
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
)
{
const
std
::
vector
<
int
>&
paddings
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
,
PoolProcess
pool_grad_process
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_height
=
input
.
dims
()[
2
];
const
int
input_width
=
input
.
dims
()[
3
];
...
...
@@ -328,8 +329,8 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const
framework
::
Tensor
&
output
,
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
string
data_format
,
PoolProcess
pool_grad_process
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
)
{
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
,
PoolProcess
pool_grad_process
)
{
bool
channel_last
=
(
data_format
==
"NHWC"
);
const
int
batch_size
=
input
.
dims
()[
0
];
...
...
@@ -678,8 +679,9 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_process
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
)
{
const
std
::
vector
<
int
>&
paddings
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
,
PoolProcess
pool_process
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_depth
=
input
.
dims
()[
2
];
const
int
input_height
=
input
.
dims
()[
3
];
...
...
@@ -773,8 +775,8 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
string
data_format
,
PoolProcess
pool_process
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
)
{
const
std
::
string
data_format
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
,
PoolProcess
pool_process
)
{
bool
channel_last
=
(
data_format
==
"NDHWC"
);
const
int
batch_size
=
input
.
dims
()[
0
];
...
...
@@ -970,8 +972,8 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
output
,
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_grad_process
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
)
{
const
std
::
vector
<
int
>&
paddings
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
,
PoolProcess
pool_grad_process
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_depth
=
input
.
dims
()[
2
];
const
int
input_height
=
input
.
dims
()[
3
];
...
...
@@ -1071,8 +1073,8 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const
framework
::
Tensor
&
output
,
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
string
data_format
,
PoolProcess
pool_grad_process
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
)
{
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
,
PoolProcess
pool_grad_process
)
{
bool
channel_last
=
(
data_format
==
"NDHWC"
);
const
int
batch_size
=
input
.
dims
()[
0
];
...
...
paddle/fluid/operators/math/pooling.cu
浏览文件 @
72d99c5d
...
...
@@ -237,8 +237,8 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
const
T
*
input
,
const
std
::
vector
<
int
>&
input_shape
,
const
std
::
vector
<
int
>&
output_shape
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_compute
,
bool
exclusive
,
bool
adaptive
,
T
*
output
,
cudaStream_t
stream
)
{
bool
exclusive
,
bool
adaptive
,
T
*
output
,
gpuStream_t
stream
,
PoolProcess
pool_compute
)
{
const
int
batch_size
=
input_shape
[
0
];
const
int
input_channels
=
input_shape
[
1
];
const
int
input_height
=
input_shape
[
2
];
...
...
@@ -277,8 +277,9 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_process
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
)
{
const
std
::
vector
<
int
>&
paddings
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
,
PoolProcess
pool_process
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_channels
=
input
.
dims
()[
1
];
const
int
input_height
=
input
.
dims
()[
2
];
...
...
@@ -311,8 +312,8 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
string
data_format
,
PoolProcess
pool_process
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
)
{
const
std
::
string
data_format
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
,
PoolProcess
pool_process
)
{
bool
channel_last
=
(
data_format
==
"NHWC"
);
const
int
batch_size
=
input
.
dims
()[
0
];
...
...
@@ -367,9 +368,9 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_process
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
)
{
const
std
::
vector
<
int
>&
paddings
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
,
PoolProcess
pool_process
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_channels
=
input
.
dims
()[
1
];
const
int
input_height
=
input
.
dims
()[
2
];
...
...
@@ -399,13 +400,15 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
ksize_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
pool_process
,
exclusive
,
adaptive
,
input_grad_data
);
}
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
output
,
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
string
data_format
,
PoolProcess
pool_process
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
)
{
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
output
,
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
string
data_format
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
,
PoolProcess
pool_process
)
{
bool
channel_last
=
(
data_format
==
"NHWC"
);
const
int
batch_size
=
input
.
dims
()[
0
];
...
...
@@ -881,8 +884,9 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_process
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
)
{
const
std
::
vector
<
int
>&
paddings
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
,
PoolProcess
pool_process
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_channels
=
input
.
dims
()[
1
];
const
int
input_depth
=
input
.
dims
()[
2
];
...
...
@@ -922,8 +926,8 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
string
data_format
,
PoolProcess
pool_process
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
)
{
const
std
::
string
data_format
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
,
PoolProcess
pool_process
)
{
bool
channel_last
=
(
data_format
==
"NDHWC"
);
const
int
batch_size
=
input
.
dims
()[
0
];
...
...
@@ -988,9 +992,9 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_process
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
)
{
const
std
::
vector
<
int
>&
paddings
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
,
PoolProcess
pool_process
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_channels
=
input
.
dims
()[
1
];
const
int
input_depth
=
input
.
dims
()[
2
];
...
...
@@ -1028,13 +1032,15 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
stride_height
,
stride_width
,
padding_depth
,
padding_height
,
padding_width
,
pool_process
,
exclusive
,
adaptive
,
input_grad_data
);
}
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
output
,
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
string
data_format
,
PoolProcess
pool_process
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
)
{
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
output
,
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
string
data_format
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
,
PoolProcess
pool_process
)
{
bool
channel_last
=
(
data_format
==
"NDHWC"
);
const
int
batch_size
=
input
.
dims
()[
0
];
...
...
paddle/fluid/operators/math/pooling.h
浏览文件 @
72d99c5d
...
...
@@ -97,7 +97,7 @@ HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) {
* This is different from average pooling. So we rewrite the max_pool_grad:
* MaxPool2dGradFunctor, MaxPool3dGradFunctor.
*/
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template
<
typename
PoolProcess
,
typename
T
>
class
Pool2dDirectCUDAFunctor
{
public:
...
...
@@ -105,9 +105,9 @@ class Pool2dDirectCUDAFunctor {
const
std
::
vector
<
int
>&
output_shape
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_comput
e
,
bool
exclusive
,
bool
adaptive
,
T
*
output
,
cudaStream_t
stream
);
const
std
::
vector
<
int
>&
paddings
,
bool
exclusiv
e
,
bool
adaptive
,
T
*
output
,
gpuStream_t
stream
,
PoolProcess
pool_compute
);
};
#endif
...
...
@@ -117,16 +117,17 @@ class Pool2dFunctor {
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_compute
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
);
const
std
::
vector
<
int
>&
paddings
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
,
PoolProcess
pool_compute
);
// overload operator() to support argument data_format
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
string
data_format
,
PoolProcess
pool_comput
e
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
);
const
std
::
string
data_format
,
bool
exclusive
,
bool
adaptiv
e
,
framework
::
Tensor
*
output
,
PoolProcess
pool_compute
);
};
template
<
typename
DeviceContext
,
typename
PoolProcess
,
typename
T
>
...
...
@@ -137,8 +138,9 @@ class Pool2dGradFunctor {
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_compute
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
);
const
std
::
vector
<
int
>&
paddings
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
,
PoolProcess
pool_compute
);
// overload operator() to support argument data_format
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
output
,
...
...
@@ -146,8 +148,8 @@ class Pool2dGradFunctor {
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
string
data_format
,
PoolProcess
pool_comput
e
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
);
const
std
::
string
data_format
,
bool
exclusive
,
bool
adaptiv
e
,
framework
::
Tensor
*
input_grad
,
PoolProcess
pool_compute
);
};
template
<
typename
DeviceContext
,
class
T
>
...
...
@@ -176,15 +178,16 @@ class Pool3dFunctor {
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_compute
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
);
const
std
::
vector
<
int
>&
paddings
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
,
PoolProcess
pool_compute
);
// overload operator() to support argument data_format
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
string
data_format
,
PoolProcess
pool_comput
e
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
);
const
std
::
string
data_format
,
bool
exclusive
,
bool
adaptiv
e
,
framework
::
Tensor
*
output
,
PoolProcess
pool_compute
);
};
template
<
typename
DeviceContext
,
typename
PoolProcess
,
typename
T
>
...
...
@@ -195,8 +198,9 @@ class Pool3dGradFunctor {
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_compute
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
);
const
std
::
vector
<
int
>&
paddings
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
,
PoolProcess
pool_compute
);
// overload operator() to support argument data_format
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
output
,
...
...
@@ -204,8 +208,8 @@ class Pool3dGradFunctor {
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
string
data_format
,
PoolProcess
pool_comput
e
,
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
);
const
std
::
string
data_format
,
bool
exclusive
,
bool
adaptiv
e
,
framework
::
Tensor
*
input_grad
,
PoolProcess
pool_compute
);
};
template
<
typename
DeviceContext
,
class
T
>
...
...
paddle/fluid/operators/pool_cudnn_op.cu.cc
浏览文件 @
72d99c5d
...
...
@@ -16,7 +16,12 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/pool_op.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/miopen_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
...
...
@@ -122,7 +127,32 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
out_dims_vec
[
3
]
=
output
->
dims
()[
2
];
out_dims_vec
[
4
]
=
output
->
dims
()[
3
];
transformed_output
.
Resize
(
framework
::
make_ddim
(
out_dims_vec
));
#ifdef PADDLE_WITH_HIP
// MIOPEN not support NHWC data layout
}
else
if
(
data_format
==
str_NHWC
)
{
layout
=
DataLayout
::
kNCHW
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
CUDADeviceContext
>();
std
::
vector
<
int
>
axis
{
0
,
3
,
1
,
2
};
transformed_input
.
Resize
(
input
->
dims
());
auto
in_dims_vec
=
framework
::
vectorize
(
input
->
dims
());
in_dims_vec
[
1
]
=
input
->
dims
()[
3
];
in_dims_vec
[
2
]
=
input
->
dims
()[
1
];
in_dims_vec
[
3
]
=
input
->
dims
()[
2
];
transformed_input
.
Resize
(
framework
::
make_ddim
(
in_dims_vec
));
transformed_input
.
mutable_data
(
ctx
.
GetPlace
(),
input
->
type
());
math
::
Transpose
<
paddle
::
platform
::
CUDADeviceContext
,
T
,
4
>
trans
;
trans
(
dev_ctx
,
*
input
,
&
transformed_input
,
axis
);
transformed_output
.
Resize
(
output
->
dims
());
auto
out_dims_vec
=
framework
::
vectorize
(
output
->
dims
());
out_dims_vec
[
1
]
=
output
->
dims
()[
3
];
out_dims_vec
[
2
]
=
output
->
dims
()[
1
];
out_dims_vec
[
3
]
=
output
->
dims
()[
2
];
transformed_output
.
Resize
(
framework
::
make_ddim
(
out_dims_vec
));
#endif
}
else
{
layout
=
getLayoutFromStr
(
data_format
);
transformed_input
=
*
input
;
...
...
@@ -138,11 +168,17 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
ScopedTensorDescriptor
output_desc
;
ScopedPoolingDescriptor
pool_desc
;
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t
cudnn_input_desc
=
input_desc
.
descriptor
<
T
>
(
layout
,
framework
::
vectorize
<
int
>
(
transformed_input
.
dims
()));
miopenTensorDescriptor_t
cudnn_output_desc
=
output_desc
.
descriptor
<
T
>
(
layout
,
framework
::
vectorize
<
int
>
(
transformed_output
.
dims
()));
#else
cudnnTensorDescriptor_t
cudnn_input_desc
=
input_desc
.
descriptor
<
T
>
(
layout
,
framework
::
vectorize
<
int
>
(
transformed_input
.
dims
()));
cudnnTensorDescriptor_t
cudnn_output_desc
=
output_desc
.
descriptor
<
T
>
(
layout
,
framework
::
vectorize
<
int
>
(
transformed_output
.
dims
()));
#endif
PoolingMode
pooling_mode
;
if
(
pooling_type
==
"max"
)
{
pooling_mode
=
PoolingMode
::
kMaximum
;
...
...
@@ -151,17 +187,36 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
:
PoolingMode
::
kAverageInclusive
;
}
#ifdef PADDLE_WITH_HIP
miopenPoolingDescriptor_t
cudnn_pool_desc
=
pool_desc
.
descriptor
(
pooling_mode
,
ksize
,
paddings
,
strides
);
#else
cudnnPoolingDescriptor_t
cudnn_pool_desc
=
pool_desc
.
descriptor
(
pooling_mode
,
ksize
,
paddings
,
strides
);
#endif
// ------------------- cudnn pool algorithm ---------------------
auto
handle
=
ctx
.
cuda_device_context
().
cudnn_handle
();
ScalingParamType
<
T
>
alpha
=
1.0
f
,
beta
=
0.0
f
;
#ifdef PADDLE_WITH_HIP
char
*
pool_workspace
;
size_t
pool_worksize
=
0
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenPoolingGetWorkSpaceSizeV2
(
cudnn_pool_desc
,
cudnn_output_desc
,
&
pool_worksize
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipMalloc
(
&
pool_workspace
,
pool_worksize
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenPoolingForward
(
handle
,
cudnn_pool_desc
,
&
alpha
,
cudnn_input_desc
,
tranformed_input_data
,
&
beta
,
cudnn_output_desc
,
tranformed_output_data
,
false
,
pool_workspace
,
pool_worksize
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipFree
(
pool_workspace
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnPoolingForward
(
handle
,
cudnn_pool_desc
,
&
alpha
,
cudnn_input_desc
,
tranformed_input_data
,
&
beta
,
cudnn_output_desc
,
tranformed_output_data
));
#endif
// add
if
(
data_format
==
str_NDHWC
)
{
auto
&
dev_ctx
=
...
...
@@ -170,6 +225,16 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
math
::
Transpose
<
paddle
::
platform
::
CUDADeviceContext
,
T
,
5
>
trans5_v2
;
trans5_v2
(
dev_ctx
,
transformed_output
,
output
,
axis
);
}
#ifdef PADDLE_WITH_HIP
// MIOPEN not support NHWC data layout
if
(
data_format
==
str_NHWC
)
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
CUDADeviceContext
>();
std
::
vector
<
int
>
axis
{
0
,
2
,
3
,
1
};
math
::
Transpose
<
paddle
::
platform
::
CUDADeviceContext
,
T
,
4
>
trans
;
trans
(
dev_ctx
,
transformed_output
,
output
,
axis
);
}
#endif
}
};
...
...
@@ -272,6 +337,49 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
// input grad
transformed_input_grad
.
Resize
(
framework
::
make_ddim
(
in_dims_vec
));
#ifdef PADDLE_WITH_HIP
// MIOPEN not support NHWC data layout
}
else
if
(
data_format
==
str_NHWC
)
{
layout
=
DataLayout
::
kNCHW
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
CUDADeviceContext
>();
std
::
vector
<
int
>
axis
{
0
,
3
,
1
,
2
};
// input
transformed_input
.
Resize
(
input
->
dims
());
auto
in_dims_vec
=
framework
::
vectorize
(
input
->
dims
());
in_dims_vec
[
1
]
=
input
->
dims
()[
3
];
in_dims_vec
[
2
]
=
input
->
dims
()[
1
];
in_dims_vec
[
3
]
=
input
->
dims
()[
2
];
transformed_input
.
Resize
(
framework
::
make_ddim
(
in_dims_vec
));
transformed_input
.
mutable_data
(
ctx
.
GetPlace
(),
input
->
type
());
math
::
Transpose
<
paddle
::
platform
::
CUDADeviceContext
,
T
,
4
>
trans4
;
trans4
(
dev_ctx
,
*
input
,
&
transformed_input
,
axis
);
// output
transformed_output
.
Resize
(
output
->
dims
());
auto
out_dims_vec
=
framework
::
vectorize
(
output
->
dims
());
out_dims_vec
[
1
]
=
output
->
dims
()[
3
];
out_dims_vec
[
2
]
=
output
->
dims
()[
1
];
out_dims_vec
[
3
]
=
output
->
dims
()[
2
];
transformed_output
.
Resize
(
framework
::
make_ddim
(
out_dims_vec
));
transformed_output
.
mutable_data
(
ctx
.
GetPlace
(),
output
->
type
());
math
::
Transpose
<
paddle
::
platform
::
CUDADeviceContext
,
T
,
4
>
trans4_v2
;
trans4_v2
(
dev_ctx
,
*
output
,
&
transformed_output
,
axis
);
// output grad
transformed_output_grad
.
Resize
(
framework
::
make_ddim
(
out_dims_vec
));
transformed_output_grad
.
mutable_data
(
ctx
.
GetPlace
(),
output_grad
->
type
());
math
::
Transpose
<
paddle
::
platform
::
CUDADeviceContext
,
T
,
4
>
trans4_v3
;
trans4_v3
(
dev_ctx
,
*
output_grad
,
&
transformed_output_grad
,
axis
);
// input grad
transformed_input_grad
.
Resize
(
framework
::
make_ddim
(
in_dims_vec
));
#endif
}
else
{
layout
=
getLayoutFromStr
(
data_format
);
transformed_input
=
*
input
;
...
...
@@ -289,11 +397,17 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
ScopedTensorDescriptor
output_desc
;
ScopedPoolingDescriptor
pool_desc
;
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t
cudnn_input_desc
=
input_desc
.
descriptor
<
T
>
(
layout
,
framework
::
vectorize
<
int
>
(
transformed_input
.
dims
()));
miopenTensorDescriptor_t
cudnn_output_desc
=
output_desc
.
descriptor
<
T
>
(
layout
,
framework
::
vectorize
<
int
>
(
transformed_output
.
dims
()));
#else
cudnnTensorDescriptor_t
cudnn_input_desc
=
input_desc
.
descriptor
<
T
>
(
layout
,
framework
::
vectorize
<
int
>
(
transformed_input
.
dims
()));
cudnnTensorDescriptor_t
cudnn_output_desc
=
output_desc
.
descriptor
<
T
>
(
layout
,
framework
::
vectorize
<
int
>
(
transformed_output
.
dims
()));
#endif
PoolingMode
pooling_mode
;
if
(
pooling_type
==
"max"
)
{
if
(
FLAGS_cudnn_deterministic
)
{
...
...
@@ -306,8 +420,13 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
:
PoolingMode
::
kAverageInclusive
;
}
#ifdef PADDLE_WITH_HIP
miopenPoolingDescriptor_t
cudnn_pool_desc
=
pool_desc
.
descriptor
(
pooling_mode
,
ksize
,
paddings
,
strides
);
#else
cudnnPoolingDescriptor_t
cudnn_pool_desc
=
pool_desc
.
descriptor
(
pooling_mode
,
ksize
,
paddings
,
strides
);
#endif
// ------------------- cudnn pool algorithm ---------------------
auto
handle
=
ctx
.
cuda_device_context
().
cudnn_handle
();
...
...
@@ -315,11 +434,25 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
if
(
input_grad
)
{
T
*
input_grad_data
=
transformed_input_grad
.
mutable_data
<
T
>
(
transformed_input_grad
.
dims
(),
ctx
.
GetPlace
());
// Because beta is zero, it is unnecessary to reset input_grad.
// Because beta is zero, it is unnecessary to reset input_grad.
#ifdef PADDLE_WITH_HIP
char
*
pool_workspace
;
size_t
pool_worksize
=
0
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenPoolingGetWorkSpaceSizeV2
(
cudnn_pool_desc
,
cudnn_output_desc
,
&
pool_worksize
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipMalloc
(
&
pool_workspace
,
pool_worksize
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenPoolingBackward
(
handle
,
cudnn_pool_desc
,
&
alpha
,
cudnn_output_desc
,
output_data
,
cudnn_output_desc
,
output_grad_data
,
cudnn_input_desc
,
input_data
,
&
beta
,
cudnn_input_desc
,
input_grad_data
,
pool_workspace
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipFree
(
pool_workspace
));
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnPoolingBackward
(
handle
,
cudnn_pool_desc
,
&
alpha
,
cudnn_output_desc
,
output_data
,
cudnn_output_desc
,
output_grad_data
,
cudnn_input_desc
,
input_data
,
&
beta
,
cudnn_input_desc
,
input_grad_data
));
#endif
if
(
data_format
==
str_NDHWC
)
{
auto
&
dev_ctx
=
...
...
@@ -328,6 +461,16 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
math
::
Transpose
<
paddle
::
platform
::
CUDADeviceContext
,
T
,
5
>
trans5_v4
;
trans5_v4
(
dev_ctx
,
transformed_input_grad
,
input_grad
,
axis
);
}
#ifdef PADDLE_WITH_HIP
// MIOPEN not support NHWC data layout
if
(
data_format
==
str_NHWC
)
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
CUDADeviceContext
>();
std
::
vector
<
int
>
axis
{
0
,
2
,
3
,
1
};
math
::
Transpose
<
paddle
::
platform
::
CUDADeviceContext
,
T
,
4
>
trans4_v4
;
trans4_v4
(
dev_ctx
,
transformed_input_grad
,
input_grad
,
axis
);
}
#endif
}
}
};
...
...
@@ -338,6 +481,21 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
REGISTER_OP_KERNEL
(
pool2d
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
PoolCUDNNOpKernel
<
float
>
,
ops
::
PoolCUDNNOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
pool2d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
PoolCUDNNGradOpKernel
<
float
>
,
ops
::
PoolCUDNNGradOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
pool3d
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
PoolCUDNNOpKernel
<
float
>
,
ops
::
PoolCUDNNOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
pool3d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
PoolCUDNNGradOpKernel
<
float
>
);
#else
REGISTER_OP_KERNEL
(
pool2d
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
PoolCUDNNOpKernel
<
float
>
,
ops
::
PoolCUDNNOpKernel
<
double
>
,
...
...
@@ -354,3 +512,4 @@ REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
REGISTER_OP_KERNEL
(
pool3d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
PoolCUDNNGradOpKernel
<
float
>
,
ops
::
PoolCUDNNGradOpKernel
<
double
>
);
#endif
paddle/fluid/operators/pool_op.cc
浏览文件 @
72d99c5d
...
...
@@ -18,6 +18,9 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/miopen_helper.h"
#endif
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
...
...
@@ -180,7 +183,7 @@ framework::OpKernelType PoolOp::GetExpectedKernelType(
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
auto
data_type
=
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"X"
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
CanCUDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kCUDNN
;
}
...
...
@@ -235,7 +238,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
auto
input_data_type
=
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"X"
);
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
CanCUDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kCUDNN
;
}
...
...
paddle/fluid/operators/pool_op.h
浏览文件 @
72d99c5d
...
...
@@ -205,7 +205,7 @@ class PoolKernel : public framework::OpKernel<T> {
pool2d_forward
;
paddle
::
operators
::
math
::
MaxPool
<
T
>
pool_process
;
pool2d_forward
(
dev_ctx
,
*
in_x
,
ksize
,
strides
,
paddings
,
data_format
,
pool_process
,
true
,
false
,
out
);
true
,
false
,
out
,
pool_process
);
}
else
if
(
pooling_type
==
"avg"
)
{
std
::
vector
<
int
>
reduce_dim
;
...
...
@@ -213,7 +213,12 @@ class PoolKernel : public framework::OpKernel<T> {
if
(
reduce_num
>
0
&&
adaptive
)
{
// for adaptive_avg_pool2d && output_size == 1
#ifdef __NVCC__
#ifdef __HIPCC__
auto
stream
=
dev_ctx
.
stream
();
TensorReduce
<
T
,
T
,
hipcub
::
Sum
,
DivideFunctor
<
T
>>
(
*
in_x
,
out
,
reduce_dim
,
static_cast
<
T
>
(
0
),
hipcub
::
Sum
(),
DivideFunctor
<
T
>
(
reduce_num
),
stream
);
#elif defined(__NVCC__)
auto
stream
=
dev_ctx
.
stream
();
TensorReduce
<
T
,
T
,
cub
::
Sum
,
DivideFunctor
<
T
>>
(
*
in_x
,
out
,
reduce_dim
,
static_cast
<
T
>
(
0
),
cub
::
Sum
(),
...
...
@@ -224,7 +229,7 @@ class PoolKernel : public framework::OpKernel<T> {
pool2d_forward
;
paddle
::
operators
::
math
::
AvgPool
<
T
>
pool_process
;
pool2d_forward
(
dev_ctx
,
*
in_x
,
ksize
,
strides
,
paddings
,
data_format
,
pool_process
,
exclusive
,
adaptive
,
out
);
data_format
,
exclusive
,
adaptive
,
out
,
pool_process
);
#endif
}
else
{
// avgpool_2d or adaptive_avg_pool2d && output_size != 1
paddle
::
operators
::
math
::
Pool2dFunctor
<
...
...
@@ -232,7 +237,7 @@ class PoolKernel : public framework::OpKernel<T> {
pool2d_forward
;
paddle
::
operators
::
math
::
AvgPool
<
T
>
pool_process
;
pool2d_forward
(
dev_ctx
,
*
in_x
,
ksize
,
strides
,
paddings
,
data_format
,
pool_process
,
exclusive
,
adaptive
,
out
);
data_format
,
exclusive
,
adaptive
,
out
,
pool_process
);
}
}
}
break
;
...
...
@@ -243,7 +248,7 @@ class PoolKernel : public framework::OpKernel<T> {
pool3d_forward
;
paddle
::
operators
::
math
::
MaxPool
<
T
>
pool_process
;
pool3d_forward
(
dev_ctx
,
*
in_x
,
ksize
,
strides
,
paddings
,
data_format
,
pool_process
,
true
,
false
,
out
);
true
,
false
,
out
,
pool_process
);
}
else
if
(
pooling_type
==
"avg"
)
{
paddle
::
operators
::
math
::
Pool3dFunctor
<
...
...
@@ -251,7 +256,7 @@ class PoolKernel : public framework::OpKernel<T> {
pool3d_forward
;
paddle
::
operators
::
math
::
AvgPool
<
T
>
pool_process
;
pool3d_forward
(
dev_ctx
,
*
in_x
,
ksize
,
strides
,
paddings
,
data_format
,
pool_process
,
exclusive
,
adaptive
,
out
);
exclusive
,
adaptive
,
out
,
pool_process
);
}
}
break
;
default:
{
...
...
@@ -324,8 +329,8 @@ class PoolGradKernel : public framework::OpKernel<T> {
pool2d_backward
;
paddle
::
operators
::
math
::
AvgPoolGrad
<
T
>
pool_process
;
pool2d_backward
(
dev_ctx
,
*
in_x
,
*
out
,
*
out_grad
,
ksize
,
strides
,
paddings
,
data_format
,
pool_process
,
exclus
ive
,
adaptive
,
in_x_grad
);
paddings
,
data_format
,
exclusive
,
adapt
ive
,
in_x_grad
,
pool_process
);
}
}
break
;
case
3
:
{
...
...
@@ -340,8 +345,8 @@ class PoolGradKernel : public framework::OpKernel<T> {
pool3d_backward
;
paddle
::
operators
::
math
::
AvgPoolGrad
<
T
>
pool_process
;
pool3d_backward
(
dev_ctx
,
*
in_x
,
*
out
,
*
out_grad
,
ksize
,
strides
,
paddings
,
data_format
,
pool_process
,
exclus
ive
,
adaptive
,
in_x_grad
);
paddings
,
data_format
,
exclusive
,
adapt
ive
,
in_x_grad
,
pool_process
);
}
}
break
;
default:
{
...
...
paddle/fluid/operators/spp_op.h
浏览文件 @
72d99c5d
...
...
@@ -56,14 +56,14 @@ class SppKernel : public framework::OpKernel<T> {
math
::
Pool2dFunctor
<
DeviceContext
,
math
::
MaxPool
<
T
>
,
T
>
pool_forward
;
math
::
MaxPool
<
T
>
max_process
;
pool_forward
(
context
.
template
device_context
<
DeviceContext
>(),
*
in_x
,
kernel_size
,
strides
,
paddings
,
max_process
,
true
,
false
,
&
out_level
);
kernel_size
,
strides
,
paddings
,
true
,
false
,
&
out_level
,
max_process
);
}
else
if
(
pooling_type
==
"avg"
)
{
math
::
Pool2dFunctor
<
DeviceContext
,
math
::
AvgPool
<
T
>
,
T
>
pool_forward
;
math
::
AvgPool
<
T
>
avg_process
;
pool_forward
(
context
.
template
device_context
<
DeviceContext
>(),
*
in_x
,
kernel_size
,
strides
,
paddings
,
avg_process
,
true
,
false
,
&
out_level
);
kernel_size
,
strides
,
paddings
,
true
,
false
,
&
out_level
,
avg_process
);
}
// flatten pooling output shape
int
output_flatten_w
=
in_x
->
dims
()[
1
]
*
bins
*
bins
;
...
...
@@ -156,7 +156,7 @@ class SppGradKernel : public framework::OpKernel<T> {
math
::
AvgPoolGrad
<
T
>
avg_process
;
pool_backward
(
context
.
template
device_context
<
DeviceContext
>(),
*
in_x
,
*&
out_level
,
*&
outgrad_level
,
kernel_size
,
strides
,
paddings
,
avg_process
,
true
,
false
,
in_x_grad
);
paddings
,
true
,
false
,
in_x_grad
,
avg_process
);
}
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录