Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
4d647ec1
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2307
Star
20932
Fork
5423
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
4d647ec1
编写于
3月 04, 2021
作者:
Q
Qi Li
提交者:
GitHub
3月 04, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[ROCM] update fluid platform for rocm (part5), test=develop (#31315)
上级
522c91ec
变更
19
隐藏空白更改
内联
并排
Showing
19 changed file
with
207 addition
and
78 deletion
+207
-78
paddle/fluid/operators/split_lod_tensor_op.cc
paddle/fluid/operators/split_lod_tensor_op.cc
+1
-1
paddle/fluid/operators/sync_batch_norm_op.cu
paddle/fluid/operators/sync_batch_norm_op.cu
+11
-0
paddle/fluid/operators/sync_batch_norm_op.cu.h
paddle/fluid/operators/sync_batch_norm_op.cu.h
+10
-3
paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
+2
-2
paddle/fluid/operators/top_k_function_cuda.h
paddle/fluid/operators/top_k_function_cuda.h
+61
-0
paddle/fluid/operators/top_k_op.cu
paddle/fluid/operators/top_k_op.cu
+5
-0
paddle/fluid/operators/trace_op.h
paddle/fluid/operators/trace_op.h
+2
-2
paddle/fluid/operators/unique_op.cu
paddle/fluid/operators/unique_op.cu
+1
-0
paddle/fluid/operators/unstack_op.h
paddle/fluid/operators/unstack_op.h
+4
-4
paddle/fluid/operators/warpctc_op.cc
paddle/fluid/operators/warpctc_op.cc
+3
-0
paddle/fluid/operators/warpctc_op.h
paddle/fluid/operators/warpctc_op.h
+1
-0
paddle/fluid/platform/cuda_helper.h
paddle/fluid/platform/cuda_helper.h
+4
-0
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+6
-0
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+5
-1
paddle/fluid/platform/device_context_test.cu
paddle/fluid/platform/device_context_test.cu
+4
-0
paddle/fluid/platform/miopen_desc.h
paddle/fluid/platform/miopen_desc.h
+84
-45
paddle/fluid/platform/miopen_helper.h
paddle/fluid/platform/miopen_helper.h
+0
-17
paddle/fluid/pybind/imperative.cc
paddle/fluid/pybind/imperative.cc
+2
-2
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+1
-1
未找到文件。
paddle/fluid/operators/split_lod_tensor_op.cc
浏览文件 @
4d647ec1
...
...
@@ -65,7 +65,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
if
(
platform
::
is_cpu_place
(
mask
.
place
()))
{
cpu_mask
->
ShareDataWith
(
mask
);
}
else
if
(
platform
::
is_gpu_place
(
mask
.
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
framework
::
TensorCopy
(
mask
,
platform
::
CPUPlace
(),
dev_ctx
,
cpu_mask
.
get
());
#else
...
...
paddle/fluid/operators/sync_batch_norm_op.cu
浏览文件 @
4d647ec1
...
...
@@ -91,6 +91,16 @@ class SyncBatchNormGradKernel<platform::CUDADeviceContext, T>
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
REGISTER_OP_CUDA_KERNEL
(
sync_batch_norm
,
ops
::
SyncBatchNormKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
SyncBatchNormKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
sync_batch_norm_grad
,
ops
::
SyncBatchNormGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
SyncBatchNormGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
#else
REGISTER_OP_CUDA_KERNEL
(
sync_batch_norm
,
ops
::
SyncBatchNormKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
SyncBatchNormKernel
<
plat
::
CUDADeviceContext
,
double
>
,
...
...
@@ -100,5 +110,6 @@ REGISTER_OP_CUDA_KERNEL(
ops
::
SyncBatchNormGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
SyncBatchNormGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
SyncBatchNormGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
#endif
// clang-format on
paddle/fluid/operators/sync_batch_norm_op.cu.h
浏览文件 @
4d647ec1
...
...
@@ -19,12 +19,19 @@ limitations under the License. */
#include <cmath>
#include <string>
#include <vector>
#ifdef __NVCC__
#include "cub/cub.cuh"
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
namespace
cub
=
hipcub
;
#include "paddle/fluid/platform/miopen_helper.h"
#endif
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/operators/norm_utils.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/nccl_helper.h"
...
...
@@ -186,7 +193,7 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx,
auto
gplace
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
ctx
.
GetPlace
());
memory
::
Copy
(
platform
::
CPUPlace
(),
c_g_st_d
,
gplace
,
stats
,
bytes
,
0
);
#if
def PADDLE_WITH_NCCL
#if
defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto
*
comm
=
dev_ctx
.
nccl_comm
();
if
(
comm
)
{
int
dtype
=
platform
::
ToNCCLDataType
(
mean_out
->
type
());
...
...
@@ -460,7 +467,7 @@ void SyncBatchNormGradFunctor(
dy_d
,
x_d
,
saved_mean
,
N
,
fsize
,
C
,
stats
);
}
#if
def PADDLE_WITH_NCCL
#if
defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto
*
comm
=
dev_ctx
.
nccl_comm
();
if
(
comm
)
{
int
dtype
=
platform
::
ToNCCLDataType
(
scale
->
type
());
...
...
paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
浏览文件 @
4d647ec1
...
...
@@ -91,7 +91,7 @@ static bool TestLeakyReluGradGradMain(const framework::DDim &dim,
int64_t
limit
=
x
.
numel
();
#if
def __NVCC__
#if
defined(__NVCC__) || defined(__HIPCC__)
if
(
platform
::
is_gpu_place
(
place
))
{
auto
&
cuda_dev_ctx
=
dynamic_cast
<
platform
::
CUDADeviceContext
&>
(
dev_ctx
);
functor
(
cuda_dev_ctx
,
&
x
,
out
,
&
ddx
,
&
ddout
,
dout
,
dx
);
...
...
@@ -105,7 +105,7 @@ static bool TestLeakyReluGradGradMain(const framework::DDim &dim,
platform
::
ForRange
<
platform
::
CPUDeviceContext
>
for_range
(
cpu_dev_ctx
,
limit
);
for_range
(
actual_functor
);
#if
def __NVCC__
#if
defined(__NVCC__) || defined(__HIPCC__)
}
#endif
...
...
paddle/fluid/operators/top_k_function_cuda.h
浏览文件 @
4d647ec1
...
...
@@ -16,11 +16,26 @@ limitations under the License. */
#include <stdio.h>
#include <cstdio>
#include <vector>
#ifdef __NVCC__
#include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
#endif
#include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/float16.h"
#ifdef __HIPCC__
namespace
rocprim
{
namespace
detail
{
template
<
>
struct
radix_key_codec_base
<
paddle
::
platform
::
float16
>
:
radix_key_codec_integral
<
paddle
::
platform
::
float16
,
uint16_t
>
{};
}
// namespace detail
}
// namespace rocprim
namespace
cub
=
hipcub
;
#else
// set cub base traits in order to handle float16
namespace
cub
{
template
<
>
...
...
@@ -28,6 +43,7 @@ struct NumericTraits<paddle::platform::float16>
:
BaseTraits
<
FLOATING_POINT
,
true
,
false
,
uint16_t
,
paddle
::
platform
::
float16
>
{};
}
// namespace cub
#endif
namespace
paddle
{
namespace
operators
{
...
...
@@ -439,6 +455,16 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
input_indices
.
data
<
int64_t
>
(),
sorted_indices_ptr
,
num_cols
*
num_rows
,
num_rows
,
segment_offsets_t
,
segment_offsets_t
+
1
,
0
,
sizeof
(
T
)
*
8
,
cu_stream
);
#ifdef __HIPCC__
if
(
err
!=
hipSuccess
)
{
LOG
(
ERROR
)
<<
"TopKOP failed as could not launch "
"hipcub::DeviceSegmentedRadixSort::SortPairsDescending to "
"calculate "
"temp_storage_bytes, status: "
<<
hipGetErrorString
(
err
);
return
false
;
}
#else
if
(
err
!=
cudaSuccess
)
{
LOG
(
ERROR
)
<<
"TopKOP failed as could not launch "
...
...
@@ -447,12 +473,22 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
<<
cudaGetErrorString
(
err
);
return
false
;
}
#endif
}
else
{
auto
err
=
cub
::
DeviceSegmentedRadixSort
::
SortPairs
(
nullptr
,
temp_storage_bytes
,
input
,
sorted_values_ptr
,
input_indices
.
data
<
int64_t
>
(),
sorted_indices_ptr
,
num_cols
*
num_rows
,
num_rows
,
segment_offsets_t
,
segment_offsets_t
+
1
,
0
,
sizeof
(
T
)
*
8
,
cu_stream
);
#ifdef __HIPCC__
if
(
err
!=
hipSuccess
)
{
LOG
(
ERROR
)
<<
"TopKOP failed as could not launch "
"hipcub::DeviceSegmentedRadixSort::SortPairs to calculate "
"temp_storage_bytes, status: "
<<
hipGetErrorString
(
err
);
return
false
;
}
#else
if
(
err
!=
cudaSuccess
)
{
LOG
(
ERROR
)
<<
"TopKOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairs to calculate "
...
...
@@ -460,6 +496,7 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
<<
cudaGetErrorString
(
err
);
return
false
;
}
#endif
}
Tensor
temp_storage
;
temp_storage
.
mutable_data
<
uint8_t
>
(
ctx
.
GetPlace
(),
temp_storage_bytes
);
...
...
@@ -470,6 +507,17 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
sorted_values_ptr
,
input_indices
.
data
<
int64_t
>
(),
sorted_indices_ptr
,
num_cols
*
num_rows
,
num_rows
,
segment_offsets_t
,
segment_offsets_t
+
1
,
0
,
sizeof
(
T
)
*
8
,
cu_stream
);
#ifdef __HIPCC__
if
(
err
!=
hipSuccess
)
{
LOG
(
ERROR
)
<<
"TopKOP failed as could not launch "
"hipcub::DeviceSegmentedRadixSort::SortPairsDescending to "
"sort input, "
"temp_storage_bytes: "
<<
temp_storage_bytes
<<
", status: "
<<
hipGetErrorString
(
err
);
return
false
;
}
#else
if
(
err
!=
cudaSuccess
)
{
LOG
(
ERROR
)
<<
"TopKOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairsDescending to "
...
...
@@ -479,12 +527,24 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
<<
", status: "
<<
cudaGetErrorString
(
err
);
return
false
;
}
#endif
}
else
{
auto
err
=
cub
::
DeviceSegmentedRadixSort
::
SortPairs
(
temp_storage
.
data
<
uint8_t
>
(),
temp_storage_bytes
,
input
,
sorted_values_ptr
,
input_indices
.
data
<
int64_t
>
(),
sorted_indices_ptr
,
num_cols
*
num_rows
,
num_rows
,
segment_offsets_t
,
segment_offsets_t
+
1
,
0
,
sizeof
(
T
)
*
8
,
cu_stream
);
#ifdef __HIPCC__
if
(
err
!=
hipSuccess
)
{
LOG
(
ERROR
)
<<
"TopKOP failed as could not launch "
"hipcub::DeviceSegmentedRadixSort::SortPairs to "
"sort input, "
"temp_storage_bytes: "
<<
temp_storage_bytes
<<
", status: "
<<
hipGetErrorString
(
err
);
return
false
;
}
#else
if
(
err
!=
cudaSuccess
)
{
LOG
(
ERROR
)
<<
"TopKOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairs to "
...
...
@@ -494,6 +554,7 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
<<
", status: "
<<
cudaGetErrorString
(
err
);
return
false
;
}
#endif
}
auto
&
dev
=
*
ctx
.
eigen_device
();
if
(
k
<
num_cols
)
{
...
...
paddle/fluid/operators/top_k_op.cu
浏览文件 @
4d647ec1
...
...
@@ -15,7 +15,12 @@ limitations under the License. */
#pragma once
#include <cstdio>
#include <vector>
#ifdef __NVCC__
#include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
#endif
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/top_k_function_cuda.h"
#include "paddle/fluid/operators/top_k_op.h"
...
...
paddle/fluid/operators/trace_op.h
浏览文件 @
4d647ec1
...
...
@@ -145,7 +145,7 @@ framework::Tensor Diagonal(const framework::ExecutionContext& context,
int64_t
pos
=
std
::
abs
(
offset
)
*
offset_stride
;
int64_t
dim_size
=
ret_strides
.
size
();
#if
def __NVCC__
#if
defined(__NVCC__) || defined(__HIPCC__)
thrust
::
device_vector
<
int64_t
>
diag_vec
(
vectorize
(
dig_stride
));
const
int64_t
*
diag_arr
=
thrust
::
raw_pointer_cast
(
diag_vec
.
data
());
thrust
::
device_vector
<
int64_t
>
ret_vec
(
ret_strides
);
...
...
@@ -238,7 +238,7 @@ class TraceGradKernel : public framework::OpKernel<T> {
int64_t
diag_size
=
len2
<
len1
?
len2
:
len1
;
int64_t
pos
=
std
::
abs
(
offset
)
*
offset_stride
;
if
(
diag_size
>
0
)
{
#if
def __NVCC__
#if
defined(__NVCC__) || defined(__HIPCC__)
thrust
::
device_vector
<
int64_t
>
output_vec
(
vectorize
(
output_stride
));
const
int64_t
*
output_arr
=
thrust
::
raw_pointer_cast
(
output_vec
.
data
());
thrust
::
device_vector
<
int64_t
>
input_vec
(
vectorize
(
input_stride
));
...
...
paddle/fluid/operators/unique_op.cu
浏览文件 @
4d647ec1
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include <thrust/execution_policy.h>
#include <thrust/functional.h>
#include <thrust/scatter.h>
#include <thrust/sequence.h>
#include <thrust/unique.h>
#include <iostream>
#include <vector>
...
...
paddle/fluid/operators/unstack_op.h
浏览文件 @
4d647ec1
...
...
@@ -18,7 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/for_range.h"
#if
def __NVCC__
#if
defined(__NVCC__) || defined(__HIPCC__)
#include <thrust/device_vector.h>
#include "paddle/fluid/framework/array.h"
#endif
...
...
@@ -103,7 +103,7 @@ class UnStackGradKernel : public framework::OpKernel<T> {
for
(
auto
i
=
0
;
i
<
axis
;
++
i
)
pre
*=
dim
[
i
];
for
(
auto
i
=
axis
;
i
<
dim
.
size
();
++
i
)
post
*=
dim
[
i
];
#if
def __NVCC__
#if
defined(__NVCC__) || defined(__HIPCC__)
int
total_num
=
pre
*
n
*
post
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
...
...
@@ -156,14 +156,14 @@ class UnStackKernel : public framework::OpKernel<T> {
int
post
=
total_num
/
(
n
*
pre
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
#if
def __NVCC__
#if
defined(__NVCC__) || defined(__HIPCC__)
thrust
::
device_vector
<
T
*>
device_dx_vec
(
dx_datas
);
auto
dx_data_arr
=
device_dx_vec
.
data
().
get
();
#else
auto
dx_data_arr
=
dx_datas
.
data
();
#endif
StackGradFunctorForRange
(
dev_ctx
,
dx_data_arr
,
dy_data
,
total_num
,
n
,
post
);
#if
def __NVCC__
#if
defined(__NVCC__) || defined(__HIPCC__)
// Wait() must be called because device_dx_vec may be destructed before
// kernel ends
dev_ctx
.
Wait
();
...
...
paddle/fluid/operators/warpctc_op.cc
浏览文件 @
4d647ec1
...
...
@@ -16,6 +16,9 @@ limitations under the License. */
#include <memory>
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/miopen_helper.h"
#endif
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
...
...
paddle/fluid/operators/warpctc_op.h
浏览文件 @
4d647ec1
...
...
@@ -159,6 +159,7 @@ class WarpCTCFunctor {
warpctc_version_
=
platform
::
dynload
::
get_warpctc_version
();
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
// HIP not support ctcOptions in third-party warpctc
#ifdef PADDLE_WITH_CUDA
options_
.
loc
=
CTC_GPU
;
options_
.
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
...
...
paddle/fluid/platform/cuda_helper.h
浏览文件 @
4d647ec1
...
...
@@ -108,7 +108,11 @@ class CublasHandleHolder {
}
#endif
#ifdef PADDLE_WITH_HIP
const
rocblas_handle
&
GetCublasHandle
()
const
{
return
handle_
;
}
#else
const
cublasHandle_t
&
GetCublasHandle
()
const
{
return
handle_
;
}
#endif
~
CublasHandleHolder
()
PADDLE_MAY_THROW
{
#ifdef PADDLE_WITH_HIP
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
4d647ec1
...
...
@@ -459,9 +459,15 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
return
context
()
->
CudnnHandle
();
}
#ifdef PADDLE_WITH_HIP
rocblas_handle
CUDADeviceContext
::
cublas_handle
()
const
{
return
context
()
->
CublasHandle
()
->
GetCublasHandle
();
}
#else
cublasHandle_t
CUDADeviceContext
::
cublas_handle
()
const
{
return
context
()
->
CublasHandle
()
->
GetCublasHandle
();
}
#endif
CudnnWorkspaceHandle
CUDADeviceContext
::
cudnn_workspace_handle
()
const
{
return
CudnnWorkspaceHandle
(
*
this
,
&
cudnn_handle_mtx_
);
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
4d647ec1
...
...
@@ -409,8 +409,12 @@ class CUDADeviceContext : public DeviceContext {
cudnnHandle_t
cudnn_handle
()
const
;
#endif
/*! \brief Return cublas handle in the device context. */
/*! \brief Return cublas handle in the device context. */
#ifdef PADDLE_WITH_HIP
rocblas_handle
cublas_handle
()
const
;
#else
cublasHandle_t
cublas_handle
()
const
;
#endif
/*! \brief Return a cudnn workspace handle to call multiple cudnn
* functions without interrupting by other threads.
...
...
paddle/fluid/platform/device_context_test.cu
浏览文件 @
4d647ec1
...
...
@@ -47,7 +47,11 @@ TEST(Device, CUDADeviceContext) {
cudnnHandle_t
cudnn_handle
=
device_context
->
cudnn_handle
();
#endif
ASSERT_NE
(
nullptr
,
cudnn_handle
);
#ifdef PADDLE_WITH_HIP
rocblas_handle
cublas_handle
=
device_context
->
cublas_handle
();
#else
cublasHandle_t
cublas_handle
=
device_context
->
cublas_handle
();
#endif
ASSERT_NE
(
nullptr
,
cublas_handle
);
delete
device_context
;
}
...
...
paddle/fluid/platform/miopen_desc.h
浏览文件 @
4d647ec1
...
...
@@ -37,9 +37,9 @@ namespace platform {
using
framework
::
Tensor
;
template
<
typename
T
>
inline
miopenDataType_t
To
MIOpe
nDataType
(
const
T
&
t
)
{
inline
miopenDataType_t
To
Cudn
nDataType
(
const
T
&
t
)
{
auto
type
=
framework
::
ToDataType
(
t
);
return
To
MIOpe
nDataType
(
type
);
return
To
Cudn
nDataType
(
type
);
}
inline
std
::
vector
<
int
>
TransformDimOrder
(
const
std
::
vector
<
int
>&
dims
)
{
...
...
@@ -66,7 +66,7 @@ inline std::vector<int> TransformDimOrder(const std::vector<int>& dims) {
}
template
<
>
inline
miopenDataType_t
To
MIOpe
nDataType
(
inline
miopenDataType_t
To
Cudn
nDataType
(
const
framework
::
proto
::
VarType
::
Type
&
t
)
{
miopenDataType_t
type
=
miopenFloat
;
switch
(
t
)
{
...
...
@@ -84,37 +84,54 @@ inline miopenDataType_t ToMIOpenDataType(
class
ActivationDescriptor
{
public:
using
T
=
miopenActivationDescriptor
;
struct
Deleter
{
void
operator
()(
T
*
t
)
{
if
(
t
!=
nullptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyActivationDescriptor
(
t
));
t
=
nullptr
;
}
}
};
ActivationDescriptor
()
{
T
*
raw_ptr
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreateActivationDescriptor
(
&
desc_
));
}
~
ActivationDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyActivationDescriptor
(
desc_
));
dynload
::
miopenCreateActivationDescriptor
(
&
raw_ptr
));
desc_
.
reset
(
raw_ptr
);
}
template
<
typename
T
>
void
set
(
miopenActivationMode_t
mode
,
const
T
&
coef
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSetActivationDescriptor
(
desc_
,
mode
,
static_cast
<
double
>
(
coef
),
0.0
,
0.0
));
desc_
.
get
()
,
mode
,
static_cast
<
double
>
(
coef
),
0.0
,
0.0
));
}
miopenActivationDescriptor_t
desc
()
{
return
desc_
;
}
miopenActivationDescriptor_t
desc
()
const
{
return
desc_
;
}
T
*
desc
()
{
return
desc_
.
get
()
;
}
T
*
desc
()
const
{
return
desc_
.
get
()
;
}
private:
miopenActivationDescriptor_t
desc_
;
std
::
unique_ptr
<
T
,
Deleter
>
desc_
;
};
class
TensorDescriptor
{
public:
using
T
=
miopenTensorDescriptor
;
struct
Deleter
{
void
operator
()(
T
*
t
)
{
if
(
t
!=
nullptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyTensorDescriptor
(
t
));
t
=
nullptr
;
}
}
};
TensorDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreateTensorDescriptor
(
&
desc_
))
;
}
~
TensorDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyTensorDescriptor
(
desc_
)
);
T
*
raw_ptr
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreateTensorDescriptor
(
&
raw_ptr
));
desc_
.
reset
(
raw_ptr
);
}
miopenTensorDescriptor_t
desc
()
{
return
desc_
;
}
miopenTensorDescriptor_t
desc
()
const
{
return
desc_
;
}
T
*
desc
()
{
return
desc_
.
get
()
;
}
T
*
desc
()
const
{
return
desc_
.
get
()
;
}
void
set
(
const
Tensor
&
tensor
,
const
int
groups
=
1
)
{
auto
dims
=
framework
::
vectorize
<
int
>
(
tensor
.
dims
());
...
...
@@ -128,7 +145,7 @@ class TensorDescriptor {
dims_with_group
[
1
]
=
dims_with_group
[
1
]
/
groups
;
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSetTensorDescriptor
(
desc_
,
ToMIOpe
nDataType
(
tensor
.
type
()),
(
miopenTensorDescriptor_t
)(
desc_
.
get
()),
ToCudn
nDataType
(
tensor
.
type
()),
static_cast
<
int
>
(
dims_with_group
.
size
()),
const_cast
<
int
*>
(
dims_with_group
.
data
()),
const_cast
<
int
*>
(
strides
.
data
())));
...
...
@@ -136,6 +153,9 @@ class TensorDescriptor {
void
set
(
const
Tensor
&
tensor
,
const
miopenTensorFormat_t
format
)
{
const
int
groups
=
1
;
PADDLE_ENFORCE_EQ
(
format
,
MIOPEN_TENSOR_NCHW
,
platform
::
errors
::
InvalidArgument
(
"format should ONLY be NCHW in MIOPEN."
));
auto
dims
=
framework
::
vectorize
<
int
>
(
tensor
.
dims
());
std
::
vector
<
int
>
strides
(
dims
.
size
());
strides
[
dims
.
size
()
-
1
]
=
1
;
...
...
@@ -147,26 +167,35 @@ class TensorDescriptor {
dims_with_group
[
1
]
=
dims_with_group
[
1
]
/
groups
;
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSetTensorDescriptor
(
desc_
,
ToMIOpe
nDataType
(
tensor
.
type
()),
(
miopenTensorDescriptor_t
)(
desc_
.
get
()),
ToCudn
nDataType
(
tensor
.
type
()),
static_cast
<
int
>
(
dims_with_group
.
size
()),
const_cast
<
int
*>
(
dims_with_group
.
data
()),
const_cast
<
int
*>
(
strides
.
data
())));
}
private:
miopenTensorDescriptor_t
desc_
;
std
::
unique_ptr
<
T
,
Deleter
>
desc_
;
};
class
FilterDescriptor
{
public:
using
T
=
miopenTensorDescriptor
;
struct
Deleter
{
void
operator
()(
T
*
t
)
{
if
(
t
!=
nullptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyTensorDescriptor
(
t
));
t
=
nullptr
;
}
}
};
FilterDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreateTensorDescriptor
(
&
desc_
))
;
}
~
FilterDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyTensorDescriptor
(
desc_
)
);
T
*
raw_ptr
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreateTensorDescriptor
(
&
raw_ptr
));
desc_
.
reset
(
raw_ptr
);
}
miopenTensorDescriptor_t
desc
()
{
return
desc_
;
}
miopenTensorDescriptor_t
desc
()
const
{
return
desc_
;
}
T
*
desc
()
{
return
desc_
.
get
()
;
}
T
*
desc
()
const
{
return
desc_
.
get
()
;
}
void
set
(
const
Tensor
&
tensor
,
const
miopenTensorFormat_t
format
,
const
int
groups
=
1
)
{
...
...
@@ -176,45 +205,55 @@ class FilterDescriptor {
platform
::
errors
::
InvalidArgument
(
"format should ONLY be NCHW in MIOPEN."
));
transformed_dims
=
dims
;
if
(
groups
>
1
)
{
transformed_dims
[
1
]
=
transformed_dims
[
1
]
/
groups
;
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSetTensorDescriptor
(
desc_
,
ToMIOpe
nDataType
(
tensor
.
type
()),
static_cast
<
int
>
(
transformed_dims
.
size
())
,
const_cast
<
int
*>
(
transformed_dims
.
data
()),
nullptr
));
//
if (groups > 1) {
//
transformed_dims[1] = transformed_dims[1] / groups;
//
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSet
4d
TensorDescriptor
(
(
miopenTensorDescriptor_t
)
desc_
.
get
(),
ToCudn
nDataType
(
tensor
.
type
()),
transformed_dims
[
0
],
transformed_dims
[
1
],
transformed_dims
[
2
]
,
transformed_dims
[
3
]
));
}
private:
miopenTensorDescriptor_t
desc_
;
std
::
unique_ptr
<
T
,
Deleter
>
desc_
;
};
class
ConvolutionDescriptor
{
public:
using
T
=
miopenConvolutionDescriptor
;
struct
Deleter
{
void
operator
()(
T
*
t
)
{
if
(
t
!=
nullptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyConvolutionDescriptor
(
t
));
t
=
nullptr
;
}
}
};
ConvolutionDescriptor
()
{
T
*
raw_ptr
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreateConvolutionDescriptor
(
&
desc_
));
}
~
ConvolutionDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyConvolutionDescriptor
(
desc_
));
dynload
::
miopenCreateConvolutionDescriptor
(
&
raw_ptr
));
desc_
.
reset
(
raw_ptr
);
}
miopenConvolutionDescriptor_t
desc
()
{
return
desc_
;
}
miopenConvolutionDescriptor_t
desc
()
const
{
return
desc_
;
}
T
*
desc
()
{
return
desc_
.
get
()
;
}
T
*
desc
()
const
{
return
desc_
.
get
()
;
}
void
set
(
miopenDataType_t
dtype
,
const
std
::
vector
<
int
>&
pads
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
dilations
,
bool
allow_tf32
,
const
int
groups
=
1
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenInitConvolutionNdDescriptor
(
desc_
,
static_cast
<
int
>
(
pads
.
size
()),
const_cast
<
int
*>
(
pads
.
data
()),
(
miopenConvolutionDescriptor_t
)
desc_
.
get
(),
static_cast
<
int
>
(
pads
.
size
()),
const_cast
<
int
*>
(
pads
.
data
()),
const_cast
<
int
*>
(
strides
.
data
()),
const_cast
<
int
*>
(
dilations
.
data
()),
miopenConvolution
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenSetConvolutionGroupCount
(
desc_
,
groups
));
platform
::
dynload
::
miopenSetConvolutionGroupCount
(
(
miopenConvolutionDescriptor_t
)
desc_
.
get
(),
groups
));
}
private:
miopenConvolutionDescriptor_t
desc_
;
std
::
unique_ptr
<
T
,
Deleter
>
desc_
;
};
}
// namespace platform
...
...
paddle/fluid/platform/miopen_helper.h
浏览文件 @
4d647ec1
...
...
@@ -43,23 +43,6 @@ typedef enum {
MIOPEN_TENSOR_NHWC
=
1
,
}
miopenTensorFormat_t
;
// MIOPEN do not support indirect function call defined in cudnnWorkspaceHandle
struct
miopenWorkspace
{
explicit
miopenWorkspace
(
size_t
size
)
:
size
(
size
),
data
(
NULL
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipMalloc
(
&
data
,
size
));
}
miopenWorkspace
(
const
miopenWorkspace
&
)
=
delete
;
miopenWorkspace
(
miopenWorkspace
&&
)
=
default
;
miopenWorkspace
&
operator
=
(
miopenWorkspace
&&
)
=
default
;
~
miopenWorkspace
()
{
if
(
data
)
{
hipFree
(
data
);
}
}
size_t
size
;
void
*
data
;
};
inline
const
char
*
miopenGetErrorString
(
miopenStatus_t
status
)
{
switch
(
status
)
{
case
miopenStatusSuccess
:
...
...
paddle/fluid/pybind/imperative.cc
浏览文件 @
4d647ec1
...
...
@@ -984,7 +984,7 @@ void BindImperative(py::module *m_ptr) {
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Imperative allreduce is not supported when paddle is "
"not compiled with NCCL."
));
#endif // PADDLE_WITH_NCCL
#endif // PADDLE_WITH_NCCL
or PADDLE_WITH_RCCL
}
},
py
::
call_guard
<
py
::
gil_scoped_release
>
())
...
...
@@ -1435,7 +1435,7 @@ void BindImperative(py::module *m_ptr) {
py
::
call_guard
<
py
::
gil_scoped_release
>
());
#endif
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
py
::
class_
<
imperative
::
NCCLParallelContext
,
imperative
::
ParallelContext
,
std
::
shared_ptr
<
imperative
::
NCCLParallelContext
>>
(
m
,
"NCCLParallelContext"
)
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
4d647ec1
...
...
@@ -1125,7 +1125,7 @@ All parameter, weight, gradient are variables in Paddle.
.
def
(
"get_fetch_list"
,
[](
Variable
&
self
)
{
return
self
.
GetMutable
<
FetchList
>
();
},
py
::
return_value_policy
::
reference
)
#if
(defined(PADDLE_WITH_NCCL)
)
#if
defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL
)
.
def
(
"get_communicator"
,
[](
Variable
&
self
)
->
platform
::
Communicator
*
{
return
self
.
GetMutable
<
platform
::
Communicator
>
();
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录