Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
4d647ec1
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
4d647ec1
编写于
3月 04, 2021
作者:
Q
Qi Li
提交者:
GitHub
3月 04, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[ROCM] update fluid platform for rocm (part5), test=develop (#31315)
上级
522c91ec
变更
19
隐藏空白更改
内联
并排
Showing
19 changed file
with
207 addition
and
78 deletion
+207
-78
paddle/fluid/operators/split_lod_tensor_op.cc
paddle/fluid/operators/split_lod_tensor_op.cc
+1
-1
paddle/fluid/operators/sync_batch_norm_op.cu
paddle/fluid/operators/sync_batch_norm_op.cu
+11
-0
paddle/fluid/operators/sync_batch_norm_op.cu.h
paddle/fluid/operators/sync_batch_norm_op.cu.h
+10
-3
paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
+2
-2
paddle/fluid/operators/top_k_function_cuda.h
paddle/fluid/operators/top_k_function_cuda.h
+61
-0
paddle/fluid/operators/top_k_op.cu
paddle/fluid/operators/top_k_op.cu
+5
-0
paddle/fluid/operators/trace_op.h
paddle/fluid/operators/trace_op.h
+2
-2
paddle/fluid/operators/unique_op.cu
paddle/fluid/operators/unique_op.cu
+1
-0
paddle/fluid/operators/unstack_op.h
paddle/fluid/operators/unstack_op.h
+4
-4
paddle/fluid/operators/warpctc_op.cc
paddle/fluid/operators/warpctc_op.cc
+3
-0
paddle/fluid/operators/warpctc_op.h
paddle/fluid/operators/warpctc_op.h
+1
-0
paddle/fluid/platform/cuda_helper.h
paddle/fluid/platform/cuda_helper.h
+4
-0
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+6
-0
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+5
-1
paddle/fluid/platform/device_context_test.cu
paddle/fluid/platform/device_context_test.cu
+4
-0
paddle/fluid/platform/miopen_desc.h
paddle/fluid/platform/miopen_desc.h
+84
-45
paddle/fluid/platform/miopen_helper.h
paddle/fluid/platform/miopen_helper.h
+0
-17
paddle/fluid/pybind/imperative.cc
paddle/fluid/pybind/imperative.cc
+2
-2
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+1
-1
未找到文件。
paddle/fluid/operators/split_lod_tensor_op.cc
浏览文件 @
4d647ec1
...
...
@@ -65,7 +65,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
if
(
platform
::
is_cpu_place
(
mask
.
place
()))
{
cpu_mask
->
ShareDataWith
(
mask
);
}
else
if
(
platform
::
is_gpu_place
(
mask
.
place
()))
{
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
framework
::
TensorCopy
(
mask
,
platform
::
CPUPlace
(),
dev_ctx
,
cpu_mask
.
get
());
#else
...
...
paddle/fluid/operators/sync_batch_norm_op.cu
浏览文件 @
4d647ec1
...
...
@@ -91,6 +91,16 @@ class SyncBatchNormGradKernel<platform::CUDADeviceContext, T>
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
REGISTER_OP_CUDA_KERNEL
(
sync_batch_norm
,
ops
::
SyncBatchNormKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
SyncBatchNormKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
sync_batch_norm_grad
,
ops
::
SyncBatchNormGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
SyncBatchNormGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
#else
REGISTER_OP_CUDA_KERNEL
(
sync_batch_norm
,
ops
::
SyncBatchNormKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
SyncBatchNormKernel
<
plat
::
CUDADeviceContext
,
double
>
,
...
...
@@ -100,5 +110,6 @@ REGISTER_OP_CUDA_KERNEL(
ops
::
SyncBatchNormGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
SyncBatchNormGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
SyncBatchNormGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
#endif
// clang-format on
paddle/fluid/operators/sync_batch_norm_op.cu.h
浏览文件 @
4d647ec1
...
...
@@ -19,12 +19,19 @@ limitations under the License. */
#include <cmath>
#include <string>
#include <vector>
#ifdef __NVCC__
#include "cub/cub.cuh"
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
namespace
cub
=
hipcub
;
#include "paddle/fluid/platform/miopen_helper.h"
#endif
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/operators/norm_utils.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/nccl_helper.h"
...
...
@@ -186,7 +193,7 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx,
auto
gplace
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
ctx
.
GetPlace
());
memory
::
Copy
(
platform
::
CPUPlace
(),
c_g_st_d
,
gplace
,
stats
,
bytes
,
0
);
#if
def PADDLE_WITH_NCCL
#if
defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto
*
comm
=
dev_ctx
.
nccl_comm
();
if
(
comm
)
{
int
dtype
=
platform
::
ToNCCLDataType
(
mean_out
->
type
());
...
...
@@ -460,7 +467,7 @@ void SyncBatchNormGradFunctor(
dy_d
,
x_d
,
saved_mean
,
N
,
fsize
,
C
,
stats
);
}
#if
def PADDLE_WITH_NCCL
#if
defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto
*
comm
=
dev_ctx
.
nccl_comm
();
if
(
comm
)
{
int
dtype
=
platform
::
ToNCCLDataType
(
scale
->
type
());
...
...
paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
浏览文件 @
4d647ec1
...
...
@@ -91,7 +91,7 @@ static bool TestLeakyReluGradGradMain(const framework::DDim &dim,
int64_t
limit
=
x
.
numel
();
#if
def __NVCC__
#if
defined(__NVCC__) || defined(__HIPCC__)
if
(
platform
::
is_gpu_place
(
place
))
{
auto
&
cuda_dev_ctx
=
dynamic_cast
<
platform
::
CUDADeviceContext
&>
(
dev_ctx
);
functor
(
cuda_dev_ctx
,
&
x
,
out
,
&
ddx
,
&
ddout
,
dout
,
dx
);
...
...
@@ -105,7 +105,7 @@ static bool TestLeakyReluGradGradMain(const framework::DDim &dim,
platform
::
ForRange
<
platform
::
CPUDeviceContext
>
for_range
(
cpu_dev_ctx
,
limit
);
for_range
(
actual_functor
);
#if
def __NVCC__
#if
defined(__NVCC__) || defined(__HIPCC__)
}
#endif
...
...
paddle/fluid/operators/top_k_function_cuda.h
浏览文件 @
4d647ec1
...
...
@@ -16,11 +16,26 @@ limitations under the License. */
#include <stdio.h>
#include <cstdio>
#include <vector>
#ifdef __NVCC__
#include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
#endif
#include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/float16.h"
#ifdef __HIPCC__
namespace
rocprim
{
namespace
detail
{
template
<
>
struct
radix_key_codec_base
<
paddle
::
platform
::
float16
>
:
radix_key_codec_integral
<
paddle
::
platform
::
float16
,
uint16_t
>
{};
}
// namespace detail
}
// namespace rocprim
namespace
cub
=
hipcub
;
#else
// set cub base traits in order to handle float16
namespace
cub
{
template
<
>
...
...
@@ -28,6 +43,7 @@ struct NumericTraits<paddle::platform::float16>
:
BaseTraits
<
FLOATING_POINT
,
true
,
false
,
uint16_t
,
paddle
::
platform
::
float16
>
{};
}
// namespace cub
#endif
namespace
paddle
{
namespace
operators
{
...
...
@@ -439,6 +455,16 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
input_indices
.
data
<
int64_t
>
(),
sorted_indices_ptr
,
num_cols
*
num_rows
,
num_rows
,
segment_offsets_t
,
segment_offsets_t
+
1
,
0
,
sizeof
(
T
)
*
8
,
cu_stream
);
#ifdef __HIPCC__
if
(
err
!=
hipSuccess
)
{
LOG
(
ERROR
)
<<
"TopKOP failed as could not launch "
"hipcub::DeviceSegmentedRadixSort::SortPairsDescending to "
"calculate "
"temp_storage_bytes, status: "
<<
hipGetErrorString
(
err
);
return
false
;
}
#else
if
(
err
!=
cudaSuccess
)
{
LOG
(
ERROR
)
<<
"TopKOP failed as could not launch "
...
...
@@ -447,12 +473,22 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
<<
cudaGetErrorString
(
err
);
return
false
;
}
#endif
}
else
{
auto
err
=
cub
::
DeviceSegmentedRadixSort
::
SortPairs
(
nullptr
,
temp_storage_bytes
,
input
,
sorted_values_ptr
,
input_indices
.
data
<
int64_t
>
(),
sorted_indices_ptr
,
num_cols
*
num_rows
,
num_rows
,
segment_offsets_t
,
segment_offsets_t
+
1
,
0
,
sizeof
(
T
)
*
8
,
cu_stream
);
#ifdef __HIPCC__
if
(
err
!=
hipSuccess
)
{
LOG
(
ERROR
)
<<
"TopKOP failed as could not launch "
"hipcub::DeviceSegmentedRadixSort::SortPairs to calculate "
"temp_storage_bytes, status: "
<<
hipGetErrorString
(
err
);
return
false
;
}
#else
if
(
err
!=
cudaSuccess
)
{
LOG
(
ERROR
)
<<
"TopKOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairs to calculate "
...
...
@@ -460,6 +496,7 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
<<
cudaGetErrorString
(
err
);
return
false
;
}
#endif
}
Tensor
temp_storage
;
temp_storage
.
mutable_data
<
uint8_t
>
(
ctx
.
GetPlace
(),
temp_storage_bytes
);
...
...
@@ -470,6 +507,17 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
sorted_values_ptr
,
input_indices
.
data
<
int64_t
>
(),
sorted_indices_ptr
,
num_cols
*
num_rows
,
num_rows
,
segment_offsets_t
,
segment_offsets_t
+
1
,
0
,
sizeof
(
T
)
*
8
,
cu_stream
);
#ifdef __HIPCC__
if
(
err
!=
hipSuccess
)
{
LOG
(
ERROR
)
<<
"TopKOP failed as could not launch "
"hipcub::DeviceSegmentedRadixSort::SortPairsDescending to "
"sort input, "
"temp_storage_bytes: "
<<
temp_storage_bytes
<<
", status: "
<<
hipGetErrorString
(
err
);
return
false
;
}
#else
if
(
err
!=
cudaSuccess
)
{
LOG
(
ERROR
)
<<
"TopKOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairsDescending to "
...
...
@@ -479,12 +527,24 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
<<
", status: "
<<
cudaGetErrorString
(
err
);
return
false
;
}
#endif
}
else
{
auto
err
=
cub
::
DeviceSegmentedRadixSort
::
SortPairs
(
temp_storage
.
data
<
uint8_t
>
(),
temp_storage_bytes
,
input
,
sorted_values_ptr
,
input_indices
.
data
<
int64_t
>
(),
sorted_indices_ptr
,
num_cols
*
num_rows
,
num_rows
,
segment_offsets_t
,
segment_offsets_t
+
1
,
0
,
sizeof
(
T
)
*
8
,
cu_stream
);
#ifdef __HIPCC__
if
(
err
!=
hipSuccess
)
{
LOG
(
ERROR
)
<<
"TopKOP failed as could not launch "
"hipcub::DeviceSegmentedRadixSort::SortPairs to "
"sort input, "
"temp_storage_bytes: "
<<
temp_storage_bytes
<<
", status: "
<<
hipGetErrorString
(
err
);
return
false
;
}
#else
if
(
err
!=
cudaSuccess
)
{
LOG
(
ERROR
)
<<
"TopKOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairs to "
...
...
@@ -494,6 +554,7 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
<<
", status: "
<<
cudaGetErrorString
(
err
);
return
false
;
}
#endif
}
auto
&
dev
=
*
ctx
.
eigen_device
();
if
(
k
<
num_cols
)
{
...
...
paddle/fluid/operators/top_k_op.cu
浏览文件 @
4d647ec1
...
...
@@ -15,7 +15,12 @@ limitations under the License. */
#pragma once
#include <cstdio>
#include <vector>
#ifdef __NVCC__
#include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
#endif
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/top_k_function_cuda.h"
#include "paddle/fluid/operators/top_k_op.h"
...
...
paddle/fluid/operators/trace_op.h
浏览文件 @
4d647ec1
...
...
@@ -145,7 +145,7 @@ framework::Tensor Diagonal(const framework::ExecutionContext& context,
int64_t
pos
=
std
::
abs
(
offset
)
*
offset_stride
;
int64_t
dim_size
=
ret_strides
.
size
();
#if
def __NVCC__
#if
defined(__NVCC__) || defined(__HIPCC__)
thrust
::
device_vector
<
int64_t
>
diag_vec
(
vectorize
(
dig_stride
));
const
int64_t
*
diag_arr
=
thrust
::
raw_pointer_cast
(
diag_vec
.
data
());
thrust
::
device_vector
<
int64_t
>
ret_vec
(
ret_strides
);
...
...
@@ -238,7 +238,7 @@ class TraceGradKernel : public framework::OpKernel<T> {
int64_t
diag_size
=
len2
<
len1
?
len2
:
len1
;
int64_t
pos
=
std
::
abs
(
offset
)
*
offset_stride
;
if
(
diag_size
>
0
)
{
#if
def __NVCC__
#if
defined(__NVCC__) || defined(__HIPCC__)
thrust
::
device_vector
<
int64_t
>
output_vec
(
vectorize
(
output_stride
));
const
int64_t
*
output_arr
=
thrust
::
raw_pointer_cast
(
output_vec
.
data
());
thrust
::
device_vector
<
int64_t
>
input_vec
(
vectorize
(
input_stride
));
...
...
paddle/fluid/operators/unique_op.cu
浏览文件 @
4d647ec1
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include <thrust/execution_policy.h>
#include <thrust/functional.h>
#include <thrust/scatter.h>
#include <thrust/sequence.h>
#include <thrust/unique.h>
#include <iostream>
#include <vector>
...
...
paddle/fluid/operators/unstack_op.h
浏览文件 @
4d647ec1
...
...
@@ -18,7 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/for_range.h"
#if
def __NVCC__
#if
defined(__NVCC__) || defined(__HIPCC__)
#include <thrust/device_vector.h>
#include "paddle/fluid/framework/array.h"
#endif
...
...
@@ -103,7 +103,7 @@ class UnStackGradKernel : public framework::OpKernel<T> {
for
(
auto
i
=
0
;
i
<
axis
;
++
i
)
pre
*=
dim
[
i
];
for
(
auto
i
=
axis
;
i
<
dim
.
size
();
++
i
)
post
*=
dim
[
i
];
#if
def __NVCC__
#if
defined(__NVCC__) || defined(__HIPCC__)
int
total_num
=
pre
*
n
*
post
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
...
...
@@ -156,14 +156,14 @@ class UnStackKernel : public framework::OpKernel<T> {
int
post
=
total_num
/
(
n
*
pre
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
#if
def __NVCC__
#if
defined(__NVCC__) || defined(__HIPCC__)
thrust
::
device_vector
<
T
*>
device_dx_vec
(
dx_datas
);
auto
dx_data_arr
=
device_dx_vec
.
data
().
get
();
#else
auto
dx_data_arr
=
dx_datas
.
data
();
#endif
StackGradFunctorForRange
(
dev_ctx
,
dx_data_arr
,
dy_data
,
total_num
,
n
,
post
);
#if
def __NVCC__
#if
defined(__NVCC__) || defined(__HIPCC__)
// Wait() must be called because device_dx_vec may be destructed before
// kernel ends
dev_ctx
.
Wait
();
...
...
paddle/fluid/operators/warpctc_op.cc
浏览文件 @
4d647ec1
...
...
@@ -16,6 +16,9 @@ limitations under the License. */
#include <memory>
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/miopen_helper.h"
#endif
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
...
...
paddle/fluid/operators/warpctc_op.h
浏览文件 @
4d647ec1
...
...
@@ -159,6 +159,7 @@ class WarpCTCFunctor {
warpctc_version_
=
platform
::
dynload
::
get_warpctc_version
();
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
// HIP not support ctcOptions in third-party warpctc
#ifdef PADDLE_WITH_CUDA
options_
.
loc
=
CTC_GPU
;
options_
.
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
...
...
paddle/fluid/platform/cuda_helper.h
浏览文件 @
4d647ec1
...
...
@@ -108,7 +108,11 @@ class CublasHandleHolder {
}
#endif
#ifdef PADDLE_WITH_HIP
const
rocblas_handle
&
GetCublasHandle
()
const
{
return
handle_
;
}
#else
const
cublasHandle_t
&
GetCublasHandle
()
const
{
return
handle_
;
}
#endif
~
CublasHandleHolder
()
PADDLE_MAY_THROW
{
#ifdef PADDLE_WITH_HIP
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
4d647ec1
...
...
@@ -459,9 +459,15 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
return
context
()
->
CudnnHandle
();
}
#ifdef PADDLE_WITH_HIP
rocblas_handle
CUDADeviceContext
::
cublas_handle
()
const
{
return
context
()
->
CublasHandle
()
->
GetCublasHandle
();
}
#else
cublasHandle_t
CUDADeviceContext
::
cublas_handle
()
const
{
return
context
()
->
CublasHandle
()
->
GetCublasHandle
();
}
#endif
CudnnWorkspaceHandle
CUDADeviceContext
::
cudnn_workspace_handle
()
const
{
return
CudnnWorkspaceHandle
(
*
this
,
&
cudnn_handle_mtx_
);
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
4d647ec1
...
...
@@ -409,8 +409,12 @@ class CUDADeviceContext : public DeviceContext {
cudnnHandle_t
cudnn_handle
()
const
;
#endif
/*! \brief Return cublas handle in the device context. */
/*! \brief Return cublas handle in the device context. */
#ifdef PADDLE_WITH_HIP
rocblas_handle
cublas_handle
()
const
;
#else
cublasHandle_t
cublas_handle
()
const
;
#endif
/*! \brief Return a cudnn workspace handle to call multiple cudnn
* functions without interrupting by other threads.
...
...
paddle/fluid/platform/device_context_test.cu
浏览文件 @
4d647ec1
...
...
@@ -47,7 +47,11 @@ TEST(Device, CUDADeviceContext) {
cudnnHandle_t
cudnn_handle
=
device_context
->
cudnn_handle
();
#endif
ASSERT_NE
(
nullptr
,
cudnn_handle
);
#ifdef PADDLE_WITH_HIP
rocblas_handle
cublas_handle
=
device_context
->
cublas_handle
();
#else
cublasHandle_t
cublas_handle
=
device_context
->
cublas_handle
();
#endif
ASSERT_NE
(
nullptr
,
cublas_handle
);
delete
device_context
;
}
...
...
paddle/fluid/platform/miopen_desc.h
浏览文件 @
4d647ec1
...
...
@@ -37,9 +37,9 @@ namespace platform {
using
framework
::
Tensor
;
template
<
typename
T
>
inline
miopenDataType_t
To
MIOpe
nDataType
(
const
T
&
t
)
{
inline
miopenDataType_t
To
Cudn
nDataType
(
const
T
&
t
)
{
auto
type
=
framework
::
ToDataType
(
t
);
return
To
MIOpe
nDataType
(
type
);
return
To
Cudn
nDataType
(
type
);
}
inline
std
::
vector
<
int
>
TransformDimOrder
(
const
std
::
vector
<
int
>&
dims
)
{
...
...
@@ -66,7 +66,7 @@ inline std::vector<int> TransformDimOrder(const std::vector<int>& dims) {
}
template
<
>
inline
miopenDataType_t
To
MIOpe
nDataType
(
inline
miopenDataType_t
To
Cudn
nDataType
(
const
framework
::
proto
::
VarType
::
Type
&
t
)
{
miopenDataType_t
type
=
miopenFloat
;
switch
(
t
)
{
...
...
@@ -84,37 +84,54 @@ inline miopenDataType_t ToMIOpenDataType(
class
ActivationDescriptor
{
public:
using
T
=
miopenActivationDescriptor
;
struct
Deleter
{
void
operator
()(
T
*
t
)
{
if
(
t
!=
nullptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyActivationDescriptor
(
t
));
t
=
nullptr
;
}
}
};
ActivationDescriptor
()
{
T
*
raw_ptr
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreateActivationDescriptor
(
&
desc_
));
}
~
ActivationDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyActivationDescriptor
(
desc_
));
dynload
::
miopenCreateActivationDescriptor
(
&
raw_ptr
));
desc_
.
reset
(
raw_ptr
);
}
template
<
typename
T
>
void
set
(
miopenActivationMode_t
mode
,
const
T
&
coef
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSetActivationDescriptor
(
desc_
,
mode
,
static_cast
<
double
>
(
coef
),
0.0
,
0.0
));
desc_
.
get
()
,
mode
,
static_cast
<
double
>
(
coef
),
0.0
,
0.0
));
}
miopenActivationDescriptor_t
desc
()
{
return
desc_
;
}
miopenActivationDescriptor_t
desc
()
const
{
return
desc_
;
}
T
*
desc
()
{
return
desc_
.
get
()
;
}
T
*
desc
()
const
{
return
desc_
.
get
()
;
}
private:
miopenActivationDescriptor_t
desc_
;
std
::
unique_ptr
<
T
,
Deleter
>
desc_
;
};
class
TensorDescriptor
{
public:
using
T
=
miopenTensorDescriptor
;
struct
Deleter
{
void
operator
()(
T
*
t
)
{
if
(
t
!=
nullptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyTensorDescriptor
(
t
));
t
=
nullptr
;
}
}
};
TensorDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreateTensorDescriptor
(
&
desc_
))
;
}
~
TensorDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyTensorDescriptor
(
desc_
)
);
T
*
raw_ptr
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreateTensorDescriptor
(
&
raw_ptr
));
desc_
.
reset
(
raw_ptr
);
}
miopenTensorDescriptor_t
desc
()
{
return
desc_
;
}
miopenTensorDescriptor_t
desc
()
const
{
return
desc_
;
}
T
*
desc
()
{
return
desc_
.
get
()
;
}
T
*
desc
()
const
{
return
desc_
.
get
()
;
}
void
set
(
const
Tensor
&
tensor
,
const
int
groups
=
1
)
{
auto
dims
=
framework
::
vectorize
<
int
>
(
tensor
.
dims
());
...
...
@@ -128,7 +145,7 @@ class TensorDescriptor {
dims_with_group
[
1
]
=
dims_with_group
[
1
]
/
groups
;
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSetTensorDescriptor
(
desc_
,
ToMIOpe
nDataType
(
tensor
.
type
()),
(
miopenTensorDescriptor_t
)(
desc_
.
get
()),
ToCudn
nDataType
(
tensor
.
type
()),
static_cast
<
int
>
(
dims_with_group
.
size
()),
const_cast
<
int
*>
(
dims_with_group
.
data
()),
const_cast
<
int
*>
(
strides
.
data
())));
...
...
@@ -136,6 +153,9 @@ class TensorDescriptor {
void
set
(
const
Tensor
&
tensor
,
const
miopenTensorFormat_t
format
)
{
const
int
groups
=
1
;
PADDLE_ENFORCE_EQ
(
format
,
MIOPEN_TENSOR_NCHW
,
platform
::
errors
::
InvalidArgument
(
"format should ONLY be NCHW in MIOPEN."
));
auto
dims
=
framework
::
vectorize
<
int
>
(
tensor
.
dims
());
std
::
vector
<
int
>
strides
(
dims
.
size
());
strides
[
dims
.
size
()
-
1
]
=
1
;
...
...
@@ -147,26 +167,35 @@ class TensorDescriptor {
dims_with_group
[
1
]
=
dims_with_group
[
1
]
/
groups
;
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSetTensorDescriptor
(
desc_
,
ToMIOpe
nDataType
(
tensor
.
type
()),
(
miopenTensorDescriptor_t
)(
desc_
.
get
()),
ToCudn
nDataType
(
tensor
.
type
()),
static_cast
<
int
>
(
dims_with_group
.
size
()),
const_cast
<
int
*>
(
dims_with_group
.
data
()),
const_cast
<
int
*>
(
strides
.
data
())));
}
private:
miopenTensorDescriptor_t
desc_
;
std
::
unique_ptr
<
T
,
Deleter
>
desc_
;
};
class
FilterDescriptor
{
public:
using
T
=
miopenTensorDescriptor
;
struct
Deleter
{
void
operator
()(
T
*
t
)
{
if
(
t
!=
nullptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyTensorDescriptor
(
t
));
t
=
nullptr
;
}
}
};
FilterDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreateTensorDescriptor
(
&
desc_
))
;
}
~
FilterDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyTensorDescriptor
(
desc_
)
);
T
*
raw_ptr
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreateTensorDescriptor
(
&
raw_ptr
));
desc_
.
reset
(
raw_ptr
);
}
miopenTensorDescriptor_t
desc
()
{
return
desc_
;
}
miopenTensorDescriptor_t
desc
()
const
{
return
desc_
;
}
T
*
desc
()
{
return
desc_
.
get
()
;
}
T
*
desc
()
const
{
return
desc_
.
get
()
;
}
void
set
(
const
Tensor
&
tensor
,
const
miopenTensorFormat_t
format
,
const
int
groups
=
1
)
{
...
...
@@ -176,45 +205,55 @@ class FilterDescriptor {
platform
::
errors
::
InvalidArgument
(
"format should ONLY be NCHW in MIOPEN."
));
transformed_dims
=
dims
;
if
(
groups
>
1
)
{
transformed_dims
[
1
]
=
transformed_dims
[
1
]
/
groups
;
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSetTensorDescriptor
(
desc_
,
ToMIOpe
nDataType
(
tensor
.
type
()),
static_cast
<
int
>
(
transformed_dims
.
size
())
,
const_cast
<
int
*>
(
transformed_dims
.
data
()),
nullptr
));
//
if (groups > 1) {
//
transformed_dims[1] = transformed_dims[1] / groups;
//
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenSet
4d
TensorDescriptor
(
(
miopenTensorDescriptor_t
)
desc_
.
get
(),
ToCudn
nDataType
(
tensor
.
type
()),
transformed_dims
[
0
],
transformed_dims
[
1
],
transformed_dims
[
2
]
,
transformed_dims
[
3
]
));
}
private:
miopenTensorDescriptor_t
desc_
;
std
::
unique_ptr
<
T
,
Deleter
>
desc_
;
};
class
ConvolutionDescriptor
{
public:
using
T
=
miopenConvolutionDescriptor
;
struct
Deleter
{
void
operator
()(
T
*
t
)
{
if
(
t
!=
nullptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyConvolutionDescriptor
(
t
));
t
=
nullptr
;
}
}
};
ConvolutionDescriptor
()
{
T
*
raw_ptr
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenCreateConvolutionDescriptor
(
&
desc_
));
}
~
ConvolutionDescriptor
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenDestroyConvolutionDescriptor
(
desc_
));
dynload
::
miopenCreateConvolutionDescriptor
(
&
raw_ptr
));
desc_
.
reset
(
raw_ptr
);
}
miopenConvolutionDescriptor_t
desc
()
{
return
desc_
;
}
miopenConvolutionDescriptor_t
desc
()
const
{
return
desc_
;
}
T
*
desc
()
{
return
desc_
.
get
()
;
}
T
*
desc
()
const
{
return
desc_
.
get
()
;
}
void
set
(
miopenDataType_t
dtype
,
const
std
::
vector
<
int
>&
pads
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
dilations
,
bool
allow_tf32
,
const
int
groups
=
1
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
miopenInitConvolutionNdDescriptor
(
desc_
,
static_cast
<
int
>
(
pads
.
size
()),
const_cast
<
int
*>
(
pads
.
data
()),
(
miopenConvolutionDescriptor_t
)
desc_
.
get
(),
static_cast
<
int
>
(
pads
.
size
()),
const_cast
<
int
*>
(
pads
.
data
()),
const_cast
<
int
*>
(
strides
.
data
()),
const_cast
<
int
*>
(
dilations
.
data
()),
miopenConvolution
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
miopenSetConvolutionGroupCount
(
desc_
,
groups
));
platform
::
dynload
::
miopenSetConvolutionGroupCount
(
(
miopenConvolutionDescriptor_t
)
desc_
.
get
(),
groups
));
}
private:
miopenConvolutionDescriptor_t
desc_
;
std
::
unique_ptr
<
T
,
Deleter
>
desc_
;
};
}
// namespace platform
...
...
paddle/fluid/platform/miopen_helper.h
浏览文件 @
4d647ec1
...
...
@@ -43,23 +43,6 @@ typedef enum {
MIOPEN_TENSOR_NHWC
=
1
,
}
miopenTensorFormat_t
;
// MIOPEN do not support indirect function call defined in cudnnWorkspaceHandle
struct
miopenWorkspace
{
explicit
miopenWorkspace
(
size_t
size
)
:
size
(
size
),
data
(
NULL
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
hipMalloc
(
&
data
,
size
));
}
miopenWorkspace
(
const
miopenWorkspace
&
)
=
delete
;
miopenWorkspace
(
miopenWorkspace
&&
)
=
default
;
miopenWorkspace
&
operator
=
(
miopenWorkspace
&&
)
=
default
;
~
miopenWorkspace
()
{
if
(
data
)
{
hipFree
(
data
);
}
}
size_t
size
;
void
*
data
;
};
inline
const
char
*
miopenGetErrorString
(
miopenStatus_t
status
)
{
switch
(
status
)
{
case
miopenStatusSuccess
:
...
...
paddle/fluid/pybind/imperative.cc
浏览文件 @
4d647ec1
...
...
@@ -984,7 +984,7 @@ void BindImperative(py::module *m_ptr) {
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Imperative allreduce is not supported when paddle is "
"not compiled with NCCL."
));
#endif // PADDLE_WITH_NCCL
#endif // PADDLE_WITH_NCCL
or PADDLE_WITH_RCCL
}
},
py
::
call_guard
<
py
::
gil_scoped_release
>
())
...
...
@@ -1435,7 +1435,7 @@ void BindImperative(py::module *m_ptr) {
py
::
call_guard
<
py
::
gil_scoped_release
>
());
#endif
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
py
::
class_
<
imperative
::
NCCLParallelContext
,
imperative
::
ParallelContext
,
std
::
shared_ptr
<
imperative
::
NCCLParallelContext
>>
(
m
,
"NCCLParallelContext"
)
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
4d647ec1
...
...
@@ -1125,7 +1125,7 @@ All parameter, weight, gradient are variables in Paddle.
.
def
(
"get_fetch_list"
,
[](
Variable
&
self
)
{
return
self
.
GetMutable
<
FetchList
>
();
},
py
::
return_value_policy
::
reference
)
#if
(defined(PADDLE_WITH_NCCL)
)
#if
defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL
)
.
def
(
"get_communicator"
,
[](
Variable
&
self
)
->
platform
::
Communicator
*
{
return
self
.
GetMutable
<
platform
::
Communicator
>
();
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录