Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
1d5746bd
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
1d5746bd
编写于
10月 19, 2021
作者:
X
Xiaoxu Chen
提交者:
GitHub
10月 19, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add rocm support for fft api (#36415)
上级
77f4597f
变更
10
显示空白变更内容
内联
并排
Showing
10 changed file
with
679 addition
and
380 deletion
+679
-380
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+1
-2
paddle/fluid/operators/spectral_helper.h
paddle/fluid/operators/spectral_helper.h
+261
-0
paddle/fluid/operators/spectral_op.cu
paddle/fluid/operators/spectral_op.cu
+237
-377
paddle/fluid/platform/dynload/CMakeLists.txt
paddle/fluid/platform/dynload/CMakeLists.txt
+1
-1
paddle/fluid/platform/dynload/dynamic_loader.cc
paddle/fluid/platform/dynload/dynamic_loader.cc
+10
-0
paddle/fluid/platform/dynload/dynamic_loader.h
paddle/fluid/platform/dynload/dynamic_loader.h
+1
-0
paddle/fluid/platform/dynload/hipfft.cc
paddle/fluid/platform/dynload/hipfft.cc
+30
-0
paddle/fluid/platform/dynload/hipfft.h
paddle/fluid/platform/dynload/hipfft.h
+124
-0
paddle/fluid/platform/enforce.h
paddle/fluid/platform/enforce.h
+10
-0
paddle/fluid/platform/enforce_test.cc
paddle/fluid/platform/enforce_test.cc
+4
-0
未找到文件。
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
1d5746bd
...
...
@@ -102,8 +102,7 @@ else()
op_library
(
warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale
)
endif
()
if
(
WITH_GPU
AND
(
NOT WITH_ROCM
))
if
(
WITH_GPU OR WITH_ROCM
)
if
(
MKL_FOUND AND WITH_ONEMKL
)
op_library
(
spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda dynload_mklrt
${
OP_HEADER_DEPS
}
)
target_include_directories
(
spectral_op PRIVATE
${
MKL_INCLUDE
}
)
...
...
paddle/fluid/operators/spectral_helper.h
0 → 100644
浏览文件 @
1d5746bd
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/spectral_op.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/hipfft.h"
#endif
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cufft.h"
#endif
namespace
paddle
{
namespace
operators
{
using
ScalarType
=
framework
::
proto
::
VarType
::
Type
;
const
int64_t
kMaxCUFFTNdim
=
3
;
const
int64_t
kMaxDataNdim
=
kMaxCUFFTNdim
+
1
;
// This struct is used to easily compute hashes of the
// parameters. It will be the **key** to the plan cache.
struct
PlanKey
{
// between 1 and kMaxCUFFTNdim, i.e., 1 <= signal_ndim <= 3
int64_t
signal_ndim_
;
// These include additional batch dimension as well.
int64_t
sizes_
[
kMaxDataNdim
];
int64_t
input_shape_
[
kMaxDataNdim
];
int64_t
output_shape_
[
kMaxDataNdim
];
FFTTransformType
fft_type_
;
ScalarType
value_type_
;
PlanKey
()
=
default
;
PlanKey
(
const
std
::
vector
<
int64_t
>&
in_shape
,
const
std
::
vector
<
int64_t
>&
out_shape
,
const
std
::
vector
<
int64_t
>&
signal_size
,
FFTTransformType
fft_type
,
ScalarType
value_type
)
{
// Padding bits must be zeroed for hashing
memset
(
this
,
0
,
sizeof
(
*
this
));
signal_ndim_
=
signal_size
.
size
()
-
1
;
fft_type_
=
fft_type
;
value_type_
=
value_type
;
std
::
copy
(
signal_size
.
cbegin
(),
signal_size
.
cend
(),
sizes_
);
std
::
copy
(
in_shape
.
cbegin
(),
in_shape
.
cend
(),
input_shape_
);
std
::
copy
(
out_shape
.
cbegin
(),
out_shape
.
cend
(),
output_shape_
);
}
};
#if defined(PADDLE_WITH_CUDA)
// An RAII encapsulation of cuFFTHandle
class
CuFFTHandle
{
::
cufftHandle
handle_
;
public:
CuFFTHandle
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cufftCreate
(
&
handle_
));
}
::
cufftHandle
&
get
()
{
return
handle_
;
}
const
::
cufftHandle
&
get
()
const
{
return
handle_
;
}
~
CuFFTHandle
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cufftDestroy
(
handle_
));
}
};
using
plan_size_type
=
long
long
int
;
// NOLINT
// This class contains all the information needed to execute a cuFFT plan:
// 1. the plan
// 2. the workspace size needed
class
CuFFTConfig
{
public:
// Only move semantics is enought for this class. Although we already use
// unique_ptr for the plan, still remove copy constructor and assignment op so
// we don't accidentally copy and take perf hit.
explicit
CuFFTConfig
(
const
PlanKey
&
plan_key
)
:
CuFFTConfig
(
std
::
vector
<
int64_t
>
(
plan_key
.
sizes_
,
plan_key
.
sizes_
+
plan_key
.
signal_ndim_
+
1
),
plan_key
.
signal_ndim_
,
plan_key
.
fft_type_
,
plan_key
.
value_type_
)
{}
// sizes are full signal, including batch size and always two-sided
CuFFTConfig
(
const
std
::
vector
<
int64_t
>&
sizes
,
const
int64_t
signal_ndim
,
FFTTransformType
fft_type
,
ScalarType
dtype
)
:
fft_type_
(
fft_type
),
value_type_
(
dtype
)
{
// signal sizes (excluding batch dim)
std
::
vector
<
plan_size_type
>
signal_sizes
(
sizes
.
begin
()
+
1
,
sizes
.
end
());
// input batch size
const
auto
batch
=
static_cast
<
plan_size_type
>
(
sizes
[
0
]);
// const int64_t signal_ndim = sizes.size() - 1;
PADDLE_ENFORCE_EQ
(
signal_ndim
,
sizes
.
size
()
-
1
,
platform
::
errors
::
InvalidArgument
(
"The signal_ndim must be equal to sizes.size() - 1,"
"But signal_ndim is: [%d], sizes.size() - 1 is: [%d]"
,
signal_ndim
,
sizes
.
size
()
-
1
));
cudaDataType
itype
,
otype
,
exec_type
;
const
auto
complex_input
=
has_complex_input
(
fft_type
);
const
auto
complex_output
=
has_complex_output
(
fft_type
);
if
(
dtype
==
framework
::
proto
::
VarType
::
FP32
)
{
itype
=
complex_input
?
CUDA_C_32F
:
CUDA_R_32F
;
otype
=
complex_output
?
CUDA_C_32F
:
CUDA_R_32F
;
exec_type
=
CUDA_C_32F
;
}
else
if
(
dtype
==
framework
::
proto
::
VarType
::
FP64
)
{
itype
=
complex_input
?
CUDA_C_64F
:
CUDA_R_64F
;
otype
=
complex_output
?
CUDA_C_64F
:
CUDA_R_64F
;
exec_type
=
CUDA_C_64F
;
}
else
if
(
dtype
==
framework
::
proto
::
VarType
::
FP16
)
{
itype
=
complex_input
?
CUDA_C_16F
:
CUDA_R_16F
;
otype
=
complex_output
?
CUDA_C_16F
:
CUDA_R_16F
;
exec_type
=
CUDA_C_16F
;
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"cuFFT only support transforms of type float16, float32 and "
"float64"
));
}
// disable auto allocation of workspace to use allocator from the framework
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cufftSetAutoAllocation
(
plan
(),
/* autoAllocate */
0
));
size_t
ws_size_t
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cufftXtMakePlanMany
(
plan
(),
signal_ndim
,
signal_sizes
.
data
(),
/* inembed */
nullptr
,
/* base_istride */
1
,
/* idist */
1
,
itype
,
/* onembed */
nullptr
,
/* base_ostride */
1
,
/* odist */
1
,
otype
,
batch
,
&
ws_size_t
,
exec_type
));
ws_size
=
ws_size_t
;
}
const
cufftHandle
&
plan
()
const
{
return
plan_ptr
.
get
();
}
FFTTransformType
transform_type
()
const
{
return
fft_type_
;
}
ScalarType
data_type
()
const
{
return
value_type_
;
}
size_t
workspace_size
()
const
{
return
ws_size
;
}
private:
CuFFTHandle
plan_ptr
;
size_t
ws_size
;
FFTTransformType
fft_type_
;
ScalarType
value_type_
;
};
#elif defined(PADDLE_WITH_HIP)
// An RAII encapsulation of cuFFTHandle
class
HIPFFTHandle
{
::
hipfftHandle
handle_
;
public:
HIPFFTHandle
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
hipfftCreate
(
&
handle_
));
}
::
hipfftHandle
&
get
()
{
return
handle_
;
}
const
::
hipfftHandle
&
get
()
const
{
return
handle_
;
}
~
HIPFFTHandle
()
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
hipfftDestroy
(
handle_
));
}
};
using
plan_size_type
=
int
;
// This class contains all the information needed to execute a cuFFT plan:
// 1. the plan
// 2. the workspace size needed
class
HIPFFTConfig
{
public:
// Only move semantics is enought for this class. Although we already use
// unique_ptr for the plan, still remove copy constructor and assignment op so
// we don't accidentally copy and take perf hit.
explicit
HIPFFTConfig
(
const
PlanKey
&
plan_key
)
:
HIPFFTConfig
(
std
::
vector
<
int64_t
>
(
plan_key
.
sizes_
,
plan_key
.
sizes_
+
plan_key
.
signal_ndim_
+
1
),
plan_key
.
signal_ndim_
,
plan_key
.
fft_type_
,
plan_key
.
value_type_
)
{}
// sizes are full signal, including batch size and always two-sided
HIPFFTConfig
(
const
std
::
vector
<
int64_t
>&
sizes
,
const
int64_t
signal_ndim
,
FFTTransformType
fft_type
,
ScalarType
dtype
)
:
fft_type_
(
fft_type
),
value_type_
(
dtype
)
{
// signal sizes (excluding batch dim)
std
::
vector
<
plan_size_type
>
signal_sizes
(
sizes
.
begin
()
+
1
,
sizes
.
end
());
// input batch size
const
auto
batch
=
static_cast
<
plan_size_type
>
(
sizes
[
0
]);
// const int64_t signal_ndim = sizes.size() - 1;
PADDLE_ENFORCE_EQ
(
signal_ndim
,
sizes
.
size
()
-
1
,
platform
::
errors
::
InvalidArgument
(
"The signal_ndim must be equal to sizes.size() - 1,"
"But signal_ndim is: [%d], sizes.size() - 1 is: [%d]"
,
signal_ndim
,
sizes
.
size
()
-
1
));
hipfftType
exec_type
=
[
&
]
{
if
(
dtype
==
framework
::
proto
::
VarType
::
FP32
)
{
switch
(
fft_type
)
{
case
FFTTransformType
::
C2C
:
return
HIPFFT_C2C
;
case
FFTTransformType
::
R2C
:
return
HIPFFT_R2C
;
case
FFTTransformType
::
C2R
:
return
HIPFFT_C2R
;
}
}
else
if
(
dtype
==
framework
::
proto
::
VarType
::
FP64
)
{
switch
(
fft_type
)
{
case
FFTTransformType
::
C2C
:
return
HIPFFT_Z2Z
;
case
FFTTransformType
::
R2C
:
return
HIPFFT_D2Z
;
case
FFTTransformType
::
C2R
:
return
HIPFFT_Z2D
;
}
}
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"hipFFT only support transforms of type float32 and float64"
));
}();
// disable auto allocation of workspace to use allocator from the framework
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
hipfftSetAutoAllocation
(
plan
(),
/* autoAllocate */
0
));
size_t
ws_size_t
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
hipfftMakePlanMany
(
plan
(),
signal_ndim
,
signal_sizes
.
data
(),
/* inembed */
nullptr
,
/* base_istride */
1
,
/* idist */
1
,
/* onembed */
nullptr
,
/* base_ostride */
1
,
/* odist */
1
,
exec_type
,
batch
,
&
ws_size_t
));
ws_size
=
ws_size_t
;
}
const
hipfftHandle
&
plan
()
const
{
return
plan_ptr
.
get
();
}
FFTTransformType
transform_type
()
const
{
return
fft_type_
;
}
ScalarType
data_type
()
const
{
return
value_type_
;
}
size_t
workspace_size
()
const
{
return
ws_size
;
}
private:
HIPFFTHandle
plan_ptr
;
size_t
ws_size
;
FFTTransformType
fft_type_
;
ScalarType
value_type_
;
};
#endif
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/spectral_op.cu
浏览文件 @
1d5746bd
...
...
@@ -8,10 +8,6 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cufft.h>
#include <cufftXt.h>
#include <functional>
#include <list>
#include <memory>
...
...
@@ -24,261 +20,170 @@
#include <vector>
#include "paddle/fluid/operators/conj_op.h"
#include "paddle/fluid/operators/spectral_helper.h"
#include "paddle/fluid/operators/spectral_op.h"
#include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/platform/
dynload/cufft
.h"
#include "paddle/fluid/platform/
enforce
.h"
namespace
paddle
{
namespace
operators
{
namespace
{
using
ScalarType
=
framework
::
proto
::
VarType
::
Type
;
const
int64_t
kMaxCUFFTNdim
=
3
;
const
int64_t
kMaxDataNdim
=
kMaxCUFFTNdim
+
1
;
static
inline
std
::
string
get_cufft_error_info
(
cufftResult
error
)
{
switch
(
error
)
{
case
CUFFT_SUCCESS
:
return
"CUFFT_SUCCESS"
;
case
CUFFT_INVALID_PLAN
:
return
"CUFFT_INVALID_PLAN"
;
case
CUFFT_ALLOC_FAILED
:
return
"CUFFT_ALLOC_FAILED"
;
case
CUFFT_INVALID_TYPE
:
return
"CUFFT_INVALID_TYPE"
;
case
CUFFT_INVALID_VALUE
:
return
"CUFFT_INVALID_VALUE"
;
case
CUFFT_INTERNAL_ERROR
:
return
"CUFFT_INTERNAL_ERROR"
;
case
CUFFT_EXEC_FAILED
:
return
"CUFFT_EXEC_FAILED"
;
case
CUFFT_SETUP_FAILED
:
return
"CUFFT_SETUP_FAILED"
;
case
CUFFT_INVALID_SIZE
:
return
"CUFFT_INVALID_SIZE"
;
case
CUFFT_UNALIGNED_DATA
:
return
"CUFFT_UNALIGNED_DATA"
;
case
CUFFT_INCOMPLETE_PARAMETER_LIST
:
return
"CUFFT_INCOMPLETE_PARAMETER_LIST"
;
case
CUFFT_INVALID_DEVICE
:
return
"CUFFT_INVALID_DEVICE"
;
case
CUFFT_PARSE_ERROR
:
return
"CUFFT_PARSE_ERROR"
;
case
CUFFT_NO_WORKSPACE
:
return
"CUFFT_NO_WORKSPACE"
;
case
CUFFT_NOT_IMPLEMENTED
:
return
"CUFFT_NOT_IMPLEMENTED"
;
#ifndef __HIPCC__
case
CUFFT_LICENSE_ERROR
:
return
"CUFFT_LICENSE_ERROR"
;
#endif
case
CUFFT_NOT_SUPPORTED
:
return
"CUFFT_NOT_SUPPORTED"
;
default:
std
::
ostringstream
ss
;
ss
<<
"unknown error "
<<
error
;
return
ss
.
str
();
// Calculates the normalization constant
double
fft_normalization_scale
(
FFTNormMode
normalization
,
const
std
::
vector
<
int64_t
>&
sizes
,
const
std
::
vector
<
int64_t
>&
dims
)
{
// auto norm = static_cast<fft_norm_mode>(normalization);
if
(
normalization
==
FFTNormMode
::
none
)
{
return
static_cast
<
double
>
(
1.0
);
}
}
static
inline
void
CUFFT_CHECK
(
cufftResult
error
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
error
);
int64_t
signal_numel
=
1
;
for
(
auto
dim
:
dims
)
{
signal_numel
*=
sizes
[
dim
];
}
const
double
scale_denom
=
(
normalization
==
FFTNormMode
::
by_sqrt_n
)
?
std
::
sqrt
(
signal_numel
)
:
static_cast
<
double
>
(
signal_numel
);
return
static_cast
<
double
>
(
1.0
/
scale_denom
);
}
// This struct is used to easily compute hashes of the
// parameters. It will be the **key** to the plan cache.
struct
PlanKey
{
// between 1 and kMaxCUFFTNdim, i.e., 1 <= signal_ndim <= 3
int64_t
signal_ndim_
;
// These include additional batch dimension as well.
int64_t
sizes_
[
kMaxDataNdim
];
int64_t
input_shape_
[
kMaxDataNdim
];
int64_t
output_shape_
[
kMaxDataNdim
];
FFTTransformType
fft_type_
;
ScalarType
value_type_
;
PlanKey
()
=
default
;
PlanKey
(
const
std
::
vector
<
int64_t
>&
in_shape
,
const
std
::
vector
<
int64_t
>&
out_shape
,
const
std
::
vector
<
int64_t
>&
signal_size
,
FFTTransformType
fft_type
,
ScalarType
value_type
)
{
// Padding bits must be zeroed for hashing
memset
(
this
,
0
,
sizeof
(
*
this
));
signal_ndim_
=
signal_size
.
size
()
-
1
;
fft_type_
=
fft_type
;
value_type_
=
value_type
;
std
::
copy
(
signal_size
.
cbegin
(),
signal_size
.
cend
(),
sizes_
);
std
::
copy
(
in_shape
.
cbegin
(),
in_shape
.
cend
(),
input_shape_
);
std
::
copy
(
out_shape
.
cbegin
(),
out_shape
.
cend
(),
output_shape_
);
template
<
typename
DeviceContext
,
typename
T
>
void
exec_normalization
(
const
DeviceContext
&
ctx
,
const
Tensor
*
in
,
Tensor
*
out
,
FFTNormMode
normalization
,
const
std
::
vector
<
int64_t
>&
sizes
,
const
std
::
vector
<
int64_t
>&
axes
)
{
double
scale
=
fft_normalization_scale
(
normalization
,
sizes
,
axes
);
if
(
scale
!=
1.0
)
{
auto
eigen_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
out
);
auto
eigen_in
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
in
);
auto
dev
=
ctx
.
eigen_device
();
EigenScale
<
Eigen
::
GpuDevice
,
T
>::
Eval
(
*
dev
,
eigen_out
,
eigen_in
,
static_cast
<
T
>
(
scale
),
static_cast
<
T
>
(
0
),
false
);
}
else
{
framework
::
TensorCopy
(
*
in
,
ctx
.
GetPlace
(),
out
);
}
}
;
}
// An RAII encapsulation of cuFFTHandle
class
CuFFTHandle
{
::
cufftHandle
handle_
;
#if defined(PADDLE_WITH_CUDA)
CuFFTConfig
create_cufft_config
(
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
output
,
int
signal_ndim
)
{
// Create the transform plan (either from cache or locally)
const
auto
value_type
=
framework
::
IsComplexType
(
input
.
type
())
?
framework
::
ToRealType
(
input
.
type
())
:
input
.
type
();
auto
fft_type
=
GetFFTTransformType
(
input
.
type
(),
output
.
type
());
// signal sizes
std
::
vector
<
int64_t
>
signal_size
(
signal_ndim
+
1
);
public:
CuFFTHandle
()
{
CUFFT_CHECK
(
platform
::
dynload
::
cufftCreate
(
&
handle_
));
}
signal_size
[
0
]
=
input
.
dims
()[
0
];
for
(
int64_t
i
=
1
;
i
<=
signal_ndim
;
++
i
)
{
auto
in_size
=
input
.
dims
()[
i
];
auto
out_size
=
output
.
dims
()[
i
];
signal_size
[
i
]
=
std
::
max
(
in_size
,
out_size
);
}
PlanKey
key
(
framework
::
vectorize
(
input
.
dims
()),
framework
::
vectorize
(
output
.
dims
()),
signal_size
,
fft_type
,
value_type
);
::
cufftHandle
&
get
()
{
return
handle_
;
}
const
::
cufftHandle
&
get
()
const
{
return
handle_
;
}
return
CuFFTConfig
(
key
);
}
~
CuFFTHandle
()
{
// Not using fftDestroy() for rocFFT to work around double freeing of handles
#ifndef __HIPCC__
CUFFT_CHECK
(
platform
::
dynload
::
cufftDestroy
(
handle_
));
#endif
}
};
// Execute a pre-planned transform
static
void
exec_cufft_plan_raw
(
const
CuFFTConfig
&
config
,
void
*
in_data
,
void
*
out_data
,
bool
forward
)
{
auto
&
plan
=
config
.
plan
();
#ifdef __HIPCC__
using
plan_size_type
=
int
;
#else
using
plan_size_type
=
long
long
int
;
// NOLINT
#endif
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cufftXtExec
(
plan
,
in_data
,
out_data
,
forward
?
CUFFT_FORWARD
:
CUFFT_INVERSE
));
}
// This class contains all the information needed to execute a cuFFT plan:
// 1. the plan
// 2. the workspace size needed
class
CuFFTConfig
{
public:
// Only move semantics is enought for this class. Although we already use
// unique_ptr for the plan, still remove copy constructor and assignment op so
// we don't accidentally copy and take perf hit.
CuFFTConfig
(
const
CuFFTConfig
&
)
=
delete
;
CuFFTConfig
&
operator
=
(
CuFFTConfig
const
&
)
=
delete
;
explicit
CuFFTConfig
(
const
PlanKey
&
plan_key
)
:
CuFFTConfig
(
std
::
vector
<
int64_t
>
(
plan_key
.
sizes_
,
plan_key
.
sizes_
+
plan_key
.
signal_ndim_
+
1
),
plan_key
.
signal_ndim_
,
plan_key
.
fft_type_
,
plan_key
.
value_type_
)
{}
// sizes are full signal, including batch size and always two-sided
CuFFTConfig
(
const
std
::
vector
<
int64_t
>&
sizes
,
const
int64_t
signal_ndim
,
FFTTransformType
fft_type
,
ScalarType
dtype
)
:
fft_type_
(
fft_type
),
value_type_
(
dtype
)
{
// signal sizes (excluding batch dim)
std
::
vector
<
plan_size_type
>
signal_sizes
(
sizes
.
begin
()
+
1
,
sizes
.
end
());
// input batch size
const
auto
batch
=
static_cast
<
plan_size_type
>
(
sizes
[
0
]);
// const int64_t signal_ndim = sizes.size() - 1;
PADDLE_ENFORCE_EQ
(
signal_ndim
,
sizes
.
size
()
-
1
,
platform
::
errors
::
InvalidArgument
(
"The signal_ndim must be equal to sizes.size() - 1,"
"But signal_ndim is: [%d], sizes.size() - 1 is: [%d]"
,
signal_ndim
,
sizes
.
size
()
-
1
));
#ifdef __HIPCC__
hipfftType
exec_type
=
[
&
]
{
if
(
dtype
==
framework
::
proto
::
VarType
::
FP32
)
{
switch
(
fft_type
)
{
case
FFTTransformType
::
C2C
:
return
HIPFFT_C2C
;
case
FFTTransformType
::
R2C
:
return
HIPFFT_R2C
;
case
FFTTransformType
::
C2R
:
return
HIPFFT_C2R
;
}
}
else
if
(
dtype
==
framework
::
proto
::
VarType
::
FP64
)
{
switch
(
fft_type
)
{
case
FFTTransformType
::
C2C
:
return
HIPFFT_Z2Z
;
case
FFTTransformType
::
R2C
:
return
HIPFFT_D2Z
;
case
FFTTransformType
::
C2R
:
return
HIPFFT_Z2D
;
}
}
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"hipFFT only support transforms of type float32 and float64"
));
}();
#else
cudaDataType
itype
,
otype
,
exec_type
;
const
auto
complex_input
=
has_complex_input
(
fft_type
);
const
auto
complex_output
=
has_complex_output
(
fft_type
);
if
(
dtype
==
framework
::
proto
::
VarType
::
FP32
)
{
itype
=
complex_input
?
CUDA_C_32F
:
CUDA_R_32F
;
otype
=
complex_output
?
CUDA_C_32F
:
CUDA_R_32F
;
exec_type
=
CUDA_C_32F
;
}
else
if
(
dtype
==
framework
::
proto
::
VarType
::
FP64
)
{
itype
=
complex_input
?
CUDA_C_64F
:
CUDA_R_64F
;
otype
=
complex_output
?
CUDA_C_64F
:
CUDA_R_64F
;
exec_type
=
CUDA_C_64F
;
}
else
if
(
dtype
==
framework
::
proto
::
VarType
::
FP16
)
{
itype
=
complex_input
?
CUDA_C_16F
:
CUDA_R_16F
;
otype
=
complex_output
?
CUDA_C_16F
:
CUDA_R_16F
;
exec_type
=
CUDA_C_16F
;
template
<
typename
DeviceContext
,
typename
Ti
,
typename
To
>
void
exec_cufft_plan
(
const
DeviceContext
&
ctx
,
const
CuFFTConfig
&
config
,
framework
::
Tensor
*
input
,
framework
::
Tensor
*
output
,
bool
forward
)
{
// execute transform plan
auto
fft_type
=
config
.
transform_type
();
if
(
fft_type
==
FFTTransformType
::
C2R
&&
forward
)
{
forward
=
false
;
framework
::
Tensor
input_conj
(
input
->
type
());
input_conj
.
mutable_data
<
Ti
>
(
input
->
dims
(),
ctx
.
GetPlace
());
platform
::
ForRange
<
DeviceContext
>
for_range
(
ctx
,
input
->
numel
());
math
::
ConjFunctor
<
Ti
>
functor
(
input
->
data
<
Ti
>
(),
input
->
numel
(),
input_conj
.
data
<
Ti
>
());
for_range
(
functor
);
exec_cufft_plan_raw
(
config
,
input_conj
.
data
<
void
>
(),
output
->
data
<
void
>
(),
forward
);
}
else
if
(
fft_type
==
FFTTransformType
::
R2C
&&
!
forward
)
{
forward
=
true
;
framework
::
Tensor
out_conj
(
output
->
type
());
out_conj
.
mutable_data
<
To
>
(
output
->
dims
(),
ctx
.
GetPlace
());
exec_cufft_plan_raw
(
config
,
input
->
data
<
void
>
(),
out_conj
.
data
<
void
>
(),
forward
);
platform
::
ForRange
<
DeviceContext
>
for_range
(
ctx
,
output
->
numel
());
math
::
ConjFunctor
<
To
>
functor
(
out_conj
.
data
<
To
>
(),
output
->
numel
(),
output
->
data
<
To
>
());
for_range
(
functor
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"cuFFT only support transforms of type float16, float32 and "
"float64"
));
exec_cufft_plan_raw
(
config
,
input
->
data
<
void
>
(),
output
->
data
<
void
>
(),
forward
);
}
#endif
// disable auto allocation of workspace to use allocator from the framework
CUFFT_CHECK
(
platform
::
dynload
::
cufftSetAutoAllocation
(
plan
(),
/* autoAllocate */
0
));
size_t
ws_size_t
;
// make plan
#ifdef __HIPCC__
CUFFT_CHECK
(
hipfftMakePlanMany
(
plan
(),
signal_ndim
,
signal_sizes
.
data
(),
/* inembed */
nullptr
,
/* base_istride */
1
,
/* idist */
1
,
/* onembed */
nullptr
,
/* base_ostride */
1
,
/* odist */
1
,
exec_type
,
batch
,
&
ws_size_t
));
#else
CUFFT_CHECK
(
platform
::
dynload
::
cufftXtMakePlanMany
(
plan
(),
signal_ndim
,
signal_sizes
.
data
(),
/* inembed */
nullptr
,
/* base_istride */
1
,
/* idist */
1
,
itype
,
/* onembed */
nullptr
,
/* base_ostride */
1
,
/* odist */
1
,
otype
,
batch
,
&
ws_size_t
,
exec_type
));
#endif
}
ws_size
=
ws_size_t
;
}
#elif defined(PADDLE_WITH_HIP)
const
cufftHandle
&
plan
()
const
{
return
plan_ptr
.
get
();
}
HIPFFTConfig
create_hipfft_config
(
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
output
,
int
signal_ndim
)
{
// Create the transform plan (either from cache or locally)
const
auto
value_type
=
framework
::
IsComplexType
(
input
.
type
())
?
framework
::
ToRealType
(
input
.
type
())
:
input
.
type
();
auto
fft_type
=
GetFFTTransformType
(
input
.
type
(),
output
.
type
());
// signal sizes
std
::
vector
<
int64_t
>
signal_size
(
signal_ndim
+
1
);
FFTTransformType
transform_type
()
const
{
return
fft_type_
;
}
ScalarType
data_type
()
const
{
return
value_type_
;
}
size_t
workspace_size
()
const
{
return
ws_size
;
}
signal_size
[
0
]
=
input
.
dims
()[
0
];
for
(
int64_t
i
=
1
;
i
<=
signal_ndim
;
++
i
)
{
auto
in_size
=
input
.
dims
()[
i
];
auto
out_size
=
output
.
dims
()[
i
];
signal_size
[
i
]
=
std
::
max
(
in_size
,
out_size
);
}
PlanKey
key
(
framework
::
vectorize
(
input
.
dims
()),
framework
::
vectorize
(
output
.
dims
()),
signal_size
,
fft_type
,
value_type
);
private:
CuFFTHandle
plan_ptr
;
size_t
ws_size
;
FFTTransformType
fft_type_
;
ScalarType
value_type_
;
};
return
HIPFFTConfig
(
key
);
}
// Execute a pre-planned transform
static
void
exec_
cufft_plan
(
const
Cu
FFTConfig
&
config
,
void
*
in_data
,
static
void
exec_
hipfft_plan_raw
(
const
HIP
FFTConfig
&
config
,
void
*
in_data
,
void
*
out_data
,
bool
forward
)
{
auto
&
plan
=
config
.
plan
();
#ifdef __HIPCC__
auto
value_type
=
config
.
data_type
();
if
(
value_type
==
framework
::
proto
::
VarType
::
FP32
)
{
switch
(
config
.
transform_type
())
{
case
FFTTransformType
::
C2C
:
{
CUFFT_CHECK
(
hipfftExecC2C
(
plan
,
static_cast
<
hipfftComplex
*>
(
in_data
),
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
hipfftExecC2C
(
plan
,
static_cast
<
hipfftComplex
*>
(
in_data
),
static_cast
<
hipfftComplex
*>
(
out_data
),
forward
?
HIPFFT_FORWARD
:
HIPFFT_BACKWARD
));
return
;
}
case
FFTTransformType
::
R2C
:
{
CUFFT_CHECK
(
hipfftExecR2C
(
plan
,
static_cast
<
hipfftReal
*>
(
in_data
),
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
hipfftExecR2C
(
plan
,
static_cast
<
hipfftReal
*>
(
in_data
),
static_cast
<
hipfftComplex
*>
(
out_data
)));
return
;
}
case
FFTTransformType
::
C2R
:
{
CUFFT_CHECK
(
hipfftExecC2R
(
plan
,
static_cast
<
hipfftComplex
*>
(
in_data
),
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
hipfftExecC2R
(
plan
,
static_cast
<
hipfftComplex
*>
(
in_data
),
static_cast
<
hipfftReal
*>
(
out_data
)));
return
;
}
...
...
@@ -286,20 +191,21 @@ static void exec_cufft_plan(const CuFFTConfig& config, void* in_data,
}
else
if
(
value_type
==
framework
::
proto
::
VarType
::
FP64
)
{
switch
(
config
.
transform_type
())
{
case
FFTTransformType
::
C2C
:
{
CUFFT_CHECK
(
hipfftExecZ2Z
(
plan
,
static_cast
<
hipfftDoubleComplex
*>
(
in_data
),
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
hipfftExecZ2Z
(
plan
,
static_cast
<
hipfftDoubleComplex
*>
(
in_data
),
static_cast
<
hipfftDoubleComplex
*>
(
out_data
),
forward
?
HIPFFT_FORWARD
:
HIPFFT_BACKWARD
));
return
;
}
case
FFTTransformType
::
R2C
:
{
CUFFT_CHECK
(
hipfftExecD2Z
(
plan
,
static_cast
<
hipfftDoubleReal
*>
(
in_data
),
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
hipfftExecD2Z
(
plan
,
static_cast
<
hipfftDoubleReal
*>
(
in_data
),
static_cast
<
hipfftDoubleComplex
*>
(
out_data
)));
return
;
}
case
FFTTransformType
::
C2R
:
{
CUFFT_CHECK
(
hipfftExecZ2D
(
plan
,
static_cast
<
hipfftDoubleComplex
*>
(
in_data
),
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
hipfftExecZ2D
(
plan
,
static_cast
<
hipfftDoubleComplex
*>
(
in_data
),
static_cast
<
hipfftDoubleReal
*>
(
out_data
)));
return
;
}
...
...
@@ -307,28 +213,53 @@ static void exec_cufft_plan(const CuFFTConfig& config, void* in_data,
}
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"hipFFT only support transforms of type float32 and float64"
));
#else
CUFFT_CHECK
(
platform
::
dynload
::
cufftXtExec
(
plan
,
in_data
,
out_data
,
forward
?
CUFFT_FORWARD
:
CUFFT_INVERSE
));
#endif
}
template
<
typename
DeviceContext
,
typename
Ti
,
typename
To
>
void
exec_hipfft_plan
(
const
DeviceContext
&
ctx
,
const
HIPFFTConfig
&
config
,
framework
::
Tensor
*
input
,
framework
::
Tensor
*
output
,
bool
forward
)
{
auto
fft_type
=
config
.
transform_type
();
if
(
fft_type
==
FFTTransformType
::
C2R
&&
forward
)
{
forward
=
false
;
framework
::
Tensor
input_conj
(
input
->
type
());
input_conj
.
mutable_data
<
Ti
>
(
input
->
dims
(),
ctx
.
GetPlace
());
platform
::
ForRange
<
DeviceContext
>
for_range
(
ctx
,
input
->
numel
());
math
::
ConjFunctor
<
Ti
>
functor
(
input
->
data
<
Ti
>
(),
input
->
numel
(),
input_conj
.
data
<
Ti
>
());
for_range
(
functor
);
exec_hipfft_plan_raw
(
config
,
input_conj
.
data
<
void
>
(),
output
->
data
<
void
>
(),
forward
);
}
else
if
(
fft_type
==
FFTTransformType
::
R2C
&&
!
forward
)
{
forward
=
true
;
framework
::
Tensor
out_conj
(
output
->
type
());
out_conj
.
mutable_data
<
To
>
(
output
->
dims
(),
ctx
.
GetPlace
());
exec_hipfft_plan_raw
(
config
,
input
->
data
<
void
>
(),
out_conj
.
data
<
void
>
(),
forward
);
platform
::
ForRange
<
DeviceContext
>
for_range
(
ctx
,
output
->
numel
());
math
::
ConjFunctor
<
To
>
functor
(
out_conj
.
data
<
To
>
(),
output
->
numel
(),
output
->
data
<
To
>
());
for_range
(
functor
);
}
else
{
exec_hipfft_plan_raw
(
config
,
input
->
data
<
void
>
(),
output
->
data
<
void
>
(),
forward
);
}
}
#endif
// Execute a general unnormalized fft operation (can be c2c, onesided r2c or
// onesided c2r)
template
<
typename
DeviceContext
,
typename
Ti
,
typename
To
>
void
exec_fft
(
const
DeviceContext
&
ctx
,
const
Tensor
*
X
,
Tensor
*
out
,
const
std
::
vector
<
int64_t
>&
dim
,
bool
forward
)
{
const
auto
x_dims
=
framework
::
vectorize
(
X
->
dims
());
const
auto
out_dims
=
framework
::
vectorize
(
out
->
dims
());
const
int64_t
ndim
=
static_cast
<
int64_t
>
(
X
->
dims
().
size
());
const
int64_t
signal_ndim
=
static_cast
<
int64_t
>
(
dim
.
size
());
const
int64_t
batch_dims
=
ndim
-
signal_ndim
;
auto
tensor_place
=
ctx
.
GetPlace
();
//
Transpose batch dimensions first, then with transforming dims
//
make a dim permutation
std
::
vector
<
int
>
dim_permute
(
ndim
);
std
::
vector
<
int
>
reverse_dim_permute
(
ndim
);
std
::
vector
<
int64_t
>
trans_dims
(
ndim
);
std
::
iota
(
dim_permute
.
begin
(),
dim_permute
.
end
(),
int
{
0
});
std
::
vector
<
bool
>
is_transformed_dim
(
ndim
);
for
(
const
auto
&
d
:
dim
)
{
...
...
@@ -340,160 +271,89 @@ void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
std
::
sort
(
dim_permute
.
begin
(),
batch_end
);
std
::
copy
(
dim
.
cbegin
(),
dim
.
cend
(),
batch_end
);
for
(
size_t
i
=
0
;
i
<
ndim
;
i
++
)
{
trans_dims
[
i
]
=
x_dims
[
dim_permute
[
i
]];
// shape of input transpose
reverse_dim_permute
[
dim_permute
[
i
]]
=
static_cast
<
int
>
(
i
);
// reverse of dim permute
}
framework
::
Tensor
input
;
input
.
Resize
(
framework
::
make_ddim
(
trans_dims
));
input
.
mutable_data
<
Ti
>
(
tensor_place
);
/*
auto in_ret = TransposeSimple<Ti>::run(ctx, *X, dim_permute, input);
if (!in_ret) {
TransCompute<DeviceContext, Ti>(ndim, ctx, *X, input, dim_permute);
}
*/
TransCompute
<
DeviceContext
,
Ti
>
(
ndim
,
ctx
,
*
X
,
&
input
,
dim_permute
);
// transpose input according to dim permutation
auto
transposed_input_shape
=
X
->
dims
().
transpose
(
dim_permute
);
framework
::
Tensor
transposed_input
;
transposed_input
.
Resize
(
transposed_input_shape
);
transposed_input
.
mutable_data
<
Ti
>
(
tensor_place
);
TransCompute
<
DeviceContext
,
Ti
>
(
ndim
,
ctx
,
*
X
,
&
transposed_input
,
dim_permute
);
// Reshape batch dimensions into a single dimension
std
::
vector
<
int64_t
>
batched_sizes
(
signal_ndim
+
1
);
const
int64_t
signal_ndim
=
static_cast
<
int64_t
>
(
dim
.
size
());
std
::
vector
<
int64_t
>
collapsed_input_shape
(
signal_ndim
+
1
);
auto
transposed_input_shape_
=
framework
::
vectorize
(
transposed_input_shape
);
const
int64_t
batch_dims
=
ndim
-
signal_ndim
;
auto
batch_size
=
std
::
accumulate
(
trans_dims
.
begin
(),
trans_dims
.
begin
()
+
batch_dims
,
std
::
accumulate
(
transposed_input_shape_
.
begin
(),
transposed_input_shape_
.
begin
()
+
batch_dims
,
static_cast
<
int
>
(
1
),
std
::
multiplies
<
int
>
());
batched_sizes
[
0
]
=
batch_size
;
std
::
copy
(
trans_dims
.
begin
()
+
batch_dims
,
trans_dims
.
end
(),
batched_sizes
.
begin
()
+
1
);
input
.
Resize
(
framework
::
make_ddim
(
batched_sizes
));
collapsed_input_shape
[
0
]
=
batch_size
;
// Check the shape of transforming dims with input and output
std
::
vector
<
int64_t
>
signal_size
(
signal_ndim
+
1
);
signal_size
[
0
]
=
batch_size
;
for
(
int64_t
i
=
0
;
i
<
signal_ndim
;
++
i
)
{
auto
in_size
=
input
.
dims
()[
i
+
1
];
auto
out_size
=
out_dims
[
dim
[
i
]];
signal_size
[
i
+
1
]
=
std
::
max
(
in_size
,
out_size
);
PADDLE_ENFORCE_EQ
(
(
in_size
==
signal_size
[
i
+
1
]
||
in_size
==
(
signal_size
[
i
+
1
]
/
2
)
+
1
),
true
,
platform
::
errors
::
InvalidArgument
(
"The dimension[%d] of Input size: [%d] must be equal or half to "
"The dimension[%d] of Output size: [%d]"
,
dim
[
i
],
in_size
,
dim
[
i
],
out_size
));
PADDLE_ENFORCE_EQ
(
(
out_size
==
signal_size
[
i
+
1
]
||
out_size
==
(
signal_size
[
i
+
1
]
/
2
)
+
1
),
true
,
platform
::
errors
::
InvalidArgument
(
"The dimension[%d] of Output size: [%d] must be equal or half to "
"The dimension[%d] of Input size: [%d]"
,
dim
[
i
],
out_size
,
dim
[
i
],
in_size
));
}
std
::
copy
(
transposed_input_shape_
.
begin
()
+
batch_dims
,
transposed_input_shape_
.
end
(),
collapsed_input_shape
.
begin
()
+
1
);
std
::
vector
<
int64_t
>
reshape_out_sizes
(
ndim
);
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
reshape_out_sizes
[
i
]
=
out_dims
[
dim_permute
[
i
]];
}
std
::
vector
<
int64_t
>
batched_out_sizes
(
batched_sizes
.
begin
(),
batched_sizes
.
end
());
framework
::
Tensor
&
collapsed_input
=
transposed_input
;
collapsed_input
.
Resize
(
framework
::
make_ddim
(
collapsed_input_shape
));
// make a collpased output
const
auto
out_dims
=
framework
::
vectorize
(
out
->
dims
());
std
::
vector
<
int64_t
>
collapsed_output_shape
(
1
+
signal_ndim
);
collapsed_output_shape
[
0
]
=
batch_size
;
for
(
size_t
i
=
0
;
i
<
dim
.
size
();
++
i
)
{
batched_out_sizes
[
i
+
1
]
=
out_dims
[
dim
[
i
]];
collapsed_output_shape
[
i
+
1
]
=
out_dims
[
dim
[
i
]];
}
framework
::
Tensor
collapsed_output
;
collapsed_output
.
Resize
(
framework
::
make_ddim
(
collapsed_output_shape
));
collapsed_output
.
mutable_data
<
To
>
(
tensor_place
);
// output
framework
::
Tensor
output
;
output
.
Resize
(
framework
::
make_ddim
(
batched_out_sizes
));
output
.
mutable_data
<
To
>
(
tensor_place
);
// Create the transform plan (either from cache or locally)
const
auto
value_type
=
framework
::
IsComplexType
(
input
.
type
())
?
framework
::
ToRealType
(
input
.
type
())
:
input
.
type
();
auto
fft_type
=
GetFFTTransformType
(
input
.
type
(),
output
.
type
());
PlanKey
Key
(
framework
::
vectorize
(
input
.
dims
()),
framework
::
vectorize
(
output
.
dims
()),
signal_size
,
fft_type
,
value_type
);
CuFFTConfig
uncached_plan
(
Key
);
CuFFTConfig
*
config
=
&
uncached_plan
;
auto
&
plan
=
config
->
plan
();
#if defined(PADDLE_WITH_CUDA)
// create plan
CuFFTConfig
config
=
create_cufft_config
(
collapsed_input
,
collapsed_output
,
signal_ndim
);
// prepare cufft for execution
CUFFT_CHECK
(
platform
::
dynload
::
cufftSetStream
(
plan
,
ctx
.
stream
()));
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cufftSetStream
(
config
.
plan
(),
ctx
.
stream
()));
framework
::
Tensor
workspace_tensor
;
workspace_tensor
.
mutable_data
<
To
>
(
tensor_place
,
config
->
workspace_size
());
CUFFT_CHECK
(
platform
::
dynload
::
cufftSetWorkArea
(
plan
,
workspace_tensor
.
data
<
To
>
()));
workspace_tensor
.
mutable_data
<
To
>
(
tensor_place
,
config
.
workspace_size
());
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cufftSetWorkArea
(
config
.
plan
(),
workspace_tensor
.
data
<
To
>
()));
// execute transform plan
if
(
fft_type
==
FFTTransformType
::
C2R
&&
forward
)
{
forward
=
false
;
framework
::
Tensor
input_conj
(
input
.
type
());
input_conj
.
mutable_data
<
Ti
>
(
input
.
dims
(),
ctx
.
GetPlace
());
platform
::
ForRange
<
DeviceContext
>
for_range
(
ctx
,
input
.
numel
());
math
::
ConjFunctor
<
Ti
>
functor
(
input
.
data
<
Ti
>
(),
input
.
numel
(),
input_conj
.
data
<
Ti
>
());
for_range
(
functor
);
exec_cufft_plan
(
*
config
,
input_conj
.
data
<
void
>
(),
output
.
data
<
void
>
(),
forward
);
}
else
if
(
fft_type
==
FFTTransformType
::
R2C
&&
!
forward
)
{
forward
=
true
;
framework
::
Tensor
out_conj
(
output
.
type
());
out_conj
.
mutable_data
<
To
>
(
output
.
dims
(),
ctx
.
GetPlace
());
exec_cufft_plan
(
*
config
,
input
.
data
<
void
>
(),
out_conj
.
data
<
void
>
(),
forward
);
exec_cufft_plan
<
DeviceContext
,
Ti
,
To
>
(
ctx
,
config
,
&
collapsed_input
,
&
collapsed_output
,
forward
);
platform
::
ForRange
<
DeviceContext
>
for_range
(
ctx
,
output
.
numel
());
math
::
ConjFunctor
<
To
>
functor
(
out_conj
.
data
<
To
>
(),
output
.
numel
(),
output
.
data
<
To
>
());
for_range
(
functor
);
}
else
{
exec_cufft_plan
(
*
config
,
input
.
data
<
void
>
(),
output
.
data
<
void
>
(),
forward
);
}
#elif defined(PADDLE_WITH_HIP)
// create plan
HIPFFTConfig
config
=
create_hipfft_config
(
collapsed_input
,
collapsed_output
,
signal_ndim
);
// prepare cufft for execution
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
hipfftSetStream
(
config
.
plan
(),
ctx
.
stream
()));
framework
::
Tensor
workspace_tensor
;
workspace_tensor
.
mutable_data
<
To
>
(
tensor_place
,
config
.
workspace_size
());
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
hipfftSetWorkArea
(
config
.
plan
(),
workspace_tensor
.
data
<
To
>
()));
// execute transform plan
exec_hipfft_plan
<
DeviceContext
,
Ti
,
To
>
(
ctx
,
config
,
&
collapsed_input
,
&
collapsed_output
,
forward
);
#endif
// Inverting output by reshape and transpose to original batch and dimension
output
.
Resize
(
framework
::
make_ddim
(
reshape_out_sizes
));
out
->
Resize
(
framework
::
make_ddim
(
out_dims
));
TransCompute
<
DeviceContext
,
To
>
(
ndim
,
ctx
,
output
,
out
,
reverse_dim_permute
);
}
auto
transposed_out_shape
=
out
->
dims
().
transpose
(
dim_permute
);
// Calculates the normalization constant
double
fft_normalization_scale
(
FFTNormMode
normalization
,
const
std
::
vector
<
int64_t
>&
sizes
,
const
std
::
vector
<
int64_t
>&
dims
)
{
// auto norm = static_cast<fft_norm_mode>(normalization);
if
(
normalization
==
FFTNormMode
::
none
)
{
return
static_cast
<
double
>
(
1.0
);
}
collapsed_output
.
Resize
(
transposed_out_shape
);
auto
&
transposed_output
=
collapsed_output
;
int64_t
signal_numel
=
1
;
for
(
auto
dim
:
dims
)
{
signal_numel
*=
sizes
[
dim
]
;
std
::
vector
<
int
>
reverse_dim_permute
(
ndim
)
;
for
(
size_t
i
=
0
;
i
<
ndim
;
i
++
)
{
reverse_dim_permute
[
dim_permute
[
i
]]
=
i
;
}
const
double
scale_denom
=
(
normalization
==
FFTNormMode
::
by_sqrt_n
)
?
std
::
sqrt
(
signal_numel
)
:
static_cast
<
double
>
(
signal_numel
);
return
static_cast
<
double
>
(
1.0
/
scale_denom
);
}
template
<
typename
DeviceContext
,
typename
T
>
void
exec_normalization
(
const
DeviceContext
&
ctx
,
const
Tensor
*
in
,
Tensor
*
out
,
FFTNormMode
normalization
,
const
std
::
vector
<
int64_t
>&
sizes
,
const
std
::
vector
<
int64_t
>&
axes
)
{
double
scale
=
fft_normalization_scale
(
normalization
,
sizes
,
axes
);
if
(
scale
!=
1.0
)
{
auto
eigen_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
out
);
auto
eigen_in
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
in
);
auto
dev
=
ctx
.
eigen_device
();
EigenScale
<
Eigen
::
GpuDevice
,
T
>::
Eval
(
*
dev
,
eigen_out
,
eigen_in
,
static_cast
<
T
>
(
scale
),
static_cast
<
T
>
(
0
),
false
);
}
else
{
framework
::
TensorCopy
(
*
in
,
ctx
.
GetPlace
(),
out
);
}
TransCompute
<
DeviceContext
,
To
>
(
ndim
,
ctx
,
transposed_output
,
out
,
reverse_dim_permute
);
}
}
// anonymous namespace
// Use the optimized path to perform single R2C or C2R if transformation dim is
...
...
paddle/fluid/platform/dynload/CMakeLists.txt
浏览文件 @
1d5746bd
...
...
@@ -7,7 +7,7 @@ if (NOT WITH_NV_JETSON)
endif
()
if
(
WITH_ROCM
)
list
(
APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc
)
list
(
APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc
hipfft.cc
)
endif
()
# There is no macOS version of NCCL.
...
...
paddle/fluid/platform/dynload/dynamic_loader.cc
浏览文件 @
1d5746bd
...
...
@@ -356,6 +356,16 @@ void* GetCurandDsoHandle() {
#endif
}
#ifdef PADDLE_WITH_HIP
void
*
GetROCFFTDsoHandle
()
{
#if defined(__APPLE__) || defined(__OSX__)
return
GetDsoHandleFromSearchPath
(
FLAGS_rocm_dir
,
"librocfft.dylib"
);
#else
return
GetDsoHandleFromSearchPath
(
FLAGS_rocm_dir
,
"librocfft.so"
);
#endif
}
#endif
void
*
GetNvjpegDsoHandle
()
{
#if defined(__APPLE__) || defined(__OSX__)
return
GetDsoHandleFromSearchPath
(
FLAGS_cuda_dir
,
"libnvjpeg.dylib"
);
...
...
paddle/fluid/platform/dynload/dynamic_loader.h
浏览文件 @
1d5746bd
...
...
@@ -44,6 +44,7 @@ void* GetOpDsoHandle(const std::string& dso_name);
void
*
GetNvtxDsoHandle
();
void
*
GetCUFFTDsoHandle
();
void
*
GetMKLRTDsoHandle
();
void
*
GetROCFFTDsoHandle
();
void
SetPaddleLibPath
(
const
std
::
string
&
);
}
// namespace dynload
...
...
paddle/fluid/platform/dynload/hipfft.cc
0 → 100644
浏览文件 @
1d5746bd
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/hipfft.h"
namespace
paddle
{
namespace
platform
{
namespace
dynload
{
std
::
once_flag
hipfft_dso_flag
;
void
*
hipfft_dso_handle
;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
HIPFFT_FFT_ROUTINE_EACH
(
DEFINE_WRAP
);
}
// namespace dynload
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/dynload/hipfft.h
0 → 100644
浏览文件 @
1d5746bd
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_HIP
#include <hipfft.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
namespace
paddle
{
namespace
platform
{
namespace
dynload
{
extern
std
::
once_flag
hipfft_dso_flag
;
extern
void
*
hipfft_dso_handle
;
#define DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using hipfftFunc = decltype(&::__name); \
std::call_once(hipfft_dso_flag, []() { \
hipfft_dso_handle = paddle::platform::dynload::GetROCFFTDsoHandle(); \
}); \
static void *p_##__name = dlsym(hipfft_dso_handle, #__name); \
return reinterpret_cast<hipfftFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define HIPFFT_FFT_ROUTINE_EACH(__macro) \
__macro(hipfftPlan1d); \
__macro(hipfftPlan2d); \
__macro(hipfftPlan3d); \
__macro(hipfftPlanMany); \
__macro(hipfftMakePlan1d); \
__macro(hipfftMakePlanMany); \
__macro(hipfftMakePlanMany64); \
__macro(hipfftGetSizeMany64); \
__macro(hipfftEstimate1d); \
__macro(hipfftEstimate2d); \
__macro(hipfftEstimate3d); \
__macro(hipfftEstimateMany); \
__macro(hipfftCreate); \
__macro(hipfftGetSize1d); \
__macro(hipfftGetSizeMany); \
__macro(hipfftGetSize); \
__macro(hipfftSetWorkArea); \
__macro(hipfftSetAutoAllocation); \
__macro(hipfftExecC2C); \
__macro(hipfftExecR2C); \
__macro(hipfftExecC2R); \
__macro(hipfftExecZ2Z); \
__macro(hipfftExecD2Z); \
__macro(hipfftExecZ2D); \
__macro(hipfftSetStream); \
__macro(hipfftDestroy); \
__macro(hipfftGetVersion); \
__macro(hipfftGetProperty);
HIPFFT_FFT_ROUTINE_EACH
(
DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP
);
inline
const
char
*
hipfftGetErrorString
(
hipfftResult_t
status
)
{
switch
(
status
)
{
case
HIPFFT_SUCCESS
:
return
"'HIPFFT_SUCCESS'. The hipFFT operation was successful."
;
case
HIPFFT_INVALID_PLAN
:
return
"'HIPFFT_INVALID_PLAN'. hipFFT was passed an invalid plan handle."
;
case
HIPFFT_ALLOC_FAILED
:
return
"'HIPFFT_ALLOC_FAILED'. hipFFT failed to allocate GPU or CPU "
"memory."
;
case
HIPFFT_INVALID_TYPE
:
return
"'HIPFFT_INVALID_TYPE'. No longer used."
;
case
HIPFFT_INVALID_VALUE
:
return
"'HIPFFT_INVALID_VALUE'. User specified an invalid pointer or "
"parameter."
;
case
HIPFFT_INTERNAL_ERROR
:
return
"'HIPFFT_INTERNAL_ERROR'. Driver or internal hipFFT library "
"error."
;
case
HIPFFT_EXEC_FAILED
:
return
"'HIPFFT_EXEC_FAILED'. Failed to execute an FFT on the GPU."
;
case
HIPFFT_SETUP_FAILED
:
return
"'HIPFFT_SETUP_FAILED'. The hipFFT library failed to initialize."
;
case
HIPFFT_INVALID_SIZE
:
return
"'HIPFFT_INVALID_SIZE'. User specified an invalid transform size."
;
case
HIPFFT_UNALIGNED_DATA
:
return
"'HIPFFT_UNALIGNED_DATA'. No longer used."
;
case
HIPFFT_INCOMPLETE_PARAMETER_LIST
:
return
"'HIPFFT_INCOMPLETE_PARAMETER_LIST'. Missing parameters in call."
;
case
HIPFFT_INVALID_DEVICE
:
return
"'HIPFFT_INVALID_DEVICE'. Execution of a plan was on different "
"GPU than plan creation."
;
case
HIPFFT_PARSE_ERROR
:
return
"'HIPFFT_PARSE_ERROR'. Internal plan database error."
;
case
HIPFFT_NO_WORKSPACE
:
return
"'HIPFFT_NO_WORKSPACE'. No workspace has been provided prior to "
"plan execution."
;
case
HIPFFT_NOT_IMPLEMENTED
:
return
"'HIPFFT_NOT_IMPLEMENTED'. Function does not implement "
"functionality for parameters given."
;
case
HIPFFT_NOT_SUPPORTED
:
return
"'HIPFFT_NOT_SUPPORTED'. Operation is not supported for "
"parameters given."
;
default:
return
"HIPFFT_STATUS_UNKNOWN_ERROR"
;
}
}
}
// namespace dynload
}
// namespace platform
}
// namespace paddle
#endif
paddle/fluid/platform/enforce.h
浏览文件 @
1d5746bd
...
...
@@ -86,6 +86,7 @@ limitations under the License. */
#endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/hipfft.h"
#include "paddle/fluid/platform/dynload/hiprand.h"
#include "paddle/fluid/platform/dynload/miopen.h"
#include "paddle/fluid/platform/dynload/rocblas.h"
...
...
@@ -1113,6 +1114,14 @@ inline std::string build_rocm_error_msg(ncclResult_t nccl_result) {
}
#endif // not(__APPLE__) and PADDLE_WITH_NCCL
/***** HIPFFT ERROR *****/
inline
bool
is_error
(
hipfftResult_t
stat
)
{
return
stat
!=
HIPFFT_SUCCESS
;
}
inline
std
::
string
build_rocm_error_msg
(
hipfftResult_t
stat
)
{
std
::
string
msg
(
" HIPFFT error, "
);
return
msg
+
platform
::
dynload
::
hipfftGetErrorString
(
stat
)
+
" "
;
}
namespace
details
{
template
<
typename
T
>
...
...
@@ -1129,6 +1138,7 @@ DEFINE_EXTERNAL_API_TYPE(hipError_t, hipSuccess);
DEFINE_EXTERNAL_API_TYPE
(
hiprandStatus_t
,
HIPRAND_STATUS_SUCCESS
);
DEFINE_EXTERNAL_API_TYPE
(
miopenStatus_t
,
miopenStatusSuccess
);
DEFINE_EXTERNAL_API_TYPE
(
rocblas_status
,
rocblas_status_success
);
DEFINE_EXTERNAL_API_TYPE
(
hipfftResult_t
,
HIPFFT_SUCCESS
);
#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
DEFINE_EXTERNAL_API_TYPE
(
ncclResult_t
,
ncclSuccess
);
...
...
paddle/fluid/platform/enforce_test.cc
浏览文件 @
1d5746bd
...
...
@@ -331,6 +331,10 @@ TEST(enforce, hip_success) {
CheckCudaStatusFailure
(
rocblas_status_invalid_handle
,
"Rocblas error"
));
EXPECT_TRUE
(
CheckCudaStatusFailure
(
rocblas_status_invalid_value
,
"Rocblas error"
));
EXPECT_TRUE
(
CheckCudaStatusSuccess
(
HIPFFT_SUCCESS
));
EXPECT_TRUE
(
CheckCudaStatusFailure
(
HIPFFT_INVALID_PLAN
,
"HIPFFT error"
));
EXPECT_TRUE
(
CheckCudaStatusFailure
(
HIPFFT_ALLOC_FAILED
,
"HIPFFT error"
));
#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
EXPECT_TRUE
(
CheckCudaStatusSuccess
(
ncclSuccess
));
EXPECT_TRUE
(
CheckCudaStatusFailure
(
ncclUnhandledCudaError
,
"Rccl error"
));
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录