add rocm support for fft api (#36415)

1d5746bd · Xiaoxu Chen · GitHub · 77f4597f · 1d5746bd · 1d5746bd
10 changed file
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -102,8 +102,7 @@ else()
    op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
+if (WITH_GPU OR WITH_ROCM)
-if (WITH_GPU AND (NOT WITH_ROCM))
    if (MKL_FOUND AND WITH_ONEMKL)
        op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda dynload_mklrt ${OP_HEADER_DEPS})
        target_include_directories(spectral_op PRIVATE ${MKL_INCLUDE})

--- a/paddle/fluid/operators/spectral_helper.h
+++ b/paddle/fluid/operators/spectral_helper.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/operators/spectral_op.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/dynload/hipfft.h"
+#endif
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/dynload/cufft.h"
+#endif
+namespace paddle {
+namespace operators {
+using ScalarType = framework::proto::VarType::Type;
+const int64_t kMaxCUFFTNdim = 3;
+const int64_t kMaxDataNdim = kMaxCUFFTNdim + 1;
+// This struct is used to easily compute hashes of the
+// parameters. It will be the **key** to the plan cache.
+struct PlanKey {
+  // between 1 and kMaxCUFFTNdim, i.e., 1 <= signal_ndim <= 3
+  int64_t signal_ndim_;
+  // These include additional batch dimension as well.
+  int64_t sizes_[kMaxDataNdim];
+  int64_t input_shape_[kMaxDataNdim];
+  int64_t output_shape_[kMaxDataNdim];
+  FFTTransformType fft_type_;
+  ScalarType value_type_;
+  PlanKey() = default;
+  PlanKey(const std::vector<int64_t>& in_shape,
+          const std::vector<int64_t>& out_shape,
+          const std::vector<int64_t>& signal_size, FFTTransformType fft_type,
+          ScalarType value_type) {
+    // Padding bits must be zeroed for hashing
+    memset(this, 0, sizeof(*this));
+    signal_ndim_ = signal_size.size() - 1;
+    fft_type_ = fft_type;
+    value_type_ = value_type;
+    std::copy(signal_size.cbegin(), signal_size.cend(), sizes_);
+    std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_);
+    std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_);
+  }
+};
+#if defined(PADDLE_WITH_CUDA)
+// An RAII encapsulation of cuFFTHandle
+class CuFFTHandle {
+  ::cufftHandle handle_;
+ public:
+  CuFFTHandle() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftCreate(&handle_));
+  }
+  ::cufftHandle& get() { return handle_; }
+  const ::cufftHandle& get() const { return handle_; }
+  ~CuFFTHandle() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftDestroy(handle_));
+  }
+};
+using plan_size_type = long long int;  // NOLINT
+// This class contains all the information needed to execute a cuFFT plan:
+//   1. the plan
+//   2. the workspace size needed
+class CuFFTConfig {
+ public:
+  // Only move semantics is enought for this class. Although we already use
+  // unique_ptr for the plan, still remove copy constructor and assignment op so
+  // we don't accidentally copy and take perf hit.
+  explicit CuFFTConfig(const PlanKey& plan_key)
+      : CuFFTConfig(
+            std::vector<int64_t>(plan_key.sizes_,
+                                 plan_key.sizes_ + plan_key.signal_ndim_ + 1),
+            plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
+  // sizes are full signal, including batch size and always two-sided
+  CuFFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
+              FFTTransformType fft_type, ScalarType dtype)
+      : fft_type_(fft_type), value_type_(dtype) {
+    // signal sizes (excluding batch dim)
+    std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
+    // input batch size
+    const auto batch = static_cast<plan_size_type>(sizes[0]);
+    // const int64_t signal_ndim = sizes.size() - 1;
+    PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
+                      platform::errors::InvalidArgument(
+                          "The signal_ndim must be equal to sizes.size() - 1,"
+                          "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
+                          signal_ndim, sizes.size() - 1));
+    cudaDataType itype, otype, exec_type;
+    const auto complex_input = has_complex_input(fft_type);
+    const auto complex_output = has_complex_output(fft_type);
+    if (dtype == framework::proto::VarType::FP32) {
+      itype = complex_input ? CUDA_C_32F : CUDA_R_32F;
+      otype = complex_output ? CUDA_C_32F : CUDA_R_32F;
+      exec_type = CUDA_C_32F;
+    } else if (dtype == framework::proto::VarType::FP64) {
+      itype = complex_input ? CUDA_C_64F : CUDA_R_64F;
+      otype = complex_output ? CUDA_C_64F : CUDA_R_64F;
+      exec_type = CUDA_C_64F;
+    } else if (dtype == framework::proto::VarType::FP16) {
+      itype = complex_input ? CUDA_C_16F : CUDA_R_16F;
+      otype = complex_output ? CUDA_C_16F : CUDA_R_16F;
+      exec_type = CUDA_C_16F;
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "cuFFT only support transforms of type float16, float32 and "
+          "float64"));
+    }
+    // disable auto allocation of workspace to use allocator from the framework
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftSetAutoAllocation(
+        plan(), /* autoAllocate */ 0));
+    size_t ws_size_t;
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftXtMakePlanMany(
+        plan(), signal_ndim, signal_sizes.data(),
+        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
+        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
+        batch, &ws_size_t, exec_type));
+    ws_size = ws_size_t;
+  }
+  const cufftHandle& plan() const { return plan_ptr.get(); }
+  FFTTransformType transform_type() const { return fft_type_; }
+  ScalarType data_type() const { return value_type_; }
+  size_t workspace_size() const { return ws_size; }
+ private:
+  CuFFTHandle plan_ptr;
+  size_t ws_size;
+  FFTTransformType fft_type_;
+  ScalarType value_type_;
+};
+#elif defined(PADDLE_WITH_HIP)
+// An RAII encapsulation of cuFFTHandle
+class HIPFFTHandle {
+  ::hipfftHandle handle_;
+ public:
+  HIPFFTHandle() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftCreate(&handle_));
+  }
+  ::hipfftHandle& get() { return handle_; }
+  const ::hipfftHandle& get() const { return handle_; }
+  ~HIPFFTHandle() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftDestroy(handle_));
+  }
+};
+using plan_size_type = int;
+// This class contains all the information needed to execute a cuFFT plan:
+//   1. the plan
+//   2. the workspace size needed
+class HIPFFTConfig {
+ public:
+  // Only move semantics is enought for this class. Although we already use
+  // unique_ptr for the plan, still remove copy constructor and assignment op so
+  // we don't accidentally copy and take perf hit.
+  explicit HIPFFTConfig(const PlanKey& plan_key)
+      : HIPFFTConfig(
+            std::vector<int64_t>(plan_key.sizes_,
+                                 plan_key.sizes_ + plan_key.signal_ndim_ + 1),
+            plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
+  // sizes are full signal, including batch size and always two-sided
+  HIPFFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
+               FFTTransformType fft_type, ScalarType dtype)
+      : fft_type_(fft_type), value_type_(dtype) {
+    // signal sizes (excluding batch dim)
+    std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
+    // input batch size
+    const auto batch = static_cast<plan_size_type>(sizes[0]);
+    // const int64_t signal_ndim = sizes.size() - 1;
+    PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
+                      platform::errors::InvalidArgument(
+                          "The signal_ndim must be equal to sizes.size() - 1,"
+                          "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
+                          signal_ndim, sizes.size() - 1));
+    hipfftType exec_type = [&] {
+      if (dtype == framework::proto::VarType::FP32) {
+        switch (fft_type) {
+          case FFTTransformType::C2C:
+            return HIPFFT_C2C;
+          case FFTTransformType::R2C:
+            return HIPFFT_R2C;
+          case FFTTransformType::C2R:
+            return HIPFFT_C2R;
+        }
+      } else if (dtype == framework::proto::VarType::FP64) {
+        switch (fft_type) {
+          case FFTTransformType::C2C:
+            return HIPFFT_Z2Z;
+          case FFTTransformType::R2C:
+            return HIPFFT_D2Z;
+          case FFTTransformType::C2R:
+            return HIPFFT_Z2D;
+        }
+      }
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "hipFFT only support transforms of type float32 and float64"));
+    }();
+    // disable auto allocation of workspace to use allocator from the framework
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftSetAutoAllocation(
+        plan(), /* autoAllocate */ 0));
+    size_t ws_size_t;
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftMakePlanMany(
+        plan(), signal_ndim, signal_sizes.data(),
+        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1,
+        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type,
+        batch, &ws_size_t));
+    ws_size = ws_size_t;
+  }
+  const hipfftHandle& plan() const { return plan_ptr.get(); }
+  FFTTransformType transform_type() const { return fft_type_; }
+  ScalarType data_type() const { return value_type_; }
+  size_t workspace_size() const { return ws_size; }
+ private:
+  HIPFFTHandle plan_ptr;
+  size_t ws_size;
+  FFTTransformType fft_type_;
+  ScalarType value_type_;
+};
+#endif
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/spectral_op.cu
+++ b/paddle/fluid/operators/spectral_op.cu
@@ -8,10 +8,6 @@
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
-#include <cufft.h>
-#include <cufftXt.h>
 #include <functional>
 #include <list>
 #include <memory>
@@ -24,311 +20,246 @@
 #include <vector>
 #include "paddle/fluid/operators/conj_op.h"
+#include "paddle/fluid/operators/spectral_helper.h"
 #include "paddle/fluid/operators/spectral_op.h"
 #include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/dynload/cufft.h"
+#include "paddle/fluid/platform/enforce.h"
 namespace paddle {
 namespace operators {
 namespace {
-using ScalarType = framework::proto::VarType::Type;
+// Calculates the normalization constant
-const int64_t kMaxCUFFTNdim = 3;
+double fft_normalization_scale(FFTNormMode normalization,
-const int64_t kMaxDataNdim = kMaxCUFFTNdim + 1;
+                               const std::vector<int64_t>& sizes,
+                               const std::vector<int64_t>& dims) {
-static inline std::string get_cufft_error_info(cufftResult error) {
+  // auto norm = static_cast<fft_norm_mode>(normalization);
-  switch (error) {
+  if (normalization == FFTNormMode::none) {
-    case CUFFT_SUCCESS:
+    return static_cast<double>(1.0);
-      return "CUFFT_SUCCESS";
-    case CUFFT_INVALID_PLAN:
-      return "CUFFT_INVALID_PLAN";
-    case CUFFT_ALLOC_FAILED:
-      return "CUFFT_ALLOC_FAILED";
-    case CUFFT_INVALID_TYPE:
-      return "CUFFT_INVALID_TYPE";
-    case CUFFT_INVALID_VALUE:
-      return "CUFFT_INVALID_VALUE";
-    case CUFFT_INTERNAL_ERROR:
-      return "CUFFT_INTERNAL_ERROR";
-    case CUFFT_EXEC_FAILED:
-      return "CUFFT_EXEC_FAILED";
-    case CUFFT_SETUP_FAILED:
-      return "CUFFT_SETUP_FAILED";
-    case CUFFT_INVALID_SIZE:
-      return "CUFFT_INVALID_SIZE";
-    case CUFFT_UNALIGNED_DATA:
-      return "CUFFT_UNALIGNED_DATA";
-    case CUFFT_INCOMPLETE_PARAMETER_LIST:
-      return "CUFFT_INCOMPLETE_PARAMETER_LIST";
-    case CUFFT_INVALID_DEVICE:
-      return "CUFFT_INVALID_DEVICE";
-    case CUFFT_PARSE_ERROR:
-      return "CUFFT_PARSE_ERROR";
-    case CUFFT_NO_WORKSPACE:
-      return "CUFFT_NO_WORKSPACE";
-    case CUFFT_NOT_IMPLEMENTED:
-      return "CUFFT_NOT_IMPLEMENTED";
-#ifndef __HIPCC__
-    case CUFFT_LICENSE_ERROR:
-      return "CUFFT_LICENSE_ERROR";
-#endif
-    case CUFFT_NOT_SUPPORTED:
-      return "CUFFT_NOT_SUPPORTED";
-    default:
-      std::ostringstream ss;
-      ss << "unknown error " << error;
-      return ss.str();
  }
-}
-static inline void CUFFT_CHECK(cufftResult error) {
+  int64_t signal_numel = 1;
-  PADDLE_ENFORCE_CUDA_SUCCESS(error);
+  for (auto dim : dims) {
+    signal_numel *= sizes[dim];
+  }
+  const double scale_denom = (normalization == FFTNormMode::by_sqrt_n)
+                                 ? std::sqrt(signal_numel)
+                                 : static_cast<double>(signal_numel);
+  return static_cast<double>(1.0 / scale_denom);
 }
-// This struct is used to easily compute hashes of the
+template <typename DeviceContext, typename T>
-// parameters. It will be the **key** to the plan cache.
+void exec_normalization(const DeviceContext& ctx, const Tensor* in, Tensor* out,
-struct PlanKey {
+                        FFTNormMode normalization,
-  // between 1 and kMaxCUFFTNdim, i.e., 1 <= signal_ndim <= 3
+                        const std::vector<int64_t>& sizes,
-  int64_t signal_ndim_;
+                        const std::vector<int64_t>& axes) {
-  // These include additional batch dimension as well.
+  double scale = fft_normalization_scale(normalization, sizes, axes);
-  int64_t sizes_[kMaxDataNdim];
+  if (scale != 1.0) {
-  int64_t input_shape_[kMaxDataNdim];
+    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-  int64_t output_shape_[kMaxDataNdim];
+    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-  FFTTransformType fft_type_;
+    auto dev = ctx.eigen_device();
-  ScalarType value_type_;
+    EigenScale<Eigen::GpuDevice, T>::Eval(*dev, eigen_out, eigen_in,
+                                          static_cast<T>(scale),
-  PlanKey() = default;
+                                          static_cast<T>(0), false);
+  } else {
-  PlanKey(const std::vector<int64_t>& in_shape,
+    framework::TensorCopy(*in, ctx.GetPlace(), out);
-          const std::vector<int64_t>& out_shape,
-          const std::vector<int64_t>& signal_size, FFTTransformType fft_type,
-          ScalarType value_type) {
-    // Padding bits must be zeroed for hashing
-    memset(this, 0, sizeof(*this));
-    signal_ndim_ = signal_size.size() - 1;
-    fft_type_ = fft_type;
-    value_type_ = value_type;
-    std::copy(signal_size.cbegin(), signal_size.cend(), sizes_);
-    std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_);
-    std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_);
  }
-};
+}
-// An RAII encapsulation of cuFFTHandle
-class CuFFTHandle {
-  ::cufftHandle handle_;
- public:
-  CuFFTHandle() { CUFFT_CHECK(platform::dynload::cufftCreate(&handle_)); }
-  ::cufftHandle& get() { return handle_; }
+#if defined(PADDLE_WITH_CUDA)
-  const ::cufftHandle& get() const { return handle_; }
+CuFFTConfig create_cufft_config(const framework::Tensor& input,
+                                const framework::Tensor& output,
+                                int signal_ndim) {
+  // Create the transform plan (either from cache or locally)
+  const auto value_type = framework::IsComplexType(input.type())
+                              ? framework::ToRealType(input.type())
+                              : input.type();
+  auto fft_type = GetFFTTransformType(input.type(), output.type());
+  // signal sizes
+  std::vector<int64_t> signal_size(signal_ndim + 1);
-  ~CuFFTHandle() {
+  signal_size[0] = input.dims()[0];
-// Not using fftDestroy() for rocFFT to work around double freeing of handles
+  for (int64_t i = 1; i <= signal_ndim; ++i) {
-#ifndef __HIPCC__
+    auto in_size = input.dims()[i];
-    CUFFT_CHECK(platform::dynload::cufftDestroy(handle_));
+    auto out_size = output.dims()[i];
-#endif
+    signal_size[i] = std::max(in_size, out_size);
  }
-};
+  PlanKey key(framework::vectorize(input.dims()),
+              framework::vectorize(output.dims()), signal_size, fft_type,
+              value_type);
-#ifdef __HIPCC__
+  return CuFFTConfig(key);
-using plan_size_type = int;
+}
-#else
-using plan_size_type = long long int;  // NOLINT
-#endif
-// This class contains all the information needed to execute a cuFFT plan:
+// Execute a pre-planned transform
-//   1. the plan
+static void exec_cufft_plan_raw(const CuFFTConfig& config, void* in_data,
-//   2. the workspace size needed
+                                void* out_data, bool forward) {
-class CuFFTConfig {
+  auto& plan = config.plan();
- public:
-  // Only move semantics is enought for this class. Although we already use
-  // unique_ptr for the plan, still remove copy constructor and assignment op so
-  // we don't accidentally copy and take perf hit.
-  CuFFTConfig(const CuFFTConfig&) = delete;
-  CuFFTConfig& operator=(CuFFTConfig const&) = delete;
-  explicit CuFFTConfig(const PlanKey& plan_key)
-      : CuFFTConfig(
-            std::vector<int64_t>(plan_key.sizes_,
-                                 plan_key.sizes_ + plan_key.signal_ndim_ + 1),
-            plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
-  // sizes are full signal, including batch size and always two-sided
-  CuFFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
-              FFTTransformType fft_type, ScalarType dtype)
-      : fft_type_(fft_type), value_type_(dtype) {
-    // signal sizes (excluding batch dim)
-    std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
-    // input batch size
-    const auto batch = static_cast<plan_size_type>(sizes[0]);
-    // const int64_t signal_ndim = sizes.size() - 1;
-    PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
-                      platform::errors::InvalidArgument(
-                          "The signal_ndim must be equal to sizes.size() - 1,"
-                          "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
-                          signal_ndim, sizes.size() - 1));
-#ifdef __HIPCC__
-    hipfftType exec_type = [&] {
-      if (dtype == framework::proto::VarType::FP32) {
-        switch (fft_type) {
-          case FFTTransformType::C2C:
-            return HIPFFT_C2C;
-          case FFTTransformType::R2C:
-            return HIPFFT_R2C;
-          case FFTTransformType::C2R:
-            return HIPFFT_C2R;
-        }
-      } else if (dtype == framework::proto::VarType::FP64) {
-        switch (fft_type) {
-          case FFTTransformType::C2C:
-            return HIPFFT_Z2Z;
-          case FFTTransformType::R2C:
-            return HIPFFT_D2Z;
-          case FFTTransformType::C2R:
-            return HIPFFT_Z2D;
-        }
-      }
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "hipFFT only support transforms of type float32 and float64"));
-    }();
-#else
-    cudaDataType itype, otype, exec_type;
-    const auto complex_input = has_complex_input(fft_type);
-    const auto complex_output = has_complex_output(fft_type);
-    if (dtype == framework::proto::VarType::FP32) {
-      itype = complex_input ? CUDA_C_32F : CUDA_R_32F;
-      otype = complex_output ? CUDA_C_32F : CUDA_R_32F;
-      exec_type = CUDA_C_32F;
-    } else if (dtype == framework::proto::VarType::FP64) {
-      itype = complex_input ? CUDA_C_64F : CUDA_R_64F;
-      otype = complex_output ? CUDA_C_64F : CUDA_R_64F;
-      exec_type = CUDA_C_64F;
-    } else if (dtype == framework::proto::VarType::FP16) {
-      itype = complex_input ? CUDA_C_16F : CUDA_R_16F;
-      otype = complex_output ? CUDA_C_16F : CUDA_R_16F;
-      exec_type = CUDA_C_16F;
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "cuFFT only support transforms of type float16, float32 and "
-          "float64"));
-    }
-#endif
-    // disable auto allocation of workspace to use allocator from the framework
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftXtExec(
-    CUFFT_CHECK(platform::dynload::cufftSetAutoAllocation(
+      plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE));
-        plan(), /* autoAllocate */ 0));
+}
-    size_t ws_size_t;
-// make plan
-#ifdef __HIPCC__
-    CUFFT_CHECK(hipfftMakePlanMany(
-        plan(), signal_ndim, signal_sizes.data(),
-        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1,
-        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type,
-        batch, &ws_size_t));
-#else
-    CUFFT_CHECK(platform::dynload::cufftXtMakePlanMany(
-        plan(), signal_ndim, signal_sizes.data(),
-        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
-        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
-        batch, &ws_size_t, exec_type));
-#endif
-    ws_size = ws_size_t;
+template <typename DeviceContext, typename Ti, typename To>
+void exec_cufft_plan(const DeviceContext& ctx, const CuFFTConfig& config,
+                     framework::Tensor* input, framework::Tensor* output,
+                     bool forward) {
+  // execute transform plan
+  auto fft_type = config.transform_type();
+  if (fft_type == FFTTransformType::C2R && forward) {
+    forward = false;
+    framework::Tensor input_conj(input->type());
+    input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
+    platform::ForRange<DeviceContext> for_range(ctx, input->numel());
+    math::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
+                                  input_conj.data<Ti>());
+    for_range(functor);
+    exec_cufft_plan_raw(config, input_conj.data<void>(), output->data<void>(),
+                        forward);
+  } else if (fft_type == FFTTransformType::R2C && !forward) {
+    forward = true;
+    framework::Tensor out_conj(output->type());
+    out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
+    exec_cufft_plan_raw(config, input->data<void>(), out_conj.data<void>(),
+                        forward);
+    platform::ForRange<DeviceContext> for_range(ctx, output->numel());
+    math::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
+                                  output->data<To>());
+    for_range(functor);
+  } else {
+    exec_cufft_plan_raw(config, input->data<void>(), output->data<void>(),
+                        forward);
  }
+}
-  const cufftHandle& plan() const { return plan_ptr.get(); }
+#elif defined(PADDLE_WITH_HIP)
-  FFTTransformType transform_type() const { return fft_type_; }
+HIPFFTConfig create_hipfft_config(const framework::Tensor& input,
-  ScalarType data_type() const { return value_type_; }
+                                  const framework::Tensor& output,
-  size_t workspace_size() const { return ws_size; }
+                                  int signal_ndim) {
+  // Create the transform plan (either from cache or locally)
+  const auto value_type = framework::IsComplexType(input.type())
+                              ? framework::ToRealType(input.type())
+                              : input.type();
+  auto fft_type = GetFFTTransformType(input.type(), output.type());
+  // signal sizes
+  std::vector<int64_t> signal_size(signal_ndim + 1);
- private:
+  signal_size[0] = input.dims()[0];
-  CuFFTHandle plan_ptr;
+  for (int64_t i = 1; i <= signal_ndim; ++i) {
-  size_t ws_size;
+    auto in_size = input.dims()[i];
-  FFTTransformType fft_type_;
+    auto out_size = output.dims()[i];
-  ScalarType value_type_;
+    signal_size[i] = std::max(in_size, out_size);
-};
+  }
+  PlanKey key(framework::vectorize(input.dims()),
+              framework::vectorize(output.dims()), signal_size, fft_type,
+              value_type);
+  return HIPFFTConfig(key);
+}
 // Execute a pre-planned transform
-static void exec_cufft_plan(const CuFFTConfig& config, void* in_data,
+static void exec_hipfft_plan_raw(const HIPFFTConfig& config, void* in_data,
-                            void* out_data, bool forward) {
+                                 void* out_data, bool forward) {
  auto& plan = config.plan();
-#ifdef __HIPCC__
  auto value_type = config.data_type();
  if (value_type == framework::proto::VarType::FP32) {
    switch (config.transform_type()) {
      case FFTTransformType::C2C: {
-        CUFFT_CHECK(hipfftExecC2C(plan, static_cast<hipfftComplex*>(in_data),
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecC2C(
-                                  static_cast<hipfftComplex*>(out_data),
+            plan, static_cast<hipfftComplex*>(in_data),
-                                  forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
+            static_cast<hipfftComplex*>(out_data),
+            forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
        return;
      }
      case FFTTransformType::R2C: {
-        CUFFT_CHECK(hipfftExecR2C(plan, static_cast<hipfftReal*>(in_data),
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecR2C(
-                                  static_cast<hipfftComplex*>(out_data)));
+            plan, static_cast<hipfftReal*>(in_data),
+            static_cast<hipfftComplex*>(out_data)));
        return;
      }
      case FFTTransformType::C2R: {
-        CUFFT_CHECK(hipfftExecC2R(plan, static_cast<hipfftComplex*>(in_data),
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecC2R(
-                                  static_cast<hipfftReal*>(out_data)));
+            plan, static_cast<hipfftComplex*>(in_data),
+            static_cast<hipfftReal*>(out_data)));
        return;
      }
    }
  } else if (value_type == framework::proto::VarType::FP64) {
    switch (config.transform_type()) {
      case FFTTransformType::C2C: {
-        CUFFT_CHECK(hipfftExecZ2Z(plan,
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecZ2Z(
-                                  static_cast<hipfftDoubleComplex*>(in_data),
+            plan, static_cast<hipfftDoubleComplex*>(in_data),
-                                  static_cast<hipfftDoubleComplex*>(out_data),
+            static_cast<hipfftDoubleComplex*>(out_data),
-                                  forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
+            forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
        return;
      }
      case FFTTransformType::R2C: {
-        CUFFT_CHECK(hipfftExecD2Z(plan, static_cast<hipfftDoubleReal*>(in_data),
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecD2Z(
-                                  static_cast<hipfftDoubleComplex*>(out_data)));
+            plan, static_cast<hipfftDoubleReal*>(in_data),
+            static_cast<hipfftDoubleComplex*>(out_data)));
        return;
      }
      case FFTTransformType::C2R: {
-        CUFFT_CHECK(hipfftExecZ2D(plan,
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecZ2D(
-                                  static_cast<hipfftDoubleComplex*>(in_data),
+            plan, static_cast<hipfftDoubleComplex*>(in_data),
-                                  static_cast<hipfftDoubleReal*>(out_data)));
+            static_cast<hipfftDoubleReal*>(out_data)));
        return;
      }
    }
  }
  PADDLE_THROW(platform::errors::InvalidArgument(
      "hipFFT only support transforms of type float32 and float64"));
-#else
-  CUFFT_CHECK(platform::dynload::cufftXtExec(
-      plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE));
-#endif
 }
+template <typename DeviceContext, typename Ti, typename To>
+void exec_hipfft_plan(const DeviceContext& ctx, const HIPFFTConfig& config,
+                      framework::Tensor* input, framework::Tensor* output,
+                      bool forward) {
+  auto fft_type = config.transform_type();
+  if (fft_type == FFTTransformType::C2R && forward) {
+    forward = false;
+    framework::Tensor input_conj(input->type());
+    input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
+    platform::ForRange<DeviceContext> for_range(ctx, input->numel());
+    math::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
+                                  input_conj.data<Ti>());
+    for_range(functor);
+    exec_hipfft_plan_raw(config, input_conj.data<void>(), output->data<void>(),
+                         forward);
+  } else if (fft_type == FFTTransformType::R2C && !forward) {
+    forward = true;
+    framework::Tensor out_conj(output->type());
+    out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
+    exec_hipfft_plan_raw(config, input->data<void>(), out_conj.data<void>(),
+                         forward);
+    platform::ForRange<DeviceContext> for_range(ctx, output->numel());
+    math::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
+                                  output->data<To>());
+    for_range(functor);
+  } else {
+    exec_hipfft_plan_raw(config, input->data<void>(), output->data<void>(),
+                         forward);
+  }
+}
+#endif
 // Execute a general unnormalized fft operation (can be c2c, onesided r2c or
 // onesided c2r)
 template <typename DeviceContext, typename Ti, typename To>
 void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
              const std::vector<int64_t>& dim, bool forward) {
  const auto x_dims = framework::vectorize(X->dims());
-  const auto out_dims = framework::vectorize(out->dims());
  const int64_t ndim = static_cast<int64_t>(X->dims().size());
-  const int64_t signal_ndim = static_cast<int64_t>(dim.size());
-  const int64_t batch_dims = ndim - signal_ndim;
  auto tensor_place = ctx.GetPlace();
-  // Transpose batch dimensions first, then with transforming dims
+  // make a dim permutation
  std::vector<int> dim_permute(ndim);
-  std::vector<int> reverse_dim_permute(ndim);
-  std::vector<int64_t> trans_dims(ndim);
  std::iota(dim_permute.begin(), dim_permute.end(), int{0});
  std::vector<bool> is_transformed_dim(ndim);
  for (const auto& d : dim) {
@@ -340,160 +271,89 @@ void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
  std::sort(dim_permute.begin(), batch_end);
  std::copy(dim.cbegin(), dim.cend(), batch_end);
-  for (size_t i = 0; i < ndim; i++) {
+  // transpose input according to dim permutation
-    trans_dims[i] = x_dims[dim_permute[i]];  // shape of input transpose
+  auto transposed_input_shape = X->dims().transpose(dim_permute);
-    reverse_dim_permute[dim_permute[i]] =
+  framework::Tensor transposed_input;
-        static_cast<int>(i);  // reverse of dim permute
+  transposed_input.Resize(transposed_input_shape);
-  }
+  transposed_input.mutable_data<Ti>(tensor_place);
-  framework::Tensor input;
+  TransCompute<DeviceContext, Ti>(ndim, ctx, *X, &transposed_input,
-  input.Resize(framework::make_ddim(trans_dims));
+                                  dim_permute);
-  input.mutable_data<Ti>(tensor_place);
-  /*
-  auto in_ret = TransposeSimple<Ti>::run(ctx, *X, dim_permute, input);
-  if (!in_ret) {
-    TransCompute<DeviceContext, Ti>(ndim, ctx, *X, input, dim_permute);
-  }
-  */
-  TransCompute<DeviceContext, Ti>(ndim, ctx, *X, &input, dim_permute);
  // Reshape batch dimensions into a single dimension
-  std::vector<int64_t> batched_sizes(signal_ndim + 1);
+  const int64_t signal_ndim = static_cast<int64_t>(dim.size());
+  std::vector<int64_t> collapsed_input_shape(signal_ndim + 1);
+  auto transposed_input_shape_ = framework::vectorize(transposed_input_shape);
+  const int64_t batch_dims = ndim - signal_ndim;
  auto batch_size =
-      std::accumulate(trans_dims.begin(), trans_dims.begin() + batch_dims,
+      std::accumulate(transposed_input_shape_.begin(),
+                      transposed_input_shape_.begin() + batch_dims,
                      static_cast<int>(1), std::multiplies<int>());
-  batched_sizes[0] = batch_size;
+  collapsed_input_shape[0] = batch_size;
-  std::copy(trans_dims.begin() + batch_dims, trans_dims.end(),
-            batched_sizes.begin() + 1);
-  input.Resize(framework::make_ddim(batched_sizes));
-  // Check the shape of transforming dims with input and output
+  std::copy(transposed_input_shape_.begin() + batch_dims,
-  std::vector<int64_t> signal_size(signal_ndim + 1);
+            transposed_input_shape_.end(), collapsed_input_shape.begin() + 1);
-  signal_size[0] = batch_size;
-  for (int64_t i = 0; i < signal_ndim; ++i) {
-    auto in_size = input.dims()[i + 1];
-    auto out_size = out_dims[dim[i]];
-    signal_size[i + 1] = std::max(in_size, out_size);
-    PADDLE_ENFORCE_EQ(
-        (in_size == signal_size[i + 1] ||
-         in_size == (signal_size[i + 1] / 2) + 1),
-        true,
-        platform::errors::InvalidArgument(
-            "The dimension[%d] of Input size: [%d] must be equal or half to "
-            "The dimension[%d] of Output size: [%d]",
-            dim[i], in_size, dim[i], out_size));
-    PADDLE_ENFORCE_EQ(
-        (out_size == signal_size[i + 1] ||
-         out_size == (signal_size[i + 1] / 2) + 1),
-        true,
-        platform::errors::InvalidArgument(
-            "The dimension[%d] of Output size: [%d] must be equal or half to "
-            "The dimension[%d] of Input size: [%d]",
-            dim[i], out_size, dim[i], in_size));
-  }
-  std::vector<int64_t> reshape_out_sizes(ndim);
+  framework::Tensor& collapsed_input = transposed_input;
-  for (size_t i = 0; i < ndim; ++i) {
+  collapsed_input.Resize(framework::make_ddim(collapsed_input_shape));
-    reshape_out_sizes[i] = out_dims[dim_permute[i]];
-  }
+  // make a collpased output
-  std::vector<int64_t> batched_out_sizes(batched_sizes.begin(),
+  const auto out_dims = framework::vectorize(out->dims());
-                                         batched_sizes.end());
+  std::vector<int64_t> collapsed_output_shape(1 + signal_ndim);
+  collapsed_output_shape[0] = batch_size;
  for (size_t i = 0; i < dim.size(); ++i) {
-    batched_out_sizes[i + 1] = out_dims[dim[i]];
+    collapsed_output_shape[i + 1] = out_dims[dim[i]];
  }
+  framework::Tensor collapsed_output;
-  // output
+  collapsed_output.Resize(framework::make_ddim(collapsed_output_shape));
-  framework::Tensor output;
+  collapsed_output.mutable_data<To>(tensor_place);
-  output.Resize(framework::make_ddim(batched_out_sizes));
-  output.mutable_data<To>(tensor_place);
+#if defined(PADDLE_WITH_CUDA)
+  // create plan
-  // Create the transform plan (either from cache or locally)
+  CuFFTConfig config =
-  const auto value_type = framework::IsComplexType(input.type())
+      create_cufft_config(collapsed_input, collapsed_output, signal_ndim);
-                              ? framework::ToRealType(input.type())
-                              : input.type();
-  auto fft_type = GetFFTTransformType(input.type(), output.type());
-  PlanKey Key(framework::vectorize(input.dims()),
-              framework::vectorize(output.dims()), signal_size, fft_type,
-              value_type);
-  CuFFTConfig uncached_plan(Key);
-  CuFFTConfig* config = &uncached_plan;
-  auto& plan = config->plan();
  // prepare cufft for execution
-  CUFFT_CHECK(platform::dynload::cufftSetStream(plan, ctx.stream()));
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      platform::dynload::cufftSetStream(config.plan(), ctx.stream()));
  framework::Tensor workspace_tensor;
-  workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
+  workspace_tensor.mutable_data<To>(tensor_place, config.workspace_size());
-  CUFFT_CHECK(
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftSetWorkArea(
-      platform::dynload::cufftSetWorkArea(plan, workspace_tensor.data<To>()));
+      config.plan(), workspace_tensor.data<To>()));
+  // execute transform plan
+  exec_cufft_plan<DeviceContext, Ti, To>(ctx, config, &collapsed_input,
+                                         &collapsed_output, forward);
+#elif defined(PADDLE_WITH_HIP)
+  // create plan
+  HIPFFTConfig config =
+      create_hipfft_config(collapsed_input, collapsed_output, signal_ndim);
+  // prepare cufft for execution
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      platform::dynload::hipfftSetStream(config.plan(), ctx.stream()));
+  framework::Tensor workspace_tensor;
+  workspace_tensor.mutable_data<To>(tensor_place, config.workspace_size());
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftSetWorkArea(
+      config.plan(), workspace_tensor.data<To>()));
  // execute transform plan
-  if (fft_type == FFTTransformType::C2R && forward) {
+  exec_hipfft_plan<DeviceContext, Ti, To>(ctx, config, &collapsed_input,
-    forward = false;
+                                          &collapsed_output, forward);
-    framework::Tensor input_conj(input.type());
+#endif
-    input_conj.mutable_data<Ti>(input.dims(), ctx.GetPlace());
-    platform::ForRange<DeviceContext> for_range(ctx, input.numel());
-    math::ConjFunctor<Ti> functor(input.data<Ti>(), input.numel(),
-                                  input_conj.data<Ti>());
-    for_range(functor);
-    exec_cufft_plan(*config, input_conj.data<void>(), output.data<void>(),
-                    forward);
-  } else if (fft_type == FFTTransformType::R2C && !forward) {
-    forward = true;
-    framework::Tensor out_conj(output.type());
-    out_conj.mutable_data<To>(output.dims(), ctx.GetPlace());
-    exec_cufft_plan(*config, input.data<void>(), out_conj.data<void>(),
-                    forward);
-    platform::ForRange<DeviceContext> for_range(ctx, output.numel());
-    math::ConjFunctor<To> functor(out_conj.data<To>(), output.numel(),
-                                  output.data<To>());
-    for_range(functor);
-  } else {
-    exec_cufft_plan(*config, input.data<void>(), output.data<void>(), forward);
-  }
  // Inverting output by reshape and transpose to original batch and dimension
-  output.Resize(framework::make_ddim(reshape_out_sizes));
+  auto transposed_out_shape = out->dims().transpose(dim_permute);
-  out->Resize(framework::make_ddim(out_dims));
-  TransCompute<DeviceContext, To>(ndim, ctx, output, out, reverse_dim_permute);
-}
-// Calculates the normalization constant
+  collapsed_output.Resize(transposed_out_shape);
-double fft_normalization_scale(FFTNormMode normalization,
+  auto& transposed_output = collapsed_output;
-                               const std::vector<int64_t>& sizes,
-                               const std::vector<int64_t>& dims) {
-  // auto norm = static_cast<fft_norm_mode>(normalization);
-  if (normalization == FFTNormMode::none) {
-    return static_cast<double>(1.0);
-  }
-  int64_t signal_numel = 1;
+  std::vector<int> reverse_dim_permute(ndim);
-  for (auto dim : dims) {
+  for (size_t i = 0; i < ndim; i++) {
-    signal_numel *= sizes[dim];
+    reverse_dim_permute[dim_permute[i]] = i;
  }
-  const double scale_denom = (normalization == FFTNormMode::by_sqrt_n)
-                                 ? std::sqrt(signal_numel)
-                                 : static_cast<double>(signal_numel);
-  return static_cast<double>(1.0 / scale_denom);
-}
-template <typename DeviceContext, typename T>
+  TransCompute<DeviceContext, To>(ndim, ctx, transposed_output, out,
-void exec_normalization(const DeviceContext& ctx, const Tensor* in, Tensor* out,
+                                  reverse_dim_permute);
-                        FFTNormMode normalization,
-                        const std::vector<int64_t>& sizes,
-                        const std::vector<int64_t>& axes) {
-  double scale = fft_normalization_scale(normalization, sizes, axes);
-  if (scale != 1.0) {
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto dev = ctx.eigen_device();
-    EigenScale<Eigen::GpuDevice, T>::Eval(*dev, eigen_out, eigen_in,
-                                          static_cast<T>(scale),
-                                          static_cast<T>(0), false);
-  } else {
-    framework::TensorCopy(*in, ctx.GetPlace(), out);
-  }
 }
 }  // anonymous namespace
 // Use the optimized path to perform single R2C or C2R if transformation dim is

--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -7,7 +7,7 @@ if (NOT WITH_NV_JETSON)
 endif()
 if (WITH_ROCM)
-  list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc)
+  list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc hipfft.cc)
 endif()
 # There is no macOS version of NCCL.

--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -356,6 +356,16 @@ void* GetCurandDsoHandle() {
 #endif
 }
+#ifdef PADDLE_WITH_HIP
+void* GetROCFFTDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.dylib");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.so");
+#endif
+}
+#endif
 void* GetNvjpegDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib");

--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -44,6 +44,7 @@ void* GetOpDsoHandle(const std::string& dso_name);
 void* GetNvtxDsoHandle();
 void* GetCUFFTDsoHandle();
 void* GetMKLRTDsoHandle();
+void* GetROCFFTDsoHandle();
 void SetPaddleLibPath(const std::string&);
 }  // namespace dynload

--- a/paddle/fluid/platform/dynload/hipfft.cc
+++ b/paddle/fluid/platform/dynload/hipfft.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/platform/dynload/hipfft.h"
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag hipfft_dso_flag;
+void *hipfft_dso_handle;
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+HIPFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/dynload/hipfft.h
+++ b/paddle/fluid/platform/dynload/hipfft.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef PADDLE_WITH_HIP
+#include <hipfft.h>
+#include <mutex>  // NOLINT
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
+namespace paddle {
+namespace platform {
+namespace dynload {
+extern std::once_flag hipfft_dso_flag;
+extern void *hipfft_dso_handle;
+#define DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name)                             \
+  struct DynLoad__##__name {                                                 \
+    template <typename... Args>                                              \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {         \
+      using hipfftFunc = decltype(&::__name);                                \
+      std::call_once(hipfft_dso_flag, []() {                                 \
+        hipfft_dso_handle = paddle::platform::dynload::GetROCFFTDsoHandle(); \
+      });                                                                    \
+      static void *p_##__name = dlsym(hipfft_dso_handle, #__name);           \
+      return reinterpret_cast<hipfftFunc>(p_##__name)(args...);              \
+    }                                                                        \
+  };                                                                         \
+  extern DynLoad__##__name __name
+#define HIPFFT_FFT_ROUTINE_EACH(__macro) \
+  __macro(hipfftPlan1d);                 \
+  __macro(hipfftPlan2d);                 \
+  __macro(hipfftPlan3d);                 \
+  __macro(hipfftPlanMany);               \
+  __macro(hipfftMakePlan1d);             \
+  __macro(hipfftMakePlanMany);           \
+  __macro(hipfftMakePlanMany64);         \
+  __macro(hipfftGetSizeMany64);          \
+  __macro(hipfftEstimate1d);             \
+  __macro(hipfftEstimate2d);             \
+  __macro(hipfftEstimate3d);             \
+  __macro(hipfftEstimateMany);           \
+  __macro(hipfftCreate);                 \
+  __macro(hipfftGetSize1d);              \
+  __macro(hipfftGetSizeMany);            \
+  __macro(hipfftGetSize);                \
+  __macro(hipfftSetWorkArea);            \
+  __macro(hipfftSetAutoAllocation);      \
+  __macro(hipfftExecC2C);                \
+  __macro(hipfftExecR2C);                \
+  __macro(hipfftExecC2R);                \
+  __macro(hipfftExecZ2Z);                \
+  __macro(hipfftExecD2Z);                \
+  __macro(hipfftExecZ2D);                \
+  __macro(hipfftSetStream);              \
+  __macro(hipfftDestroy);                \
+  __macro(hipfftGetVersion);             \
+  __macro(hipfftGetProperty);
+HIPFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP);
+inline const char *hipfftGetErrorString(hipfftResult_t status) {
+  switch (status) {
+    case HIPFFT_SUCCESS:
+      return "'HIPFFT_SUCCESS'. The hipFFT operation was successful.";
+    case HIPFFT_INVALID_PLAN:
+      return "'HIPFFT_INVALID_PLAN'. hipFFT was passed an invalid plan handle.";
+    case HIPFFT_ALLOC_FAILED:
+      return "'HIPFFT_ALLOC_FAILED'. hipFFT failed to allocate GPU or CPU "
+             "memory.";
+    case HIPFFT_INVALID_TYPE:
+      return "'HIPFFT_INVALID_TYPE'. No longer used.";
+    case HIPFFT_INVALID_VALUE:
+      return "'HIPFFT_INVALID_VALUE'. User specified an invalid pointer or "
+             "parameter.";
+    case HIPFFT_INTERNAL_ERROR:
+      return "'HIPFFT_INTERNAL_ERROR'. Driver or internal hipFFT library "
+             "error.";
+    case HIPFFT_EXEC_FAILED:
+      return "'HIPFFT_EXEC_FAILED'. Failed to execute an FFT on the GPU.";
+    case HIPFFT_SETUP_FAILED:
+      return "'HIPFFT_SETUP_FAILED'. The hipFFT library failed to initialize.";
+    case HIPFFT_INVALID_SIZE:
+      return "'HIPFFT_INVALID_SIZE'. User specified an invalid transform size.";
+    case HIPFFT_UNALIGNED_DATA:
+      return "'HIPFFT_UNALIGNED_DATA'. No longer used.";
+    case HIPFFT_INCOMPLETE_PARAMETER_LIST:
+      return "'HIPFFT_INCOMPLETE_PARAMETER_LIST'. Missing parameters in call.";
+    case HIPFFT_INVALID_DEVICE:
+      return "'HIPFFT_INVALID_DEVICE'. Execution of a plan was on different "
+             "GPU than plan creation.";
+    case HIPFFT_PARSE_ERROR:
+      return "'HIPFFT_PARSE_ERROR'. Internal plan database error.";
+    case HIPFFT_NO_WORKSPACE:
+      return "'HIPFFT_NO_WORKSPACE'. No workspace has been provided prior to "
+             "plan execution.";
+    case HIPFFT_NOT_IMPLEMENTED:
+      return "'HIPFFT_NOT_IMPLEMENTED'. Function does not implement "
+             "functionality for parameters given.";
+    case HIPFFT_NOT_SUPPORTED:
+      return "'HIPFFT_NOT_SUPPORTED'. Operation is not supported for "
+             "parameters given.";
+    default:
+      return "HIPFFT_STATUS_UNKNOWN_ERROR";
+  }
+}
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+#endif
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -86,6 +86,7 @@ limitations under the License. */
 #endif  // PADDLE_WITH_CUDA
 #ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/dynload/hipfft.h"
 #include "paddle/fluid/platform/dynload/hiprand.h"
 #include "paddle/fluid/platform/dynload/miopen.h"
 #include "paddle/fluid/platform/dynload/rocblas.h"
@@ -1113,6 +1114,14 @@ inline std::string build_rocm_error_msg(ncclResult_t nccl_result) {
 }
 #endif  // not(__APPLE__) and PADDLE_WITH_NCCL
+/***** HIPFFT ERROR *****/
+inline bool is_error(hipfftResult_t stat) { return stat != HIPFFT_SUCCESS; }
+inline std::string build_rocm_error_msg(hipfftResult_t stat) {
+  std::string msg(" HIPFFT error, ");
+  return msg + platform::dynload::hipfftGetErrorString(stat) + " ";
+}
 namespace details {
 template <typename T>
@@ -1129,6 +1138,7 @@ DEFINE_EXTERNAL_API_TYPE(hipError_t, hipSuccess);
 DEFINE_EXTERNAL_API_TYPE(hiprandStatus_t, HIPRAND_STATUS_SUCCESS);
 DEFINE_EXTERNAL_API_TYPE(miopenStatus_t, miopenStatusSuccess);
 DEFINE_EXTERNAL_API_TYPE(rocblas_status, rocblas_status_success);
+DEFINE_EXTERNAL_API_TYPE(hipfftResult_t, HIPFFT_SUCCESS);
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
 DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess);

--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -331,6 +331,10 @@ TEST(enforce, hip_success) {
      CheckCudaStatusFailure(rocblas_status_invalid_handle, "Rocblas error"));
  EXPECT_TRUE(
      CheckCudaStatusFailure(rocblas_status_invalid_value, "Rocblas error"));
+  EXPECT_TRUE(CheckCudaStatusSuccess(HIPFFT_SUCCESS));
+  EXPECT_TRUE(CheckCudaStatusFailure(HIPFFT_INVALID_PLAN, "HIPFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(HIPFFT_ALLOC_FAILED, "HIPFFT error"));
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
  EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
  EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Rccl error"));