[Cherry-pick]FFT function enhancements and bugfixes (#36537)

* update fft api path (#36219) * update fft api path * add sample code for ihfft2 Co-authored-by: N chenfeiyu <chenfeiyu@baidu.com> * fix fft axis (#36321) fix: `-1` is used when fft's axis is `0` * use unified external error message for cufft api (#36114) * fft: modify sample code result (#36325) * dynamic load mkl as a fft backend when it is avaialble and requested (#36414) * add rocm support for fft api (#36415) * move signal apis * move fft and signal API path (#2) * move signal apis * move fft.py and signal.py to paddle/, fix typos * fix relative imports from fft.py and signal.py * fix typos in signal.py (#3) * move signal apis * move fft.py and signal.py to paddle/, fix typos * fix relative imports from fft.py and signal.py * fix typos * disable Cache when CUFFT_VERSION >= 10200 (#4) * move signal apis * move fft.py and signal.py to paddle/, fix typos * fix relative imports from fft.py and signal.py * fix typos * Add LRUCache for fft plans * add LRUCache for cuff and hipfft (#5) * move signal apis * move fft.py and signal.py to paddle/, fix typos * fix relative imports from fft.py and signal.py * fix typos * WIP: add cache * delete move constructor and operator= for CuFFTHandle and FFTConfig * remove log from CuFFTHandle and FFTConfig * add lrucache for fft rocm backend * disable LRUCache when CUFFT_VERSION >= 10200 * disbale copy and move for hipFFTHandle; format code Co-authored-by: N Xiaoxu Chen <chenxx_id@163.com> * remove debug message of cufftHandler * roll_op: support Tensor as input for shifts (#36727) * fix fftshift/ifftshift on static mode * update roll_op version * add more test cases for fftshift/ifftshift Co-authored-by: N zhiboniu <31800336+zhiboniu@users.noreply.github.com> Co-authored-by: N chenfeiyu <chenfeiyu@baidu.com> Co-authored-by: LJQ❤️ <33169170+lijiaqi0612@users.noreply.github.com>

[Cherry-pick]FFT function enhancements and bugfixes (#36537)
* update fft api path (#36219) * update fft api path * add sample code for ihfft2 Co-authored-by: N chenfeiyu <chenfeiyu@baidu.com> * fix fft axis (#36321) fix: `-1` is used when fft's axis is `0` * use unified external error message for cufft api (#36114) * fft: modify sample code result (#36325) * dynamic load mkl as a fft backend when it is avaialble and requested (#36414) * add rocm support for fft api (#36415) * move signal apis * move fft and signal API path (#2) * move signal apis * move fft.py and signal.py to paddle/, fix typos * fix relative imports from fft.py and signal.py * fix typos in signal.py (#3) * move signal apis * move fft.py and signal.py to paddle/, fix typos * fix relative imports from fft.py and signal.py * fix typos * disable Cache when CUFFT_VERSION >= 10200 (#4) * move signal apis * move fft.py and signal.py to paddle/, fix typos * fix relative imports from fft.py and signal.py * fix typos * Add LRUCache for fft plans * add LRUCache for cuff and hipfft (#5) * move signal apis * move fft.py and signal.py to paddle/, fix typos * fix relative imports from fft.py and signal.py * fix typos * WIP: add cache * delete move constructor and operator= for CuFFTHandle and FFTConfig * remove log from CuFFTHandle and FFTConfig * add lrucache for fft rocm backend * disable LRUCache when CUFFT_VERSION >= 10200 * disbale copy and move for hipFFTHandle; format code Co-authored-by: N Xiaoxu Chen <chenxx_id@163.com> * remove debug message of cufftHandler * roll_op: support Tensor as input for shifts (#36727) * fix fftshift/ifftshift on static mode * update roll_op version * add more test cases for fftshift/ifftshift Co-authored-by: N zhiboniu <31800336+zhiboniu@users.noreply.github.com> Co-authored-by: N chenfeiyu <chenfeiyu@baidu.com> Co-authored-by: LJQ❤️ <33169170+lijiaqi0612@users.noreply.github.com>
11b9f5f9 · Xiaoxu Chen · GitHub · 96edcea4 · 11b9f5f9 · 11b9f5f9
29 changed file
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -255,8 +255,8 @@ if(WITH_GPU)
        include(external/cub)       # download cub
        list(APPEND third_party_deps extern_cub)
    endif()
-    set(URL  "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz" CACHE STRING "" FORCE)
-    file_download_and_uncompress(${URL} "externalError" MD5 061f3b7895aadcbe2c3ed592590f8b10)   # download file externalErrorMsg.tar.gz
+    set(URL  "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg_20210928.tar.gz" CACHE STRING "" FORCE)
+    file_download_and_uncompress(${URL} "externalError" MD5 a712a49384e77ca216ad866712f7cafa)   # download file externalErrorMsg.tar.gz
    if(WITH_TESTING)
        # copy externalErrorMsg.pb, just for unittest can get error message correctly.
        set(SRC_DIR ${THIRD_PARTY_PATH}/externalError/data)

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -105,10 +105,20 @@ else()
    op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()

-if (WITH_GPU AND (NOT WITH_ROCM))
-    op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda ${OP_HEADER_DEPS})
+if (WITH_GPU OR WITH_ROCM)
+    if (MKL_FOUND AND WITH_ONEMKL)
+        op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda dynload_mklrt ${OP_HEADER_DEPS})
+        target_include_directories(spectral_op PRIVATE ${MKL_INCLUDE})
+    else()
+        op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda ${OP_HEADER_DEPS})
+    endif()
 else()
-    op_library(spectral_op SRCS spectral_op.cc DEPS ${OP_HEADER_DEPS})
+    if (MKL_FOUND AND WITH_ONEMKL)
+        op_library(spectral_op SRCS spectral_op.cc DEPS dynload_mklrt ${OP_HEADER_DEPS})
+        target_include_directories(spectral_op PRIVATE ${MKL_INCLUDE})
+    else()
+        op_library(spectral_op SRCS spectral_op.cc DEPS ${OP_HEADER_DEPS})
+    endif()
 endif()

 op_library(lstm_op DEPS ${OP_HEADER_DEPS}  lstm_compute)

--- a/paddle/fluid/operators/roll_op.cc
+++ b/paddle/fluid/operators/roll_op.cc
@@ -40,21 +40,23 @@ class RollOp : public framework::OperatorWithKernel {
    auto dims = ctx->Attrs().Get<std::vector<int64_t>>("axis");
    auto shifts = ctx->Attrs().Get<std::vector<int64_t>>("shifts");

-    if (dims.size() != 0) {
-      PADDLE_ENFORCE_EQ(dims.size(), shifts.size(),
-                        platform::errors::InvalidArgument(
-                            "When dims.size() != 0, dims.size() "
-                            "should be equal to "
-                            "shifts.size(). But received "
-                            "dims.size() = %d, shifts.size() = %d",
-                            dims.size(), shifts.size()));
-    } else {
-      PADDLE_ENFORCE_EQ(shifts.size(), 1,
-                        platform::errors::InvalidArgument(
-                            "When dims.size() == 0, shifts.size() "
-                            "should be equal to 1, But received "
-                            "shifts.size() = %d",
-                            shifts.size()));
+    if (!ctx->HasInput("ShiftsTensor")) {
+      if (dims.size() != 0) {
+        PADDLE_ENFORCE_EQ(dims.size(), shifts.size(),
+                          platform::errors::InvalidArgument(
+                              "When dims.size() != 0, dims.size() "
+                              "should be equal to "
+                              "shifts.size(). But received "
+                              "dims.size() = %d, shifts.size() = %d",
+                              dims.size(), shifts.size()));
+      } else {
+        PADDLE_ENFORCE_EQ(shifts.size(), 1,
+                          platform::errors::InvalidArgument(
+                              "When dims.size() == 0, shifts.size() "
+                              "should be equal to 1, But received "
+                              "shifts.size() = %d",
+                              shifts.size()));
+      }
    }

    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
@@ -105,6 +107,10 @@ class RollOpMaker : public framework::OpProtoAndCheckerMaker {
                                  "The number of places by which the elements "
                                  "of the tensor are shifted.")
        .SetDefault({});
+    AddInput("ShiftsTensor",
+             "The number of places by which the elements of the tensor "
+             "are shifted.")
+        .AsDispensable();
    AddAttr<std::vector<int64_t>>(
        "axis",
        "Axis along which to roll. It must have the same size "
@@ -129,6 +135,9 @@ class RollGradMaker : public framework::SingleGradOpMaker<T> {
  void Apply(GradOpPtr<T> op) const override {
    op->SetType("roll_grad");
    op->SetInput("X", this->Input("X"));
+    if (this->HasInput("ShiftsTensor")) {
+      op->SetInput("ShiftsTensor", this->Input("ShiftsTensor"));
+    }
    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
    op->SetAttrMap(this->Attrs());
@@ -174,7 +183,12 @@ REGISTER_OP_VERSION(roll)
                     "(std::vector<int64_t>) Axis along which to roll. "
                     "It must have the same size with shifts, or size = 0.",
                     std::vector<int64_t>())
-            .DeleteAttr(
-                "dims",
-                "(std::vector<int64_t>) Dims along which to roll. "
-                "It must have the same size with shifts, or size = 0."));
+            .DeleteAttr("dims",
+                        "(std::vector<int64_t>) Dims along which to roll. "
+                        "It must have the same size with shifts, or size = 0."))
+    .AddCheckpoint(
+        R"ROC(Upgrade roll add a dispensable input "ShiftsTensor".)ROC",
+        paddle::framework::compatible::OpVersionDesc().NewInput(
+            "ShiftsTensor",
+            "The number of places by which the elements of"
+            "the tensor are shifted."));
--- a/paddle/fluid/operators/roll_op.cu
+++ b/paddle/fluid/operators/roll_op.cu
@@ -59,6 +59,16 @@ class RollKernel<platform::CUDADeviceContext, T>
    auto* in = context.Input<LoDTensor>("X");
    auto* out = context.Output<LoDTensor>("Out");
    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
+    if (context.HasInput("ShiftsTensor")) {
+      const auto* shifts_tensor =
+          context.Input<framework::Tensor>("ShiftsTensor");
+      PADDLE_ENFORCE_EQ(
+          shifts_tensor->dims().size(), 1,
+          platform::errors::InvalidArgument(
+              "The rank of ShiftsTensor is expected to be 1, got %s",
+              shifts_tensor->dims().size()));
+      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
+    }
    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");

    auto* in_data = in->data<T>();
@@ -134,6 +144,16 @@ class RollGradKernel<platform::CUDADeviceContext, T>
    auto* in = context.Input<LoDTensor>(framework::GradVarName("Out"));
    auto* out = context.Output<LoDTensor>(framework::GradVarName("X"));
    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
+    if (context.HasInput("ShiftsTensor")) {
+      const auto* shifts_tensor =
+          context.Input<framework::Tensor>("ShiftsTensor");
+      PADDLE_ENFORCE_EQ(
+          shifts_tensor->dims().size(), 1,
+          platform::errors::InvalidArgument(
+              "The rank of ShiftsTensor is expected to be 1, got %s",
+              shifts_tensor->dims().size()));
+      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
+    }
    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");

    auto* in_data = in->data<T>();

--- a/paddle/fluid/operators/roll_op.h
+++ b/paddle/fluid/operators/roll_op.h
@@ -16,6 +16,8 @@
 #include <memory>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/platform/enforce.h"

 namespace paddle {
 namespace operators {
@@ -85,6 +87,16 @@ class RollKernel : public framework::OpKernel<T> {
    auto& input = input_var->Get<LoDTensor>();
    auto* output = output_var->GetMutable<LoDTensor>();
    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
+    if (context.HasInput("ShiftsTensor")) {
+      const auto* shifts_tensor =
+          context.Input<framework::Tensor>("ShiftsTensor");
+      PADDLE_ENFORCE_EQ(
+          shifts_tensor->dims().size(), 1,
+          platform::errors::InvalidArgument(
+              "The rank of ShiftsTensor is expected to be 1, got %s",
+              shifts_tensor->dims().size()));
+      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
+    }
    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");

    std::vector<T> out_vec;
@@ -123,6 +135,11 @@ class RollGradKernel : public framework::OpKernel<T> {
    auto& input = input_var->Get<LoDTensor>();
    auto* output = output_var->GetMutable<LoDTensor>();
    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
+    if (context.HasInput("ShiftsTensor")) {
+      const auto* shifts_tensor =
+          context.Input<framework::Tensor>("ShiftsTensor");
+      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
+    }
    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");

    std::vector<T> out_vec;

--- a/paddle/fluid/operators/spectral_helper.h
+++ b/paddle/fluid/operators/spectral_helper.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/spectral_op.h"
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/dynload/hipfft.h"
+#endif
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/dynload/cufft.h"
+#endif
+
+namespace paddle {
+namespace operators {
+using ScalarType = framework::proto::VarType::Type;
+const int64_t kMaxFFTNdim = 3;
+const int64_t kMaxDataNdim = kMaxFFTNdim + 1;
+// This struct is used to easily compute hashes of the
+// parameters. It will be the **key** to the plan cache.
+struct FFTConfigKey {
+  // between 1 and kMaxFFTNdim, i.e., 1 <= signal_ndim <= 3
+  int64_t signal_ndim_;
+  // These include additional batch dimension as well.
+  int64_t sizes_[kMaxDataNdim];
+  int64_t input_shape_[kMaxDataNdim];
+  int64_t output_shape_[kMaxDataNdim];
+  FFTTransformType fft_type_;
+  ScalarType value_type_;
+
+  FFTConfigKey() = default;
+
+  FFTConfigKey(const std::vector<int64_t>& in_shape,
+               const std::vector<int64_t>& out_shape,
+               const std::vector<int64_t>& signal_size,
+               FFTTransformType fft_type, ScalarType value_type) {
+    // Padding bits must be zeroed for hashing
+    memset(this, 0, sizeof(*this));
+    signal_ndim_ = signal_size.size() - 1;
+    fft_type_ = fft_type;
+    value_type_ = value_type;
+
+    std::copy(signal_size.cbegin(), signal_size.cend(), sizes_);
+    std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_);
+    std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_);
+  }
+};
+
+#if defined(PADDLE_WITH_CUDA)
+// An RAII encapsulation of cuFFTHandle
+class CuFFTHandle {
+  ::cufftHandle handle_;
+
+ public:
+  CuFFTHandle() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftCreate(&handle_));
+  }
+
+  CuFFTHandle(const CuFFTHandle& other) = delete;
+  CuFFTHandle& operator=(const CuFFTHandle& other) = delete;
+
+  CuFFTHandle(CuFFTHandle&& other) = delete;
+  CuFFTHandle& operator=(CuFFTHandle&& other) = delete;
+
+  ::cufftHandle& get() { return handle_; }
+  const ::cufftHandle& get() const { return handle_; }
+
+  ~CuFFTHandle() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftDestroy(handle_));
+  }
+};
+
+using plan_size_type = long long int;  // NOLINT
+// This class contains all the information needed to execute a cuFFT plan:
+//   1. the plan
+//   2. the workspace size needed
+class FFTConfig {
+ public:
+  // Only move semantics is enought for this class. Although we already use
+  // unique_ptr for the plan, still remove copy constructor and assignment op so
+  // we don't accidentally copy and take perf hit.
+  explicit FFTConfig(const FFTConfigKey& plan_key)
+      : FFTConfig(
+            std::vector<int64_t>(plan_key.sizes_,
+                                 plan_key.sizes_ + plan_key.signal_ndim_ + 1),
+            plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
+
+  // sizes are full signal, including batch size and always two-sided
+  FFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
+            FFTTransformType fft_type, ScalarType dtype)
+      : fft_type_(fft_type), value_type_(dtype) {
+    // signal sizes (excluding batch dim)
+    std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
+
+    // input batch size
+    const auto batch = static_cast<plan_size_type>(sizes[0]);
+    // const int64_t signal_ndim = sizes.size() - 1;
+    PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
+                      platform::errors::InvalidArgument(
+                          "The signal_ndim must be equal to sizes.size() - 1,"
+                          "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
+                          signal_ndim, sizes.size() - 1));
+
+    cudaDataType itype, otype, exec_type;
+    const auto complex_input = has_complex_input(fft_type);
+    const auto complex_output = has_complex_output(fft_type);
+    if (dtype == framework::proto::VarType::FP32) {
+      itype = complex_input ? CUDA_C_32F : CUDA_R_32F;
+      otype = complex_output ? CUDA_C_32F : CUDA_R_32F;
+      exec_type = CUDA_C_32F;
+    } else if (dtype == framework::proto::VarType::FP64) {
+      itype = complex_input ? CUDA_C_64F : CUDA_R_64F;
+      otype = complex_output ? CUDA_C_64F : CUDA_R_64F;
+      exec_type = CUDA_C_64F;
+    } else if (dtype == framework::proto::VarType::FP16) {
+      itype = complex_input ? CUDA_C_16F : CUDA_R_16F;
+      otype = complex_output ? CUDA_C_16F : CUDA_R_16F;
+      exec_type = CUDA_C_16F;
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "cuFFT only support transforms of type float16, float32 and "
+          "float64"));
+    }
+
+    // disable auto allocation of workspace to use allocator from the framework
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftSetAutoAllocation(
+        plan(), /* autoAllocate */ 0));
+
+    size_t ws_size_t;
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftXtMakePlanMany(
+        plan(), signal_ndim, signal_sizes.data(),
+        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
+        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
+        batch, &ws_size_t, exec_type));
+
+    ws_size = ws_size_t;
+  }
+
+  FFTConfig(const FFTConfig& other) = delete;
+  FFTConfig& operator=(const FFTConfig& other) = delete;
+
+  FFTConfig(FFTConfig&& other) = delete;
+  FFTConfig& operator=(FFTConfig&& other) = delete;
+
+  const cufftHandle& plan() const { return plan_ptr.get(); }
+
+  FFTTransformType transform_type() const { return fft_type_; }
+  ScalarType data_type() const { return value_type_; }
+  size_t workspace_size() const { return ws_size; }
+
+ private:
+  CuFFTHandle plan_ptr;
+  size_t ws_size;
+  FFTTransformType fft_type_;
+  ScalarType value_type_;
+};
+
+#elif defined(PADDLE_WITH_HIP)
+// An RAII encapsulation of cuFFTHandle
+class HIPFFTHandle {
+  ::hipfftHandle handle_;
+
+ public:
+  HIPFFTHandle() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftCreate(&handle_));
+  }
+
+  HIPFFTHandle(const HIPFFTHandle& other) = delete;
+  HIPFFTHandle& operator=(const HIPFFTHandle& other) = delete;
+
+  HIPFFTHandle(HIPFFTHandle&& other) = delete;
+  HIPFFTHandle& operator=(HIPFFTHandle&& other) = delete;
+
+  ::hipfftHandle& get() { return handle_; }
+  const ::hipfftHandle& get() const { return handle_; }
+
+  ~HIPFFTHandle() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftDestroy(handle_));
+  }
+};
+using plan_size_type = int;
+// This class contains all the information needed to execute a cuFFT plan:
+//   1. the plan
+//   2. the workspace size needed
+class FFTConfig {
+ public:
+  // Only move semantics is enought for this class. Although we already use
+  // unique_ptr for the plan, still remove copy constructor and assignment op so
+  // we don't accidentally copy and take perf hit.
+  explicit FFTConfig(const FFTConfigKey& plan_key)
+      : FFTConfig(
+            std::vector<int64_t>(plan_key.sizes_,
+                                 plan_key.sizes_ + plan_key.signal_ndim_ + 1),
+            plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
+
+  // sizes are full signal, including batch size and always two-sided
+  FFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
+            FFTTransformType fft_type, ScalarType dtype)
+      : fft_type_(fft_type), value_type_(dtype) {
+    // signal sizes (excluding batch dim)
+    std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
+
+    // input batch size
+    const auto batch = static_cast<plan_size_type>(sizes[0]);
+    // const int64_t signal_ndim = sizes.size() - 1;
+    PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
+                      platform::errors::InvalidArgument(
+                          "The signal_ndim must be equal to sizes.size() - 1,"
+                          "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
+                          signal_ndim, sizes.size() - 1));
+
+    hipfftType exec_type = [&] {
+      if (dtype == framework::proto::VarType::FP32) {
+        switch (fft_type) {
+          case FFTTransformType::C2C:
+            return HIPFFT_C2C;
+          case FFTTransformType::R2C:
+            return HIPFFT_R2C;
+          case FFTTransformType::C2R:
+            return HIPFFT_C2R;
+        }
+      } else if (dtype == framework::proto::VarType::FP64) {
+        switch (fft_type) {
+          case FFTTransformType::C2C:
+            return HIPFFT_Z2Z;
+          case FFTTransformType::R2C:
+            return HIPFFT_D2Z;
+          case FFTTransformType::C2R:
+            return HIPFFT_Z2D;
+        }
+      }
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "hipFFT only support transforms of type float32 and float64"));
+    }();
+
+    // disable auto allocation of workspace to use allocator from the framework
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftSetAutoAllocation(
+        plan(), /* autoAllocate */ 0));
+
+    size_t ws_size_t;
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftMakePlanMany(
+        plan(), signal_ndim, signal_sizes.data(),
+        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1,
+        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type,
+        batch, &ws_size_t));
+
+    ws_size = ws_size_t;
+  }
+
+  const hipfftHandle& plan() const { return plan_ptr.get(); }
+
+  FFTTransformType transform_type() const { return fft_type_; }
+  ScalarType data_type() const { return value_type_; }
+  size_t workspace_size() const { return ws_size; }
+
+ private:
+  HIPFFTHandle plan_ptr;
+  size_t ws_size;
+  FFTTransformType fft_type_;
+  ScalarType value_type_;
+};
+#endif
+
+// Hashing machinery for Key
+// Fowler–Noll–Vo hash function
+// see
+// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+template <typename Key>
+struct KeyHash {
+  // Key must be a POD because we read out its memory
+  // contenst as char* when hashing
+  static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
+
+  size_t operator()(const Key& params) const {
+    auto ptr = reinterpret_cast<const uint8_t*>(&params);
+    uint32_t value = 0x811C9DC5;
+    for (int i = 0; i < static_cast<int>(sizeof(Key)); ++i) {
+      value ^= ptr[i];
+      value *= 0x01000193;
+    }
+    return static_cast<size_t>(value);
+  }
+};
+
+template <typename Key>
+struct KeyEqual {
+  // Key must be a POD because we read out its memory
+  // contenst as char* when comparing
+  static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
+
+  bool operator()(const Key& a, const Key& b) const {
+    auto ptr1 = reinterpret_cast<const uint8_t*>(&a);
+    auto ptr2 = reinterpret_cast<const uint8_t*>(&b);
+    return memcmp(ptr1, ptr2, sizeof(Key)) == 0;
+  }
+};
+
+#if CUDA_VERSION < 10000
+// Note that the max plan number for CUDA version < 10 has to be 1023
+// due to a bug that fails on the 1024th plan
+constexpr size_t CUFFT_MAX_PLAN_NUM = 1023;
+constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM;
+#else
+constexpr size_t CUFFT_MAX_PLAN_NUM = std::numeric_limits<size_t>::max();
+// The default max cache size chosen for CUDA version > 10 is arbitrary.
+// This number puts a limit on how big of a plan cache should we maintain by
+// default. Users can always configure it via cufft_set_plan_cache_max_size.
+constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = 4096;
+#endif
+static_assert(CUFFT_MAX_PLAN_NUM >= 0 &&
+                  CUFFT_MAX_PLAN_NUM <= std::numeric_limits<size_t>::max(),
+              "CUFFT_MAX_PLAN_NUM not in size_t range");
+static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 &&
+                  CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM,
+              "CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range");
+
+// This cache assumes that the mapping from key to value never changes.
+// This is **NOT** thread-safe. Please use a mutex when using it **AND** the
+// value returned from try_emplace_value.
+// The contract of using this cache is that try_emplace_value should only be
+// used when the max_size is positive.
+class FFTConfigCache {
+ public:
+  using kv_t = typename std::pair<FFTConfigKey, FFTConfig>;
+  using map_t = typename std::unordered_map<
+      std::reference_wrapper<FFTConfigKey>, typename std::list<kv_t>::iterator,
+      KeyHash<FFTConfigKey>, KeyEqual<FFTConfigKey>>;
+  using map_kkv_iter_t = typename map_t::iterator;
+
+  FFTConfigCache() : FFTConfigCache(CUFFT_DEFAULT_CACHE_SIZE) {}
+
+  explicit FFTConfigCache(int64_t max_size) { _set_max_size(max_size); }
+
+  FFTConfigCache(const FFTConfigCache& other) = delete;
+  FFTConfigCache& operator=(const FFTConfigCache& other) = delete;
+
+  FFTConfigCache(FFTConfigCache&& other) noexcept
+      : _usage_list(std::move(other._usage_list)),
+        _cache_map(std::move(other._cache_map)),
+        _max_size(other._max_size) {}
+
+  FFTConfigCache& operator=(FFTConfigCache&& other) noexcept {
+    _usage_list = std::move(other._usage_list);
+    _cache_map = std::move(other._cache_map);
+    _max_size = other._max_size;
+    return *this;
+  }
+
+  // If key is in this cache, return the cached config. Otherwise, emplace the
+  // config in this cache and return it.
+  FFTConfig& lookup(FFTConfigKey params) {
+    PADDLE_ENFORCE_GT(_max_size, 0,
+                      platform::errors::InvalidArgument(
+                          "The max size of FFTConfigCache must be great than 0,"
+                          "But received is [%d]",
+                          _max_size));
+
+    map_kkv_iter_t map_it = _cache_map.find(params);
+    // Hit, put to list front
+    if (map_it != _cache_map.end()) {
+      _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second);
+      return map_it->second->second;
+    }
+
+    // Miss
+    // remove if needed
+    if (_usage_list.size() >= _max_size) {
+      auto last = _usage_list.end();
+      last--;
+      _cache_map.erase(last->first);
+      _usage_list.pop_back();
+    }
+
+    // construct new plan at list front, then insert into _cache_map
+    _usage_list.emplace_front(std::piecewise_construct,
+                              std::forward_as_tuple(params),
+                              std::forward_as_tuple(params));
+    auto kv_it = _usage_list.begin();
+    _cache_map.emplace(std::piecewise_construct,
+                       std::forward_as_tuple(kv_it->first),
+                       std::forward_as_tuple(kv_it));
+    return kv_it->second;
+  }
+
+  void clear() {
+    _cache_map.clear();
+    _usage_list.clear();
+  }
+
+  void resize(int64_t new_size) {
+    _set_max_size(new_size);
+    auto cur_size = _usage_list.size();
+    if (cur_size > _max_size) {
+      auto delete_it = _usage_list.end();
+      for (size_t i = 0; i < cur_size - _max_size; i++) {
+        delete_it--;
+        _cache_map.erase(delete_it->first);
+      }
+      _usage_list.erase(delete_it, _usage_list.end());
+    }
+  }
+
+  size_t size() const { return _cache_map.size(); }
+
+  size_t max_size() const noexcept { return _max_size; }
+
+  std::mutex mutex;
+
+ private:
+  // Only sets size and does value check. Does not resize the data structures.
+  void _set_max_size(int64_t new_size) {
+    // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since
+    // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check
+    // first.
+    PADDLE_ENFORCE_GE(
+        new_size, 0,
+        platform::errors::InvalidArgument(
+            "cuFFT plan cache size must be non-negative, But received is [%d]",
+            new_size));
+    PADDLE_ENFORCE_LE(new_size, CUFFT_MAX_PLAN_NUM,
+                      platform::errors::InvalidArgument(
+                          "cuFFT plan cache size can not be larger than [%d], "
+                          "But received is [%d]",
+                          CUFFT_MAX_PLAN_NUM, new_size));
+    _max_size = static_cast<size_t>(new_size);
+  }
+
+  std::list<kv_t> _usage_list;
+  map_t _cache_map;
+  size_t _max_size;
+};
+
+static std::vector<std::unique_ptr<FFTConfigCache>> plan_caches;
+static std::mutex plan_caches_mutex;
+
+static inline FFTConfigCache& get_fft_plan_cache(int64_t device_index) {
+  std::lock_guard<std::mutex> guard(plan_caches_mutex);
+
+  if (device_index >= plan_caches.size()) {
+    plan_caches.resize(device_index + 1);
+  }
+
+  if (!plan_caches[device_index]) {
+    plan_caches[device_index] = std::make_unique<FFTConfigCache>();
+  }
+
+  return *plan_caches[device_index];
+}
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/spectral_op.cc
+++ b/paddle/fluid/operators/spectral_op.cc
@@ -27,7 +27,7 @@
 #include "paddle/fluid/platform/complex.h"

 #if defined(PADDLE_WITH_ONEMKL)
-#include <mkl_dfti.h>
+#include "paddle/fluid/platform/dynload/mklrt.h"
 #elif defined(PADDLE_WITH_POCKETFFT)
 #include "extern_pocketfft/pocketfft_hdronly.h"
 #endif
@@ -357,46 +357,45 @@ FFTNormMode get_norm_from_string(const std::string& norm, bool forward) {
 // FFT Functors
 #if defined(PADDLE_WITH_ONEMKL)

+#define MKL_DFTI_CHECK(expr)                                       \
+  do {                                                             \
+    MKL_LONG status = (expr);                                      \
+    if (!platform::dynload::DftiErrorClass(status, DFTI_NO_ERROR)) \
+      PADDLE_THROW(platform::errors::External(                     \
+          platform::dynload::DftiErrorMessage(status)));           \
+  } while (0);
+
 namespace {
-static inline void MKL_DFTI_CHECK(MKL_INT status) {
-  if (status && !DftiErrorClass(status, DFTI_NO_ERROR)) {
-    PADDLE_THROW(platform::errors::External(DftiErrorMessage(status)));
-  }
-}

 struct DftiDescriptorDeleter {
  void operator()(DFTI_DESCRIPTOR_HANDLE handle) {
    if (handle != nullptr) {
-      MKL_DFTI_CHECK(DftiFreeDescriptor(&handle));
+      MKL_DFTI_CHECK(platform::dynload::DftiFreeDescriptor(&handle));
    }
  }
 };

+// A RAII wrapper for MKL_DESCRIPTOR*
 class DftiDescriptor {
 public:
  void init(DFTI_CONFIG_VALUE precision, DFTI_CONFIG_VALUE signal_type,
            MKL_LONG signal_ndim, MKL_LONG* sizes) {
-    if (desc_ != nullptr) {
-      PADDLE_THROW(platform::errors::AlreadyExists(
-          "DFT DESCRIPTOR can only be initialized once."));
-    }
+    PADDLE_ENFORCE_EQ(desc_.get(), nullptr,
+                      platform::errors::AlreadyExists(
+                          "DftiDescriptor has already been initialized."));
+
    DFTI_DESCRIPTOR* raw_desc;
-    if (signal_ndim == 1) {
-      MKL_DFTI_CHECK(
-          DftiCreateDescriptor(&raw_desc, precision, signal_type, 1, sizes[0]));
-    } else {
-      MKL_DFTI_CHECK(DftiCreateDescriptor(&raw_desc, precision, signal_type,
-                                          signal_ndim, sizes));
-    }
+    MKL_DFTI_CHECK(platform::dynload::DftiCreateDescriptorX(
+        &raw_desc, precision, signal_type, signal_ndim, sizes));
    desc_.reset(raw_desc);
  }

  DFTI_DESCRIPTOR* get() const {
-    if (desc_ == nullptr) {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "DFTI DESCRIPTOR has not been initialized."));
-    }
-    return desc_.get();
+    DFTI_DESCRIPTOR* raw_desc = desc_.get();
+    PADDLE_ENFORCE_NOT_NULL(raw_desc,
+                            platform::errors::PreconditionNotMet(
+                                "DFTI DESCRIPTOR has not been initialized."));
+    return raw_desc;
  }

 private:
@@ -421,7 +420,9 @@ DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype,
        return DFTI_DOUBLE;
      default:
        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Input data type should be FP32, FP64, COMPLEX64 or COMPLEX128."));
+            "Invalid input datatype (%s), input data type should be FP32, "
+            "FP64, COMPLEX64 or COMPLEX128.",
+            framework::DataTypeToString(in_dtype)));
    }
  }();

@@ -430,35 +431,27 @@ DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype,
  const DFTI_CONFIG_VALUE domain =
      (fft_type == FFTTransformType::C2C) ? DFTI_COMPLEX : DFTI_REAL;

-  // const bool complex_input = framework::IsComplexType(in_dtype);
-  // const bool complex_output = framework::IsComplexType(out_dtype);
-  // const DFTI_CONFIG_VALUE domain = [&] {
-  //   if (forward) {
-  //     return complex_input ? DFTI_COMPLEX : DFTI_REAL;
-  //   } else {
-  //     return complex_output ? DFTI_COMPLEX : DFTI_REAL;
-  //   }
-  // }();
-
  DftiDescriptor descriptor;
  std::vector<MKL_LONG> fft_sizes(signal_sizes.cbegin(), signal_sizes.cend());
  const MKL_LONG signal_ndim = fft_sizes.size() - 1;
  descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1);

  // placement inplace or not inplace
-  MKL_DFTI_CHECK(
-      DftiSetValue(descriptor.get(), DFTI_PLACEMENT, DFTI_NOT_INPLACE));
+  MKL_DFTI_CHECK(platform::dynload::DftiSetValue(
+      descriptor.get(), DFTI_PLACEMENT, DFTI_NOT_INPLACE));

  // number of transformations
  const MKL_LONG batch_size = fft_sizes[0];
-  MKL_DFTI_CHECK(
-      DftiSetValue(descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size));
+  MKL_DFTI_CHECK(platform::dynload::DftiSetValue(
+      descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size));

  // input & output distance
  const MKL_LONG idist = in_strides[0];
  const MKL_LONG odist = out_strides[0];
-  MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist));
-  MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_OUTPUT_DISTANCE, odist));
+  MKL_DFTI_CHECK(platform::dynload::DftiSetValue(descriptor.get(),
+                                                 DFTI_INPUT_DISTANCE, idist));
+  MKL_DFTI_CHECK(platform::dynload::DftiSetValue(descriptor.get(),
+                                                 DFTI_OUTPUT_DISTANCE, odist));

  // input & output stride
  std::vector<MKL_LONG> mkl_in_stride(1 + signal_ndim, 0);
@@ -467,15 +460,15 @@ DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype,
    mkl_in_stride[i] = in_strides[i];
    mkl_out_stride[i] = out_strides[i];
  }
-  MKL_DFTI_CHECK(
-      DftiSetValue(descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data()));
-  MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_OUTPUT_STRIDES,
-                              mkl_out_stride.data()));
+  MKL_DFTI_CHECK(platform::dynload::DftiSetValue(
+      descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data()));
+  MKL_DFTI_CHECK(platform::dynload::DftiSetValue(
+      descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data()));

  // conjugate even storage
  if (!(fft_type == FFTTransformType::C2C)) {
-    MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE,
-                                DFTI_COMPLEX_COMPLEX));
+    MKL_DFTI_CHECK(platform::dynload::DftiSetValue(
+        descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX));
  }

  MKL_LONG signal_numel =
@@ -496,11 +489,12 @@ DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype,
        return DFTI_BACKWARD_SCALE;
      }
    }();
-    MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), scale_direction, scale));
+    MKL_DFTI_CHECK(platform::dynload::DftiSetValue(descriptor.get(),
+                                                   scale_direction, scale));
  }

  // commit the descriptor
-  MKL_DFTI_CHECK(DftiCommitDescriptor(descriptor.get()));
+  MKL_DFTI_CHECK(platform::dynload::DftiCommitDescriptor(descriptor.get()));
  return descriptor;
 }

@@ -592,15 +586,16 @@ void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out,
                                  collapsed_input.numel(),
                                  collapsed_input_conj.data<Ti>());
    for_range(functor);
-    MKL_DFTI_CHECK(DftiComputeBackward(desc.get(),
-                                       collapsed_input_conj.data<void>(),
-                                       collapsed_output.data<void>()));
+    MKL_DFTI_CHECK(platform::dynload::DftiComputeBackward(
+        desc.get(), collapsed_input_conj.data<void>(),
+        collapsed_output.data<void>()));
  } else if (fft_type == FFTTransformType::R2C && !forward) {
    framework::Tensor collapsed_output_conj(collapsed_output.type());
    collapsed_output_conj.mutable_data<To>(collapsed_output.dims(),
                                           ctx.GetPlace());
-    MKL_DFTI_CHECK(DftiComputeForward(desc.get(), collapsed_input.data<void>(),
-                                      collapsed_output_conj.data<void>()));
+    MKL_DFTI_CHECK(platform::dynload::DftiComputeForward(
+        desc.get(), collapsed_input.data<void>(),
+        collapsed_output_conj.data<void>()));
    // conjugate the output
    platform::ForRange<DeviceContext> for_range(ctx, collapsed_output.numel());
    math::ConjFunctor<To> functor(collapsed_output_conj.data<To>(),
@@ -609,13 +604,13 @@ void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out,
    for_range(functor);
  } else {
    if (forward) {
-      MKL_DFTI_CHECK(DftiComputeForward(desc.get(),
-                                        collapsed_input.data<void>(),
-                                        collapsed_output.data<void>()));
+      MKL_DFTI_CHECK(platform::dynload::DftiComputeForward(
+          desc.get(), collapsed_input.data<void>(),
+          collapsed_output.data<void>()));
    } else {
-      MKL_DFTI_CHECK(DftiComputeBackward(desc.get(),
-                                         collapsed_input.data<void>(),
-                                         collapsed_output.data<void>()));
+      MKL_DFTI_CHECK(platform::dynload::DftiComputeBackward(
+          desc.get(), collapsed_input.data<void>(),
+          collapsed_output.data<void>()));
    }
  }


--- a/paddle/fluid/operators/spectral_op.cu
+++ b/paddle/fluid/operators/spectral_op.cu
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -7,7 +7,7 @@ if (NOT WITH_NV_JETSON)
 endif()

 if (WITH_ROCM)
-  list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc)
+  list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc hipfft.cc)
 endif()

 # There is no macOS version of NCCL.
@@ -49,3 +49,9 @@ endif()
 cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader)
 add_dependencies(dynload_lapack extern_lapack)
 # TODO(TJ): add iomp, mkldnn?
+
+if (MKL_FOUND AND WITH_ONEMKL)
+  message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}")
+  cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader)
+  target_include_directories(dynload_mklrt PRIVATE ${MKL_INCLUDE})
+endif()
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -53,6 +53,12 @@ DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");

 DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");

+DEFINE_string(mkl_dir, "",
+              "Specify path for loading libmkl_rt.so. "
+              "For insrance, /opt/intel/oneapi/mkl/latest/lib/intel64/."
+              "If default, "
+              "dlopen will search mkl from LD_LIBRARY_PATH");
+
 DEFINE_string(op_dir, "", "Specify path for loading user-defined op library.");

 #ifdef PADDLE_WITH_HIP
@@ -350,6 +356,16 @@ void* GetCurandDsoHandle() {
 #endif
 }

+#ifdef PADDLE_WITH_HIP
+void* GetROCFFTDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.dylib");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.so");
+#endif
+}
+#endif
+
 void* GetNvjpegDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib");
@@ -518,6 +534,16 @@ void* GetCUFFTDsoHandle() {
 #endif
 }

+void* GetMKLRTDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "mkl_rt.dll");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.so");
+#endif
+}
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -43,6 +43,8 @@ void* GetLAPACKDsoHandle();
 void* GetOpDsoHandle(const std::string& dso_name);
 void* GetNvtxDsoHandle();
 void* GetCUFFTDsoHandle();
+void* GetMKLRTDsoHandle();
+void* GetROCFFTDsoHandle();

 void SetPaddleLibPath(const std::string&);
 }  // namespace dynload

--- a/paddle/fluid/platform/dynload/hipfft.cc
+++ b/paddle/fluid/platform/dynload/hipfft.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/hipfft.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag hipfft_dso_flag;
+void *hipfft_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+HIPFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/dynload/hipfft.h
+++ b/paddle/fluid/platform/dynload/hipfft.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef PADDLE_WITH_HIP
+#include <hipfft.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+extern std::once_flag hipfft_dso_flag;
+extern void *hipfft_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name)                             \
+  struct DynLoad__##__name {                                                 \
+    template <typename... Args>                                              \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {         \
+      using hipfftFunc = decltype(&::__name);                                \
+      std::call_once(hipfft_dso_flag, []() {                                 \
+        hipfft_dso_handle = paddle::platform::dynload::GetROCFFTDsoHandle(); \
+      });                                                                    \
+      static void *p_##__name = dlsym(hipfft_dso_handle, #__name);           \
+      return reinterpret_cast<hipfftFunc>(p_##__name)(args...);              \
+    }                                                                        \
+  };                                                                         \
+  extern DynLoad__##__name __name
+
+#define HIPFFT_FFT_ROUTINE_EACH(__macro) \
+  __macro(hipfftPlan1d);                 \
+  __macro(hipfftPlan2d);                 \
+  __macro(hipfftPlan3d);                 \
+  __macro(hipfftPlanMany);               \
+  __macro(hipfftMakePlan1d);             \
+  __macro(hipfftMakePlanMany);           \
+  __macro(hipfftMakePlanMany64);         \
+  __macro(hipfftGetSizeMany64);          \
+  __macro(hipfftEstimate1d);             \
+  __macro(hipfftEstimate2d);             \
+  __macro(hipfftEstimate3d);             \
+  __macro(hipfftEstimateMany);           \
+  __macro(hipfftCreate);                 \
+  __macro(hipfftGetSize1d);              \
+  __macro(hipfftGetSizeMany);            \
+  __macro(hipfftGetSize);                \
+  __macro(hipfftSetWorkArea);            \
+  __macro(hipfftSetAutoAllocation);      \
+  __macro(hipfftExecC2C);                \
+  __macro(hipfftExecR2C);                \
+  __macro(hipfftExecC2R);                \
+  __macro(hipfftExecZ2Z);                \
+  __macro(hipfftExecD2Z);                \
+  __macro(hipfftExecZ2D);                \
+  __macro(hipfftSetStream);              \
+  __macro(hipfftDestroy);                \
+  __macro(hipfftGetVersion);             \
+  __macro(hipfftGetProperty);
+
+HIPFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP);
+
+inline const char *hipfftGetErrorString(hipfftResult_t status) {
+  switch (status) {
+    case HIPFFT_SUCCESS:
+      return "'HIPFFT_SUCCESS'. The hipFFT operation was successful.";
+    case HIPFFT_INVALID_PLAN:
+      return "'HIPFFT_INVALID_PLAN'. hipFFT was passed an invalid plan handle.";
+    case HIPFFT_ALLOC_FAILED:
+      return "'HIPFFT_ALLOC_FAILED'. hipFFT failed to allocate GPU or CPU "
+             "memory.";
+    case HIPFFT_INVALID_TYPE:
+      return "'HIPFFT_INVALID_TYPE'. No longer used.";
+    case HIPFFT_INVALID_VALUE:
+      return "'HIPFFT_INVALID_VALUE'. User specified an invalid pointer or "
+             "parameter.";
+    case HIPFFT_INTERNAL_ERROR:
+      return "'HIPFFT_INTERNAL_ERROR'. Driver or internal hipFFT library "
+             "error.";
+    case HIPFFT_EXEC_FAILED:
+      return "'HIPFFT_EXEC_FAILED'. Failed to execute an FFT on the GPU.";
+    case HIPFFT_SETUP_FAILED:
+      return "'HIPFFT_SETUP_FAILED'. The hipFFT library failed to initialize.";
+    case HIPFFT_INVALID_SIZE:
+      return "'HIPFFT_INVALID_SIZE'. User specified an invalid transform size.";
+    case HIPFFT_UNALIGNED_DATA:
+      return "'HIPFFT_UNALIGNED_DATA'. No longer used.";
+    case HIPFFT_INCOMPLETE_PARAMETER_LIST:
+      return "'HIPFFT_INCOMPLETE_PARAMETER_LIST'. Missing parameters in call.";
+    case HIPFFT_INVALID_DEVICE:
+      return "'HIPFFT_INVALID_DEVICE'. Execution of a plan was on different "
+             "GPU than plan creation.";
+    case HIPFFT_PARSE_ERROR:
+      return "'HIPFFT_PARSE_ERROR'. Internal plan database error.";
+    case HIPFFT_NO_WORKSPACE:
+      return "'HIPFFT_NO_WORKSPACE'. No workspace has been provided prior to "
+             "plan execution.";
+    case HIPFFT_NOT_IMPLEMENTED:
+      return "'HIPFFT_NOT_IMPLEMENTED'. Function does not implement "
+             "functionality for parameters given.";
+    case HIPFFT_NOT_SUPPORTED:
+      return "'HIPFFT_NOT_SUPPORTED'. Operation is not supported for "
+             "parameters given.";
+    default:
+      return "HIPFFT_STATUS_UNKNOWN_ERROR";
+  }
+}
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+
+#endif
--- a/paddle/fluid/platform/dynload/mklrt.cc
+++ b/paddle/fluid/platform/dynload/mklrt.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/mklrt.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag mklrt_dso_flag;
+void* mklrt_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MKLDFTI_ROUTINE_EACH(DEFINE_WRAP);
+
+DFTI_EXTERN MKL_LONG DftiCreateDescriptorX(DFTI_DESCRIPTOR_HANDLE* desc,
+                                           enum DFTI_CONFIG_VALUE prec,
+                                           enum DFTI_CONFIG_VALUE domain,
+                                           MKL_LONG dim, MKL_LONG* sizes) {
+  if (prec == DFTI_SINGLE) {
+    if (dim == 1) {
+      return DftiCreateDescriptor_s_1d(desc, domain, sizes[0]);
+    } else {
+      return DftiCreateDescriptor_s_md(desc, domain, dim, sizes);
+    }
+  } else if (prec == DFTI_DOUBLE) {
+    if (dim == 1) {
+      return DftiCreateDescriptor_d_1d(desc, domain, sizes[0]);
+    } else {
+      return DftiCreateDescriptor_d_md(desc, domain, dim, sizes);
+    }
+  } else {
+    return DftiCreateDescriptor(desc, prec, domain, dim, sizes);
+  }
+}
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/dynload/mklrt.h
+++ b/paddle/fluid/platform/dynload/mklrt.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mkl_dfti.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag mklrt_dso_flag;
+extern void* mklrt_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load mkldfti routine
+ * via operator overloading.
+ */
+#define DYNAMIC_LOAD_MKLRT_WRAP(__name)                                    \
+  struct DynLoad__##__name {                                               \
+    template <typename... Args>                                            \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
+      using mklrtFunc = decltype(&::__name);                               \
+      std::call_once(mklrt_dso_flag, []() {                                \
+        mklrt_dso_handle = paddle::platform::dynload::GetMKLRTDsoHandle(); \
+      });                                                                  \
+      static void* p_##__name = dlsym(mklrt_dso_handle, #__name);          \
+      return reinterpret_cast<mklrtFunc>(p_##__name)(args...);             \
+    }                                                                      \
+  };                                                                       \
+  extern DynLoad__##__name __name
+
+// mkl_dfti.h has a macro that shadows the function with the same name
+// un-defeine this macro so as to export that function
+#undef DftiCreateDescriptor
+
+#define MKLDFTI_ROUTINE_EACH(__macro) \
+  __macro(DftiCreateDescriptor);      \
+  __macro(DftiCreateDescriptor_s_1d); \
+  __macro(DftiCreateDescriptor_d_1d); \
+  __macro(DftiCreateDescriptor_s_md); \
+  __macro(DftiCreateDescriptor_d_md); \
+  __macro(DftiSetValue);              \
+  __macro(DftiGetValue);              \
+  __macro(DftiCommitDescriptor);      \
+  __macro(DftiComputeForward);        \
+  __macro(DftiComputeBackward);       \
+  __macro(DftiFreeDescriptor);        \
+  __macro(DftiErrorClass);            \
+  __macro(DftiErrorMessage);
+
+MKLDFTI_ROUTINE_EACH(DYNAMIC_LOAD_MKLRT_WRAP)
+
+#undef DYNAMIC_LOAD_MKLRT_WRAP
+
+// define another function to avoid naming conflict
+DFTI_EXTERN MKL_LONG DftiCreateDescriptorX(DFTI_DESCRIPTOR_HANDLE* desc,
+                                           enum DFTI_CONFIG_VALUE prec,
+                                           enum DFTI_CONFIG_VALUE domain,
+                                           MKL_LONG dim, MKL_LONG* sizes);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -31,6 +31,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cublas_v2.h>
 #include <cudnn.h>
+#include <cufft.h>
 #include <curand.h>
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
@@ -85,6 +86,7 @@ limitations under the License. */
 #endif  // PADDLE_WITH_CUDA

 #ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/dynload/hipfft.h"
 #include "paddle/fluid/platform/dynload/hiprand.h"
 #include "paddle/fluid/platform/dynload/miopen.h"
 #include "paddle/fluid/platform/dynload/rocblas.h"
@@ -714,6 +716,7 @@ DEFINE_EXTERNAL_API_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS, CURAND);
 DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS, CUDNN);
 DEFINE_EXTERNAL_API_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS, CUBLAS);
 DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS, CUSOLVER);
+DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS, CUFFT);

 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess, NCCL);
@@ -751,6 +754,8 @@ inline const char* GetErrorMsgUrl(T status) {
      return "https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/"
             "types.html#ncclresult-t";
      break;
+    case platform::proto::ApiType::CUFFT:
+      return "https://docs.nvidia.com/cuda/cufft/index.html#cufftresult";
    default:
      return "Unknown type of External API, can't get error message URL!";
      break;
@@ -839,6 +844,7 @@ template std::string GetExternalErrorMsg<curandStatus_t>(curandStatus_t);
 template std::string GetExternalErrorMsg<cudnnStatus_t>(cudnnStatus_t);
 template std::string GetExternalErrorMsg<cublasStatus_t>(cublasStatus_t);
 template std::string GetExternalErrorMsg<cusolverStatus_t>(cusolverStatus_t);
+template std::string GetExternalErrorMsg<cufftResult_t>(cufftResult_t);
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 template std::string GetExternalErrorMsg<ncclResult_t>(ncclResult_t);
 #endif
@@ -899,6 +905,15 @@ inline std::string build_nvidia_error_msg(cusolverStatus_t stat) {
  return sout.str();
 }

+/*************** CUFFT ERROR ***************/
+inline bool is_error(cufftResult_t stat) { return stat != CUFFT_SUCCESS; }
+
+inline std::string build_nvidia_error_msg(cufftResult_t stat) {
+  std::ostringstream sout;
+  sout << "CUFFT error(" << stat << "). " << GetExternalErrorMsg(stat);
+  return sout.str();
+}
+
 /**************** NCCL ERROR ****************/
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 inline bool is_error(ncclResult_t nccl_result) {
@@ -1099,6 +1114,14 @@ inline std::string build_rocm_error_msg(ncclResult_t nccl_result) {
 }
 #endif  // not(__APPLE__) and PADDLE_WITH_NCCL

+/***** HIPFFT ERROR *****/
+inline bool is_error(hipfftResult_t stat) { return stat != HIPFFT_SUCCESS; }
+
+inline std::string build_rocm_error_msg(hipfftResult_t stat) {
+  std::string msg(" HIPFFT error, ");
+  return msg + platform::dynload::hipfftGetErrorString(stat) + " ";
+}
+
 namespace details {

 template <typename T>
@@ -1115,6 +1138,7 @@ DEFINE_EXTERNAL_API_TYPE(hipError_t, hipSuccess);
 DEFINE_EXTERNAL_API_TYPE(hiprandStatus_t, HIPRAND_STATUS_SUCCESS);
 DEFINE_EXTERNAL_API_TYPE(miopenStatus_t, miopenStatusSuccess);
 DEFINE_EXTERNAL_API_TYPE(rocblas_status, rocblas_status_success);
+DEFINE_EXTERNAL_API_TYPE(hipfftResult_t, HIPFFT_SUCCESS);

 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
 DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess);

--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -9,10 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "paddle/fluid/platform/enforce.h"
+
 #include <list>

 #include "gtest/gtest.h"
-#include "paddle/fluid/platform/enforce.h"

 TEST(ENFORCE, OK) {
  PADDLE_ENFORCE(true, paddle::platform::errors::Unavailable(
@@ -330,6 +331,10 @@ TEST(enforce, hip_success) {
      CheckCudaStatusFailure(rocblas_status_invalid_handle, "Rocblas error"));
  EXPECT_TRUE(
      CheckCudaStatusFailure(rocblas_status_invalid_value, "Rocblas error"));
+  EXPECT_TRUE(CheckCudaStatusSuccess(HIPFFT_SUCCESS));
+  EXPECT_TRUE(CheckCudaStatusFailure(HIPFFT_INVALID_PLAN, "HIPFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(HIPFFT_ALLOC_FAILED, "HIPFFT error"));
+
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
  EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
  EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Rccl error"));
@@ -418,6 +423,25 @@ TEST(enforce, cuda_success) {
      "negative vector size, for example).To correct: ensure that all the "
      "parameters being passed have valid values"));

+  EXPECT_TRUE(CheckCudaStatusSuccess(CUFFT_SUCCESS));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_PLAN, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_ALLOC_FAILED, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_TYPE, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_VALUE, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INTERNAL_ERROR, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_EXEC_FAILED, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_SETUP_FAILED, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_SIZE, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_UNALIGNED_DATA, "CUFFT error"));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(CUFFT_INCOMPLETE_PARAMETER_LIST, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_DEVICE, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_PARSE_ERROR, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_NO_WORKSPACE, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_NOT_IMPLEMENTED, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_LICENSE_ERROR, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_NOT_SUPPORTED, "CUFFT error"));
+
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
  EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
  EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "NCCL error"));

--- a/paddle/fluid/platform/external_error.proto
+++ b/paddle/fluid/platform/external_error.proto
@@ -24,6 +24,7 @@ enum ApiType {
  CUBLAS = 3;
  CUSOLVER = 4;
  NCCL = 5;
+  CUFFT = 6;
 }

 message MessageDesc {

--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -64,7 +64,6 @@ import paddle.reader  # noqa: F401
 import paddle.static  # noqa: F401
 import paddle.vision  # noqa: F401

-from .tensor import fft
 from .tensor.random import bernoulli  # noqa: F401

 from .tensor.attribute import rank  # noqa: F401
@@ -297,6 +296,8 @@ from .hapi import summary  # noqa: F401
 from .hapi import flops  # noqa: F401
 from . import hub  # noqa: F401
 from . import linalg  # noqa: F401
+from . import fft  # noqa: F401
+from . import signal  # noqa: F401

 import paddle.text  # noqa: F401
 import paddle.vision  # noqa: F401

--- a/python/paddle/tensor/fft.py
+++ b/python/paddle/tensor/fft.py
@@ -15,30 +15,30 @@
 from typing import Sequence
 import numpy as np
 import paddle
-from .attribute import is_complex, is_floating_point, is_interger, _real_to_complex_dtype, _complex_to_real_dtype
-from ..fluid.framework import in_dygraph_mode
-from .. import _C_ops
-from ..fluid.data_feeder import check_variable_and_dtype
-from ..fluid.layer_helper import LayerHelper
+from .tensor.attribute import is_complex, is_floating_point, is_interger, _real_to_complex_dtype, _complex_to_real_dtype
+from .fluid.framework import in_dygraph_mode
+from . import _C_ops
+from .fluid.data_feeder import check_variable_and_dtype
+from .fluid.layer_helper import LayerHelper

 __all__ = [
    'fft',
-    'fft2',
-    'fftn',
    'ifft',
-    'ifft2',
-    'ifftn',
    'rfft',
-    'rfft2',
-    'rfftn',
    'irfft',
-    'irfft2',
-    'irfftn',
    'hfft',
-    'hfft2',
-    'hfftn',
    'ihfft',
+    'fft2',
+    'ifft2',
+    'rfft2',
+    'irfft2',
+    'hfft2',
    'ihfft2',
+    'fftn',
+    'ifftn',
+    'rfftn',
+    'irfftn',
+    'hfftn',
    'ihfftn',
    'fftfreq',
    'rfftfreq',
@@ -362,7 +362,7 @@ def irfft(x, n=None, axis=-1, norm="backward", name=None):
            xp = paddle.to_tensor(x)
            irfft_xp = paddle.fft.irfft(xp).numpy()
            print(irfft_xp)
-            #  [0. 0. 0. 4.]
+            #  [0. 1. 0. 0.]

    """
    return fft_c2r(x, n, axis, norm, forward=False, name=name)
@@ -500,7 +500,7 @@ def fftn(x, s=None, axes=None, norm="backward", name=None):
            import numpy as np
            import paddle

-            x = x = np.mgrid[:4, :4, :4][1]
+            x = np.mgrid[:4, :4, :4][1]
            xp = paddle.to_tensor(x)
            fftn_xp = paddle.fft.fftn(xp, axes=(1, 2)).numpy()
            print(fftn_xp)
@@ -654,9 +654,9 @@ def rfftn(x, s=None, axes=None, norm="backward", name=None):
        # use axes(2, 0)
        print(paddle.fft.rfftn(x, axes=(2, 0)))
        # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
-        #        [[[(24+0j), 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ]],
+        #        [[[(8+0j), 0j     , 0j     ],
+        #          [(8+0j), 0j     , 0j     ],
+        #          [(8+0j), 0j     , 0j     ]],
        #
        #         [[0j     , 0j     , 0j     ],
        #          [0j     , 0j     , 0j     ],
@@ -1135,7 +1135,24 @@ def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
            refer to :ref:`api_guide_Name` . 

    Returns:
-        out(Tensor) : The result of the inverse real 2-D FFT.
+        out(Tensor) : The result of the inverse hermitian 2-D FFT.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.mgrid[:5, :5][0].astype(np.float64)
+            xp = paddle.to_tensor(x)
+            ihfft2_xp = paddle.fft.ihfft2(xp).numpy()
+            print(ihfft2_xp)
+            # [[ 2. +0.j          0. +0.j          0. +0.j        ]
+            #  [-0.5-0.68819096j  0. +0.j          0. +0.j        ]
+            #  [-0.5-0.16245985j  0. +0.j          0. +0.j        ]
+            #  [-0.5+0.16245985j  0. +0.j          0. +0.j        ]
+            #  [-0.5+0.68819096j  0. +0.j          0. +0.j        ]]
    """
    _check_at_least_ndim(x, 2)
    if s is not None:
@@ -1273,9 +1290,8 @@ def fftshift(x, axes=None, name=None):
            import paddle

            x = np.array([3, 1, 2, 2, 3], dtype=float)
-            scalar_temp = 0.3
            n = x.size
-            fftfreq_xp = paddle.fft.fftfreq(n, d=scalar_temp)
+            fftfreq_xp = paddle.fft.fftfreq(n, d=0.3)
            res = paddle.fft.fftshift(fftfreq_xp).numpy()
            print(res)
            #  [-1.3333334 -0.6666667  0.         0.6666667  1.3333334]
@@ -1284,13 +1300,13 @@ def fftshift(x, axes=None, name=None):
    shape = paddle.shape(x)
    if axes is None:
        # shift all axes
-        rank = paddle.rank(x).reshape([1])
-        axes = axes or paddle.arange(0, rank)
-        shifts = [size // 2 for size in shape]
+        rank = len(x.shape)
+        axes = list(range(0, rank))
+        shifts = shape // 2
    elif isinstance(axes, int):
        shifts = shape[axes] // 2
    else:
-        shifts = [shape[ax] // 2 for ax in axes]
+        shifts = paddle.concat([shape[ax] // 2 for ax in axes])
    return paddle.roll(x, shifts, axes, name=name)


@@ -1317,9 +1333,8 @@ def ifftshift(x, axes=None, name=None):
            import paddle

            x = np.array([3, 1, 2, 2, 3], dtype=float)
-            scalar_temp = 0.3
            n = x.size
-            fftfreq_xp = paddle.fft.fftfreq(n, d=scalar_temp)
+            fftfreq_xp = paddle.fft.fftfreq(n, d=0.3)
            res = paddle.fft.ifftshift(fftfreq_xp).numpy()
            print(res)
            #  [ 1.3333334 -1.3333334 -0.6666667  0.         0.6666667]
@@ -1328,13 +1343,13 @@ def ifftshift(x, axes=None, name=None):
    shape = paddle.shape(x)
    if axes is None:
        # shift all axes
-        rank = paddle.rank(x).reshape([1])
-        axes = axes or paddle.arange(0, rank)
-        shifts = [-size // 2 for size in shape]
+        rank = len(x.shape)
+        axes = list(range(0, rank))
+        shifts = shape // 2
    elif isinstance(axes, int):
        shifts = -shape[axes] // 2
    else:
-        shifts = [-shape[ax] // 2 for ax in axes]
+        shifts = paddle.concat([-shape[ax] // 2 for ax in axes])
    return paddle.roll(x, shifts, axes, name=name)


@@ -1346,7 +1361,7 @@ def fft_c2c(x, n, axis, norm, forward, name):
        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
    _check_normalization(norm)

-    axis = axis or -1
+    axis = axis if axis is not None else -1
    _check_fft_axis(x, axis)
    axes = [axis]
    axes = _normalize_axes(x, axes)
@@ -1376,7 +1391,7 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name):
    if is_interger(x):
        x = paddle.cast(x, paddle.get_default_dtype())
    _check_normalization(norm)
-    axis = axis or -1
+    axis = axis if axis is not None else -1
    _check_fft_axis(x, axis)
    axes = [axis]
    axes = _normalize_axes(x, axes)
@@ -1415,7 +1430,7 @@ def fft_c2r(x, n, axis, norm, forward, name):
    elif is_floating_point(x):
        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
    _check_normalization(norm)
-    axis = axis or -1
+    axis = axis if axis is not None else -1
    _check_fft_axis(x, axis)
    axes = [axis]
    axes = _normalize_axes(x, axes)

--- a/python/paddle/fluid/tests/unittests/fft/test_fft.py
+++ b/python/paddle/fluid/tests/unittests/fft/test_fft.py
@@ -1009,10 +1009,11 @@ class TestRfftFreq(unittest.TestCase):


 @place(DEVICES)
-@parameterize((TEST_CASE_NAME, 'x', 'axes', 'dtype'), [
-    ('test_1d', np.random.randn(10), (0, ), 'float64'),
-    ('test_2d', np.random.randn(10, 10), (0, 1), 'float64'),
-])
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'axes', 'dtype'),
+    [('test_1d', np.random.randn(10), (0, ), 'float64'),
+     ('test_2d', np.random.randn(10, 10), (0, 1), 'float64'),
+     ('test_2d_with_all_axes', np.random.randn(10, 10), None, 'float64')])
 class TestFftShift(unittest.TestCase):
    def test_fftshift(self):
        """Test fftshift with norm condition
@@ -1030,6 +1031,7 @@ class TestFftShift(unittest.TestCase):
 @parameterize((TEST_CASE_NAME, 'x', 'axes'), [
    ('test_1d', np.random.randn(10), (0, ), 'float64'),
    ('test_2d', np.random.randn(10, 10), (0, 1), 'float64'),
+    ('test_2d_with_all_axes', np.random.randn(10, 10), None, 'float64'),
 ])
 class TestIfftShift(unittest.TestCase):
    def test_ifftshift(self):

--- a/python/paddle/fluid/tests/unittests/test_roll_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roll_op.py
@@ -122,6 +122,34 @@ class TestRollAPI(unittest.TestCase):

        self.assertRaises(ValueError, test_axis_out_range)

+    def test_shifts_as_tensor_dygraph(self):
+        with fluid.dygraph.guard():
+            x = paddle.arange(9).reshape([3, 3])
+            shape = paddle.shape(x)
+            shifts = shape // 2
+            axes = [0, 1]
+            out = paddle.roll(x, shifts=shifts, axis=axes).numpy()
+            expected_out = np.array([[8, 6, 7], [2, 0, 1], [5, 3, 4]])
+            self.assertTrue(np.allclose(out, expected_out))
+
+    def test_shifts_as_tensor_static(self):
+        with program_guard(Program(), Program()):
+            x = paddle.arange(9).reshape([3, 3]).astype('float32')
+            shape = paddle.shape(x)
+            shifts = shape // 2
+            axes = [0, 1]
+            out = paddle.roll(x, shifts=shifts, axis=axes)
+            expected_out = np.array([[8, 6, 7], [2, 0, 1], [5, 3, 4]])
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            [out_np] = exe.run(fetch_list=[out])
+            self.assertTrue(np.allclose(out_np, expected_out))
+
+            if paddle.is_compiled_with_cuda():
+                exe = fluid.Executor(fluid.CPUPlace())
+                [out_np] = exe.run(fetch_list=[out])
+                self.assertTrue(np.allclose(out_np, expected_out))
+

 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_signal.py
+++ b/python/paddle/fluid/tests/unittests/test_signal.py
@@ -652,7 +652,7 @@ class TestFrame(unittest.TestCase):
        self.assertTrue(
            np.allclose(
                frame_for_api_test(self.x, self.frame_length, self.hop_length, self.axis),
-                paddle.tensor.signal.frame(
+                paddle.signal.frame(
                    paddle.to_tensor(self.x),
                    self.frame_length,
                    self.hop_length,
@@ -678,7 +678,7 @@ class TestFrameStatic(unittest.TestCase):
        mp, sp = paddle.static.Program(), paddle.static.Program()
        with paddle.static.program_guard(mp, sp):
            input = paddle.static.data('input', self.x.shape, dtype=self.x.dtype)
-            output = paddle.tensor.signal.frame(
+            output = paddle.signal.frame(
                     input,
                     self.frame_length,
                     self.hop_length,
@@ -708,7 +708,7 @@ class TestFrameStatic(unittest.TestCase):
 class TestFrameException(unittest.TestCase):
    def test_frame(self):
        with self.assertRaises(self.expect_exception):
-            paddle.tensor.signal.frame(
+            paddle.signal.frame(
                paddle.to_tensor(self.x),
                self.frame_length,
                self.hop_length,
@@ -731,7 +731,7 @@ class TestOverlapAdd(unittest.TestCase):
        self.assertTrue(
            np.allclose(
                overlap_add_for_api_test(self.x, self.hop_length, self.axis),
-                paddle.tensor.signal.overlap_add(
+                paddle.signal.overlap_add(
                    paddle.to_tensor(self.x),
                    self.hop_length,
                    self.axis),
@@ -756,7 +756,7 @@ class TestOverlapAddStatic(unittest.TestCase):
        mp, sp = paddle.static.Program(), paddle.static.Program()
        with paddle.static.program_guard(mp, sp):
            input = paddle.static.data('input', self.x.shape, dtype=self.x.dtype)
-            output = paddle.tensor.signal.overlap_add(
+            output = paddle.signal.overlap_add(
                     input,
                     self.hop_length,
                     self.axis),
@@ -783,7 +783,7 @@ class TestOverlapAddStatic(unittest.TestCase):
 class TestOverlapAddException(unittest.TestCase):
    def test_overlap_add(self):
        with self.assertRaises(self.expect_exception):
-            paddle.tensor.signal.overlap_add(
+            paddle.signal.overlap_add(
                paddle.to_tensor(self.x),
                self.hop_length,
                self.axis)
@@ -848,7 +848,7 @@ class TestStft(unittest.TestCase):
        self.assertTrue(
            np.allclose(
                stft(self.x, self.n_fft, self.hop_length, self.win_length, win_l, self.center, self.pad_mode),
-                paddle.tensor.signal.stft(
+                paddle.signal.stft(
                    paddle.to_tensor(self.x),
                    self.n_fft,
                    self.hop_length,
@@ -891,7 +891,7 @@ class TestStftException(unittest.TestCase):
            win_p = paddle.to_tensor(self.window)

        with self.assertRaises(self.expect_exception):
-            paddle.tensor.signal.stft(
+            paddle.signal.stft(
                paddle.to_tensor(self.x),
                self.n_fft,
                self.hop_length,
@@ -934,7 +934,7 @@ class TestIstft(unittest.TestCase):
        self.assertTrue(
            np.allclose(
                istft(self.x, self.hop_length, self.win_length, win_l, self.center, self.length),
-                paddle.tensor.signal.istft(
+                paddle.signal.istft(
                    paddle.to_tensor(self.x),
                    self.n_fft,
                    self.hop_length,
@@ -986,7 +986,7 @@ class TestIstftException(unittest.TestCase):
            win_p = paddle.to_tensor(self.window)

        with self.assertRaises(self.expect_exception):
-            paddle.tensor.signal.istft(
+            paddle.signal.istft(
                paddle.to_tensor(self.x),
                self.n_fft,
                self.hop_length,

--- a/python/paddle/tensor/signal.py
+++ b/python/paddle/tensor/signal.py
@@ -16,16 +16,14 @@ from typing import Optional

 import paddle

-from .attribute import is_complex, is_floating_point
+from .tensor.attribute import is_complex, is_floating_point
 from .fft import fft_r2c, fft_c2r, fft_c2c
-from ..fluid.data_feeder import check_variable_and_dtype
-from ..fluid.framework import in_dygraph_mode
-from ..fluid.layer_helper import LayerHelper
-from .. import _C_ops
+from .fluid.data_feeder import check_variable_and_dtype
+from .fluid.framework import in_dygraph_mode
+from .fluid.layer_helper import LayerHelper
+from . import _C_ops

 __all__ = [
-    'frame',
-    'overlap_add',
    'stft',
    'istft',
 ]
@@ -56,7 +54,7 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
    .. code-block:: python

        import paddle
-        from paddle.tensor.signal import frame
+        from paddle.signal import frame
        
        # 1D
        x = paddle.arange(8)
@@ -177,7 +175,7 @@ def overlap_add(x, hop_length, axis=-1, name=None):
    .. code-block:: python

        import paddle
-        from paddle.tensor.signal import overlap_add
+        from paddle.signal import overlap_add
        
        # 2D
        x0 = paddle.arange(16).reshape([8, 2])
@@ -291,11 +289,11 @@ def stft(x,
            real-valued input and `onesided` is `True`) or `[..., n_fft, num_frames]`(
            `onesided` is `False`)
    
-    Exampels:
+    Examples:
        .. code-block:: python
    
            import paddle
-            from paddle.tensor.signal import stft
+            from paddle.signal import stft
    
            # real-valued input
            x = paddle.randn([8, 48000], dtype=paddle.float64)
@@ -415,7 +413,7 @@ def istft(x,
    - :math:`N`: Value of `n_fft`.
    - :math:`H`: Value of `hop_length`.

-    Result of `istft` expected to be the inverse of `paddle.tensor.signal.stft`, but it is
+    Result of `istft` expected to be the inverse of `paddle.signal.stft`, but it is
        not guaranteed to reconstruct a exactly realizible time-domain signal from a STFT
        complex tensor which has been modified (via masking or otherwise). Therefore, `istft`
        gives the [Griffin-Lim optimal estimate](https://ieeexplore.ieee.org/document/1164317)
@@ -454,12 +452,12 @@ def istft(x,
        A tensor of least squares estimation of the reconstructed signal(s) with shape
            `[..., seq_length]`

-    Exampels:
+    Examples:
        .. code-block:: python

            import numpy as np
            import paddle
-            from paddle.tensor.signal import stft, istft
+            from paddle.signal import stft, istft

            paddle.seed(0)


--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -222,8 +222,6 @@ from .array import array_write  # noqa: F401
 from .array import create_array  # noqa: F401

 from .einsum import einsum  # noqa: F401
-from . import fft
-from . import signal

 #this list used in math_op_patch.py for _binary_creator_
 tensor_method_func  = [ #noqa

--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -696,15 +696,24 @@ def roll(x, shifts, axis=None, name=None):

    helper = LayerHelper("roll", **locals())
    check_type(axis, 'axis', (list, tuple), 'roll')
-    check_type(shifts, 'shifts', (list, tuple), 'roll')
+
    out = helper.create_variable_for_type_inference(x.dtype)

-    helper.append_op(
-        type='roll',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'axis': axis,
-               'shifts': shifts})
+    if isinstance(shifts, Variable):
+        helper.append_op(
+            type='roll',
+            inputs={'X': x,
+                    "ShiftsTensor": shifts},
+            outputs={'Out': out},
+            attrs={'axis': axis})
+    else:
+        check_type(shifts, 'shifts', (list, tuple), 'roll')
+        helper.append_op(
+            type='roll',
+            inputs={'X': x},
+            outputs={'Out': out},
+            attrs={'axis': axis,
+                   'shifts': shifts})
    return out



--- a/tools/externalError/README.md
+++ b/tools/externalError/README.md
-Usage:
+#### **Introduction for crawling new error message:**

-Please run:
-```
-bash start.sh
-```

-If you want to update all external error message, you need to run command `bash start.sh` in current directory, 
-and upload the generated file `externalErrorMsg.tar.gz` to https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz
+
+1. add new spider code in spider.py for crawling error message from website. 
+
+2. run `bash start.sh` in current  directory to generate new externalErrorMsg_${date}.tar.gz file, for example `externalErrorMsg_20210928.tar.gz`.
+
+3. upload above tar file into bos https://paddlepaddledeps.bj.bcebos.com **paddlepaddledeps** bucket, and copy download link `${download_url}`. ***\*Be careful not to delete original tar file\****.
+
+4. compute md5 value of above tar file `${md5}`, and modify cmake/third_party.cmake file 
+
+   ```
+   set(URL  "${download_url}" CACHE STRING "" FORCE)
+   file_download_and_uncompress(${URL} "externalError" MD5 ${md5})   
+   ```
+
+   for example:
+
+   ```
+   set(URL  "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg_20210928.tar.gz" CACHE STRING "" FORCE)
+   file_download_and_uncompress(${URL} "externalError" MD5 a712a49384e77ca216ad866712f7cafa)
+   ```
+
+5. commit your changes, and create pull request.
--- a/tools/externalError/spider.py
+++ b/tools/externalError/spider.py
@@ -17,8 +17,10 @@ import re
 import urllib.request
 import json
 import collections
-import sys, getopt
+import sys
+import getopt
 import external_error_pb2
+from html.parser import HTMLParser


 def parsing(externalErrorDesc):
@@ -335,6 +337,31 @@ def parsing(externalErrorDesc):
        _Messages.message = "'%s'. %s" % (error[0], m_message)
    print("End crawling errorMessage for nvidia NCCL API!\n")

+    #*************************************************************************************************#
+    #*********************************** CUFFT Error Message **************************************#
+    print("start crawling errorMessage for nvidia CUFFT API--->")
+    url = 'https://docs.nvidia.com/cuda/cufft/index.html#cufftresult'
+
+    allMessageDesc = externalErrorDesc.errors.add()
+    allMessageDesc.type = external_error_pb2.CUFFT
+
+    html = urllib.request.urlopen(url).read().decode('utf-8')
+
+    class CUFFTHTMLParser(HTMLParser):
+        '''CUFFTHTML Parser
+        '''
+
+        def handle_data(self, data):
+            if 'typedef enum cufftResult_t' in data:
+                for line in data.strip().splitlines()[1:-1]:
+                    status, code, desc = re.split('=|//', line.strip())
+                    _Messages = allMessageDesc.messages.add()
+                    _Messages.code = int(code.strip(' ,'))
+                    _Messages.message = "'%s'. %s" % (status.strip(),
+                                                      desc.strip())
+
+    CUFFTHTMLParser().feed(html)
+

 def main(argv):
    try:

--- a/tools/externalError/start.sh
+++ b/tools/externalError/start.sh
@@ -32,4 +32,4 @@ fi
 protobuf/bin/protoc -I../../paddle/fluid/platform/ --python_out . ../../paddle/fluid/platform/external_error.proto

 python3.7 spider.py
-tar czvf externalErrorMsg.tar.gz externalErrorMsg.pb
+tar czvf externalErrorMsg_$(date +'%Y%m%d').tar.gz externalErrorMsg.pb