diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 893f7d51140a70550b18ac440ff81266c47e0f39..a25c54bae2a436415bc23d5e661f200f21280e7f 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -104,7 +104,7 @@ endif()
 set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel backward_infermeta)
 
 register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op quantize_linear_op
-        recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
+        recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
 
 op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
 target_link_libraries(run_program_op cuda_graph_with_memory_pool)
@@ -129,22 +129,6 @@ else()
     op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
 
-if (WITH_GPU OR WITH_ROCM)
-    if (MKL_FOUND AND WITH_ONEMKL)
-        op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda dynload_mklrt ${OP_HEADER_DEPS})
-        target_include_directories(spectral_op PRIVATE ${MKL_INCLUDE})
-    else()
-        op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda ${OP_HEADER_DEPS})
-    endif()
-else()
-    if (MKL_FOUND AND WITH_ONEMKL)
-        op_library(spectral_op SRCS spectral_op.cc DEPS dynload_mklrt ${OP_HEADER_DEPS})
-        target_include_directories(spectral_op PRIVATE ${MKL_INCLUDE})
-    else()
-        op_library(spectral_op SRCS spectral_op.cc DEPS ${OP_HEADER_DEPS})
-    endif()
-endif()
-
 if (WITH_ASCEND_CL)
   op_library(sync_batch_norm_op)
 endif()
diff --git a/paddle/fluid/operators/spectral_helper.h b/paddle/fluid/operators/spectral_helper.h
deleted file mode 100644
index f69573e18927e1623e3e80e0167e104b3be176b4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/spectral_helper.h
+++ /dev/null
@@ -1,545 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/operators/spectral_op.h"
-
-#if defined(PADDLE_WITH_ONEMKL)
-#include "paddle/phi/backends/dynload/mklrt.h"
-#elif defined(PADDLE_WITH_POCKETFFT)
-#include "extern_pocketfft/pocketfft_hdronly.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-// FFT Functors
-#if defined(PADDLE_WITH_ONEMKL)
-
-#define MKL_DFTI_CHECK(expr)                                                   \
-  do {                                                                         \
-    MKL_LONG status = (expr);                                                  \
-    if (!phi::dynload::DftiErrorClass(status, DFTI_NO_ERROR))                  \
-      PADDLE_THROW(                                                            \
-          platform::errors::External(phi::dynload::DftiErrorMessage(status))); \
-  } while (0);
-
-struct DftiDescriptorDeleter {
-  void operator()(DFTI_DESCRIPTOR_HANDLE handle) {
-    if (handle != nullptr) {
-      MKL_DFTI_CHECK(phi::dynload::DftiFreeDescriptor(&handle));
-    }
-  }
-};
-
-// A RAII wrapper for MKL_DESCRIPTOR*
-class DftiDescriptor {
- public:
-  void init(DFTI_CONFIG_VALUE precision,
-            DFTI_CONFIG_VALUE signal_type,
-            MKL_LONG signal_ndim,
-            MKL_LONG* sizes) {
-    PADDLE_ENFORCE_EQ(desc_.get(),
-                      nullptr,
-                      platform::errors::AlreadyExists(
-                          "DftiDescriptor has already been initialized."));
-
-    DFTI_DESCRIPTOR* raw_desc;
-    MKL_DFTI_CHECK(phi::dynload::DftiCreateDescriptorX(
-        &raw_desc, precision, signal_type, signal_ndim, sizes));
-    desc_.reset(raw_desc);
-  }
-
-  DFTI_DESCRIPTOR* get() const {
-    DFTI_DESCRIPTOR* raw_desc = desc_.get();
-    PADDLE_ENFORCE_NOT_NULL(raw_desc,
-                            platform::errors::PreconditionNotMet(
-                                "DFTI DESCRIPTOR has not been initialized."));
-    return raw_desc;
-  }
-
- private:
-  std::unique_ptr<DFTI_DESCRIPTOR, DftiDescriptorDeleter> desc_;
-};
-
-static DftiDescriptor _plan_mkl_fft(
-    const framework::proto::VarType::Type& in_dtype,
-    const framework::proto::VarType::Type& out_dtype,
-    const framework::DDim& in_strides,
-    const framework::DDim& out_strides,
-    const std::vector<int>& signal_sizes,
-    FFTNormMode normalization,
-    bool forward) {
-  const DFTI_CONFIG_VALUE precision = [&] {
-    switch (in_dtype) {
-      case framework::proto::VarType::FP32:
-        return DFTI_SINGLE;
-      case framework::proto::VarType::COMPLEX64:
-        return DFTI_SINGLE;
-      case framework::proto::VarType::FP64:
-        return DFTI_DOUBLE;
-      case framework::proto::VarType::COMPLEX128:
-        return DFTI_DOUBLE;
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Invalid input datatype (%s), input data type should be FP32, "
-            "FP64, COMPLEX64 or COMPLEX128.",
-            framework::DataTypeToString(in_dtype)));
-    }
-  }();
-
-  // C2C, R2C, C2R
-  const FFTTransformType fft_type = GetFFTTransformType(in_dtype, out_dtype);
-  const DFTI_CONFIG_VALUE domain =
-      (fft_type == FFTTransformType::C2C) ? DFTI_COMPLEX : DFTI_REAL;
-
-  DftiDescriptor descriptor;
-  std::vector<MKL_LONG> fft_sizes(signal_sizes.cbegin(), signal_sizes.cend());
-  const MKL_LONG signal_ndim = fft_sizes.size() - 1;
-  descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1);
-
-  // placement inplace or not inplace
-  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
-      descriptor.get(), DFTI_PLACEMENT, DFTI_NOT_INPLACE));
-
-  // number of transformations
-  const MKL_LONG batch_size = fft_sizes[0];
-  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
-      descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size));
-
-  // input & output distance
-  const MKL_LONG idist = in_strides[0];
-  const MKL_LONG odist = out_strides[0];
-  MKL_DFTI_CHECK(
-      phi::dynload::DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist));
-  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
-      descriptor.get(), DFTI_OUTPUT_DISTANCE, odist));
-
-  // input & output stride
-  std::vector<MKL_LONG> mkl_in_stride(1 + signal_ndim, 0);
-  std::vector<MKL_LONG> mkl_out_stride(1 + signal_ndim, 0);
-  for (MKL_LONG i = 1; i <= signal_ndim; i++) {
-    mkl_in_stride[i] = in_strides[i];
-    mkl_out_stride[i] = out_strides[i];
-  }
-  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
-      descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data()));
-  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
-      descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data()));
-
-  // conjugate even storage
-  if (!(fft_type == FFTTransformType::C2C)) {
-    MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
-        descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX));
-  }
-
-  MKL_LONG signal_numel = std::accumulate(fft_sizes.cbegin() + 1,
-                                          fft_sizes.cend(),
-                                          1UL,
-                                          std::multiplies<MKL_LONG>());
-  if (normalization != FFTNormMode::none) {
-    const double scale =
-        ((normalization == FFTNormMode::by_sqrt_n)
-             ? 1.0 / std::sqrt(static_cast<double>(signal_numel))
-             : 1.0 / static_cast<double>(signal_numel));
-    const auto scale_direction = [&]() {
-      if (fft_type == FFTTransformType::R2C ||
-          (fft_type == FFTTransformType::C2C && forward)) {
-        return DFTI_FORWARD_SCALE;
-      } else {
-        // (fft_type == FFTTransformType::C2R ||
-        //          (fft_type == FFTTransformType::C2C && !forward))
-        return DFTI_BACKWARD_SCALE;
-      }
-    }();
-    MKL_DFTI_CHECK(
-        phi::dynload::DftiSetValue(descriptor.get(), scale_direction, scale));
-  }
-
-  // commit the descriptor
-  MKL_DFTI_CHECK(phi::dynload::DftiCommitDescriptor(descriptor.get()));
-  return descriptor;
-}
-
-// Execute a general fft operation (can be c2c, onesided r2c or onesided c2r)
-template <typename DeviceContext, typename Ti, typename To>
-void exec_fft(const DeviceContext& ctx,
-              const Tensor* x,
-              Tensor* out,
-              const std::vector<int64_t>& axes,
-              FFTNormMode normalization,
-              bool forward) {
-  const framework::DDim& in_sizes = x->dims();
-  const int ndim = in_sizes.size();
-  const int signal_ndim = axes.size();
-  const int batch_ndim = ndim - signal_ndim;
-  const framework::DDim& out_sizes = out->dims();
-
-  // make a dim permutation
-  std::vector<int> dim_permute(ndim);
-  std::iota(dim_permute.begin(), dim_permute.end(), 0);
-  std::vector<bool> is_transformed_dim(ndim, false);
-  for (const auto& d : axes) {
-    is_transformed_dim[d] = true;
-  }
-  const auto batch_end =
-      std::partition(dim_permute.begin(), dim_permute.end(), [&](size_t axis) {
-        return !is_transformed_dim[axis];
-      });
-  std::copy(axes.cbegin(), axes.cend(), batch_end);
-
-  // transpose input according to that permutation
-  framework::DDim transposed_input_shape = in_sizes.transpose(dim_permute);
-  std::vector<int64_t> transposed_input_shape_ =
-      phi::vectorize(transposed_input_shape);
-  framework::Tensor transposed_input;
-  transposed_input.Resize(transposed_input_shape);
-  const auto place = ctx.GetPlace();
-  transposed_input.mutable_data<Ti>(place);
-  TransCompute<phi::CPUContext, Ti>(
-      ndim, ctx, *x, &transposed_input, dim_permute);
-
-  // make an collapsed input: collapse batch axes for input
-  const int batch_size =
-      std::accumulate(transposed_input_shape.Get(),
-                      transposed_input_shape.Get() + batch_ndim,
-                      1L,
-                      std::multiplies<int64_t>());
-  std::vector<int> collapsed_input_shape_(1 + signal_ndim);
-  collapsed_input_shape_[0] = batch_size;
-  std::copy(transposed_input_shape_.begin() + batch_ndim,
-            transposed_input_shape_.end(),
-            collapsed_input_shape_.begin() + 1);
-  const framework::DDim collapsed_input_shape =
-      phi::make_ddim(collapsed_input_shape_);
-  transposed_input.Resize(collapsed_input_shape);
-  framework::Tensor& collapsed_input = transposed_input;
-
-  // make a collapsed output
-  std::vector<int> collapsed_output_shape_(1 + signal_ndim);
-  collapsed_output_shape_[0] = batch_size;
-  for (int i = 0; i < signal_ndim; i++) {
-    collapsed_output_shape_[1 + i] = out_sizes[axes[i]];
-  }
-  const framework::DDim collapsed_output_shape =
-      phi::make_ddim(collapsed_output_shape_);
-  framework::Tensor collapsed_output;
-  collapsed_output.Resize(collapsed_output_shape);
-  collapsed_output.mutable_data(place, out->type());
-
-  // signal sizes
-  std::vector<int> signal_sizes(1 + signal_ndim);
-  signal_sizes[0] = batch_size;
-  for (int i = 0; i < signal_ndim; i++) {
-    signal_sizes[1 + i] =
-        std::max(collapsed_input_shape[1 + i], collapsed_output_shape[1 + i]);
-  }
-
-  // input & output stride
-  const framework::DDim input_stride = phi::stride(collapsed_input_shape);
-  const framework::DDim output_stride = phi::stride(collapsed_output_shape);
-
-  // make a DFTI_DESCRIPTOR
-  DftiDescriptor desc =
-      _plan_mkl_fft(framework::TransToProtoVarType(x->dtype()),
-                    framework::TransToProtoVarType(out->dtype()),
-                    input_stride,
-                    output_stride,
-                    signal_sizes,
-                    normalization,
-                    forward);
-
-  const FFTTransformType fft_type =
-      GetFFTTransformType(framework::TransToProtoVarType(x->dtype()),
-                          framework::TransToProtoVarType(out->type()));
-  if (fft_type == FFTTransformType::C2R && forward) {
-    framework::Tensor collapsed_input_conj(collapsed_input.dtype());
-    collapsed_input_conj.mutable_data<Ti>(collapsed_input.dims(),
-                                          ctx.GetPlace());
-    // conjugate the input
-    platform::ForRange<DeviceContext> for_range(ctx, collapsed_input.numel());
-    phi::funcs::ConjFunctor<Ti> functor(collapsed_input.data<Ti>(),
-                                        collapsed_input.numel(),
-                                        collapsed_input_conj.data<Ti>());
-    for_range(functor);
-    MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
-        desc.get(), collapsed_input_conj.data(), collapsed_output.data()));
-  } else if (fft_type == FFTTransformType::R2C && !forward) {
-    framework::Tensor collapsed_output_conj(collapsed_output.dtype());
-    collapsed_output_conj.mutable_data<To>(collapsed_output.dims(),
-                                           ctx.GetPlace());
-    MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
-        desc.get(), collapsed_input.data(), collapsed_output_conj.data()));
-    // conjugate the output
-    platform::ForRange<DeviceContext> for_range(ctx, collapsed_output.numel());
-    phi::funcs::ConjFunctor<To> functor(collapsed_output_conj.data<To>(),
-                                        collapsed_output.numel(),
-                                        collapsed_output.data<To>());
-    for_range(functor);
-  } else {
-    if (forward) {
-      MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
-          desc.get(), collapsed_input.data(), collapsed_output.data()));
-    } else {
-      MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
-          desc.get(), collapsed_input.data(), collapsed_output.data()));
-    }
-  }
-
-  // resize for the collapsed output
-  framework::DDim transposed_output_shape = out_sizes.transpose(dim_permute);
-  collapsed_output.Resize(transposed_output_shape);
-  framework::Tensor& transposed_output = collapsed_output;
-
-  // reverse the transposition
-  std::vector<int> reverse_dim_permute(ndim);
-  for (int i = 0; i < ndim; i++) {
-    reverse_dim_permute[dim_permute[i]] = i;
-  }
-  TransCompute<phi::CPUContext, To>(
-      ndim, ctx, transposed_output, out, reverse_dim_permute);
-}
-
-template <typename Ti, typename To>
-struct FFTC2CFunctor<phi::CPUContext, Ti, To> {
-  void operator()(const phi::CPUContext& ctx,
-                  const Tensor* x,
-                  Tensor* out,
-                  const std::vector<int64_t>& axes,
-                  FFTNormMode normalization,
-                  bool forward) {
-    exec_fft<phi::CPUContext, Ti, To>(
-        ctx, x, out, axes, normalization, forward);
-  }
-};
-
-template <typename Ti, typename To>
-struct FFTR2CFunctor<phi::CPUContext, Ti, To> {
-  void operator()(const phi::CPUContext& ctx,
-                  const Tensor* x,
-                  Tensor* out,
-                  const std::vector<int64_t>& axes,
-                  FFTNormMode normalization,
-                  bool forward) {
-    exec_fft<phi::CPUContext, Ti, To>(
-        ctx, x, out, axes, normalization, forward);
-  }
-};
-
-template <typename Ti, typename To>
-struct FFTC2RFunctor<phi::CPUContext, Ti, To> {
-  void operator()(const phi::CPUContext& ctx,
-                  const Tensor* x,
-                  Tensor* out,
-                  const std::vector<int64_t>& axes,
-                  FFTNormMode normalization,
-                  bool forward) {
-    if (axes.size() > 1) {
-      const std::vector<int64_t> c2c_dims(axes.begin(), axes.end() - 1);
-      Tensor temp;
-      temp.mutable_data<Ti>(x->dims(), ctx.GetPlace());
-
-      FFTC2CFunctor<phi::CPUContext, Ti, Ti> c2c_functor;
-      c2c_functor(ctx, x, &temp, c2c_dims, normalization, forward);
-
-      const std::vector<int64_t> new_axes{axes.back()};
-      exec_fft<phi::CPUContext, Ti, To>(
-          ctx, &temp, out, new_axes, normalization, forward);
-    } else {
-      exec_fft<phi::CPUContext, Ti, To>(
-          ctx, x, out, axes, normalization, forward);
-    }
-  }
-};
-#elif defined(PADDLE_WITH_POCKETFFT)
-
-template <typename T>
-T compute_factor(int64_t size, FFTNormMode normalization) {
-  constexpr auto one = static_cast<T>(1);
-  switch (normalization) {
-    case FFTNormMode::none:
-      return one;
-    case FFTNormMode::by_n:
-      return one / static_cast<T>(size);
-    case FFTNormMode::by_sqrt_n:
-      return one / std::sqrt(static_cast<T>(size));
-  }
-  PADDLE_THROW(
-      platform::errors::InvalidArgument("Unsupported normalization type"));
-}
-
-template <typename Ti, typename To>
-struct FFTC2CFunctor<phi::CPUContext, Ti, To> {
-  void operator()(const phi::CPUContext& ctx,
-                  const Tensor* x,
-                  Tensor* out,
-                  const std::vector<int64_t>& axes,
-                  FFTNormMode normalization,
-                  bool forward) {
-    using R = typename Ti::value_type;
-    using C = std::complex<R>;
-
-    const auto& input_dim = x->dims();
-    const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
-    std::vector<std::ptrdiff_t> in_strides =
-        phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
-    const int64_t data_size = sizeof(C);
-    std::transform(in_strides.begin(),
-                   in_strides.end(),
-                   in_strides.begin(),
-                   [&](std::ptrdiff_t s) { return s * data_size; });
-
-    const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
-    auto* out_data = reinterpret_cast<C*>(out->data<To>());
-    // pocketfft requires std::vector<size_t>
-    std::vector<size_t> axes_(axes.size());
-    std::copy(axes.begin(), axes.end(), axes_.begin());
-    // compuet factor
-    int64_t signal_numel = 1;
-    for (auto i : axes) {
-      signal_numel *= in_sizes[i];
-    }
-    R factor = compute_factor<R>(signal_numel, normalization);
-    pocketfft::c2c(in_sizes,
-                   in_strides,
-                   in_strides,
-                   axes_,
-                   forward,
-                   in_data,
-                   out_data,
-                   factor);
-  }
-};
-
-template <typename Ti, typename To>
-struct FFTR2CFunctor<phi::CPUContext, Ti, To> {
-  void operator()(const phi::CPUContext& ctx,
-                  const Tensor* x,
-                  Tensor* out,
-                  const std::vector<int64_t>& axes,
-                  FFTNormMode normalization,
-                  bool forward) {
-    using R = Ti;
-    using C = std::complex<R>;
-
-    const auto& input_dim = x->dims();
-    const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
-    std::vector<std::ptrdiff_t> in_strides =
-        phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
-    {
-      const int64_t data_size = sizeof(R);
-      std::transform(in_strides.begin(),
-                     in_strides.end(),
-                     in_strides.begin(),
-                     [&](std::ptrdiff_t s) { return s * data_size; });
-    }
-
-    const auto& output_dim = out->dims();
-    const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
-    std::vector<std::ptrdiff_t> out_strides =
-        phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
-    {
-      const int64_t data_size = sizeof(C);
-      std::transform(out_strides.begin(),
-                     out_strides.end(),
-                     out_strides.begin(),
-                     [&](std::ptrdiff_t s) { return s * data_size; });
-    }
-
-    const auto* in_data = x->data<R>();
-    auto* out_data = reinterpret_cast<C*>(out->data<To>());
-    // pocketfft requires std::vector<size_t>
-    std::vector<size_t> axes_(axes.size());
-    std::copy(axes.begin(), axes.end(), axes_.begin());
-    // compuet normalization factor
-    int64_t signal_numel = 1;
-    for (auto i : axes) {
-      signal_numel *= in_sizes[i];
-    }
-    R factor = compute_factor<R>(signal_numel, normalization);
-    pocketfft::r2c(in_sizes,
-                   in_strides,
-                   out_strides,
-                   axes_,
-                   forward,
-                   in_data,
-                   out_data,
-                   factor);
-  }
-};
-
-template <typename Ti, typename To>
-struct FFTC2RFunctor<phi::CPUContext, Ti, To> {
-  void operator()(const phi::CPUContext& ctx,
-                  const Tensor* x,
-                  Tensor* out,
-                  const std::vector<int64_t>& axes,
-                  FFTNormMode normalization,
-                  bool forward) {
-    using R = To;
-    using C = std::complex<R>;
-
-    const auto& input_dim = x->dims();
-    const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
-    std::vector<std::ptrdiff_t> in_strides =
-        phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
-    {
-      const int64_t data_size = sizeof(C);
-      std::transform(in_strides.begin(),
-                     in_strides.end(),
-                     in_strides.begin(),
-                     [&](std::ptrdiff_t s) { return s * data_size; });
-    }
-
-    const auto& output_dim = out->dims();
-    const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
-    std::vector<std::ptrdiff_t> out_strides =
-        phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
-    {
-      const int64_t data_size = sizeof(R);
-      std::transform(out_strides.begin(),
-                     out_strides.end(),
-                     out_strides.begin(),
-                     [&](std::ptrdiff_t s) { return s * data_size; });
-    }
-
-    const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
-    auto* out_data = out->data<R>();
-    // pocketfft requires std::vector<size_t>
-    std::vector<size_t> axes_(axes.size());
-    std::copy(axes.begin(), axes.end(), axes_.begin());
-    // compuet normalization factor
-    int64_t signal_numel = 1;
-    for (auto i : axes) {
-      signal_numel *= out_sizes[i];
-    }
-    R factor = compute_factor<R>(signal_numel, normalization);
-    pocketfft::c2r(out_sizes,
-                   in_strides,
-                   out_strides,
-                   axes_,
-                   forward,
-                   in_data,
-                   out_data,
-                   factor);
-  }
-};
-
-#endif
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/spectral_op.cc b/paddle/fluid/operators/spectral_op.cc
deleted file mode 100644
index 91e3880dff0049b324bed7188a90962224a4e25b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/spectral_op.cc
+++ /dev/null
@@ -1,389 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/spectral_op.h"
-
-#include "paddle/fluid/operators/spectral_helper.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-// FFTC2C
-class FFTC2COpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), the input tensor of fft_c2c op.");
-    AddOutput("Out", "(Tensor), the output tensor of fft_c2c op.");
-    AddAttr<std::vector<int64_t>>("axes",
-                                  "std::vector<int64_t>, the fft axes.");
-    AddAttr<std::string>("normalization",
-                         "fft_norm_type, the fft normalization type.");
-    AddAttr<bool>("forward", "bool, the fft direction.");
-    AddComment(R"DOC(
-      Compute complex to complex FFT.
-    )DOC");
-  }
-};
-
-class FFTC2COp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fft_c2c");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "fft_c2c");
-    const auto axes = ctx->Attrs().Get<std::vector<int64_t>>("axes");
-    const auto x_dim = ctx->GetInputDim("X");
-    for (size_t i = 0; i < axes.size(); i++) {
-      PADDLE_ENFORCE_GT(x_dim[axes[i]],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "Invalid fft n-point (%d).", x_dim[axes[i]]));
-    }
-    ctx->ShareDim("X", /*->*/ "Out");  // only for c2c
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    const auto in_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    const auto kernel_dtype = framework::ToRealType(in_dtype);
-    return framework::OpKernelType(kernel_dtype, ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class FFTC2CGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("fft_c2c_grad");
-    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-class FFTC2CGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    const auto out_grad_name = framework::GradVarName("Out");
-    OP_INOUT_CHECK(
-        ctx->HasInput(out_grad_name), "Input", out_grad_name, "fft_c2c_grad");
-    const auto x_grad_name = framework::GradVarName("X");
-    OP_INOUT_CHECK(
-        ctx->HasOutput(x_grad_name), "Output", x_grad_name, "fft_c2c_grad");
-
-    ctx->SetOutputDim(x_grad_name, ctx->GetInputDim(out_grad_name));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    const auto in_dtype = OperatorWithKernel::IndicateVarDataType(
-        ctx, framework::GradVarName("Out"));
-    const auto kernel_dtype = framework::ToRealType(in_dtype);
-    return framework::OpKernelType(kernel_dtype, ctx.GetPlace());
-  }
-};
-
-// FFTR2C
-class FFTR2COpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), the input tensor of fft_r2c op.");
-    AddOutput("Out", "(Tensor), the output tensor of fft_r2c op.");
-    AddAttr<std::vector<int64_t>>("axes",
-                                  "std::vector<int64_t>, the fft axes.");
-    AddAttr<std::string>("normalization",
-                         "fft_norm_type, the fft normalization type.");
-    AddAttr<bool>("forward", "bool, the fft direction.");
-    AddAttr<bool>("onesided", "bool, perform onesided fft.");
-    AddComment(R"DOC(
-      Compute real to complex FFT.
-    )DOC");
-  }
-};
-
-class FFTR2COp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fft_r2c");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "fft_r2c");
-    const auto axes = ctx->Attrs().Get<std::vector<int64_t>>("axes");
-    const auto x_dim = ctx->GetInputDim("X");
-    for (size_t i = 0; i < axes.size() - 1L; i++) {
-      PADDLE_ENFORCE_GT(x_dim[axes[i]],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "Invalid fft n-point (%d).", x_dim[axes[i]]));
-    }
-
-    const bool onesided = ctx->Attrs().Get<bool>("onesided");
-    if (!onesided) {
-      ctx->ShareDim("X", /*->*/ "Out");
-    } else {
-      framework::DDim out_dim(ctx->GetInputDim("X"));
-      const int64_t last_fft_axis = axes.back();
-      const int64_t last_fft_dim_size = out_dim.at(last_fft_axis);
-      out_dim.at(last_fft_axis) = last_fft_dim_size / 2 + 1;
-      ctx->SetOutputDim("Out", out_dim);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    const auto in_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return framework::OpKernelType(in_dtype, ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class FFTR2CGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("fft_r2c_grad");
-    grad_op->SetInput("X", this->Input("X"));
-    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-class FFTR2CGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    const auto out_grad_name = framework::GradVarName("Out");
-    OP_INOUT_CHECK(
-        ctx->HasInput(out_grad_name), "Input", out_grad_name, "fft_r2c_grad");
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fft_r2c_grad");
-
-    const auto x_grad_name = framework::GradVarName("X");
-    OP_INOUT_CHECK(
-        ctx->HasOutput(x_grad_name), "Output", x_grad_name, "fft_r2c_grad");
-
-    ctx->ShareDim("X", /*->*/ x_grad_name);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    const auto in_dtype = OperatorWithKernel::IndicateVarDataType(
-        ctx, framework::GradVarName("Out"));
-    const auto kernel_dtype = framework::ToRealType(in_dtype);
-    return framework::OpKernelType(kernel_dtype, ctx.GetPlace());
-  }
-};
-
-// FFTC2R
-class FFTC2ROpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), the input tensor of fft_c2r op.");
-    AddOutput("Out", "(Tensor), the output tensor of fft_c2r op.");
-    AddAttr<std::vector<int64_t>>("axes",
-                                  "std::vector<int64_t>, the fft axes.");
-    AddAttr<std::string>("normalization",
-                         "fft_norm_type, the fft normalization type.");
-    AddAttr<bool>("forward", "bool, the fft direction.");
-    AddAttr<int64_t>(
-        "last_dim_size",
-        "int",
-        "Length of the transformed "
-        "axis of the output. For n output points, last_dim_size//2 + 1 input"
-        " points are necessary. If the input is longer than this,"
-        " it is cropped. If it is shorter than this, it is padded"
-        " with zeros. If last_dim_size is not given, it is taken to be 2*(m-1)"
-        " where m is the length of the input along the axis "
-        "specified by axis.")
-        .SetDefault(0L);
-    AddComment(R"DOC(
-      Compute complex to complex FFT.
-    )DOC");
-  }
-};
-
-class FFTC2ROp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fft_c2r");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "fft_c2r");
-
-    const auto axes = ctx->Attrs().Get<std::vector<int64_t>>("axes");
-    const auto x_dim = ctx->GetInputDim("X");
-    for (size_t i = 0; i < axes.size() - 1L; i++) {
-      PADDLE_ENFORCE_GT(x_dim[axes[i]],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "Invalid fft n-point (%d).", x_dim[axes[i]]));
-    }
-
-    const int64_t last_dim_size = ctx->Attrs().Get<int64_t>("last_dim_size");
-    framework::DDim out_dim(ctx->GetInputDim("X"));
-    const int64_t last_fft_axis = axes.back();
-    if (last_dim_size == 0) {
-      const int64_t last_fft_dim_size = out_dim.at(last_fft_axis);
-      const int64_t fft_n_point = (last_fft_dim_size - 1) * 2;
-      PADDLE_ENFORCE_GT(fft_n_point,
-                        0,
-                        platform::errors::InvalidArgument(
-                            "Invalid fft n-point (%d).", fft_n_point));
-      out_dim.at(last_fft_axis) = fft_n_point;
-    } else {
-      PADDLE_ENFORCE_GT(last_dim_size,
-                        0,
-                        platform::errors::InvalidArgument(
-                            "Invalid fft n-point (%d).", last_dim_size));
-      out_dim.at(last_fft_axis) = last_dim_size;
-    }
-    ctx->SetOutputDim("Out", out_dim);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    const auto in_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    const auto kernel_dtype = framework::ToRealType(in_dtype);
-    return framework::OpKernelType(kernel_dtype, ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class FFTC2RGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("fft_c2r_grad");
-    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-class FFTC2RGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    const auto out_grad_name = framework::GradVarName("Out");
-    OP_INOUT_CHECK(
-        ctx->HasInput(out_grad_name), "Input", out_grad_name, "fft_c2r_grad");
-
-    const auto x_grad_name = framework::GradVarName("X");
-    OP_INOUT_CHECK(
-        ctx->HasOutput(x_grad_name), "Output", x_grad_name, "fft_c2r_grad");
-
-    const auto axes = ctx->Attrs().Get<std::vector<int64_t>>("axes");
-
-    const auto out_grad_dim = ctx->GetInputDim(out_grad_name);
-    framework::DDim x_grad_dim(out_grad_dim);
-    const int64_t last_fft_axis = axes.back();
-    const int64_t last_fft_dim_size = x_grad_dim.at(last_fft_axis);
-    x_grad_dim.at(last_fft_axis) = last_fft_dim_size / 2 + 1;
-    ctx->SetOutputDim(x_grad_name, x_grad_dim);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    const auto in_dtype = OperatorWithKernel::IndicateVarDataType(
-        ctx, framework::GradVarName("Out"));
-    return framework::OpKernelType(in_dtype, ctx.GetPlace());
-  }
-};
-
-// common functions
-FFTNormMode get_norm_from_string(const std::string& norm, bool forward) {
-  if (norm.empty() || norm == "backward") {
-    return forward ? FFTNormMode::none : FFTNormMode::by_n;
-  }
-
-  if (norm == "forward") {
-    return forward ? FFTNormMode::by_n : FFTNormMode::none;
-  }
-
-  if (norm == "ortho") {
-    return FFTNormMode::by_sqrt_n;
-  }
-
-  PADDLE_THROW(platform::errors::InvalidArgument(
-      "FFT norm string must be 'forward' or 'backward' or 'ortho', "
-      "received %s",
-      norm));
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(fft_c2c,
-                  ops::FFTC2COp,
-                  ops::FFTC2COpMaker,
-                  ops::FFTC2CGradOpMaker<paddle::framework::OpDesc>,
-                  ops::FFTC2CGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(fft_c2c,
-                       ops::FFTC2CKernel<phi::CPUContext, float>,
-                       ops::FFTC2CKernel<phi::CPUContext, double>);
-
-REGISTER_OPERATOR(fft_c2c_grad, ops::FFTC2CGradOp);
-REGISTER_OP_CPU_KERNEL(fft_c2c_grad,
-                       ops::FFTC2CGradKernel<phi::CPUContext, float>,
-                       ops::FFTC2CGradKernel<phi::CPUContext, double>);
-
-REGISTER_OPERATOR(fft_r2c,
-                  ops::FFTR2COp,
-                  ops::FFTR2COpMaker,
-                  ops::FFTR2CGradOpMaker<paddle::framework::OpDesc>,
-                  ops::FFTR2CGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(fft_r2c,
-                       ops::FFTR2CKernel<phi::CPUContext, float>,
-                       ops::FFTR2CKernel<phi::CPUContext, double>);
-
-REGISTER_OPERATOR(fft_r2c_grad, ops::FFTR2CGradOp);
-REGISTER_OP_CPU_KERNEL(fft_r2c_grad,
-                       ops::FFTR2CGradKernel<phi::CPUContext, float>,
-                       ops::FFTR2CGradKernel<phi::CPUContext, double>);
-
-REGISTER_OPERATOR(fft_c2r,
-                  ops::FFTC2ROp,
-                  ops::FFTC2ROpMaker,
-                  ops::FFTC2RGradOpMaker<paddle::framework::OpDesc>,
-                  ops::FFTC2RGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(fft_c2r,
-                       ops::FFTC2RKernel<phi::CPUContext, float>,
-                       ops::FFTC2RKernel<phi::CPUContext, double>);
-
-REGISTER_OPERATOR(fft_c2r_grad, ops::FFTC2RGradOp);
-REGISTER_OP_CPU_KERNEL(fft_c2r_grad,
-                       ops::FFTC2RGradKernel<phi::CPUContext, float>,
-                       ops::FFTC2RGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu
deleted file mode 100644
index 661fcc83771f54fd3290499584d5896e76ba8c5b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/spectral_op.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/fluid/operators/spectral_op.h"
-#include "paddle/fluid/operators/spectral_op.cu.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(fft_c2c,
-                        ops::FFTC2CKernel<phi::GPUContext, float>,
-                        ops::FFTC2CKernel<phi::GPUContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(fft_c2c_grad,
-                        ops::FFTC2CGradKernel<phi::GPUContext, float>,
-                        ops::FFTC2CGradKernel<phi::GPUContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(fft_c2r,
-                        ops::FFTC2RKernel<phi::GPUContext, float>,
-                        ops::FFTC2RKernel<phi::GPUContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(fft_c2r_grad,
-                        ops::FFTC2RGradKernel<phi::GPUContext, float>,
-                        ops::FFTC2RGradKernel<phi::GPUContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(fft_r2c,
-                        ops::FFTR2CKernel<phi::GPUContext, float>,
-                        ops::FFTR2CKernel<phi::GPUContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(fft_r2c_grad,
-                        ops::FFTR2CGradKernel<phi::GPUContext, float>,
-                        ops::FFTR2CGradKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/spectral_op.cu.h b/paddle/fluid/operators/spectral_op.cu.h
deleted file mode 100644
index 5ced67691ee0764d38a5457ea261e1f44b48c830..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/spectral_op.cu.h
+++ /dev/null
@@ -1,1018 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <functional>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <numeric>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/operators/spectral_op.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/dynload/hipfft.h"
-#endif
-
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/dynload/cufft.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-using ScalarType = framework::proto::VarType::Type;
-const int64_t kMaxFFTNdim = 3;
-const int64_t kMaxDataNdim = kMaxFFTNdim + 1;
-// This struct is used to easily compute hashes of the
-// parameters. It will be the **key** to the plan cache.
-struct FFTConfigKey {
-  // between 1 and kMaxFFTNdim, i.e., 1 <= signal_ndim <= 3
-  int64_t signal_ndim_;
-  // These include additional batch dimension as well.
-  int64_t sizes_[kMaxDataNdim];
-  int64_t input_shape_[kMaxDataNdim];
-  int64_t output_shape_[kMaxDataNdim];
-  FFTTransformType fft_type_;
-  ScalarType value_type_;
-
-  FFTConfigKey() = default;
-
-  FFTConfigKey(const std::vector<int64_t>& in_shape,
-               const std::vector<int64_t>& out_shape,
-               const std::vector<int64_t>& signal_size,
-               FFTTransformType fft_type,
-               ScalarType value_type) {
-    // Padding bits must be zeroed for hashing
-    memset(this, 0, sizeof(*this));
-    signal_ndim_ = signal_size.size() - 1;
-    fft_type_ = fft_type;
-    value_type_ = value_type;
-
-    std::copy(signal_size.cbegin(), signal_size.cend(), sizes_);
-    std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_);
-    std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_);
-  }
-};
-
-#if defined(PADDLE_WITH_CUDA)
-// An RAII encapsulation of cuFFTHandle
-class CuFFTHandle {
-  ::cufftHandle handle_;
-
- public:
-  CuFFTHandle() {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftCreate(&handle_));
-  }
-
-  CuFFTHandle(const CuFFTHandle& other) = delete;
-  CuFFTHandle& operator=(const CuFFTHandle& other) = delete;
-
-  CuFFTHandle(CuFFTHandle&& other) = delete;
-  CuFFTHandle& operator=(CuFFTHandle&& other) = delete;
-
-  ::cufftHandle& get() { return handle_; }
-  const ::cufftHandle& get() const { return handle_; }
-
-  ~CuFFTHandle() {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftDestroy(handle_));
-  }
-};
-
-using plan_size_type = long long int;  // NOLINT
-// This class contains all the information needed to execute a cuFFT plan:
-//   1. the plan
-//   2. the workspace size needed
-class FFTConfig {
- public:
-  // Only move semantics is enought for this class. Although we already use
-  // unique_ptr for the plan, still remove copy constructor and assignment op so
-  // we don't accidentally copy and take perf hit.
-  explicit FFTConfig(const FFTConfigKey& plan_key)
-      : FFTConfig(
-            std::vector<int64_t>(plan_key.sizes_,
-                                 plan_key.sizes_ + plan_key.signal_ndim_ + 1),
-            plan_key.signal_ndim_,
-            plan_key.fft_type_,
-            plan_key.value_type_) {}
-
-  // sizes are full signal, including batch size and always two-sided
-  FFTConfig(const std::vector<int64_t>& sizes,
-            const int64_t signal_ndim,
-            FFTTransformType fft_type,
-            ScalarType dtype)
-      : fft_type_(fft_type), value_type_(dtype) {
-    // signal sizes (excluding batch dim)
-    std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
-
-    // input batch size
-    const auto batch = static_cast<plan_size_type>(sizes[0]);
-    // const int64_t signal_ndim = sizes.size() - 1;
-    PADDLE_ENFORCE_EQ(signal_ndim,
-                      sizes.size() - 1,
-                      platform::errors::InvalidArgument(
-                          "The signal_ndim must be equal to sizes.size() - 1,"
-                          "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
-                          signal_ndim,
-                          sizes.size() - 1));
-
-    cudaDataType itype, otype, exec_type;
-    const auto complex_input = has_complex_input(fft_type);
-    const auto complex_output = has_complex_output(fft_type);
-    if (dtype == framework::proto::VarType::FP32) {
-      itype = complex_input ? CUDA_C_32F : CUDA_R_32F;
-      otype = complex_output ? CUDA_C_32F : CUDA_R_32F;
-      exec_type = CUDA_C_32F;
-    } else if (dtype == framework::proto::VarType::FP64) {
-      itype = complex_input ? CUDA_C_64F : CUDA_R_64F;
-      otype = complex_output ? CUDA_C_64F : CUDA_R_64F;
-      exec_type = CUDA_C_64F;
-    } else if (dtype == framework::proto::VarType::FP16) {
-      itype = complex_input ? CUDA_C_16F : CUDA_R_16F;
-      otype = complex_output ? CUDA_C_16F : CUDA_R_16F;
-      exec_type = CUDA_C_16F;
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "cuFFT only support transforms of type float16, float32 and "
-          "float64"));
-    }
-
-    // disable auto allocation of workspace to use allocator from the framework
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetAutoAllocation(
-        plan(), /* autoAllocate */ 0));
-
-    size_t ws_size_t;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cufftXtMakePlanMany(plan(),
-                                               signal_ndim,
-                                               signal_sizes.data(),
-                                               /* inembed */ nullptr,
-                                               /* base_istride */ 1,
-                                               /* idist */ 1,
-                                               itype,
-                                               /* onembed */ nullptr,
-                                               /* base_ostride */ 1,
-                                               /* odist */ 1,
-                                               otype,
-                                               batch,
-                                               &ws_size_t,
-                                               exec_type));
-
-    ws_size = ws_size_t;
-  }
-
-  FFTConfig(const FFTConfig& other) = delete;
-  FFTConfig& operator=(const FFTConfig& other) = delete;
-
-  FFTConfig(FFTConfig&& other) = delete;
-  FFTConfig& operator=(FFTConfig&& other) = delete;
-
-  const cufftHandle& plan() const { return plan_ptr.get(); }
-
-  FFTTransformType transform_type() const { return fft_type_; }
-  ScalarType data_type() const { return value_type_; }
-  size_t workspace_size() const { return ws_size; }
-
- private:
-  CuFFTHandle plan_ptr;
-  size_t ws_size;
-  FFTTransformType fft_type_;
-  ScalarType value_type_;
-};
-
-#elif defined(PADDLE_WITH_HIP)
-// An RAII encapsulation of cuFFTHandle
-class HIPFFTHandle {
-  ::hipfftHandle handle_;
-
- public:
-  HIPFFTHandle() {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftCreate(&handle_));
-  }
-
-  HIPFFTHandle(const HIPFFTHandle& other) = delete;
-  HIPFFTHandle& operator=(const HIPFFTHandle& other) = delete;
-
-  HIPFFTHandle(HIPFFTHandle&& other) = delete;
-  HIPFFTHandle& operator=(HIPFFTHandle&& other) = delete;
-
-  ::hipfftHandle& get() { return handle_; }
-  const ::hipfftHandle& get() const { return handle_; }
-
-  ~HIPFFTHandle() {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftDestroy(handle_));
-  }
-};
-using plan_size_type = int;
-// This class contains all the information needed to execute a cuFFT plan:
-//   1. the plan
-//   2. the workspace size needed
-class FFTConfig {
- public:
-  // Only move semantics is enought for this class. Although we already use
-  // unique_ptr for the plan, still remove copy constructor and assignment op so
-  // we don't accidentally copy and take perf hit.
-  explicit FFTConfig(const FFTConfigKey& plan_key)
-      : FFTConfig(
-            std::vector<int64_t>(plan_key.sizes_,
-                                 plan_key.sizes_ + plan_key.signal_ndim_ + 1),
-            plan_key.signal_ndim_,
-            plan_key.fft_type_,
-            plan_key.value_type_) {}
-
-  // sizes are full signal, including batch size and always two-sided
-  FFTConfig(const std::vector<int64_t>& sizes,
-            const int64_t signal_ndim,
-            FFTTransformType fft_type,
-            ScalarType dtype)
-      : fft_type_(fft_type), value_type_(dtype) {
-    // signal sizes (excluding batch dim)
-    std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
-
-    // input batch size
-    const auto batch = static_cast<plan_size_type>(sizes[0]);
-    // const int64_t signal_ndim = sizes.size() - 1;
-    PADDLE_ENFORCE_EQ(signal_ndim,
-                      sizes.size() - 1,
-                      platform::errors::InvalidArgument(
-                          "The signal_ndim must be equal to sizes.size() - 1,"
-                          "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
-                          signal_ndim,
-                          sizes.size() - 1));
-
-    hipfftType exec_type = [&] {
-      if (dtype == framework::proto::VarType::FP32) {
-        switch (fft_type) {
-          case FFTTransformType::C2C:
-            return HIPFFT_C2C;
-          case FFTTransformType::R2C:
-            return HIPFFT_R2C;
-          case FFTTransformType::C2R:
-            return HIPFFT_C2R;
-        }
-      } else if (dtype == framework::proto::VarType::FP64) {
-        switch (fft_type) {
-          case FFTTransformType::C2C:
-            return HIPFFT_Z2Z;
-          case FFTTransformType::R2C:
-            return HIPFFT_D2Z;
-          case FFTTransformType::C2R:
-            return HIPFFT_Z2D;
-        }
-      }
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "hipFFT only support transforms of type float32 and float64"));
-    }();
-
-    // disable auto allocation of workspace to use allocator from the framework
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetAutoAllocation(
-        plan(), /* autoAllocate */ 0));
-
-    size_t ws_size_t;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::hipfftMakePlanMany(plan(),
-                                              signal_ndim,
-                                              signal_sizes.data(),
-                                              /* inembed */ nullptr,
-                                              /* base_istride */ 1,
-                                              /* idist */ 1,
-                                              /* onembed */ nullptr,
-                                              /* base_ostride */ 1,
-                                              /* odist */ 1,
-                                              exec_type,
-                                              batch,
-                                              &ws_size_t));
-
-    ws_size = ws_size_t;
-  }
-
-  const hipfftHandle& plan() const { return plan_ptr.get(); }
-
-  FFTTransformType transform_type() const { return fft_type_; }
-  ScalarType data_type() const { return value_type_; }
-  size_t workspace_size() const { return ws_size; }
-
- private:
-  HIPFFTHandle plan_ptr;
-  size_t ws_size;
-  FFTTransformType fft_type_;
-  ScalarType value_type_;
-};
-#endif
-
-// Hashing machinery for Key
-// Fowler–Noll–Vo hash function
-// see
-// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
-template <typename Key>
-struct KeyHash {
-  // Key must be a POD because we read out its memory
-  // contenst as char* when hashing
-  static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
-
-  size_t operator()(const Key& params) const {
-    auto ptr = reinterpret_cast<const uint8_t*>(&params);
-    uint32_t value = 0x811C9DC5;
-    for (int i = 0; i < static_cast<int>(sizeof(Key)); ++i) {
-      value ^= ptr[i];
-      value *= 0x01000193;
-    }
-    return static_cast<size_t>(value);
-  }
-};
-
-template <typename Key>
-struct KeyEqual {
-  // Key must be a POD because we read out its memory
-  // contenst as char* when comparing
-  static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
-
-  bool operator()(const Key& a, const Key& b) const {
-    auto ptr1 = reinterpret_cast<const uint8_t*>(&a);
-    auto ptr2 = reinterpret_cast<const uint8_t*>(&b);
-    return memcmp(ptr1, ptr2, sizeof(Key)) == 0;
-  }
-};
-
-#if CUDA_VERSION < 10000
-// Note that the max plan number for CUDA version < 10 has to be 1023
-// due to a bug that fails on the 1024th plan
-constexpr size_t CUFFT_MAX_PLAN_NUM = 1023;
-constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM;
-#else
-constexpr size_t CUFFT_MAX_PLAN_NUM = std::numeric_limits<size_t>::max();
-// The default max cache size chosen for CUDA version > 10 is arbitrary.
-// This number puts a limit on how big of a plan cache should we maintain by
-// default. Users can always configure it via cufft_set_plan_cache_max_size.
-constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = 4096;
-#endif
-static_assert(CUFFT_MAX_PLAN_NUM >= 0 &&
-                  CUFFT_MAX_PLAN_NUM <= std::numeric_limits<size_t>::max(),
-              "CUFFT_MAX_PLAN_NUM not in size_t range");
-static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 &&
-                  CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM,
-              "CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range");
-
-// This cache assumes that the mapping from key to value never changes.
-// This is **NOT** thread-safe. Please use a mutex when using it **AND** the
-// value returned from try_emplace_value.
-// The contract of using this cache is that try_emplace_value should only be
-// used when the max_size is positive.
-class FFTConfigCache {
- public:
-  using kv_t = typename std::pair<FFTConfigKey, FFTConfig>;
-  using map_t =
-      typename std::unordered_map<std::reference_wrapper<FFTConfigKey>,
-                                  typename std::list<kv_t>::iterator,
-                                  KeyHash<FFTConfigKey>,
-                                  KeyEqual<FFTConfigKey>>;
-  using map_kkv_iter_t = typename map_t::iterator;
-
-  FFTConfigCache() : FFTConfigCache(CUFFT_DEFAULT_CACHE_SIZE) {}
-
-  explicit FFTConfigCache(int64_t max_size) { _set_max_size(max_size); }
-
-  FFTConfigCache(const FFTConfigCache& other) = delete;
-  FFTConfigCache& operator=(const FFTConfigCache& other) = delete;
-
-  FFTConfigCache(FFTConfigCache&& other) noexcept
-      : _usage_list(std::move(other._usage_list)),
-        _cache_map(std::move(other._cache_map)),
-        _max_size(other._max_size) {}
-
-  FFTConfigCache& operator=(FFTConfigCache&& other) noexcept {
-    _usage_list = std::move(other._usage_list);
-    _cache_map = std::move(other._cache_map);
-    _max_size = other._max_size;
-    return *this;
-  }
-
-  // If key is in this cache, return the cached config. Otherwise, emplace the
-  // config in this cache and return it.
-  FFTConfig& lookup(FFTConfigKey params) {
-    PADDLE_ENFORCE_GT(_max_size,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The max size of FFTConfigCache must be great than 0,"
-                          "But received is [%d]",
-                          _max_size));
-
-    map_kkv_iter_t map_it = _cache_map.find(params);
-    // Hit, put to list front
-    if (map_it != _cache_map.end()) {
-      _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second);
-      return map_it->second->second;
-    }
-
-    // Miss
-    // remove if needed
-    if (_usage_list.size() >= _max_size) {
-      auto last = _usage_list.end();
-      last--;
-      _cache_map.erase(last->first);
-      _usage_list.pop_back();
-    }
-
-    // construct new plan at list front, then insert into _cache_map
-    _usage_list.emplace_front(std::piecewise_construct,
-                              std::forward_as_tuple(params),
-                              std::forward_as_tuple(params));
-    auto kv_it = _usage_list.begin();
-    _cache_map.emplace(std::piecewise_construct,
-                       std::forward_as_tuple(kv_it->first),
-                       std::forward_as_tuple(kv_it));
-    return kv_it->second;
-  }
-
-  void clear() {
-    _cache_map.clear();
-    _usage_list.clear();
-  }
-
-  void resize(int64_t new_size) {
-    _set_max_size(new_size);
-    auto cur_size = _usage_list.size();
-    if (cur_size > _max_size) {
-      auto delete_it = _usage_list.end();
-      for (size_t i = 0; i < cur_size - _max_size; i++) {
-        delete_it--;
-        _cache_map.erase(delete_it->first);
-      }
-      _usage_list.erase(delete_it, _usage_list.end());
-    }
-  }
-
-  size_t size() const { return _cache_map.size(); }
-
-  size_t max_size() const noexcept { return _max_size; }
-
-  std::mutex mutex;
-
- private:
-  // Only sets size and does value check. Does not resize the data structures.
-  void _set_max_size(int64_t new_size) {
-    // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since
-    // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check
-    // first.
-    PADDLE_ENFORCE_GE(
-        new_size,
-        0,
-        platform::errors::InvalidArgument(
-            "cuFFT plan cache size must be non-negative, But received is [%d]",
-            new_size));
-    PADDLE_ENFORCE_LE(new_size,
-                      CUFFT_MAX_PLAN_NUM,
-                      platform::errors::InvalidArgument(
-                          "cuFFT plan cache size can not be larger than [%d], "
-                          "But received is [%d]",
-                          CUFFT_MAX_PLAN_NUM,
-                          new_size));
-    _max_size = static_cast<size_t>(new_size);
-  }
-
-  std::list<kv_t> _usage_list;
-  map_t _cache_map;
-  size_t _max_size;
-};
-
-static std::vector<std::unique_ptr<FFTConfigCache>> plan_caches;
-static std::mutex plan_caches_mutex;
-
-static inline FFTConfigCache& get_fft_plan_cache(int64_t device_index) {
-  std::lock_guard<std::mutex> guard(plan_caches_mutex);
-
-  if (device_index >= plan_caches.size()) {
-    plan_caches.resize(device_index + 1);
-  }
-
-  if (!plan_caches[device_index]) {
-    plan_caches[device_index] = std::make_unique<FFTConfigCache>();
-  }
-
-  return *plan_caches[device_index];
-}
-
-// Calculates the normalization constant
-static double fft_normalization_scale(FFTNormMode normalization,
-                                      const std::vector<int64_t>& sizes,
-                                      const std::vector<int64_t>& dims) {
-  // auto norm = static_cast<fft_norm_mode>(normalization);
-  if (normalization == FFTNormMode::none) {
-    return static_cast<double>(1.0);
-  }
-
-  int64_t signal_numel = 1;
-  for (auto dim : dims) {
-    signal_numel *= sizes[dim];
-  }
-  const double scale_denom = (normalization == FFTNormMode::by_sqrt_n)
-                                 ? std::sqrt(signal_numel)
-                                 : static_cast<double>(signal_numel);
-  return static_cast<double>(1.0 / scale_denom);
-}
-
-template <typename DeviceContext, typename T>
-void exec_normalization(const DeviceContext& ctx,
-                        const Tensor* in,
-                        Tensor* out,
-                        FFTNormMode normalization,
-                        const std::vector<int64_t>& sizes,
-                        const std::vector<int64_t>& axes) {
-  double scale = fft_normalization_scale(normalization, sizes, axes);
-  if (scale != 1.0) {
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto dev = ctx.eigen_device();
-    EigenScale<Eigen::GpuDevice, T>::Eval(*dev,
-                                          eigen_out,
-                                          eigen_in,
-                                          static_cast<T>(scale),
-                                          static_cast<T>(0),
-                                          false);
-  } else {
-    framework::TensorCopy(*in, ctx.GetPlace(), out);
-  }
-}
-
-#if defined(PADDLE_WITH_CUDA)
-static FFTConfigKey create_fft_configkey(const framework::Tensor& input,
-                                         const framework::Tensor& output,
-                                         int signal_ndim) {
-  // Create the transform plan (either from cache or locally)
-  const auto value_type =
-      framework::IsComplexType(framework::TransToProtoVarType(input.dtype()))
-          ? framework::ToRealType(framework::TransToProtoVarType(input.dtype()))
-          : framework::TransToProtoVarType(input.dtype());
-  auto fft_type =
-      GetFFTTransformType(framework::TransToProtoVarType(input.dtype()),
-                          framework::TransToProtoVarType(output.dtype()));
-  // signal sizes
-  std::vector<int64_t> signal_size(signal_ndim + 1);
-
-  signal_size[0] = input.dims()[0];
-  for (int64_t i = 1; i <= signal_ndim; ++i) {
-    auto in_size = input.dims()[i];
-    auto out_size = output.dims()[i];
-    signal_size[i] = std::max(in_size, out_size);
-  }
-  FFTConfigKey key(phi::vectorize(input.dims()),
-                   phi::vectorize(output.dims()),
-                   signal_size,
-                   fft_type,
-                   value_type);
-  return key;
-}
-
-// Execute a pre-planned transform
-static void exec_cufft_plan_raw(const FFTConfig& config,
-                                void* in_data,
-                                void* out_data,
-                                bool forward) {
-  auto& plan = config.plan();
-
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtExec(
-      plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE));
-}
-
-template <typename DeviceContext, typename Ti, typename To>
-void exec_cufft_plan(const DeviceContext& ctx,
-                     const FFTConfig& config,
-                     framework::Tensor* input,
-                     framework::Tensor* output,
-                     bool forward) {
-  // execute transform plan
-  auto fft_type = config.transform_type();
-  if (fft_type == FFTTransformType::C2R && forward) {
-    forward = false;
-    framework::Tensor input_conj(input->type());
-    input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
-    platform::ForRange<DeviceContext> for_range(ctx, input->numel());
-    phi::funcs::ConjFunctor<Ti> functor(
-        input->data<Ti>(), input->numel(), input_conj.data<Ti>());
-    for_range(functor);
-    exec_cufft_plan_raw(config, input_conj.data(), output->data(), forward);
-  } else if (fft_type == FFTTransformType::R2C && !forward) {
-    forward = true;
-    framework::Tensor out_conj(output->type());
-    out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
-    exec_cufft_plan_raw(config, input->data(), out_conj.data(), forward);
-
-    platform::ForRange<DeviceContext> for_range(ctx, output->numel());
-    phi::funcs::ConjFunctor<To> functor(
-        out_conj.data<To>(), output->numel(), output->data<To>());
-    for_range(functor);
-  } else {
-    exec_cufft_plan_raw(config, input->data(), output->data(), forward);
-  }
-}
-
-#elif defined(PADDLE_WITH_HIP)
-
-static FFTConfigKey create_fft_configkey(const framework::Tensor& input,
-                                         const framework::Tensor& output,
-                                         int signal_ndim) {
-  // Create the transform plan (either from cache or locally)
-  const auto value_type =
-      framework::IsComplexType(framework::TransToProtoVarType(input.dtype()))
-          ? framework::ToRealType(framework::TransToProtoVarType(input.dtype()))
-          : framework::TransToProtoVarType(input.dtype());
-  auto fft_type =
-      GetFFTTransformType(framework::TransToProtoVarType(input.dtype()),
-                          framework::TransToProtoVarType(output.type()));
-  // signal sizes
-  std::vector<int64_t> signal_size(signal_ndim + 1);
-
-  signal_size[0] = input.dims()[0];
-  for (int64_t i = 1; i <= signal_ndim; ++i) {
-    auto in_size = input.dims()[i];
-    auto out_size = output.dims()[i];
-    signal_size[i] = std::max(in_size, out_size);
-  }
-  FFTConfigKey key(phi::vectorize(input.dims()),
-                   phi::vectorize(output.dims()),
-                   signal_size,
-                   fft_type,
-                   value_type);
-  return key;
-}
-
-// Execute a pre-planned transform
-static void exec_hipfft_plan_raw(const FFTConfig& config,
-                                 void* in_data,
-                                 void* out_data,
-                                 bool forward) {
-  auto& plan = config.plan();
-
-  auto value_type = config.data_type();
-  if (value_type == framework::proto::VarType::FP32) {
-    switch (config.transform_type()) {
-      case FFTTransformType::C2C: {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2C(
-            plan,
-            static_cast<hipfftComplex*>(in_data),
-            static_cast<hipfftComplex*>(out_data),
-            forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
-        return;
-      }
-      case FFTTransformType::R2C: {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecR2C(
-            plan,
-            static_cast<hipfftReal*>(in_data),
-            static_cast<hipfftComplex*>(out_data)));
-        return;
-      }
-      case FFTTransformType::C2R: {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2R(
-            plan,
-            static_cast<hipfftComplex*>(in_data),
-            static_cast<hipfftReal*>(out_data)));
-        return;
-      }
-    }
-  } else if (value_type == framework::proto::VarType::FP64) {
-    switch (config.transform_type()) {
-      case FFTTransformType::C2C: {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2Z(
-            plan,
-            static_cast<hipfftDoubleComplex*>(in_data),
-            static_cast<hipfftDoubleComplex*>(out_data),
-            forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
-        return;
-      }
-      case FFTTransformType::R2C: {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecD2Z(
-            plan,
-            static_cast<hipfftDoubleReal*>(in_data),
-            static_cast<hipfftDoubleComplex*>(out_data)));
-        return;
-      }
-      case FFTTransformType::C2R: {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2D(
-            plan,
-            static_cast<hipfftDoubleComplex*>(in_data),
-            static_cast<hipfftDoubleReal*>(out_data)));
-        return;
-      }
-    }
-  }
-  PADDLE_THROW(platform::errors::InvalidArgument(
-      "hipFFT only support transforms of type float32 and float64"));
-}
-
-template <typename DeviceContext, typename Ti, typename To>
-void exec_hipfft_plan(const DeviceContext& ctx,
-                      const FFTConfig& config,
-                      framework::Tensor* input,
-                      framework::Tensor* output,
-                      bool forward) {
-  auto fft_type = config.transform_type();
-  if (fft_type == FFTTransformType::C2R && forward) {
-    forward = false;
-    framework::Tensor input_conj(input->type());
-    input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
-    platform::ForRange<DeviceContext> for_range(ctx, input->numel());
-    phi::funcs::ConjFunctor<Ti> functor(
-        input->data<Ti>(), input->numel(), input_conj.data<Ti>());
-    for_range(functor);
-    exec_hipfft_plan_raw(config, input_conj.data(), output->data(), forward);
-  } else if (fft_type == FFTTransformType::R2C && !forward) {
-    forward = true;
-    framework::Tensor out_conj(output->type());
-    out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
-    exec_hipfft_plan_raw(config, input->data(), out_conj.data(), forward);
-
-    platform::ForRange<DeviceContext> for_range(ctx, output->numel());
-    phi::funcs::ConjFunctor<To> functor(
-        out_conj.data<To>(), output->numel(), output->data<To>());
-    for_range(functor);
-  } else {
-    exec_hipfft_plan_raw(config, input->data(), output->data(), forward);
-  }
-}
-
-#endif
-
-// Execute a general unnormalized fft operation (can be c2c, onesided r2c or
-// onesided c2r)
-template <typename DeviceContext, typename Ti, typename To>
-void exec_fft(const DeviceContext& ctx,
-              const Tensor* X,
-              Tensor* out,
-              const std::vector<int64_t>& dim,
-              bool forward) {
-  const auto x_dims = phi::vectorize(X->dims());
-  const int64_t ndim = static_cast<int64_t>(X->dims().size());
-  auto tensor_place = ctx.GetPlace();
-
-  // make a dim permutation
-  std::vector<int> dim_permute(ndim);
-  std::iota(dim_permute.begin(), dim_permute.end(), int{0});
-  std::vector<bool> is_transformed_dim(ndim);
-  for (const auto& d : dim) {
-    is_transformed_dim[d] = true;
-  }
-  auto batch_end =
-      std::partition(dim_permute.begin(), dim_permute.end(), [&](int64_t d) {
-        return !is_transformed_dim[d];
-      });
-  std::sort(dim_permute.begin(), batch_end);
-  std::copy(dim.cbegin(), dim.cend(), batch_end);
-
-  // transpose input according to dim permutation
-  auto transposed_input_shape = X->dims().transpose(dim_permute);
-  framework::Tensor transposed_input;
-  transposed_input.Resize(transposed_input_shape);
-  transposed_input.mutable_data<Ti>(tensor_place);
-  TransCompute<DeviceContext, Ti>(
-      ndim, ctx, *X, &transposed_input, dim_permute);
-
-  // Reshape batch dimensions into a single dimension
-  const int64_t signal_ndim = static_cast<int64_t>(dim.size());
-  std::vector<int64_t> collapsed_input_shape(signal_ndim + 1);
-
-  auto transposed_input_shape_ = phi::vectorize(transposed_input_shape);
-  const int64_t batch_dims = ndim - signal_ndim;
-  auto batch_size =
-      std::accumulate(transposed_input_shape_.begin(),
-                      transposed_input_shape_.begin() + batch_dims,
-                      static_cast<int>(1),
-                      std::multiplies<int>());
-  collapsed_input_shape[0] = batch_size;
-
-  std::copy(transposed_input_shape_.begin() + batch_dims,
-            transposed_input_shape_.end(),
-            collapsed_input_shape.begin() + 1);
-
-  framework::Tensor& collapsed_input = transposed_input;
-  collapsed_input.Resize(phi::make_ddim(collapsed_input_shape));
-
-  // make a collpased output
-  const auto out_dims = phi::vectorize(out->dims());
-  std::vector<int64_t> collapsed_output_shape(1 + signal_ndim);
-  collapsed_output_shape[0] = batch_size;
-  for (size_t i = 0; i < dim.size(); ++i) {
-    collapsed_output_shape[i + 1] = out_dims[dim[i]];
-  }
-  framework::Tensor collapsed_output;
-  collapsed_output.Resize(phi::make_ddim(collapsed_output_shape));
-  collapsed_output.mutable_data<To>(tensor_place);
-
-  FFTConfig* config = nullptr;
-
-#if defined(PADDLE_WITH_CUDA)
-  std::unique_ptr<FFTConfig> config_ = nullptr;
-  // create plan
-  FFTConfigKey key =
-      create_fft_configkey(collapsed_input, collapsed_output, signal_ndim);
-  bool using_cache = false;
-#if !defined(CUFFT_VERSION) || (CUFFT_VERSION < 10200)
-  using_cache = true;
-#endif
-
-  if (using_cache) {
-    const int64_t device_id = static_cast<int64_t>(
-        reinterpret_cast<const platform::CUDAPlace*>(&collapsed_input.place())
-            ->GetDeviceId());
-    FFTConfigCache& plan_cache = get_fft_plan_cache(device_id);
-    std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
-    guard.lock();
-    config = &(plan_cache.lookup(key));
-  } else {
-    config_ = std::make_unique<FFTConfig>(key);
-    config = config_.get();
-  }
-
-  // prepare cufft for execution
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cufftSetStream(config->plan(), ctx.stream()));
-  framework::Tensor workspace_tensor;
-  workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetWorkArea(
-      config->plan(), workspace_tensor.data<To>()));
-  // execute transform plan
-  exec_cufft_plan<DeviceContext, Ti, To>(
-      ctx, *config, &collapsed_input, &collapsed_output, forward);
-
-#elif defined(PADDLE_WITH_HIP)
-  // create plan
-  FFTConfigKey key =
-      create_fft_configkey(collapsed_input, collapsed_output, signal_ndim);
-  const int64_t device_id = static_cast<int64_t>(
-      reinterpret_cast<const platform::CUDAPlace*>(&collapsed_input.place())
-          ->GetDeviceId());
-  FFTConfigCache& plan_cache = get_fft_plan_cache(device_id);
-  std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
-  guard.lock();
-  config = &(plan_cache.lookup(key));
-
-  // prepare cufft for execution
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::hipfftSetStream(config->plan(), ctx.stream()));
-  framework::Tensor workspace_tensor;
-  workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetWorkArea(
-      config->plan(), workspace_tensor.data<To>()));
-  // execute transform plan
-  exec_hipfft_plan<DeviceContext, Ti, To>(
-      ctx, *config, &collapsed_input, &collapsed_output, forward);
-#endif
-
-  // Inverting output by reshape and transpose to original batch and dimension
-  auto transposed_out_shape = out->dims().transpose(dim_permute);
-
-  collapsed_output.Resize(transposed_out_shape);
-  auto& transposed_output = collapsed_output;
-
-  std::vector<int> reverse_dim_permute(ndim);
-  for (size_t i = 0; i < ndim; i++) {
-    reverse_dim_permute[dim_permute[i]] = i;
-  }
-
-  TransCompute<DeviceContext, To>(
-      ndim, ctx, transposed_output, out, reverse_dim_permute);
-}
-
-// Use the optimized path to perform single R2C or C2R if transformation dim is
-// supported by cuFFT
-static bool use_optimized_fft_path(const std::vector<int64_t>& axes) {
-  // For performance reason, when axes starts with (0, 1), do not use the
-  // optimized path.
-  if (axes.size() > kMaxFFTNdim ||
-      (axes.size() >= 2 && axes[0] == 0 && axes[1] == 1)) {
-    return false;
-  } else {
-    return true;
-  }
-}
-
-template <typename Ti, typename To>
-struct FFTC2CFunctor<phi::GPUContext, Ti, To> {
-  void operator()(const phi::GPUContext& ctx,
-                  const Tensor* X,
-                  Tensor* out,
-                  const std::vector<int64_t>& axes,
-                  FFTNormMode normalization,
-                  bool forward) {
-    if (axes.empty()) {
-      framework::TensorCopy(*X, ctx.GetPlace(), out);
-      return;
-    }
-
-    framework::Tensor* p_out = out;
-    std::vector<int64_t> out_dims = phi::vectorize(X->dims());
-    std::vector<int64_t> working_axes(axes.begin(), axes.end());
-    std::vector<int64_t> first_dims;
-    size_t max_dims;
-    framework::Tensor working_tensor;
-    working_tensor.mutable_data<Ti>(X->dims(), ctx.GetPlace());
-    framework::Tensor* p_working_tensor = &working_tensor;
-    framework::TensorCopy(*X, ctx.GetPlace(), &working_tensor);
-
-    while (true) {
-      max_dims =
-          std::min(static_cast<size_t>(kMaxFFTNdim), working_axes.size());
-      first_dims.assign(working_axes.end() - max_dims, working_axes.end());
-
-      exec_fft<phi::GPUContext, Ti, To>(
-          ctx, p_working_tensor, p_out, first_dims, forward);
-      working_axes.resize(working_axes.size() - max_dims);
-      first_dims.clear();
-
-      if (working_axes.empty()) {
-        break;
-      }
-
-      std::swap(p_out, p_working_tensor);
-    }
-    exec_normalization<phi::GPUContext, To>(
-        ctx, p_out, out, normalization, out_dims, axes);
-  }
-};
-
-template <typename Ti, typename To>
-struct FFTC2RFunctor<phi::GPUContext, Ti, To> {
-  void operator()(const phi::GPUContext& ctx,
-                  const Tensor* X,
-                  Tensor* out,
-                  const std::vector<int64_t>& axes,
-                  FFTNormMode normalization,
-                  bool forward) {
-    std::vector<int64_t> in_dims = phi::vectorize(X->dims());
-    std::vector<int64_t> out_dims = phi::vectorize(out->dims());
-
-    if (use_optimized_fft_path(axes)) {
-      framework::Tensor x_copy(X->type());
-      x_copy.mutable_data<Ti>(X->dims(), ctx.GetPlace());
-      framework::TensorCopy(*X, ctx.GetPlace(), &x_copy);
-      exec_fft<phi::GPUContext, Ti, To>(ctx, &x_copy, out, axes, forward);
-    } else {
-      framework::Tensor temp_tensor;
-      temp_tensor.mutable_data<Ti>(X->dims(), ctx.GetPlace());
-      const std::vector<int64_t> dims(axes.begin(), axes.end() - 1);
-
-      FFTC2CFunctor<phi::GPUContext, Ti, Ti> c2c_functor;
-      c2c_functor(ctx, X, &temp_tensor, dims, FFTNormMode::none, forward);
-
-      exec_fft<phi::GPUContext, Ti, To>(
-          ctx, &temp_tensor, out, {axes.back()}, forward);
-    }
-    exec_normalization<phi::GPUContext, To>(
-        ctx, out, out, normalization, out_dims, axes);
-  }
-};
-
-// n dimension real to complex FFT use cufft lib
-template <typename Ti, typename To>
-struct FFTR2CFunctor<phi::GPUContext, Ti, To> {
-  void operator()(const phi::GPUContext& ctx,
-                  const Tensor* X,
-                  Tensor* out,
-                  const std::vector<int64_t>& axes,
-                  FFTNormMode normalization,
-                  bool forward) {
-    // Step1: R2C transform on the last dimension
-    framework::Tensor* r2c_out = out;
-    const std::vector<int64_t> last_dim{axes.back()};
-    std::vector<int64_t> out_dims = phi::vectorize(out->dims());
-    exec_fft<phi::GPUContext, Ti, To>(ctx, X, r2c_out, last_dim, forward);
-
-    // Step2: C2C transform on the remaining dimension
-    framework::Tensor c2c_out;
-    if (axes.size() > 1) {
-      c2c_out.mutable_data<To>(out->dims(), ctx.GetPlace());
-      std::vector<int64_t> remain_dim(axes.begin(), axes.end() - 1);
-      FFTC2CFunctor<phi::GPUContext, To, To> fft_c2c_func;
-      fft_c2c_func(
-          ctx, r2c_out, &c2c_out, remain_dim, FFTNormMode::none, forward);
-    }
-
-    const auto in_sizes = phi::vectorize(X->dims());
-    framework::Tensor* norm_tensor = axes.size() > 1 ? &c2c_out : r2c_out;
-    exec_normalization<phi::GPUContext, To>(
-        ctx, norm_tensor, out, normalization, in_sizes, axes);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/spectral_op.h b/paddle/fluid/operators/spectral_op.h
deleted file mode 100644
index 9296f997584dd7895d8f706421f3a829a53d02bb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/spectral_op.h
+++ /dev/null
@@ -1,507 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#define NOMINMAX  // to use std::min std::max correctly on windows
-#include <algorithm>
-#include <functional>
-#include <iostream>
-#include <memory>
-#include <numeric>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-#include "paddle/phi/kernels/funcs/padding.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
-#include "thrust/device_vector.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-enum class FFTNormMode : int64_t {
-  none,       // No normalization
-  by_sqrt_n,  // Divide by sqrt(signal_size)
-  by_n,       // Divide by signal_size
-};
-
-FFTNormMode get_norm_from_string(const std::string& norm, bool forward);
-
-// Enum representing the FFT type
-enum class FFTTransformType : int64_t {
-  C2C = 0,  // Complex-to-complex
-  R2C,      // Real-to-complex
-  C2R,      // Complex-to-real
-};
-
-// Create transform type enum from bools representing if input and output are
-// complex
-inline FFTTransformType GetFFTTransformType(
-    framework::proto::VarType::Type input_dtype,
-    framework::proto::VarType::Type output_dtype) {
-  auto complex_input = framework::IsComplexType(input_dtype);
-  auto complex_output = framework::IsComplexType(output_dtype);
-  if (complex_input && complex_output) {
-    return FFTTransformType::C2C;
-  } else if (complex_input && !complex_output) {
-    return FFTTransformType::C2R;
-  } else if (!complex_input && complex_output) {
-    return FFTTransformType::R2C;
-  }
-  PADDLE_THROW(
-      platform::errors::InvalidArgument("Real to real FFTs are not supported"));
-}
-
-// Returns true if the transform type has complex input
-inline bool has_complex_input(FFTTransformType type) {
-  switch (type) {
-    case FFTTransformType::C2C:
-    case FFTTransformType::C2R:
-      return true;
-
-    case FFTTransformType::R2C:
-      return false;
-  }
-  PADDLE_THROW(platform::errors::InvalidArgument("Unknown FFTTransformType"));
-}
-
-// Returns true if the transform type has complex output
-inline bool has_complex_output(FFTTransformType type) {
-  switch (type) {
-    case FFTTransformType::C2C:
-    case FFTTransformType::R2C:
-      return true;
-
-    case FFTTransformType::C2R:
-      return false;
-  }
-  PADDLE_THROW(platform::errors::InvalidArgument("Unknown FFTTransformType"));
-}
-
-template <typename T>
-struct FFTFillConjGradFunctor {
-  T* input_;
-  const size_t axis_;
-  const int64_t* strides_;
-  const size_t double_length_;
-
-  FFTFillConjGradFunctor(T* input,
-                         size_t axis,
-                         const int64_t* strides,
-                         size_t double_length)
-      : input_(input),
-        axis_(axis),
-        strides_(strides),
-        double_length_(double_length) {}
-
-  HOSTDEVICE void operator()(size_t index) {
-    size_t offtset = index;  // back
-    size_t index_i;
-    for (size_t i = 0; i <= axis_; i++) {
-      index_i = offtset / strides_[i];
-      offtset %= strides_[i];
-    }
-
-    if ((0 < index_i) && (index_i < double_length_ + 1)) {
-      input_[index] *= static_cast<T>(2);
-    }
-  }
-};
-
-template <typename DeviceContext, typename Ti, typename To>
-struct FFTC2CFunctor {
-  void operator()(const DeviceContext& ctx,
-                  const Tensor* X,
-                  Tensor* out,
-                  const std::vector<int64_t>& axes,
-                  FFTNormMode normalization,
-                  bool forward);
-};
-
-template <typename DeviceContext, typename Ti, typename To>
-struct FFTR2CFunctor {
-  void operator()(const DeviceContext& ctx,
-                  const Tensor* X,
-                  Tensor* out,
-                  const std::vector<int64_t>& axes,
-                  FFTNormMode normalization,
-                  bool forward);
-};
-
-template <typename DeviceContext, typename Ti, typename To>
-struct FFTC2RFunctor {
-  void operator()(const DeviceContext& ctx,
-                  const Tensor* X,
-                  Tensor* out,
-                  const std::vector<int64_t>& axes,
-                  FFTNormMode normalization,
-                  bool forward);
-};
-
-// Giving a linear destination index and strides of tensor, get_idx return the
-// corresponding linear position of source tensor.
-// The linear index is the position of flatten tensor.
-// Giving a linear destination index and strides of tensor, get_idx return the
-// corresponding linear position of source tensor.
-// The linear index is the position of flatten tensor.
-HOSTDEVICE inline int64_t get_src_idx(const int64_t dst_idx,
-                                      const int64_t* dst_strides,
-                                      const int64_t* dst_shape,
-                                      const int64_t* src_strides,
-                                      const bool* is_fft_axis,
-                                      const bool conj,
-                                      const int64_t rank) {
-  int64_t src_idx = 0;
-  int64_t quotient = dst_idx;
-  int64_t remainder = 0;
-
-  for (int64_t i = 0; i < rank; i++) {
-    remainder = quotient % dst_strides[i];
-    quotient = quotient / dst_strides[i];
-    if (conj && is_fft_axis[i]) {
-      src_idx += ((dst_shape[i] - quotient) % dst_shape[i]) * src_strides[i];
-    } else {
-      src_idx += src_strides[i] * quotient;
-    }
-    quotient = remainder;
-  }
-
-  return src_idx;
-}
-
-HOSTDEVICE inline bool is_conj_part(const int64_t dst_idx,
-                                    const int64_t* dst_strides,
-                                    const int64_t last_axis,
-                                    const int64_t last_axis_size) {
-  int64_t quotient = dst_idx;
-  int64_t remainder = 0;
-
-  for (int64_t i = 0; i < last_axis + 1; i++) {
-    remainder = quotient % dst_strides[i];
-    quotient = quotient / dst_strides[i];
-
-    if ((i == last_axis) && (quotient > last_axis_size - 1)) {
-      return true;
-    }
-
-    quotient = remainder;
-  }
-
-  return false;
-}
-
-// FFTFillConjFunctor fill the destination tensor with source tensor and
-// conjugate symmetry element of source tensor .
-// Use framework::ForRange to iterate destination element with
-// supporting different device
-template <typename C>
-struct FFTFillConjFunctor {
-  FFTFillConjFunctor(const C* src_data,
-                     C* dst_data,
-                     const int64_t* src_strides,
-                     const int64_t* dst_strides,
-                     const int64_t* dst_shape,
-                     const bool* is_fft_axis,
-                     const int64_t last_axis,
-                     const int64_t last_axis_size,
-                     const int64_t rank)
-      : src_data_(src_data),
-        dst_data_(dst_data),
-        src_strides_(src_strides),
-        dst_strides_(dst_strides),
-        dst_shape_(dst_shape),
-        is_fft_axis_(is_fft_axis),
-        last_axis_(last_axis),
-        last_axis_size_(last_axis_size),
-        rank_(rank) {}
-  HOSTDEVICE void operator()(int64_t dst_idx) {
-    if (is_conj_part(dst_idx, dst_strides_, last_axis_, last_axis_size_)) {
-      const auto conj_idx = get_src_idx(dst_idx,
-                                        dst_strides_,
-                                        dst_shape_,
-                                        src_strides_,
-                                        is_fft_axis_,
-                                        true,
-                                        rank_);
-      auto src_value = src_data_[conj_idx];
-      auto conj_value = C(src_value.real, -src_value.imag);
-      dst_data_[dst_idx] = conj_value;
-    } else {
-      const auto copy_idx = get_src_idx(dst_idx,
-                                        dst_strides_,
-                                        dst_shape_,
-                                        src_strides_,
-                                        is_fft_axis_,
-                                        false,
-                                        rank_);
-      dst_data_[dst_idx] = src_data_[copy_idx];
-    }
-  }
-
-  const C* src_data_;
-  C* dst_data_;
-  const int64_t* src_strides_;
-  const int64_t* dst_strides_;
-  const int64_t* dst_shape_;
-  const bool* is_fft_axis_;
-  const int64_t last_axis_;
-  const int64_t last_axis_size_;
-  const int64_t rank_;
-};
-
-template <typename DeviceContext, typename C>
-void fill_conj(const DeviceContext& ctx,
-               const Tensor* src,
-               Tensor* dst,
-               const std::vector<int64_t>& axes) {
-  std::vector<int64_t> src_strides_v =
-      phi::vectorize<int64_t>(phi::stride(src->dims()));
-  std::vector<int64_t> dst_strides_v =
-      phi::vectorize<int64_t>(phi::stride(dst->dims()));
-  std::vector<int64_t> dst_shape_v = phi::vectorize<int64_t>(dst->dims());
-  const auto src_data = src->data<C>();
-  auto dst_data = dst->data<C>();
-  const auto last_axis = axes.back();
-  const auto last_axis_size = dst->dims().at(last_axis) / 2 + 1;
-  const int64_t rank = dst->dims().size();
-  auto _is_fft_axis = std::make_unique<bool[]>(rank);
-  for (const auto i : axes) {
-    _is_fft_axis[i] = true;
-  }
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-  const thrust::device_vector<int64_t> src_strides_g(src_strides_v);
-  const auto src_strides = thrust::raw_pointer_cast(src_strides_g.data());
-  const thrust::device_vector<int64_t> dst_strides_g(dst_strides_v);
-  const auto dst_strides = thrust::raw_pointer_cast(dst_strides_g.data());
-  const thrust::device_vector<int64_t> dst_shape_g(dst_shape_v);
-  const auto dst_shape = thrust::raw_pointer_cast(dst_shape_g.data());
-  const thrust::device_vector<bool> is_fft_axis_g(_is_fft_axis.get(),
-                                                  _is_fft_axis.get() + rank);
-  const auto p_is_fft_axis = thrust::raw_pointer_cast(is_fft_axis_g.data());
-#else
-  const auto src_strides = src_strides_v.data();
-  const auto dst_strides = dst_strides_v.data();
-  const auto dst_shape = dst_shape_v.data();
-  const auto p_is_fft_axis = _is_fft_axis.get();
-#endif
-  platform::ForRange<DeviceContext> for_range(ctx, dst->numel());
-  FFTFillConjFunctor<C> fill_conj_functor(src_data,
-                                          dst_data,
-                                          src_strides,
-                                          dst_strides,
-                                          dst_shape,
-                                          p_is_fft_axis,
-                                          last_axis,
-                                          last_axis_size,
-                                          rank);
-  for_range(fill_conj_functor);
-}
-
-template <typename DeviceContext, typename T>
-class FFTC2CKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using C = paddle::platform::complex<T>;
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-
-    auto axes = ctx.Attr<std::vector<int64_t>>("axes");
-    const std::string& norm_str = ctx.Attr<std::string>("normalization");
-    const bool forward = ctx.Attr<bool>("forward");
-    const auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Output<Tensor>("Out");
-
-    y->mutable_data<C>(ctx.GetPlace());
-    auto normalization = get_norm_from_string(norm_str, forward);
-
-    FFTC2CFunctor<DeviceContext, C, C> fft_c2c_func;
-    fft_c2c_func(dev_ctx, x, y, axes, normalization, forward);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FFTC2CGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using C = paddle::platform::complex<T>;
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-
-    auto axes = ctx.Attr<std::vector<int64_t>>("axes");
-    const std::string& norm_str = ctx.Attr<std::string>("normalization");
-    const bool forward = ctx.Attr<bool>("forward");
-    const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    dx->mutable_data<C>(ctx.GetPlace());
-    auto normalization = get_norm_from_string(norm_str, forward);
-
-    FFTC2CFunctor<DeviceContext, C, C> fft_c2c_func;
-    fft_c2c_func(dev_ctx, dy, dx, axes, normalization, !forward);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FFTR2CKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using C = paddle::platform::complex<T>;
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-
-    auto axes = ctx.Attr<std::vector<int64_t>>("axes");
-    const std::string& norm_str = ctx.Attr<std::string>("normalization");
-    const bool forward = ctx.Attr<bool>("forward");
-    const bool onesided = ctx.Attr<bool>("onesided");
-    const auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Output<Tensor>("Out");
-
-    y->mutable_data<C>(ctx.GetPlace());
-    auto normalization = get_norm_from_string(norm_str, forward);
-
-    FFTR2CFunctor<DeviceContext, T, C> fft_r2c_func;
-
-    if (onesided) {
-      fft_r2c_func(dev_ctx, x, y, axes, normalization, forward);
-    } else {
-      framework::DDim onesided_dims(y->dims());
-      const int64_t onesided_last_axis_size = y->dims().at(axes.back()) / 2 + 1;
-      onesided_dims.at(axes.back()) = onesided_last_axis_size;
-      framework::Tensor onesided_out;
-      onesided_out.mutable_data<C>(onesided_dims, ctx.GetPlace());
-      fft_r2c_func(dev_ctx, x, &onesided_out, axes, normalization, forward);
-      fill_conj<DeviceContext, C>(dev_ctx, &onesided_out, y, axes);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FFTR2CGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using C = paddle::platform::complex<T>;
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-
-    const auto axes = ctx.Attr<std::vector<int64_t>>("axes");
-    const std::string& norm_str = ctx.Attr<std::string>("normalization");
-    const bool forward = ctx.Attr<bool>("forward");
-    const bool onesided = ctx.Attr<bool>("onesided");
-
-    const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-    framework::Tensor complex_dx;
-    complex_dx.mutable_data<C>(dx->dims(), ctx.GetPlace());
-
-    auto normalization = get_norm_from_string(norm_str, forward);
-    FFTC2CFunctor<DeviceContext, C, C> fft_c2c_func;
-
-    if (!onesided) {
-      fft_c2c_func(dev_ctx, dy, &complex_dx, axes, normalization, !forward);
-    } else {
-      framework::Tensor full_dy;
-      full_dy.mutable_data<C>(dx->dims(), ctx.GetPlace());
-      auto zero_length = static_cast<int>(full_dy.dims().at(axes.back()) -
-                                          dy->dims().at(axes.back()));
-      auto rank = dy->dims().size();
-
-      std::vector<int> pads(rank * 2, 0);
-      pads[axes.back() * 2 + 1] = zero_length;
-
-      phi::funcs::PaddingFunctor<DeviceContext, C>(
-          rank,
-          ctx.template device_context<DeviceContext>(),
-          pads,
-          static_cast<C>(0),
-          *dy,
-          &full_dy);
-      fft_c2c_func(
-          dev_ctx, &full_dy, &complex_dx, axes, normalization, !forward);
-    }
-    framework::TransComplexToReal(
-        framework::TransToProtoVarType(dx->dtype()),
-        framework::TransToProtoVarType(complex_dx.dtype()),
-        complex_dx,
-        dx);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FFTC2RKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using C = paddle::platform::complex<T>;
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-
-    auto axes = ctx.Attr<std::vector<int64_t>>("axes");
-    const std::string& norm_str = ctx.Attr<std::string>("normalization");
-    const bool forward = ctx.Attr<bool>("forward");
-    const auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Output<Tensor>("Out");
-
-    y->mutable_data<T>(ctx.GetPlace());
-    auto normalization = get_norm_from_string(norm_str, forward);
-
-    FFTC2RFunctor<DeviceContext, C, T> fft_c2r_func;
-    fft_c2r_func(dev_ctx, x, y, axes, normalization, forward);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FFTC2RGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using C = paddle::platform::complex<T>;
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-
-    auto axes = ctx.Attr<std::vector<int64_t>>("axes");
-    const std::string& norm_str = ctx.Attr<std::string>("normalization");
-    const bool forward = ctx.Attr<bool>("forward");
-    const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    C* pdx = dx->mutable_data<C>(ctx.GetPlace());
-    auto normalization = get_norm_from_string(norm_str, forward);
-
-    FFTR2CFunctor<DeviceContext, T, C> fft_r2c_func;
-    fft_r2c_func(dev_ctx, dy, dx, axes, normalization, !forward);
-
-    const int64_t double_length =
-        dy->dims()[axes.back()] - dx->dims()[axes.back()];
-    const framework::DDim strides = phi::stride(dx->dims());
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-    const thrust::device_vector<int64_t> strides_g(phi::vectorize(strides));
-    const int64_t* pstrides = thrust::raw_pointer_cast(strides_g.data());
-#else
-    const int64_t* pstrides = strides.Get();
-#endif
-
-    FFTFillConjGradFunctor<C> func(pdx, axes.back(), pstrides, double_length);
-    size_t limit = dx->numel();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
-    for_range(func);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/stft_op.cc b/paddle/fluid/operators/stft_op.cc
index d708abe3d0f8c16414572933e0f3b3b052e159c2..986911a1391456b270ba0ab1ce5b1b93010a14d9 100644
--- a/paddle/fluid/operators/stft_op.cc
+++ b/paddle/fluid/operators/stft_op.cc
@@ -14,8 +14,6 @@
 
 #include "paddle/fluid/operators/stft_op.h"
 
-#include "paddle/fluid/operators/spectral_helper.h"
-
 namespace paddle {
 namespace operators {
 class StftOp : public framework::OperatorWithKernel {
diff --git a/paddle/fluid/operators/stft_op.cu b/paddle/fluid/operators/stft_op.cu
index 9edee0f66c51428ab2481e132338a24fbed916f8..bd48112f0737eca993c482c4f413dcb9ba7058ea 100644
--- a/paddle/fluid/operators/stft_op.cu
+++ b/paddle/fluid/operators/stft_op.cu
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/stft_op.h"
-#include "paddle/fluid/operators/spectral_op.cu.h"
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/stft_op.h b/paddle/fluid/operators/stft_op.h
index bbd9b137699dc7612d45d478cd48e8a81f6b99f2..fb2ca31608cd70f6a2036de2db6e7a67ae60751a 100644
--- a/paddle/fluid/operators/stft_op.h
+++ b/paddle/fluid/operators/stft_op.h
@@ -18,8 +18,11 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/spectral_op.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/funcs/fft.h"
+#include "paddle/phi/kernels/funcs/fft_fill_conj.h"
 #include "paddle/phi/kernels/funcs/frame_functor.h"
+#include "paddle/phi/kernels/funcs/padding.h"
 
 namespace paddle {
 namespace operators {
@@ -76,25 +79,25 @@ class StftKernel : public framework::OpKernel<T> {
         ctx, &frames, window, axes.back(), MulFunctor<T>(), &frames_w);
 
     // FFTR2C
-    FFTNormMode normalization;
+    phi::funcs::FFTNormMode normalization;
     if (normalized) {
-      normalization = get_norm_from_string("ortho", true);
+      normalization = phi::funcs::get_norm_from_string("ortho", true);
     } else {
-      normalization = get_norm_from_string("backward", true);
+      normalization = phi::funcs::get_norm_from_string("backward", true);
     }
-    FFTR2CFunctor<DeviceContext, T, C> fft_r2c_func;
+    phi::funcs::FFTR2CFunctor<DeviceContext, T, C> fft_r2c_func;
 
     if (onesided) {
-      fft_r2c_func(dev_ctx, &frames_w, out, axes, normalization, true);
+      fft_r2c_func(dev_ctx, frames_w, out, axes, normalization, true);
     } else {
       framework::DDim onesided_dims(out->dims());
       const int64_t onesided_axis_size = out->dims().at(axes.back()) / 2 + 1;
       onesided_dims.at(axes.back()) = onesided_axis_size;
       Tensor onesided_out;
       onesided_out.mutable_data<C>(onesided_dims, ctx.GetPlace());
-      fft_r2c_func(
-          dev_ctx, &frames_w, &onesided_out, axes, normalization, true);
-      fill_conj<DeviceContext, C>(dev_ctx, &onesided_out, out, axes);
+      fft_r2c_func(dev_ctx, frames_w, &onesided_out, axes, normalization, true);
+      phi::funcs::FFTFillConj<DeviceContext, C>(
+          dev_ctx, &onesided_out, out, axes);
     }
   }
 };
@@ -131,17 +134,17 @@ class StftGradKernel : public framework::OpKernel<T> {
     complex_d_frames_w.mutable_data<C>(d_frames_dims, ctx.GetPlace());
 
     // dy -> d_frames_w
-    FFTNormMode normalization;
+    phi::funcs::FFTNormMode normalization;
     if (normalized) {
-      normalization = get_norm_from_string("ortho", true);
+      normalization = phi::funcs::get_norm_from_string("ortho", true);
     } else {
-      normalization = get_norm_from_string("backward", true);
+      normalization = phi::funcs::get_norm_from_string("backward", true);
     }
-    FFTC2CFunctor<DeviceContext, C, C> fft_c2c_func;
+    phi::funcs::FFTC2CFunctor<DeviceContext, C, C> fft_c2c_func;
 
     if (!onesided) {
       fft_c2c_func(
-          dev_ctx, dy, &complex_d_frames_w, axes, normalization, false);
+          dev_ctx, *dy, &complex_d_frames_w, axes, normalization, false);
     } else {
       Tensor full_dy;
       full_dy.mutable_data<C>(d_frames_dims, ctx.GetPlace());
@@ -153,20 +156,11 @@ class StftGradKernel : public framework::OpKernel<T> {
       pads[axes.back() * 2 + 1] = zero_length;
 
       phi::funcs::PaddingFunctor<DeviceContext, C>(
-          rank,
-          ctx.template device_context<DeviceContext>(),
-          pads,
-          static_cast<C>(0),
-          *dy,
-          &full_dy);
+          rank, dev_ctx, pads, static_cast<C>(0), *dy, &full_dy);
       fft_c2c_func(
-          dev_ctx, &full_dy, &complex_d_frames_w, axes, normalization, false);
+          dev_ctx, full_dy, &complex_d_frames_w, axes, normalization, false);
     }
-    framework::TransComplexToReal(
-        framework::TransToProtoVarType(d_frames_w.dtype()),
-        framework::TransToProtoVarType(complex_d_frames_w.dtype()),
-        complex_d_frames_w,
-        &d_frames_w);
+    phi::RealKernel<C>(dev_ctx, complex_d_frames_w, &d_frames_w);
 
     // d_frames_w -> d_frames
     Tensor d_frames;
diff --git a/paddle/phi/api/yaml/api.yaml b/paddle/phi/api/yaml/api.yaml
index 1156206ee4b51a4d9dd103f20e406a9ad69d72c4..1674ecd2c6da7dedc00ee6576980e4d334272562 100644
--- a/paddle/phi/api/yaml/api.yaml
+++ b/paddle/phi/api/yaml/api.yaml
@@ -98,6 +98,33 @@
     func : erf
   backward : erf_grad
 
+- api : fft_c2c
+  args : (Tensor x, int64_t[] axes, str normalization, bool forward)
+  output : Tensor
+  infer_meta :
+    func : FFTC2CInferMeta
+  kernel :
+    func : fft_c2c
+  backward : fft_c2c_grad
+
+- api : fft_c2r
+  args : (Tensor x, int64_t[] axes, str normalization, bool forward, int64_t last_dim_size=0L)
+  output : Tensor
+  infer_meta :
+    func : FFTC2RInferMeta
+  kernel :
+    func : fft_c2r
+  backward : fft_c2r_grad
+
+- api : fft_r2c
+  args : (Tensor x, int64_t[] axes, str normalization, bool forward, bool onesided)
+  output : Tensor
+  infer_meta :
+    func : FFTR2CInferMeta
+  kernel :
+    func : fft_r2c
+  backward : fft_r2c_grad
+
 - api : lgamma
   args : (Tensor x)
   output : Tensor(out)
@@ -105,7 +132,7 @@
     func : UnchangedInferMeta
   kernel :
     func : lgamma
-  backward : lgamma_grad 
+  backward : lgamma_grad
 
 - api : mv
   args : (Tensor x, Tensor vec)
diff --git a/paddle/phi/api/yaml/api_compat.yaml b/paddle/phi/api/yaml/api_compat.yaml
index 5300d551f8ef8987307f3477b2cdd9d5a9fa2ea7..afa55479b3570805170fec1cd7bb381d8558725b 100644
--- a/paddle/phi/api/yaml/api_compat.yaml
+++ b/paddle/phi/api/yaml/api_compat.yaml
@@ -31,6 +31,15 @@
              float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false,
              int workspace_size_MB = 512, bool exhaustive_search = false]
 
+- api : conv2d
+  extra :
+    attrs : [bool use_cudnn = false, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false,
+             bool use_quantizer = false, str mkldnn_data_type = "float32", bool fuse_relu = false,
+             str fuse_activation = "", bool fuse_alpha = false, bool fuse_beta = false, bool use_addto = false,
+             bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f,
+             float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false,
+             int workspace_size_MB = 512, bool exhaustive_search = false]
+
 - api : cross
   inputs :
     {x : X, y : Y}
@@ -112,3 +121,15 @@
     x : X
   outputs :
     out : Out
+
+- api: fft_c2c
+  inputs: {x: X}
+  outputs: {out: Out}
+
+- api: fft_c2r
+  inputs: {x: X}
+  outputs: {out: Out}
+
+- api: fft_r2c
+  inputs: {x: X}
+  outputs: {out: Out}
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 53cdc97a716d7ac928254add06162ae3e7622130..30c2c9f9e12d39daf33d76924783c36bf77a6faa 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -105,6 +105,38 @@
     func : erf_grad
     data_type : out_grad
 
+- backward_api : fft_c2c_grad
+  forward: fft_c2c(Tensor x, int64_t[] axes, str normalization, bool forward) -> Tensor(out)
+  args : (Tensor out_grad, int64_t[] axes, str normalization, bool forward)
+  output: Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [out_grad]
+  kernel :
+    func : fft_c2c_grad
+
+- backward_api : fft_c2r_grad
+  forward: fft_c2r(Tensor x, int64_t[] axes, str normalization, bool forward, int64_t last_dim_size) -> Tensor(out)
+  args : (Tensor out_grad, int64_t[] axes, str normalization, bool forward, int64_t last_dim_size)
+  output: Tensor(x_grad)
+  infer_meta :
+    func : FFTC2RGradInferMeta
+  kernel :
+    func : fft_c2r_grad
+    data_type: out_grad
+
+- backward_api : fft_r2c_grad
+  forward: fft_r2c(Tensor x, int64_t[] axes, str normalization, bool forward, bool onesided) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, int64_t[] axes, str normalization, bool forward, bool onesided)
+  output: Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : fft_r2c_grad
+    data_type: out_grad
+  no_need_buffer: x
+
 - backward_api : lgamma_grad
   forward : lgamma(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
diff --git a/paddle/phi/core/utils/data_type.h b/paddle/phi/core/utils/data_type.h
index 975d55889c7173ddb1a00fa278ba465291c7275e..9877149dc52bd8308c459f8f8f53a8f090ab57e7 100644
--- a/paddle/phi/core/utils/data_type.h
+++ b/paddle/phi/core/utils/data_type.h
@@ -97,4 +97,18 @@ inline DataType ToComplexType(const DataType& type) {
           type));
   }
 }
+
+inline DataType ToRealType(const DataType& type) {
+  switch (type) {
+    case DataType::COMPLEX64:
+      return DataType::FLOAT32;
+    case DataType::COMPLEX128:
+      return DataType::FLOAT64;
+    default:
+      PADDLE_THROW(errors::Unimplemented(
+          "Can not transform data type (%s) to real type, now only support "
+          "complex64 and complex128 value.",
+          type));
+  }
+}
 }  // namespace phi
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index e3898adf56c55803ab41d1e7b4853e5e644ac50c..d5c88a2f9e393990551a2f48cb5fba82cc9d79c8 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/infermeta/backward.h"
 #include "paddle/phi/common/type_traits.h"
+#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 
 namespace phi {
@@ -285,6 +286,47 @@ void EigvalshGradInferMeta(const MetaTensor& out_v,
   }
 }
 
+void FFTC2RGradInferMeta(const MetaTensor& x,
+                         const std::vector<int64_t>& axes,
+                         const std::string& normalization,
+                         bool forward,
+                         int64_t last_dim_size,
+                         MetaTensor* out,
+                         MetaConfig config) {
+  PADDLE_ENFORCE_NOT_NULL(out,
+                          phi::errors::InvalidArgument(
+                              "Output of fft_c2r _grad should not be null."));
+  const phi::DDim x_dim = x.dims();
+
+  // only ensure that fft axes' size greater than zero at runtime
+  // they might be -1 to indicate unknown size ar compile time
+  if (config.is_runtime) {
+    for (size_t i = 0; i < axes.size(); i++) {
+      PADDLE_ENFORCE_GT(x_dim[axes[i]],
+                        0,
+                        phi::errors::InvalidArgument(
+                            "Invalid fft n-point (%d).", x_dim[axes[i]]));
+    }
+  }
+
+  out->set_layout(x.layout());
+  out->set_dtype(ToComplexType(x.dtype()));
+
+  phi::DDim out_dim = x.dims();
+  const int64_t last_fft_axis = axes.back();
+  if (last_dim_size > 0) {
+    out_dim.at(last_fft_axis) = last_dim_size / 2 + 1;
+  } else if (config.is_runtime) {
+    const int64_t last_fft_dim_size = x_dim[last_fft_axis];
+    out_dim.at(last_fft_axis) = last_fft_dim_size / 2 + 1;
+  } else {
+    const int64_t last_fft_dim_size = x_dim[last_fft_axis];
+    out_dim.at(last_fft_axis) =
+        last_fft_dim_size == -1 ? -1 : last_fft_dim_size / 2 + 1;
+  }
+  out->set_dims(out_dim);
+}
+
 void FillDiagonalGradInferMeta(const MetaTensor& dout,
                                float value,
                                int offset,
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index 15ab16eff1c19d55619a05ddafa23865804784ba..f21b942bf952c5d14e58ea679d954c9dc3742d0a 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -137,6 +137,14 @@ void EigvalshGradInferMeta(const MetaTensor& out_v,
                            bool is_test,
                            MetaTensor* x_grad);
 
+void FFTC2RGradInferMeta(const MetaTensor& x,
+                         const std::vector<int64_t>& axes,
+                         const std::string& normalization,
+                         bool forward,
+                         int64_t last_dim_size,
+                         MetaTensor* out,
+                         MetaConfig = MetaConfig());
+
 void FillDiagonalGradInferMeta(
     const MetaTensor& dout, float value, int offset, bool wrap, MetaTensor* dx);
 
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index a7ed6e71d0503a4b5b40743dc9c82ccaee32be81..bdcef965be2585adc523835b9b18fdc645af3178 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -866,6 +866,112 @@ void FillDiagonalInferMeta(
   out->set_dtype(x.dtype());
 }
 
+void FFTC2CInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axes,
+                     const std::string& normalization,
+                     bool forward,
+                     MetaTensor* out,
+                     MetaConfig config) {
+  PADDLE_ENFORCE_NOT_NULL(
+      out,
+      phi::errors::InvalidArgument("Output of fft_c2c should not be null."));
+  // only ensure that fft axes' size greater than zero at runtime
+  // they might be -1 to indicate unknown size ar compile time
+  if (config.is_runtime) {
+    const phi::DDim x_dim = x.dims();
+    for (size_t i = 0; i < axes.size(); i++) {
+      PADDLE_ENFORCE_GT(x_dim[axes[i]],
+                        0,
+                        phi::errors::InvalidArgument(
+                            "Invalid fft n-point (%d).", x_dim[axes[i]]));
+    }
+  }
+  out->share_meta(x);
+}
+
+void FFTC2RInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axes,
+                     const std::string& normalization,
+                     bool forward,
+                     int64_t last_dim_size,
+                     MetaTensor* out,
+                     MetaConfig config) {
+  PADDLE_ENFORCE_NOT_NULL(
+      out,
+      phi::errors::InvalidArgument("Output of fft_c2r should not be null."));
+  const phi::DDim x_dim = x.dims();
+  const int64_t last_fft_axis = axes.back();
+
+  // only ensure that fft axes' size greater than zero at runtime
+  // they might be -1 to indicate unknown size ar compile time
+  if (config.is_runtime) {
+    size_t signal_dims = axes.size();
+    for (size_t i = 0; i < signal_dims - 1; i++) {
+      PADDLE_ENFORCE_GT(x_dim[axes[i]],
+                        0,
+                        phi::errors::InvalidArgument(
+                            "Invalid fft n-point (%d).", x_dim[axes[i]]));
+    }
+  }
+
+  out->set_layout(x.layout());
+  out->set_dtype(ToRealType(x.dtype()));
+  phi::DDim out_dim = x_dim;
+
+  if (last_dim_size > 0) {
+    out_dim.at(last_fft_axis) = last_dim_size;
+  } else if (config.is_runtime) {
+    const int64_t input_last_dim_size = x_dim[last_fft_axis];
+    const int64_t fft_n_point = (input_last_dim_size - 1) * 2;
+    PADDLE_ENFORCE_GT(
+        fft_n_point,
+        0,
+        phi::errors::InvalidArgument("Invalid fft n-point (%d).", fft_n_point));
+    out_dim.at(last_fft_axis) = fft_n_point;
+  } else {
+    const int64_t input_last_dim_size = x_dim[last_fft_axis];
+    out_dim.at(last_fft_axis) =
+        input_last_dim_size == -1 ? -1 : (input_last_dim_size - 1) * 2;
+  }
+  out->set_dims(out_dim);
+}
+
+void FFTR2CInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axes,
+                     const std::string& normalization,
+                     bool forward,
+                     bool onesided,
+                     MetaTensor* out,
+                     MetaConfig config) {
+  PADDLE_ENFORCE_NOT_NULL(
+      out,
+      phi::errors::InvalidArgument("Output of fft_r2c should not be null."));
+  const phi::DDim x_dim = x.dims();
+
+  // only ensure that fft axes' size greater than zero at runtime
+  // they might be -1 to indicate unknown size ar compile time
+  if (config.is_runtime) {
+    for (size_t i = 0; i < axes.size(); i++) {
+      PADDLE_ENFORCE_GT(x_dim[axes[i]],
+                        0,
+                        phi::errors::InvalidArgument(
+                            "Invalid fft n-point (%d).", x_dim[axes[i]]));
+    }
+  }
+
+  out->set_layout(x.layout());
+  out->set_dtype(ToComplexType(x.dtype()));
+  if (!onesided) {
+    out->share_dims(x);
+  } else {
+    phi::DDim out_dim = x.dims();
+    const int64_t last_fft_axis = axes.back();
+    const int64_t last_fft_dim_size = x_dim[last_fft_axis];
+    out_dim.at(last_fft_axis) = last_fft_dim_size / 2 + 1;
+    out->set_dims(out_dim);
+  }
+}
+
 void FlattenInferMeta(const MetaTensor& x,
                       int start_axis,
                       int stop_axis,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 5a8bcfb2b702fbd6dd267aac727aefe3d6e75bb3..f2bb43e952d5f8e41157b1bb48a197b42666bbd2 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -135,6 +135,29 @@ void ExpandInferMeta(const MetaTensor& x,
 void FillDiagonalInferMeta(
     const MetaTensor& x, float value, int offset, bool wrap, MetaTensor* out);
 
+void FFTC2CInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axes,
+                     const std::string& normalization,
+                     bool forward,
+                     MetaTensor* out,
+                     MetaConfig = MetaConfig());
+
+void FFTC2RInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axes,
+                     const std::string& normalization,
+                     bool forward,
+                     int64_t last_dim_size,
+                     MetaTensor* out,
+                     MetaConfig = MetaConfig());
+
+void FFTR2CInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axes,
+                     const std::string& normalization,
+                     bool forward,
+                     bool onesided,
+                     MetaTensor* out,
+                     MetaConfig = MetaConfig());
+
 void FlattenInferMeta(const MetaTensor& x,
                       int start_axis,
                       int stop_axis,
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 7b64658d571cf8260c280f2883fb956043e5e70e..64ffdbe885356d6861407e80a3b25ff5cba93297 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -65,7 +65,8 @@ set(COMMON_KERNEL_DEPS
     matrix_solve
     phi_dynload_warpctc
     sequence_padding
-    sequence_scale)
+    sequence_scale
+    fft)
 
 set(COMMON_KERNEL_DEPS
     ${COMMON_KERNEL_DEPS}
diff --git a/paddle/phi/kernels/assign_kernel.h b/paddle/phi/kernels/assign_kernel.h
index 0294dc950deb15bb2e453e16cdfdf38c34d7ac9d..41be3e43a303d1233f2f18bbf2cd4af6f34d31be 100644
--- a/paddle/phi/kernels/assign_kernel.h
+++ b/paddle/phi/kernels/assign_kernel.h
@@ -18,6 +18,7 @@
 
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace phi {
 
@@ -26,6 +27,16 @@ void AssignKernel(const Context& dev_ctx,
                   const DenseTensor& x,
                   DenseTensor* out);
 
+template <typename Context>
+DenseTensor Assign(const Context& dev_ctx, const DenseTensor& x) {
+  DenseTensor out;
+  MetaTensor meta_out(&out);
+  MetaTensor meta_x(x);
+  UnchangedInferMeta(meta_x, &meta_out);
+  AssignKernel<Context>(dev_ctx, x, &out);
+  return out;
+}
+
 // In order to be compatible with the `AsDispensable` input in the original
 // assign op maker, the input parameter here needs to be dispensable, but
 // this looks weird
diff --git a/paddle/phi/kernels/cpu/fft_grad_kernel.cc b/paddle/phi/kernels/cpu/fft_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aecaf6c5c13f8f78e99bbde895be6e219a201bf0
--- /dev/null
+++ b/paddle/phi/kernels/cpu/fft_grad_kernel.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/fft_grad_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/fft_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(fft_c2c_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FFTC2CGradKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+PD_REGISTER_KERNEL(
+    fft_c2r_grad, CPU, ALL_LAYOUT, phi::FFTC2RGradKernel, float, double) {}
+PD_REGISTER_KERNEL(fft_r2c_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FFTR2CGradKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/fft_kernel.cc b/paddle/phi/kernels/cpu/fft_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f5e3e350fc085176a8becca856d635344bde129f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/fft_kernel.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/fft_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/fft_kernel_impl.h"
+
+PD_REGISTER_KERNEL(fft_c2c,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FFTC2CKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+PD_REGISTER_KERNEL(fft_c2r,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FFTC2RKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+PD_REGISTER_KERNEL(fft_r2c, CPU, ALL_LAYOUT, phi::FFTR2CKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/cpu/scale_kernel.cc b/paddle/phi/kernels/cpu/scale_kernel.cc
index e929b5bd7219b60acb226374f67a0bc511c41723..421aae270ee591c13dd205dd6909a8eb7bc3cef5 100644
--- a/paddle/phi/kernels/cpu/scale_kernel.cc
+++ b/paddle/phi/kernels/cpu/scale_kernel.cc
@@ -62,4 +62,6 @@ PD_REGISTER_KERNEL(scale,
                    int8_t,
                    int16_t,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/fft_grad_kernel.h b/paddle/phi/kernels/fft_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f5237f1fd08c7aa8c492dbbbc32dab6ebae2fc6
--- /dev/null
+++ b/paddle/phi/kernels/fft_grad_kernel.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+template <typename T, typename Context>
+void FFTC2CGradKernel(const Context& ctx,
+                      const DenseTensor& out_grad,
+                      const std::vector<int64_t>& axes,
+                      const std::string& normalization,
+                      bool forward,
+                      DenseTensor* x_grad);
+
+template <typename T, typename Context>
+void FFTC2RGradKernel(const Context& ctx,
+                      const DenseTensor& out_grad,
+                      const std::vector<int64_t>& axes,
+                      const std::string& normalization,
+                      bool forward,
+                      int64_t last_dim_size,
+                      DenseTensor* x_grad);
+
+template <typename T, typename Context>
+void FFTR2CGradKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const std::vector<int64_t>& axes,
+                      const std::string& normalization,
+                      bool forward,
+                      bool onesided,
+                      DenseTensor* x_grad);
+}  // namespace phi
diff --git a/paddle/phi/kernels/fft_kernel.h b/paddle/phi/kernels/fft_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6105ec4d0b3ec9e72bf4842ac192c312badc48a1
--- /dev/null
+++ b/paddle/phi/kernels/fft_kernel.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+template <typename T, typename Context>
+void FFTC2CKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& axes,
+                  const std::string& normalization,
+                  bool forward,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void FFTC2RKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& axes,
+                  const std::string& normalization,
+                  bool forward,
+                  int64_t last_dim_size,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void FFTR2CKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& axes,
+                  const std::string& normalization,
+                  bool forward,
+                  bool onesided,
+                  DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index 646d65cf8a63eac052a841eed1c952c1f0ceb44e..afa46f1dacaefeb28946f840d778a5969b554bed 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -16,3 +16,20 @@ math_library(pooling DEPS dense_tensor)
 math_library(segment_pooling)
 math_library(sequence2batch)
 math_library(matrix_solve DEPS dense_tensor eigen3 blas math_function)
+
+if(WITH_GPU OR WITH_ROCM)
+  if(MKL_FOUND AND WITH_ONEMKL)
+    math_library(fft spectral_op.cu DEPS dynload_cuda dynload_mklrt
+                 dense_tensor)
+    target_include_directories(fft PRIVATE ${MKL_INCLUDE})
+  else()
+    math_library(fft spectral_op.cu DEPS dynload_cuda dense_tensor pocketfft)
+  endif()
+else()
+  if(MKL_FOUND AND WITH_ONEMKL)
+    mathp_library(fft DEPS dynload_mklrt dense_tensor)
+    target_include_directories(fft PRIVATE ${MKL_INCLUDE})
+  else()
+    math_library(fft DEPS dense_tensor pocketfft)
+  endif()
+endif()
diff --git a/paddle/phi/kernels/funcs/cufft_util.h b/paddle/phi/kernels/funcs/cufft_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..584425c6112a5dfe3c7f0ad7b7294a0be1812402
--- /dev/null
+++ b/paddle/phi/kernels/funcs/cufft_util.h
@@ -0,0 +1,160 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/backends/dynload/cufft.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/funcs/fft.h"
+#include "paddle/phi/kernels/funcs/fft_key.h"
+
+namespace phi {
+namespace funcs {
+namespace detail {
+
+// An RAII encapsulation of cuFFTHandle
+class CuFFTHandle {
+ public:
+  CuFFTHandle() {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cufftCreate(&handle_));
+  }
+
+  CuFFTHandle(const CuFFTHandle& other) = delete;
+  CuFFTHandle& operator=(const CuFFTHandle& other) = delete;
+
+  CuFFTHandle(CuFFTHandle&& other) = delete;
+  CuFFTHandle& operator=(CuFFTHandle&& other) = delete;
+
+  ::cufftHandle& get() { return handle_; }
+  const ::cufftHandle& get() const { return handle_; }
+
+  ~CuFFTHandle() { phi::dynload::cufftDestroy(handle_); }
+
+ private:
+  ::cufftHandle handle_;
+};
+
+// Returns true if the transform type has complex input
+inline bool has_complex_input(FFTTransformType type) {
+  switch (type) {
+    case FFTTransformType::C2C:
+    case FFTTransformType::C2R:
+      return true;
+
+    case FFTTransformType::R2C:
+      return false;
+  }
+  PADDLE_THROW(phi::errors::InvalidArgument("Unknown FFTTransformType"));
+}
+
+// Returns true if the transform type has complex output
+inline bool has_complex_output(FFTTransformType type) {
+  switch (type) {
+    case FFTTransformType::C2C:
+    case FFTTransformType::R2C:
+      return true;
+
+    case FFTTransformType::C2R:
+      return false;
+  }
+  PADDLE_THROW(phi::errors::InvalidArgument("Unknown FFTTransformType"));
+}
+
+class FFTConfig {
+ public:
+  using plan_size_type = long long int;  // NOLINT (be consistent with cufft)
+  explicit FFTConfig(const FFTConfigKey& key)
+      : FFTConfig(
+            std::vector<int64_t>(key.sizes_, key.sizes_ + key.signal_ndim_ + 1),
+            key.fft_type_,
+            key.value_type_) {}
+  // sizes are full signal, including batch size and always two-sided
+  FFTConfig(const std::vector<int64_t>& sizes,
+            FFTTransformType fft_type,
+            DataType precison)
+      : fft_type_(fft_type), precision_(precison) {
+    const auto batch_size = static_cast<plan_size_type>(sizes[0]);
+    std::vector<plan_size_type> signal_sizes(sizes.cbegin() + 1, sizes.cend());
+    const int signal_ndim = sizes.size() - 1;
+
+    cudaDataType itype, otype, exec_type;
+    const bool complex_input = has_complex_input(fft_type);
+    const bool complex_output = has_complex_output(fft_type);
+    if (precison == DataType::FLOAT32) {
+      itype = complex_input ? CUDA_C_32F : CUDA_R_32F;
+      otype = complex_output ? CUDA_C_32F : CUDA_R_32F;
+      exec_type = CUDA_C_32F;
+    } else if (precison == DataType::FLOAT64) {
+      itype = complex_input ? CUDA_C_64F : CUDA_R_64F;
+      otype = complex_output ? CUDA_C_64F : CUDA_R_64F;
+      exec_type = CUDA_C_64F;
+    } else {
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "Only transforms of type float32 and float64 are supported."));
+    }
+
+    // disable auto allocation of workspace to use allocator from the framework
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cufftSetAutoAllocation(plan(), /* autoAllocate */ 0));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cufftXtMakePlanMany(plan(),
+                                          signal_ndim,
+                                          signal_sizes.data(),
+                                          /* inembed */ nullptr,
+                                          /* base_istride */ 1L,
+                                          /* idist */ 1L,
+                                          itype,
+                                          /* onembed */ nullptr,
+                                          /* base_ostride */ 1L,
+                                          /* odist */ 1L,
+                                          otype,
+                                          batch_size,
+                                          &ws_size_,
+                                          exec_type));
+  }
+
+  FFTConfig(const FFTConfig& other) = delete;
+  FFTConfig& operator=(const FFTConfig& other) = delete;
+
+  FFTConfig(FFTConfig&& other) = delete;
+  FFTConfig& operator=(FFTConfig&& other) = delete;
+
+  const cufftHandle& plan() const { return plan_.get(); }
+  FFTTransformType transform_type() const { return fft_type_; }
+  DataType data_type() const { return precision_; }
+  size_t workspace_size() const { return ws_size_; }
+
+ private:
+  CuFFTHandle plan_;
+  size_t ws_size_;  // workspace size in bytes
+  FFTTransformType fft_type_;
+  DataType precision_;
+};
+
+// NOTE: R2C is forward-only, C2R is backward only
+static void exec_plan(const FFTConfig& config,
+                      void* in_data,
+                      void* out_data,
+                      bool forward) {
+  auto& plan = config.plan();
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cufftXtExec(
+      plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE));
+}
+
+}  // namespace detail
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/fft.cc b/paddle/phi/kernels/funcs/fft.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9895ff406cb89bbf6dc39b1cc7c5780e8a9ff10d
--- /dev/null
+++ b/paddle/phi/kernels/funcs/fft.cc
@@ -0,0 +1,378 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/funcs/fft.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+#if defined(PADDLE_WITH_ONEMKL)
+#include "paddle/phi/kernels/funcs/mkl_fft_utils.h"
+#elif defined(PADDLE_WITH_POCKETFFT)
+#define POCKETFFT_CACHE_SIZE 16
+#include "extern_pocketfft/pocketfft_hdronly.h"
+#endif
+
+namespace phi {
+namespace funcs {
+#if defined(PADDLE_WITH_ONEMKL)
+
+namespace detail {
+// Execute a general fft operation (can be c2c, onesided r2c or onesided c2r)
+template <typename Ti, typename To>
+void exec_fft(const phi::CPUContext& ctx,
+              const DenseTensor& x,
+              DenseTensor* out,
+              const std::vector<int64_t>& axes,
+              FFTNormMode normalization,
+              bool forward) {
+  const phi::DDim& in_sizes = x.dims();
+  const int ndim = in_sizes.size();
+  const int signal_ndim = axes.size();
+  const int batch_ndim = ndim - signal_ndim;
+  const phi::DDim& out_sizes = out->dims();
+
+  // make a dim permutation
+  std::vector<int> dim_permute(ndim);
+  std::iota(dim_permute.begin(), dim_permute.end(), 0);
+  std::vector<bool> is_transformed_dim(ndim, false);
+  for (const auto& d : axes) {
+    is_transformed_dim[d] = true;
+  }
+  const auto batch_end =
+      std::partition(dim_permute.begin(), dim_permute.end(), [&](size_t axis) {
+        return !is_transformed_dim[axis];
+      });
+  std::copy(axes.cbegin(), axes.cend(), batch_end);
+
+  // transpose input according to the permutation
+  DenseTensor transposed_input =
+      Transpose<Ti, phi::CPUContext>(ctx, x, dim_permute);
+  const phi::DDim& transposed_input_shape = transposed_input.dims();
+
+  // batch size
+  int64_t batch_size = 1L;
+  for (int i = 0; i < batch_ndim; i++) {
+    batch_size *= transposed_input_shape[i];
+  }
+
+  // make an collapsed input: collapse batch axes for input
+  std::vector<int64_t> collapsed_input_shape_;
+  collapsed_input_shape_.reserve(1 + signal_ndim);
+  collapsed_input_shape_.emplace_back(batch_size);
+  for (int i = 0; i < signal_ndim; i++) {
+    collapsed_input_shape_.push_back(in_sizes[axes[i]]);
+  }
+  phi::DDim collapsed_input_shape = phi::make_ddim(collapsed_input_shape_);
+  transposed_input.Resize(collapsed_input_shape);
+  DenseTensor& collapsed_input = transposed_input;
+
+  // make a collapsed output
+  phi::DDim transposed_output_shape = out_sizes.transpose(dim_permute);
+  std::vector<int64_t> collapsed_output_shape_;
+  collapsed_output_shape_.reserve(1 + signal_ndim);
+  collapsed_output_shape_.emplace_back(batch_size);
+  for (int i = 0; i < signal_ndim; i++) {
+    collapsed_output_shape_.push_back(out_sizes[axes[i]]);
+  }
+  phi::DDim collapsed_output_shape = phi::make_ddim(collapsed_output_shape_);
+  DenseTensor collapsed_output;
+  collapsed_output.Resize(collapsed_output_shape);
+  ctx.Alloc<To>(&collapsed_output);
+
+  // make a DFTI_DESCRIPTOR
+  std::vector<int64_t> signal_sizes(1 + signal_ndim);
+  signal_sizes[0] = batch_size;
+  for (int i = 0; i < signal_ndim; i++) {
+    signal_sizes[1 + i] =
+        std::max(collapsed_input_shape[1 + i], collapsed_output_shape[1 + i]);
+  }
+  const phi::DDim input_stride = phi::stride(collapsed_input_shape);
+  const phi::DDim output_stride = phi::stride(collapsed_output_shape);
+
+  DftiDescriptor desc = plan_mkl_fft(x.dtype(),
+                                     out->dtype(),
+                                     input_stride,
+                                     output_stride,
+                                     signal_sizes,
+                                     normalization,
+                                     forward);
+  // execute the transform
+  const FFTTransformType fft_type = GetFFTTransformType(x.dtype(), out->type());
+  if (fft_type == FFTTransformType::C2R && forward) {
+    ConjKernel<Ti, phi::CPUContext>(ctx, collapsed_input, &collapsed_input);
+    MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
+        desc.get(), collapsed_input.data(), collapsed_output.data()));
+  } else if (fft_type == FFTTransformType::R2C && !forward) {
+    MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
+        desc.get(), collapsed_input.data(), collapsed_output.data()));
+    ConjKernel<To, phi::CPUContext>(ctx, collapsed_output, &collapsed_output);
+  } else {
+    if (forward) {
+      MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
+          desc.get(), collapsed_input.data(), collapsed_output.data()));
+    } else {
+      MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
+          desc.get(), collapsed_input.data(), collapsed_output.data()));
+    }
+  }
+
+  // resize for the collapsed output
+  collapsed_output.Resize(transposed_output_shape);
+  phi::DenseTensor& transposed_output = collapsed_output;
+
+  // reverse the transposition
+  std::vector<int> reverse_dim_permute(ndim);
+  for (int i = 0; i < ndim; i++) {
+    reverse_dim_permute[dim_permute[i]] = i;
+  }
+  TransposeKernel<To, phi::CPUContext>(
+      ctx, transposed_output, reverse_dim_permute, out);
+}
+}  // namespace detail
+
+template <typename Ti, typename To>
+struct FFTC2CFunctor<phi::CPUContext, Ti, To> {
+  void operator()(const phi::CPUContext& ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out,
+                  const std::vector<int64_t>& axes,
+                  FFTNormMode normalization,
+                  bool forward) {
+    detail::exec_fft<Ti, To>(ctx, x, out, axes, normalization, forward);
+  }
+};
+
+template <typename Ti, typename To>
+struct FFTR2CFunctor<phi::CPUContext, Ti, To> {
+  void operator()(const phi::CPUContext& ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out,
+                  const std::vector<int64_t>& axes,
+                  FFTNormMode normalization,
+                  bool forward) {
+    detail::exec_fft<Ti, To>(ctx, x, out, axes, normalization, forward);
+  }
+};
+
+template <typename Ti, typename To>
+struct FFTC2RFunctor<phi::CPUContext, Ti, To> {
+  void operator()(const phi::CPUContext& ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out,
+                  const std::vector<int64_t>& axes,
+                  FFTNormMode normalization,
+                  bool forward) {
+    if (axes.size() > 1) {
+      DenseTensor c2c_result = EmptyLike<Ti, phi::CPUContext>(ctx, x);
+
+      const std::vector<int64_t> c2c_dims(axes.begin(), axes.end() - 1);
+      FFTC2CFunctor<phi::CPUContext, Ti, Ti> c2c_functor;
+      c2c_functor(ctx, x, &c2c_result, c2c_dims, normalization, forward);
+
+      const std::vector<int64_t> new_axes{axes.back()};
+      detail::exec_fft<Ti, To>(
+          ctx, c2c_result, out, new_axes, normalization, forward);
+    } else {
+      detail::exec_fft<Ti, To>(ctx, x, out, axes, normalization, forward);
+    }
+  }
+};
+
+#elif defined(PADDLE_WITH_POCKETFFT)
+namespace detail {
+template <typename T>
+static T compute_factor(size_t size, FFTNormMode normalization) {
+  constexpr auto one = static_cast<T>(1);
+  switch (normalization) {
+    case FFTNormMode::none:
+      return one;
+    case FFTNormMode::by_n:
+      return one / static_cast<T>(size);
+    case FFTNormMode::by_sqrt_n:
+      return one / std::sqrt(static_cast<T>(size));
+  }
+  PADDLE_THROW(phi::errors::InvalidArgument("Unsupported normalization type"));
+}
+}  // namespace detail
+
+template <typename Ti, typename To>
+struct FFTC2CFunctor<phi::CPUContext, Ti, To> {
+  void operator()(const phi::CPUContext& ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out,
+                  const std::vector<int64_t>& axes,
+                  FFTNormMode normalization,
+                  bool forward) {
+    using R = typename Ti::value_type;
+    using C = std::complex<R>;
+
+    const auto& input_dim = x.dims();
+    const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
+    std::vector<std::ptrdiff_t> in_strides =
+        phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
+    const int64_t data_size = sizeof(C);
+    std::transform(in_strides.begin(),
+                   in_strides.end(),
+                   in_strides.begin(),
+                   [&](std::ptrdiff_t s) { return s * data_size; });
+
+    const auto* in_data = reinterpret_cast<const C*>(x.data<Ti>());
+    auto* out_data = reinterpret_cast<C*>(out->data<To>());
+    // pocketfft requires std::vector<size_t>
+    std::vector<size_t> axes_(axes.size());
+    std::copy(axes.begin(), axes.end(), axes_.begin());
+    // compuet factor
+    size_t signal_numel = 1;
+    for (const auto axis : axes) {
+      signal_numel *= in_sizes[axis];
+    }
+    R factor = detail::compute_factor<R>(signal_numel, normalization);
+    pocketfft::c2c(in_sizes,
+                   in_strides,
+                   in_strides,
+                   axes_,
+                   forward,
+                   in_data,
+                   out_data,
+                   factor);
+  }
+};
+
+template <typename Ti, typename To>
+struct FFTR2CFunctor<phi::CPUContext, Ti, To> {
+  void operator()(const phi::CPUContext& ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out,
+                  const std::vector<int64_t>& axes,
+                  FFTNormMode normalization,
+                  bool forward) {
+    using R = Ti;
+    using C = std::complex<R>;
+
+    const auto& input_dim = x.dims();
+    const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
+    std::vector<std::ptrdiff_t> in_strides =
+        phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
+    {
+      const int64_t data_size = sizeof(R);
+      std::transform(in_strides.begin(),
+                     in_strides.end(),
+                     in_strides.begin(),
+                     [&](std::ptrdiff_t s) { return s * data_size; });
+    }
+
+    const auto& output_dim = out->dims();
+    const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
+    std::vector<std::ptrdiff_t> out_strides =
+        phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
+    {
+      const int64_t data_size = sizeof(C);
+      std::transform(out_strides.begin(),
+                     out_strides.end(),
+                     out_strides.begin(),
+                     [&](std::ptrdiff_t s) { return s * data_size; });
+    }
+
+    const auto* in_data = x.data<R>();
+    auto* out_data = reinterpret_cast<C*>(out->data<To>());
+    // pocketfft requires std::vector<size_t>
+    std::vector<size_t> axes_(axes.size());
+    std::copy(axes.begin(), axes.end(), axes_.begin());
+    // compuet normalization factor
+    size_t signal_numel = 1;
+    for (const auto axis : axes) {
+      signal_numel *= in_sizes[axis];
+    }
+    R factor = detail::compute_factor<R>(signal_numel, normalization);
+    pocketfft::r2c(in_sizes,
+                   in_strides,
+                   out_strides,
+                   axes_,
+                   forward,
+                   in_data,
+                   out_data,
+                   factor);
+  }
+};
+
+template <typename Ti, typename To>
+struct FFTC2RFunctor<phi::CPUContext, Ti, To> {
+  void operator()(const phi::CPUContext& ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out,
+                  const std::vector<int64_t>& axes,
+                  FFTNormMode normalization,
+                  bool forward) {
+    using R = To;
+    using C = std::complex<R>;
+
+    const auto& input_dim = x.dims();
+    const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
+    std::vector<std::ptrdiff_t> in_strides =
+        phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
+    {
+      const int64_t data_size = sizeof(C);
+      std::transform(in_strides.begin(),
+                     in_strides.end(),
+                     in_strides.begin(),
+                     [&](std::ptrdiff_t s) { return s * data_size; });
+    }
+
+    const auto& output_dim = out->dims();
+    const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
+    std::vector<std::ptrdiff_t> out_strides =
+        phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
+    {
+      const int64_t data_size = sizeof(R);
+      std::transform(out_strides.begin(),
+                     out_strides.end(),
+                     out_strides.begin(),
+                     [&](std::ptrdiff_t s) { return s * data_size; });
+    }
+
+    const auto* in_data = reinterpret_cast<const C*>(x.data<Ti>());
+    auto* out_data = out->data<R>();
+    // pocketfft requires std::vector<size_t>
+    std::vector<size_t> axes_(axes.size());
+    std::copy(axes.begin(), axes.end(), axes_.begin());
+    // compuet normalization factor
+    size_t signal_numel = 1;
+    for (const auto axis : axes) {
+      signal_numel *= out_sizes[axis];
+    }
+    R factor = detail::compute_factor<R>(signal_numel, normalization);
+    pocketfft::c2r(out_sizes,
+                   in_strides,
+                   out_strides,
+                   axes_,
+                   forward,
+                   in_data,
+                   out_data,
+                   factor);
+  }
+};
+#endif
+
+using complex64_t = phi::dtype::complex<float>;
+using complex128_t = phi::dtype::complex<double>;
+template struct FFTC2CFunctor<phi::CPUContext, complex64_t, complex64_t>;
+template struct FFTC2CFunctor<phi::CPUContext, complex128_t, complex128_t>;
+template struct FFTC2RFunctor<phi::CPUContext, complex64_t, float>;
+template struct FFTC2RFunctor<phi::CPUContext, complex128_t, double>;
+template struct FFTR2CFunctor<phi::CPUContext, float, complex64_t>;
+template struct FFTR2CFunctor<phi::CPUContext, double, complex128_t>;
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/fft.cu b/paddle/phi/kernels/funcs/fft.cu
new file mode 100644
index 0000000000000000000000000000000000000000..edac497bc8e8b7c164108f0d856fd0aa0dcc981a
--- /dev/null
+++ b/paddle/phi/kernels/funcs/fft.cu
@@ -0,0 +1,346 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cmath>
+
+#include "paddle/phi/kernels/funcs/fft.h"
+#include "paddle/phi/kernels/funcs/fft_cache.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/assign_kernel.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/scale_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+namespace funcs {
+namespace detail {
+
+// Use the optimized path to perform single R2C or C2R if transformation dim is
+// supported by cuFFT
+static bool use_optimized_fft_path(const std::vector<int64_t>& axes) {
+  // For performance reason, when axes starts with (0, 1), do not use the
+  // optimized path.
+  if (axes.size() > kMaxFFTNdim ||
+      (axes.size() >= 2 && axes[0] == 0 && axes[1] == 1)) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
+static double fft_normalization_scale(FFTNormMode normalization,
+                                      const std::vector<int64_t>& sizes,
+                                      const std::vector<int64_t>& dims) {
+  // auto norm = static_cast<fft_norm_mode>(normalization);
+  if (normalization == FFTNormMode::none) {
+    return static_cast<double>(1.0);
+  }
+
+  int64_t signal_numel = 1;
+  for (auto dim : dims) {
+    signal_numel *= sizes[dim];
+  }
+  const double scale_denom = (normalization == FFTNormMode::by_sqrt_n)
+                                 ? std::sqrt(signal_numel)
+                                 : static_cast<double>(signal_numel);
+  return static_cast<double>(1.0 / scale_denom);
+}
+
+template <typename T>
+void exec_normalization(const phi::GPUContext& ctx,
+                        const DenseTensor& in,
+                        DenseTensor* out,
+                        FFTNormMode normalization,
+                        const std::vector<int64_t>& sizes,
+                        const std::vector<int64_t>& axes) {
+  const double scale = fft_normalization_scale(normalization, sizes, axes);
+  if (scale != 1.0) {
+    ScaleKernel<T, phi::GPUContext>(ctx, in, scale, 0, true, out);
+  } else {
+    AssignKernel<phi::GPUContext>(ctx, in, out);
+  }
+}
+
+bool has_large_prime_factor(int64_t n) {
+  constexpr int64_t first_large_prime = 11;
+  const std::array<int64_t, 4> prime_radices{{2, 3, 5, 7}};
+  for (auto prime : prime_radices) {
+    if (n < first_large_prime) {
+      return false;
+    }
+    while (n % prime == 0) {
+      n /= prime;
+    }
+  }
+  return n != 1;
+}
+
+#if defined(PADDLE_WITH_CUDA)
+inline bool use_cache(const int64_t* signal_size) {
+  bool using_cache = true;
+  int cufft_version;
+  phi::dynload::cufftGetVersion(&cufft_version);
+  if (10300 <= cufft_version && cufft_version <= 10400) {
+    using_cache = std::none_of(
+        signal_size + 1, signal_size + kMaxDataNdim, [](int64_t dim_size) {
+          return has_large_prime_factor(dim_size);
+        });
+  }
+  return using_cache;
+}
+#elif defined(PADDLE_WITH_HIP)
+inline bool use_cache(const int64_t* signal_size) { return true; }
+#endif
+
+// up to 3d unnormalized fft transform (c2r, r2c, c2c)
+template <typename Ti, typename To>
+void exec_fft(const phi::GPUContext& ctx,
+              const DenseTensor& x,
+              DenseTensor* out,
+              const std::vector<int64_t>& axes,
+              bool forward) {
+  const phi::DDim& in_sizes = x.dims();
+  const int ndim = in_sizes.size();
+  const int signal_ndim = axes.size();
+  const int batch_ndim = ndim - signal_ndim;
+  const phi::DDim& out_sizes = out->dims();
+
+  // make a dim permutation
+  std::vector<int> dim_permute(ndim);
+  std::iota(dim_permute.begin(), dim_permute.end(), 0);
+  std::vector<bool> is_transformed_dim(ndim, false);
+  for (const auto& d : axes) {
+    is_transformed_dim[d] = true;
+  }
+  const auto batch_end =
+      std::partition(dim_permute.begin(), dim_permute.end(), [&](size_t axis) {
+        return !is_transformed_dim[axis];
+      });
+  std::copy(axes.cbegin(), axes.cend(), batch_end);
+
+  // transpose input according to the permutation
+  DenseTensor transposed_input =
+      Transpose<Ti, phi::GPUContext>(ctx, x, dim_permute);
+  const phi::DDim transposed_input_shape = transposed_input.dims();
+
+  // batch size
+  int64_t batch_size = 1L;
+  for (int i = 0; i < batch_ndim; i++) {
+    batch_size *= transposed_input_shape[i];
+  }
+
+  // make an collapsed input: collapse batch axes for input
+  std::vector<int64_t> collapsed_input_shape_;
+  collapsed_input_shape_.reserve(1 + signal_ndim);
+  collapsed_input_shape_.emplace_back(batch_size);
+  for (int i = 0; i < signal_ndim; i++) {
+    collapsed_input_shape_.push_back(in_sizes[axes[i]]);
+  }
+  phi::DDim collapsed_input_shape = phi::make_ddim(collapsed_input_shape_);
+  transposed_input.Resize(collapsed_input_shape);
+  DenseTensor& collapsed_input = transposed_input;
+
+  // make a collapsed output
+  phi::DDim transposed_output_shape = out_sizes.transpose(dim_permute);
+  std::vector<int64_t> collapsed_output_shape_;
+  collapsed_output_shape_.reserve(1 + signal_ndim);
+  collapsed_output_shape_.emplace_back(batch_size);
+  for (int i = 0; i < signal_ndim; i++) {
+    collapsed_output_shape_.push_back(out_sizes[axes[i]]);
+  }
+  phi::DDim collapsed_output_shape = phi::make_ddim(collapsed_output_shape_);
+  DenseTensor collapsed_output;
+  collapsed_output.Resize(collapsed_output_shape);
+  ctx.Alloc<To>(&collapsed_output);
+
+  FFTConfigKey key =
+      create_fft_configkey(collapsed_input, collapsed_output, signal_ndim);
+  int64_t device_id = ctx.GetPlace().GetDeviceId();
+  FFTConfig* config = nullptr;
+  std::unique_ptr<FFTConfig> config_ = nullptr;
+  bool using_cache = use_cache(key.sizes_);
+
+  if (using_cache) {
+    FFTConfigCache& plan_cache = get_fft_plan_cache(device_id);
+    std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
+    guard.lock();
+    config = &(plan_cache.lookup(key));
+  } else {
+    config_ = std::make_unique<FFTConfig>(key);
+    config = config_.get();
+  }
+
+  const int64_t workspace_size = static_cast<int64_t>(config->workspace_size());
+  DenseTensor workspace_tensor = Empty<uint8_t>(ctx, {workspace_size});
+
+  // prepare cufft for execution
+#if defined(PADDLE_WITH_CUDA)
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cufftSetStream(config->plan(), ctx.stream()));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cufftSetWorkArea(config->plan(), workspace_tensor.data()));
+#elif defined(PADDLE_WITH_HIP)
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::hipfftSetStream(config->plan(), ctx.stream()));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::hipfftSetWorkArea(config->plan(), workspace_tensor.data()));
+#endif
+
+  // execution of fft plan
+  const FFTTransformType fft_type = config->transform_type();
+  if (fft_type == FFTTransformType::C2R && forward) {
+    ConjKernel<Ti, phi::GPUContext>(ctx, collapsed_input, &collapsed_input);
+    exec_plan(*config, collapsed_input.data(), collapsed_output.data(), false);
+  } else if (fft_type == FFTTransformType::R2C && !forward) {
+    exec_plan(*config, collapsed_input.data(), collapsed_output.data(), true);
+    ConjKernel<To, phi::GPUContext>(ctx, collapsed_output, &collapsed_output);
+  } else {
+    exec_plan(
+        *config, collapsed_input.data(), collapsed_output.data(), forward);
+  }
+
+  // resize for the collapsed output
+  collapsed_output.Resize(transposed_output_shape);
+  phi::DenseTensor& transposed_output = collapsed_output;
+
+  // reverse the transposition
+  std::vector<int> reverse_dim_permute(ndim);
+  for (int i = 0; i < ndim; i++) {
+    reverse_dim_permute[dim_permute[i]] = i;
+  }
+  TransposeKernel<To, phi::GPUContext>(
+      ctx, transposed_output, reverse_dim_permute, out);
+}
+}  // namespace detail
+
+template <typename Ti, typename To>
+struct FFTC2CFunctor<phi::GPUContext, Ti, To> {
+  void operator()(const phi::GPUContext& ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out,
+                  const std::vector<int64_t>& axes,
+                  FFTNormMode normalization,
+                  bool forward) {
+    if (axes.empty()) {
+      AssignKernel<phi::GPUContext>(ctx, x, out);
+      return;
+    }
+
+    std::vector<int64_t> working_axes = axes;
+    std::sort(working_axes.begin(), working_axes.end());
+    std::vector<int64_t> first_dims;
+    size_t max_dims;
+
+    DenseTensor working_tensor = x;  // shallow copy
+    while (true) {
+      max_dims = std::min(static_cast<size_t>(detail::kMaxFFTNdim),
+                          working_axes.size());
+      first_dims.assign(working_axes.end() - max_dims, working_axes.end());
+
+      detail::exec_fft<Ti, To>(ctx, working_tensor, out, first_dims, forward);
+      working_axes.resize(working_axes.size() - max_dims);
+      first_dims.clear();
+
+      if (working_axes.empty()) {
+        break;
+      }
+
+      if (working_tensor.IsSharedWith(x)) {
+        working_tensor = std::move(*out);
+        *out = EmptyLike<Ti>(ctx, x);
+      } else {
+        std::swap(*out, working_tensor);
+      }
+    }
+
+    std::vector<int64_t> out_dims = phi::vectorize(x.dims());
+    detail::exec_normalization<To>(
+        ctx, *out, out, normalization, out_dims, axes);
+  }
+};
+
+template <typename Ti, typename To>
+struct FFTC2RFunctor<phi::GPUContext, Ti, To> {
+  void operator()(const phi::GPUContext& ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out,
+                  const std::vector<int64_t>& axes,
+                  FFTNormMode normalization,
+                  bool forward) {
+    std::vector<int64_t> out_dims = phi::vectorize(out->dims());
+
+    if (detail::use_optimized_fft_path(axes)) {
+      DenseTensor x_copy = Assign(ctx, x);
+      detail::exec_fft<Ti, To>(ctx, x_copy, out, axes, forward);
+    } else {
+      DenseTensor c2c_result = EmptyLike<Ti, phi::GPUContext>(ctx, x);
+      FFTC2CFunctor<phi::GPUContext, Ti, Ti> c2c_functor;
+      c2c_functor(ctx,
+                  x,
+                  &c2c_result,
+                  {axes.begin(), axes.end() - 1},
+                  FFTNormMode::none,
+                  forward);
+      detail::exec_fft<Ti, To>(ctx, c2c_result, out, {axes.back()}, forward);
+    }
+    detail::exec_normalization<To>(
+        ctx, *out, out, normalization, out_dims, axes);
+  }
+};
+
+template <typename Ti, typename To>
+struct FFTR2CFunctor<phi::GPUContext, Ti, To> {
+  void operator()(const phi::GPUContext& ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out,
+                  const std::vector<int64_t>& axes,
+                  FFTNormMode normalization,
+                  bool forward) {
+    if (detail::use_optimized_fft_path(axes)) {
+      detail::exec_fft<Ti, To>(ctx, x, out, axes, forward);
+    } else {
+      DenseTensor r2c_result = EmptyLike<To, phi::GPUContext>(ctx, *out);
+      detail::exec_fft<Ti, To>(ctx, x, &r2c_result, {axes.back()}, forward);
+
+      FFTC2CFunctor<phi::GPUContext, To, To> fft_c2c_func;
+      fft_c2c_func(ctx,
+                   r2c_result,
+                   out,
+                   {axes.begin(), axes.end() - 1},
+                   FFTNormMode::none,
+                   forward);
+    }
+
+    const auto in_dims = phi::vectorize(x.dims());
+    detail::exec_normalization<To>(
+        ctx, *out, out, normalization, in_dims, axes);
+  }
+};
+
+using complex64_t = phi::dtype::complex<float>;
+using complex128_t = phi::dtype::complex<double>;
+template struct FFTC2CFunctor<phi::GPUContext, complex64_t, complex64_t>;
+template struct FFTC2CFunctor<phi::GPUContext, complex128_t, complex128_t>;
+template struct FFTC2RFunctor<phi::GPUContext, complex64_t, float>;
+template struct FFTC2RFunctor<phi::GPUContext, complex128_t, double>;
+template struct FFTR2CFunctor<phi::GPUContext, float, complex64_t>;
+template struct FFTR2CFunctor<phi::GPUContext, double, complex128_t>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/fft.h b/paddle/phi/kernels/funcs/fft.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f9e1191ebb3e69cb9aa9176bd64abc822569d80
--- /dev/null
+++ b/paddle/phi/kernels/funcs/fft.h
@@ -0,0 +1,103 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/utils/data_type.h"
+
+namespace phi {
+namespace funcs {
+
+enum class FFTNormMode : int8_t {
+  none,       // No normalization
+  by_sqrt_n,  // Divide by sqrt(signal_size)
+  by_n,       // Divide by signal_size
+};
+
+inline FFTNormMode get_norm_from_string(const std::string& norm, bool forward) {
+  if (norm.empty() || norm == "backward") {
+    return forward ? FFTNormMode::none : FFTNormMode::by_n;
+  }
+
+  if (norm == "forward") {
+    return forward ? FFTNormMode::by_n : FFTNormMode::none;
+  }
+
+  if (norm == "ortho") {
+    return FFTNormMode::by_sqrt_n;
+  }
+
+  PADDLE_THROW(phi::errors::InvalidArgument(
+      "FFT norm string must be 'forward' or 'backward' or 'ortho', "
+      "received %s",
+      norm));
+}
+
+enum class FFTTransformType : int8_t {
+  C2C = 0,  // Complex-to-complex
+  R2C,      // Real-to-complex
+  C2R,      // Complex-to-real
+};
+
+// Create transform type enum from bools representing if input and output are
+// complex
+inline FFTTransformType GetFFTTransformType(DataType input_dtype,
+                                            DataType output_dtype) {
+  auto complex_input = IsComplexType(input_dtype);
+  auto complex_output = IsComplexType(output_dtype);
+  if (complex_input && complex_output) {
+    return FFTTransformType::C2C;
+  } else if (complex_input && !complex_output) {
+    return FFTTransformType::C2R;
+  } else if (!complex_input && complex_output) {
+    return FFTTransformType::R2C;
+  }
+  PADDLE_THROW(
+      phi::errors::InvalidArgument("Real to real FFTs are not supported"));
+}
+
+template <typename DeviceContext, typename Ti, typename To>
+struct FFTC2CFunctor {
+  void operator()(const DeviceContext& ctx,
+                  const DenseTensor& X,
+                  DenseTensor* out,
+                  const std::vector<int64_t>& axes,
+                  FFTNormMode normalization,
+                  bool forward);
+};
+
+template <typename DeviceContext, typename Ti, typename To>
+struct FFTR2CFunctor {
+  void operator()(const DeviceContext& ctx,
+                  const DenseTensor& X,
+                  DenseTensor* out,
+                  const std::vector<int64_t>& axes,
+                  FFTNormMode normalization,
+                  bool forward);
+};
+
+template <typename DeviceContext, typename Ti, typename To>
+struct FFTC2RFunctor {
+  void operator()(const DeviceContext& ctx,
+                  const DenseTensor& X,
+                  DenseTensor* out,
+                  const std::vector<int64_t>& axes,
+                  FFTNormMode normalization,
+                  bool forward);
+};
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/fft_cache.h b/paddle/phi/kernels/funcs/fft_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..51e90a6c0d95b5e802fa76cfc92c8b7bbf44943e
--- /dev/null
+++ b/paddle/phi/kernels/funcs/fft_cache.h
@@ -0,0 +1,189 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <functional>
+#include <limits>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+
+#if defined(PADDLE_WITH_CUDA)
+#include "paddle/phi/kernels/funcs/cufft_util.h"
+#elif defined(PADDLE_WITH_HIP)
+#include "paddle/phi/kernels/funcs/hipfft_util.h"
+#endif
+
+namespace phi {
+namespace funcs {
+namespace detail {
+
+#if CUDA_VERSION < 10000
+// Note that the max plan number for CUDA version < 10 has to be 1023
+// due to a bug that fails on the 1024th plan
+constexpr size_t CUFFT_MAX_PLAN_NUM = 1023;
+constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM;
+#else
+constexpr size_t CUFFT_MAX_PLAN_NUM = std::numeric_limits<size_t>::max();
+// The default max cache size chosen for CUDA version > 10 is arbitrary.
+// This number puts a limit on how big of a plan cache should we maintain by
+// default. Users can always configure it via cufft_set_plan_cache_max_size.
+constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = 4096;
+#endif
+
+static_assert(CUFFT_MAX_PLAN_NUM >= 0 &&
+                  CUFFT_MAX_PLAN_NUM <= std::numeric_limits<size_t>::max(),
+              "CUFFT_MAX_PLAN_NUM not in size_t range");
+static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 &&
+                  CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM,
+              "CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range");
+
+class FFTConfigCache {
+ public:
+  using kv_t = typename std::pair<FFTConfigKey, FFTConfig>;
+  using map_t =
+      typename std::unordered_map<std::reference_wrapper<FFTConfigKey>,
+                                  typename std::list<kv_t>::iterator,
+                                  KeyHash<FFTConfigKey>,
+                                  KeyEqual<FFTConfigKey>>;
+  using map_kkv_iter_t = typename map_t::iterator;
+
+  FFTConfigCache() : FFTConfigCache(CUFFT_DEFAULT_CACHE_SIZE) {}
+
+  explicit FFTConfigCache(int64_t max_size) { _set_max_size(max_size); }
+
+  FFTConfigCache(const FFTConfigCache& other) = delete;
+  FFTConfigCache& operator=(const FFTConfigCache& other) = delete;
+
+  FFTConfigCache(FFTConfigCache&& other) noexcept
+      : _usage_list(std::move(other._usage_list)),
+        _cache_map(std::move(other._cache_map)),
+        _max_size(other._max_size) {}
+
+  FFTConfigCache& operator=(FFTConfigCache&& other) noexcept {
+    _usage_list = std::move(other._usage_list);
+    _cache_map = std::move(other._cache_map);
+    _max_size = other._max_size;
+    return *this;
+  }
+
+  // If key is in this cache, return the cached config. Otherwise, emplace the
+  // config in this cache and return it.
+  FFTConfig& lookup(FFTConfigKey params) {
+    PADDLE_ENFORCE_GT(_max_size,
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The max size of FFTConfigCache must be great than 0,"
+                          "But received is [%d]",
+                          _max_size));
+
+    map_kkv_iter_t map_it = _cache_map.find(params);
+    // Hit, put to list front
+    if (map_it != _cache_map.end()) {
+      _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second);
+      return map_it->second->second;
+    }
+
+    // Miss
+    // remove if needed
+    if (_usage_list.size() >= _max_size) {
+      auto last = _usage_list.end();
+      last--;
+      _cache_map.erase(last->first);
+      _usage_list.pop_back();
+    }
+
+    // construct new plan at list front, then insert into _cache_map
+    _usage_list.emplace_front(std::piecewise_construct,
+                              std::forward_as_tuple(params),
+                              std::forward_as_tuple(params));
+    auto kv_it = _usage_list.begin();
+    _cache_map.emplace(std::piecewise_construct,
+                       std::forward_as_tuple(kv_it->first),
+                       std::forward_as_tuple(kv_it));
+    return kv_it->second;
+  }
+
+  void clear() {
+    _cache_map.clear();
+    _usage_list.clear();
+  }
+
+  void resize(int64_t new_size) {
+    _set_max_size(new_size);
+    auto cur_size = _usage_list.size();
+    if (cur_size > _max_size) {
+      auto delete_it = _usage_list.end();
+      for (size_t i = 0; i < cur_size - _max_size; i++) {
+        delete_it--;
+        _cache_map.erase(delete_it->first);
+      }
+      _usage_list.erase(delete_it, _usage_list.end());
+    }
+  }
+
+  size_t size() const { return _cache_map.size(); }
+
+  size_t max_size() const noexcept { return _max_size; }
+
+  std::mutex mutex;
+
+ private:
+  // Only sets size and does value check. Does not resize the data structures.
+  void _set_max_size(int64_t new_size) {
+    // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since
+    // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check
+    // first.
+    PADDLE_ENFORCE_GE(
+        new_size,
+        0,
+        phi::errors::InvalidArgument(
+            "cuFFT plan cache size must be non-negative, But received is [%d]",
+            new_size));
+    PADDLE_ENFORCE_LE(new_size,
+                      CUFFT_MAX_PLAN_NUM,
+                      phi::errors::InvalidArgument(
+                          "cuFFT plan cache size can not be larger than [%d], "
+                          "But received is [%d]",
+                          CUFFT_MAX_PLAN_NUM,
+                          new_size));
+    _max_size = static_cast<size_t>(new_size);
+  }
+
+  std::list<kv_t> _usage_list;
+  map_t _cache_map;
+  size_t _max_size;
+};
+
+static std::vector<std::unique_ptr<FFTConfigCache>> plan_caches;
+static std::mutex plan_caches_mutex;
+
+static inline FFTConfigCache& get_fft_plan_cache(int64_t device_index) {
+  std::lock_guard<std::mutex> guard(plan_caches_mutex);
+
+  if (device_index >= plan_caches.size()) {
+    plan_caches.resize(device_index + 1);
+  }
+
+  if (!plan_caches[device_index]) {
+    plan_caches[device_index] = std::make_unique<FFTConfigCache>();
+  }
+
+  return *plan_caches[device_index];
+}
+}  // namespace detail
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/fft_fill_conj.h b/paddle/phi/kernels/funcs/fft_fill_conj.h
new file mode 100644
index 0000000000000000000000000000000000000000..91d859020f88b93d9a656accea4525c25e62d485
--- /dev/null
+++ b/paddle/phi/kernels/funcs/fft_fill_conj.h
@@ -0,0 +1,219 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include "thrust/device_vector.h"
+#endif
+
+namespace phi {
+namespace funcs {
+
+// Giving a linear destination index and strides of tensor, get_idx return the
+// corresponding linear position of source tensor.
+// The linear index is the position of flatten tensor.
+// Giving a linear destination index and strides of tensor, get_idx return the
+// corresponding linear position of source tensor.
+// The linear index is the position of flatten tensor.
+HOSTDEVICE inline int64_t get_src_idx(const int64_t dst_idx,
+                                      const int64_t* dst_strides,
+                                      const int64_t* dst_shape,
+                                      const int64_t* src_strides,
+                                      const bool* is_fft_axis,
+                                      const bool conj,
+                                      const int64_t rank) {
+  int64_t src_idx = 0;
+  int64_t quotient = dst_idx;
+  int64_t remainder = 0;
+
+  for (int64_t i = 0; i < rank; i++) {
+    remainder = quotient % dst_strides[i];
+    quotient = quotient / dst_strides[i];
+    if (conj && is_fft_axis[i]) {
+      src_idx += ((dst_shape[i] - quotient) % dst_shape[i]) * src_strides[i];
+    } else {
+      src_idx += src_strides[i] * quotient;
+    }
+    quotient = remainder;
+  }
+
+  return src_idx;
+}
+
+HOSTDEVICE inline bool is_conj_part(const int64_t dst_idx,
+                                    const int64_t* dst_strides,
+                                    const int64_t last_axis,
+                                    const int64_t last_axis_size) {
+  int64_t quotient = dst_idx;
+  int64_t remainder = 0;
+
+  for (int64_t i = 0; i < last_axis + 1; i++) {
+    remainder = quotient % dst_strides[i];
+    quotient = quotient / dst_strides[i];
+
+    if ((i == last_axis) && (quotient > last_axis_size - 1)) {
+      return true;
+    }
+
+    quotient = remainder;
+  }
+
+  return false;
+}
+
+// FFTFillConjFunctor fill the destination tensor with source tensor and
+// conjugate symmetry element of source tensor .
+// Use framework::ForRange to iterate destination element with
+// supporting different device
+template <typename C>
+struct FFTFillConjFunctor {
+  FFTFillConjFunctor(const C* src_data,
+                     C* dst_data,
+                     const int64_t* src_strides,
+                     const int64_t* dst_strides,
+                     const int64_t* dst_shape,
+                     const bool* is_fft_axis,
+                     const int64_t last_axis,
+                     const int64_t last_axis_size,
+                     const int64_t rank)
+      : src_data_(src_data),
+        dst_data_(dst_data),
+        src_strides_(src_strides),
+        dst_strides_(dst_strides),
+        dst_shape_(dst_shape),
+        is_fft_axis_(is_fft_axis),
+        last_axis_(last_axis),
+        last_axis_size_(last_axis_size),
+        rank_(rank) {}
+  HOSTDEVICE void operator()(int64_t dst_idx) {
+    if (is_conj_part(dst_idx, dst_strides_, last_axis_, last_axis_size_)) {
+      const auto conj_idx = get_src_idx(dst_idx,
+                                        dst_strides_,
+                                        dst_shape_,
+                                        src_strides_,
+                                        is_fft_axis_,
+                                        true,
+                                        rank_);
+      auto src_value = src_data_[conj_idx];
+      auto conj_value = C(src_value.real, -src_value.imag);
+      dst_data_[dst_idx] = conj_value;
+    } else {
+      const auto copy_idx = get_src_idx(dst_idx,
+                                        dst_strides_,
+                                        dst_shape_,
+                                        src_strides_,
+                                        is_fft_axis_,
+                                        false,
+                                        rank_);
+      dst_data_[dst_idx] = src_data_[copy_idx];
+    }
+  }
+
+  const C* src_data_;
+  C* dst_data_;
+  const int64_t* src_strides_;
+  const int64_t* dst_strides_;
+  const int64_t* dst_shape_;
+  const bool* is_fft_axis_;
+  const int64_t last_axis_;
+  const int64_t last_axis_size_;
+  const int64_t rank_;
+};
+
+template <typename DeviceContext, typename C>
+void FFTFillConj(const DeviceContext& ctx,
+                 const DenseTensor* src,
+                 DenseTensor* dst,
+                 const std::vector<int64_t>& axes) {
+  std::vector<int64_t> src_strides_v =
+      phi::vectorize<int64_t>(phi::stride(src->dims()));
+  std::vector<int64_t> dst_strides_v =
+      phi::vectorize<int64_t>(phi::stride(dst->dims()));
+  std::vector<int64_t> dst_shape_v = phi::vectorize<int64_t>(dst->dims());
+  const auto src_data = src->data<C>();
+  auto dst_data = dst->data<C>();
+  const auto last_axis = axes.back();
+  const auto last_axis_size = dst->dims().at(last_axis) / 2 + 1;
+  const int64_t rank = dst->dims().size();
+  auto _is_fft_axis = std::make_unique<bool[]>(rank);
+  for (const auto i : axes) {
+    _is_fft_axis[i] = true;
+  }
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+  const thrust::device_vector<int64_t> src_strides_g(src_strides_v);
+  const auto src_strides = thrust::raw_pointer_cast(src_strides_g.data());
+  const thrust::device_vector<int64_t> dst_strides_g(dst_strides_v);
+  const auto dst_strides = thrust::raw_pointer_cast(dst_strides_g.data());
+  const thrust::device_vector<int64_t> dst_shape_g(dst_shape_v);
+  const auto dst_shape = thrust::raw_pointer_cast(dst_shape_g.data());
+  const thrust::device_vector<bool> is_fft_axis_g(_is_fft_axis.get(),
+                                                  _is_fft_axis.get() + rank);
+  const auto p_is_fft_axis = thrust::raw_pointer_cast(is_fft_axis_g.data());
+#else
+  const auto src_strides = src_strides_v.data();
+  const auto dst_strides = dst_strides_v.data();
+  const auto dst_shape = dst_shape_v.data();
+  const auto p_is_fft_axis = _is_fft_axis.get();
+#endif
+  ForRange<DeviceContext> for_range(ctx, dst->numel());
+  FFTFillConjFunctor<C> fill_conj_functor(src_data,
+                                          dst_data,
+                                          src_strides,
+                                          dst_strides,
+                                          dst_shape,
+                                          p_is_fft_axis,
+                                          last_axis,
+                                          last_axis_size,
+                                          rank);
+  for_range(fill_conj_functor);
+}
+
+template <typename T>
+struct FFTFillConjGradFunctor {
+  T* input_;
+  const size_t axis_;
+  const int64_t* strides_;
+  const size_t double_length_;
+
+  FFTFillConjGradFunctor(T* input,
+                         size_t axis,
+                         const int64_t* strides,
+                         size_t double_length)
+      : input_(input),
+        axis_(axis),
+        strides_(strides),
+        double_length_(double_length) {}
+
+  HOSTDEVICE void operator()(size_t index) {
+    size_t offtset = index;  // back
+    size_t index_i;
+    for (size_t i = 0; i <= axis_; i++) {
+      index_i = offtset / strides_[i];
+      offtset %= strides_[i];
+    }
+
+    if ((0 < index_i) && (index_i < double_length_ + 1)) {
+      input_[index] *= static_cast<T>(2);
+    }
+  }
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/fft_key.h b/paddle/phi/kernels/funcs/fft_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..5893cfc6ba019f3b90c5e84f2d8c3482c530fc17
--- /dev/null
+++ b/paddle/phi/kernels/funcs/fft_key.h
@@ -0,0 +1,115 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/fft.h"
+
+namespace phi {
+namespace funcs {
+namespace detail {
+
+const int64_t kMaxFFTNdim = 3;
+const int64_t kMaxDataNdim = kMaxFFTNdim + 1;
+
+struct FFTConfigKey {
+  int signal_ndim_;  // 1 <= signal_ndim <= kMaxFFTNdim
+  // These include additional batch dimension as well.
+  int64_t sizes_[kMaxDataNdim];
+  int64_t input_shape_[kMaxDataNdim];
+  int64_t output_shape_[kMaxDataNdim];
+  FFTTransformType fft_type_;
+  DataType value_type_;
+
+  using shape_t = std::vector<int64_t>;
+  FFTConfigKey() = default;
+
+  FFTConfigKey(const shape_t& in_shape,
+               const shape_t& out_shape,
+               const shape_t& signal_size,
+               FFTTransformType fft_type,
+               DataType value_type) {
+    // Padding bits must be zeroed for hashing
+    memset(this, 0, sizeof(*this));
+    signal_ndim_ = signal_size.size() - 1;
+    fft_type_ = fft_type;
+    value_type_ = value_type;
+    std::copy(signal_size.cbegin(), signal_size.cend(), sizes_);
+    std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_);
+    std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_);
+  }
+};
+
+// Hashing machinery for Key
+// Fowler–Noll–Vo hash function
+// see
+// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+template <typename Key>
+struct KeyHash {
+  // Key must be a POD because we read out its memory
+  // contenst as char* when hashing
+  static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
+
+  size_t operator()(const Key& params) const {
+    auto ptr = reinterpret_cast<const uint8_t*>(&params);
+    uint32_t value = 0x811C9DC5;
+    for (int i = 0; i < static_cast<int>(sizeof(Key)); ++i) {
+      value ^= ptr[i];
+      value *= 0x01000193;
+    }
+    return static_cast<size_t>(value);
+  }
+};
+
+template <typename Key>
+struct KeyEqual {
+  // Key must be a POD because we read out its memory
+  // contenst as char* when comparing
+  static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
+
+  bool operator()(const Key& a, const Key& b) const {
+    auto ptr1 = reinterpret_cast<const uint8_t*>(&a);
+    auto ptr2 = reinterpret_cast<const uint8_t*>(&b);
+    return memcmp(ptr1, ptr2, sizeof(Key)) == 0;
+  }
+};
+
+static FFTConfigKey create_fft_configkey(const DenseTensor& input,
+                                         const DenseTensor& output,
+                                         int signal_ndim) {
+  // Create the transform plan (either from cache or locally)
+  DataType input_dtype = input.dtype();
+  const auto value_type =
+      IsComplexType(input_dtype) ? ToRealType(input_dtype) : input_dtype;
+  const auto fft_type = GetFFTTransformType(input.dtype(), output.dtype());
+  // signal sizes
+  std::vector<int64_t> signal_size(signal_ndim + 1);
+
+  signal_size[0] = input.dims()[0];
+  for (int64_t i = 1; i <= signal_ndim; ++i) {
+    auto in_size = input.dims()[i];
+    auto out_size = output.dims()[i];
+    signal_size[i] = std::max(in_size, out_size);
+  }
+  FFTConfigKey key(phi::vectorize(input.dims()),
+                   phi::vectorize(output.dims()),
+                   signal_size,
+                   fft_type,
+                   value_type);
+  return key;
+}
+
+}  // namespace detail
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/hipfft_util.h b/paddle/phi/kernels/funcs/hipfft_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..6583a97f17a1d8b358e190ca81f52eb60d5f14d6
--- /dev/null
+++ b/paddle/phi/kernels/funcs/hipfft_util.h
@@ -0,0 +1,184 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/backends/dynload/hipfft.h"
+#include "paddle/phi/kernels/funcs/fft.h"
+#include "paddle/phi/kernels/funcs/fft_key.h"
+
+namespace phi {
+namespace funcs {
+namespace detail {
+
+// An RAII encapsulation of hipFFTHandle
+class HIPFFTHandle {
+ public:
+  HIPFFTHandle() {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::hipfftCreate(&handle_));
+  }
+
+  HIPFFTHandle(const HIPFFTHandle& other) = delete;
+  HIPFFTHandle& operator=(const HIPFFTHandle& other) = delete;
+
+  HIPFFTHandle(HIPFFTHandle&& other) = delete;
+  HIPFFTHandle& operator=(HIPFFTHandle&& other) = delete;
+
+  ::hipfftHandle& get() { return handle_; }
+  const ::hipfftHandle& get() const { return handle_; }
+
+  ~HIPFFTHandle() { phi::dynload::hipfftDestroy(handle_); }
+
+ private:
+  ::hipfftHandle handle_;
+};
+
+class FFTConfig {
+ public:
+  using plan_size_type = int;
+  explicit FFTConfig(const FFTConfigKey& key)
+      : FFTConfig(
+            std::vector<int64_t>(key.sizes_, key.sizes_ + key.signal_ndim_ + 1),
+            key.fft_type_,
+            key.value_type_) {}
+  FFTConfig(const std::vector<int64_t>& sizes,
+            FFTTransformType fft_type,
+            DataType precision)
+      : fft_type_(fft_type), precision_(precision) {
+    std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
+    const auto batch_size = static_cast<plan_size_type>(sizes[0]);
+    const int signal_ndim = sizes.size() - 1;
+
+    hipfftType exec_type = [&]() {
+      if (precision == DataType::FLOAT32) {
+        switch (fft_type) {
+          case FFTTransformType::C2C:
+            return HIPFFT_C2C;
+          case FFTTransformType::R2C:
+            return HIPFFT_R2C;
+          case FFTTransformType::C2R:
+            return HIPFFT_C2R;
+        }
+      } else if (precision == DataType::FLOAT64) {
+        switch (fft_type) {
+          case FFTTransformType::C2C:
+            return HIPFFT_Z2Z;
+          case FFTTransformType::R2C:
+            return HIPFFT_D2Z;
+          case FFTTransformType::C2R:
+            return HIPFFT_Z2D;
+        }
+      }
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "Only transforms of type float32 and float64 are supported."));
+    }();
+
+    // disable auto allocation of workspace to use allocator from the framework
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::hipfftSetAutoAllocation(plan(), /* autoAllocate */ 0));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::hipfftMakePlanMany(plan(),
+                                         signal_ndim,
+                                         signal_sizes.data(),
+                                         /* inembed */ nullptr,
+                                         /* base_istride */ 1,
+                                         /* idist */ 1,
+                                         /* onembed */ nullptr,
+                                         /* base_ostride */ 1,
+                                         /* odist */ 1,
+                                         exec_type,
+                                         batch_size,
+                                         &ws_size_));
+  }
+
+  const hipfftHandle& plan() const { return plan_.get(); }
+  FFTTransformType transform_type() const { return fft_type_; }
+  DataType data_type() const { return precision_; }
+  size_t workspace_size() const { return ws_size_; }
+
+ private:
+  HIPFFTHandle plan_;
+  size_t ws_size_;  // workspace size in bytes
+  FFTTransformType fft_type_;
+  DataType precision_;
+};
+
+// NOTE: R2C is forward-only, C2R is backward only
+static void exec_plan(const FFTConfig& config,
+                      void* in_data,
+                      void* out_data,
+                      bool forward) {
+  const hipfftHandle& plan = config.plan();
+
+  DataType value_type = config.data_type();
+  if (value_type == DataType::FLOAT32) {
+    switch (config.transform_type()) {
+      case FFTTransformType::C2C: {
+        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::hipfftExecC2C(
+            plan,
+            static_cast<hipfftComplex*>(in_data),
+            static_cast<hipfftComplex*>(out_data),
+            forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
+        return;
+      }
+      case FFTTransformType::R2C: {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::hipfftExecR2C(plan,
+                                        static_cast<hipfftReal*>(in_data),
+                                        static_cast<hipfftComplex*>(out_data)));
+        return;
+      }
+      case FFTTransformType::C2R: {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::hipfftExecC2R(plan,
+                                        static_cast<hipfftComplex*>(in_data),
+                                        static_cast<hipfftReal*>(out_data)));
+        return;
+      }
+    }
+  } else if (value_type == DataType::FLOAT64) {
+    switch (config.transform_type()) {
+      case FFTTransformType::C2C: {
+        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::hipfftExecZ2Z(
+            plan,
+            static_cast<hipfftDoubleComplex*>(in_data),
+            static_cast<hipfftDoubleComplex*>(out_data),
+            forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
+        return;
+      }
+      case FFTTransformType::R2C: {
+        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::hipfftExecD2Z(
+            plan,
+            static_cast<hipfftDoubleReal*>(in_data),
+            static_cast<hipfftDoubleComplex*>(out_data)));
+        return;
+      }
+      case FFTTransformType::C2R: {
+        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::hipfftExecZ2D(
+            plan,
+            static_cast<hipfftDoubleComplex*>(in_data),
+            static_cast<hipfftDoubleReal*>(out_data)));
+        return;
+      }
+    }
+  }
+  PADDLE_THROW(phi::errors::InvalidArgument(
+      "hipFFT only support transforms of type float32 and float64"));
+}
+
+}  // namespace detail
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/mkl_fft_utils.h b/paddle/phi/kernels/funcs/mkl_fft_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..dbc0678ab7ae5a0e23725a5145d86dba04485c1b
--- /dev/null
+++ b/paddle/phi/kernels/funcs/mkl_fft_utils.h
@@ -0,0 +1,172 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <functional>
+#include <numeric>
+#include "paddle/phi/backends/dynload/mklrt.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/kernels/funcs/fft.h"
+
+namespace phi {
+namespace funcs {
+namespace detail {
+
+#define MKL_DFTI_CHECK(expr)                                              \
+  do {                                                                    \
+    MKL_LONG status = (expr);                                             \
+    if (!phi::dynload::DftiErrorClass(status, DFTI_NO_ERROR))             \
+      PADDLE_THROW(                                                       \
+          phi::errors::External(phi::dynload::DftiErrorMessage(status))); \
+  } while (0);
+
+struct DftiDescriptorDeleter {
+  void operator()(DFTI_DESCRIPTOR_HANDLE handle) {
+    if (handle != nullptr) {
+      MKL_DFTI_CHECK(phi::dynload::DftiFreeDescriptor(&handle));
+    }
+  }
+};
+
+// A RAII wrapper for MKL_DESCRIPTOR*
+class DftiDescriptor {
+ public:
+  void init(DFTI_CONFIG_VALUE precision,
+            DFTI_CONFIG_VALUE signal_type,
+            MKL_LONG signal_ndim,
+            MKL_LONG* sizes) {
+    PADDLE_ENFORCE_EQ(desc_.get(),
+                      nullptr,
+                      phi::errors::AlreadyExists(
+                          "DftiDescriptor has already been initialized."));
+
+    DFTI_DESCRIPTOR* raw_desc;
+    MKL_DFTI_CHECK(phi::dynload::DftiCreateDescriptorX(
+        &raw_desc, precision, signal_type, signal_ndim, sizes));
+    desc_.reset(raw_desc);
+  }
+
+  DFTI_DESCRIPTOR* get() const {
+    DFTI_DESCRIPTOR* raw_desc = desc_.get();
+    PADDLE_ENFORCE_NOT_NULL(raw_desc,
+                            phi::errors::PreconditionNotMet(
+                                "DFTI DESCRIPTOR has not been initialized."));
+    return raw_desc;
+  }
+
+ private:
+  std::unique_ptr<DFTI_DESCRIPTOR, DftiDescriptorDeleter> desc_;
+};
+
+static DftiDescriptor plan_mkl_fft(const DataType in_dtype,
+                                   const DataType out_dtype,
+                                   const phi::DDim& in_strides,
+                                   const phi::DDim& out_strides,
+                                   const std::vector<int64_t>& signal_sizes,
+                                   FFTNormMode normalization,
+                                   bool forward) {
+  const DFTI_CONFIG_VALUE precision = [&] {
+    switch (in_dtype) {
+      case DataType::FLOAT32:
+        return DFTI_SINGLE;
+      case DataType::COMPLEX64:
+        return DFTI_SINGLE;
+      case DataType::FLOAT64:
+        return DFTI_DOUBLE;
+      case DataType::COMPLEX128:
+        return DFTI_DOUBLE;
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument(
+            "Invalid input datatype (%s), input data type should be FP32, "
+            "FP64, COMPLEX64 or COMPLEX128.",
+            in_dtype));
+    }
+  }();
+
+  // C2C, R2C, C2R
+  const FFTTransformType fft_type = GetFFTTransformType(in_dtype, out_dtype);
+  const DFTI_CONFIG_VALUE domain =
+      (fft_type == FFTTransformType::C2C) ? DFTI_COMPLEX : DFTI_REAL;
+
+  DftiDescriptor descriptor;
+  std::vector<MKL_LONG> fft_sizes(signal_sizes.cbegin(), signal_sizes.cend());
+  const MKL_LONG signal_ndim = fft_sizes.size() - 1;
+  descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1);
+
+  // placement inplace or not inplace
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
+      descriptor.get(), DFTI_PLACEMENT, DFTI_NOT_INPLACE));
+
+  // number of transformations
+  const MKL_LONG batch_size = fft_sizes[0];
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
+      descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size));
+
+  // input & output distance
+  const MKL_LONG idist = in_strides[0];
+  const MKL_LONG odist = out_strides[0];
+  MKL_DFTI_CHECK(
+      phi::dynload::DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist));
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
+      descriptor.get(), DFTI_OUTPUT_DISTANCE, odist));
+
+  // input & output stride
+  std::vector<MKL_LONG> mkl_in_stride(1 + signal_ndim, 0);
+  std::vector<MKL_LONG> mkl_out_stride(1 + signal_ndim, 0);
+  for (MKL_LONG i = 1; i <= signal_ndim; i++) {
+    mkl_in_stride[i] = in_strides[i];
+    mkl_out_stride[i] = out_strides[i];
+  }
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
+      descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data()));
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
+      descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data()));
+
+  // conjugate even storage
+  if (!(fft_type == FFTTransformType::C2C)) {
+    MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
+        descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX));
+  }
+
+  MKL_LONG signal_numel = std::accumulate(fft_sizes.cbegin() + 1,
+                                          fft_sizes.cend(),
+                                          1UL,
+                                          std::multiplies<MKL_LONG>());
+  if (normalization != FFTNormMode::none) {
+    const double scale =
+        ((normalization == FFTNormMode::by_sqrt_n)
+             ? 1.0 / std::sqrt(static_cast<double>(signal_numel))
+             : 1.0 / static_cast<double>(signal_numel));
+    const auto scale_direction = [&]() {
+      if (fft_type == FFTTransformType::R2C ||
+          (fft_type == FFTTransformType::C2C && forward)) {
+        return DFTI_FORWARD_SCALE;
+      } else {
+        // (fft_type == FFTTransformType::C2R ||
+        //          (fft_type == FFTTransformType::C2C && !forward))
+        return DFTI_BACKWARD_SCALE;
+      }
+    }();
+    MKL_DFTI_CHECK(
+        phi::dynload::DftiSetValue(descriptor.get(), scale_direction, scale));
+  }
+
+  // commit the descriptor
+  MKL_DFTI_CHECK(phi::dynload::DftiCommitDescriptor(descriptor.get()));
+  return descriptor;
+}
+
+}  // namespace detail
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/fft_grad_kernel.cu b/paddle/phi/kernels/gpu/fft_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..69a95cffc3ee0e5ba71c4d96f35ad1b91ae6ccf6
--- /dev/null
+++ b/paddle/phi/kernels/gpu/fft_grad_kernel.cu
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/fft_grad_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/fft_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(fft_c2c_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::FFTC2CGradKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+PD_REGISTER_KERNEL(
+    fft_c2r_grad, GPU, ALL_LAYOUT, phi::FFTC2RGradKernel, float, double) {}
+PD_REGISTER_KERNEL(fft_r2c_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::FFTR2CGradKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/fft_kernel.cu b/paddle/phi/kernels/gpu/fft_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..aaa1ed0c225c600e94bacd352cc1bd17432d7c6c
--- /dev/null
+++ b/paddle/phi/kernels/gpu/fft_kernel.cu
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/fft_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/fft_kernel_impl.h"
+
+PD_REGISTER_KERNEL(fft_c2c,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::FFTC2CKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+PD_REGISTER_KERNEL(fft_c2r,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::FFTC2RKernel,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+PD_REGISTER_KERNEL(fft_r2c, GPU, ALL_LAYOUT, phi::FFTR2CKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/gpu/scale_kernel.cu b/paddle/phi/kernels/gpu/scale_kernel.cu
index 6f96a697b2f2db6c2097640f34c30142939f80e0..1a574c05494fdaa14f3d8ca0f148d2f60c3964f5 100644
--- a/paddle/phi/kernels/gpu/scale_kernel.cu
+++ b/paddle/phi/kernels/gpu/scale_kernel.cu
@@ -74,4 +74,6 @@ PD_REGISTER_KERNEL(scale,
                    int8_t,
                    int16_t,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/impl/fft_grad_kernel_impl.h b/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..02a8fbd14bca94efefddcf7ec593583d0c095971
--- /dev/null
+++ b/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
@@ -0,0 +1,110 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/kernels/fft_grad_kernel.h"
+
+#include <string>
+#include <vector>
+
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/fft.h"
+#include "paddle/phi/kernels/funcs/fft_fill_conj.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/pad_kernel.h"
+
+namespace phi {
+template <typename T, typename Context>
+void FFTC2CGradKernel(const Context& ctx,
+                      const DenseTensor& out_grad,
+                      const std::vector<int64_t>& axes,
+                      const std::string& normalization,
+                      bool forward,
+                      DenseTensor* x_grad) {
+  ctx.template Alloc<T>(x_grad);
+  auto norm_type = funcs::get_norm_from_string(normalization, forward);
+  funcs::FFTC2CFunctor<Context, T, T> fft_c2c_func;
+  fft_c2c_func(ctx, out_grad, x_grad, axes, norm_type, !forward);
+}
+
+template <typename T, typename Context>
+void FFTR2CGradKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const std::vector<int64_t>& axes,
+                      const std::string& normalization,
+                      bool forward,
+                      bool onesided,
+                      DenseTensor* x_grad) {
+  using R = typename T::value_type;
+  DenseTensor complex_x_grad = EmptyLike<T>(ctx, x);
+  ctx.template Alloc<R>(x_grad);
+  auto norm_type = funcs::get_norm_from_string(normalization, forward);
+  funcs::FFTC2CFunctor<Context, T, T> fft_c2c_func;
+
+  if (!onesided) {
+    fft_c2c_func(ctx, out_grad, &complex_x_grad, axes, norm_type, !forward);
+  } else {
+    DenseTensor full_dy;
+    DenseTensorMeta full_dy_meta(out_grad.type(), x_grad->dims());
+    full_dy.set_meta(full_dy_meta);
+    auto zero_length = static_cast<int>(full_dy.dims().at(axes.back()) -
+                                        out_grad.dims().at(axes.back()));
+    auto rank = out_grad.dims().size();
+    std::vector<int> pads(rank * 2, 0);
+    pads[axes.back() * 2 + 1] = zero_length;
+    PadKernel<T>(ctx, out_grad, pads, static_cast<float>(0.0), &full_dy);
+    fft_c2c_func(ctx, full_dy, &complex_x_grad, axes, norm_type, !forward);
+  }
+  RealKernel<T>(ctx, complex_x_grad, x_grad);
+}
+
+template <typename T, typename Context>
+void FFTC2RGradKernel(const Context& ctx,
+                      const DenseTensor& out_grad,
+                      const std::vector<int64_t>& axes,
+                      const std::string& normalization,
+                      bool forward,
+                      int64_t last_dim_size,
+                      DenseTensor* x_grad) {
+  using C = phi::dtype::complex<T>;
+  ctx.template Alloc<C>(x_grad);
+  auto norm_type = funcs::get_norm_from_string(normalization, forward);
+
+  funcs::FFTR2CFunctor<Context, T, C> fft_r2c_func;
+  fft_r2c_func(ctx, out_grad, x_grad, axes, norm_type, !forward);
+
+  const int64_t double_length =
+      out_grad.dims()[axes.back()] - x_grad->dims()[axes.back()];
+  const phi::DDim strides = phi::stride(x_grad->dims());
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+  const thrust::device_vector<int64_t> strides_g(phi::vectorize(strides));
+  const int64_t* pstrides = thrust::raw_pointer_cast(strides_g.data());
+#else
+  const int64_t* pstrides = strides.Get();
+#endif
+
+  funcs::FFTFillConjGradFunctor<C> func(
+      x_grad->data<C>(), axes.back(), pstrides, double_length);
+  size_t limit = x_grad->numel();
+  funcs::ForRange<Context> for_range(ctx, limit);
+  for_range(func);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/fft_kernel_impl.h b/paddle/phi/kernels/impl/fft_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..d441093db07c9802e139eba4048da3d01d07ff09
--- /dev/null
+++ b/paddle/phi/kernels/impl/fft_kernel_impl.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/kernels/fft_kernel.h"
+
+#include <string>
+#include <vector>
+
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/fft.h"
+#include "paddle/phi/kernels/funcs/fft_fill_conj.h"
+
+namespace phi {
+template <typename T, typename Context>
+void FFTC2CKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& axes,
+                  const std::string& normalization,
+                  bool forward,
+                  DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+  const auto norm_type = funcs::get_norm_from_string(normalization, forward);
+  funcs::FFTC2CFunctor<Context, T, T> fft_c2c_func;
+  fft_c2c_func(ctx, x, out, axes, norm_type, forward);
+}
+
+template <typename T, typename Context>
+void FFTC2RKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& axes,
+                  const std::string& normalization,
+                  bool forward,
+                  int64_t last_dim_size,
+                  DenseTensor* out) {
+  using R = typename T::value_type;  // get real type
+  ctx.template Alloc<R>(out);
+  const auto norm_type = funcs::get_norm_from_string(normalization, forward);
+  funcs::FFTC2RFunctor<Context, T, R> fft_c2r_func;
+  fft_c2r_func(ctx, x, out, axes, norm_type, forward);
+}
+
+template <typename T, typename Context>
+void FFTR2CKernel(const Context& ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& axes,
+                  const std::string& normalization,
+                  bool forward,
+                  bool onesided,
+                  DenseTensor* out) {
+  using C = phi::dtype::complex<T>;
+  ctx.template Alloc<C>(out);
+  auto norm_type = funcs::get_norm_from_string(normalization, forward);
+  funcs::FFTR2CFunctor<Context, T, C> fft_r2c_func;
+
+  if (onesided) {
+    fft_r2c_func(ctx, x, out, axes, norm_type, forward);
+  } else {
+    phi::DDim onesided_out_shape = x.dims();
+    const int64_t last_fft_axis = axes.back();
+    const int64_t onesided_last_axis_size =
+        out->dims().at(last_fft_axis) / 2 + 1;
+    onesided_out_shape[last_fft_axis] = onesided_last_axis_size;
+    DenseTensor onesided_out =
+        Empty<C, Context>(ctx, phi::vectorize(onesided_out_shape));
+    fft_r2c_func(ctx, x, &onesided_out, axes, norm_type, forward);
+    funcs::FFTFillConj<Context, C>(ctx, &onesided_out, out, axes);
+  }
+}
+}  // namespace phi
diff --git a/python/paddle/fft.py b/python/paddle/fft.py
index f44111cb766187a726dd0c94e4b1890049d6aebb..4b6a93edc447b9a4cdc758135dec53eb2bbdba44 100644
--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
@@ -17,7 +17,7 @@ import numpy as np
 import paddle
 from .tensor.attribute import is_complex, is_floating_point, is_integer
 from .tensor.creation import _real_to_complex_dtype, _complex_to_real_dtype
-from .fluid.framework import _non_static_mode
+from .fluid.framework import _in_legacy_dygraph, in_dygraph_mode
 from . import _C_ops
 from .fluid.data_feeder import check_variable_and_dtype
 from .fluid.layer_helper import LayerHelper
@@ -1392,7 +1392,9 @@ def fft_c2c(x, n, axis, norm, forward, name):
     op_type = 'fft_c2c'
 
     check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
-    if _non_static_mode():
+    if in_dygraph_mode():
+        out = _C_ops.final_state_fft_c2c(x, axes, norm, forward)
+    elif _in_legacy_dygraph():
         attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
         out = getattr(_C_ops, op_type)(x, *attrs)
     else:
@@ -1426,7 +1428,9 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name):
     op_type = 'fft_r2c'
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], op_type)
 
-    if _non_static_mode():
+    if in_dygraph_mode():
+        out = _C_ops.final_state_fft_r2c(x, axes, norm, forward, onesided)
+    elif _in_legacy_dygraph():
         attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
                  'onesided', onesided)
         out = getattr(_C_ops, op_type)(x, *attrs)
@@ -1469,7 +1473,12 @@ def fft_c2r(x, n, axis, norm, forward, name):
     op_type = 'fft_c2r'
     check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
 
-    if _non_static_mode():
+    if in_dygraph_mode():
+        if n is not None:
+            out = _C_ops.final_state_fft_c2r(x, axes, norm, forward, n)
+        else:
+            out = _C_ops.final_state_fft_c2r(x, axes, norm, forward, 0)
+    elif _in_legacy_dygraph():
         if n is not None:
             attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
                      'last_dim_size', n)
@@ -1528,7 +1537,9 @@ def fftn_c2c(x, s, axes, norm, forward, name):
     op_type = 'fft_c2c'
     check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
 
-    if _non_static_mode():
+    if in_dygraph_mode():
+        out = _C_ops.final_state_fft_c2c(x, axes, norm, forward)
+    elif _in_legacy_dygraph():
         attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
         out = getattr(_C_ops, op_type)(x, *attrs)
     else:
@@ -1579,7 +1590,9 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
     op_type = 'fft_r2c'
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], op_type)
 
-    if _non_static_mode():
+    if in_dygraph_mode():
+        out = _C_ops.final_state_fft_r2c(x, axes, norm, forward, onesided)
+    elif _in_legacy_dygraph():
         attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
                  'onesided', onesided)
         out = getattr(_C_ops, op_type)(x, *attrs)
@@ -1642,7 +1655,12 @@ def fftn_c2r(x, s, axes, norm, forward, name):
     op_type = 'fft_c2r'
     check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
 
-    if _non_static_mode():
+    if in_dygraph_mode():
+        if s is not None:
+            out = _C_ops.final_state_fft_c2r(x, axes, norm, forward, s[-1])
+        else:
+            out = _C_ops.final_state_fft_c2r(x, axes, norm, forward, 0)
+    elif _in_legacy_dygraph():
         if s:
             attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
                      'last_dim_size', s[-1])
diff --git a/python/paddle/fluid/tests/unittests/fft/spectral_op_np.py b/python/paddle/fluid/tests/unittests/fft/spectral_op_np.py
index 3c48c99af34b585d481e3783c3364986873df8a7..756dd7889977b4f6a243d541ae8b76486ea41cf8 100644
--- a/python/paddle/fluid/tests/unittests/fft/spectral_op_np.py
+++ b/python/paddle/fluid/tests/unittests/fft/spectral_op_np.py
@@ -12,20 +12,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import enum
 import numpy as np
 from functools import partial
 from numpy import asarray
 from numpy.fft._pocketfft import _raw_fft, _raw_fftnd, _get_forward_norm, _get_backward_norm, _cook_nd_args
 
 
+class NormMode(enum.Enum):
+    none = 1
+    by_sqrt_n = 2
+    by_n = 3
+
+
+def _get_norm_mode(norm, forward):
+    if norm == "ortho":
+        return NormMode.by_sqrt_n
+    if norm is None or norm == "backward":
+        return NormMode.none if forward else NormMode.by_n
+    return NormMode.by_n if forward else NormMode.none
+
+
+def _get_inv_norm(n, norm_mode):
+    assert isinstance(norm_mode,
+                      NormMode), "invalid norm_type {}".format(norm_mode)
+    if norm_mode == NormMode.none:
+        return 1.0
+    if norm_mode == NormMode.by_sqrt_n:
+        return np.sqrt(n)
+    return n
+
+
+# 1d transforms
 def _fftc2c(a, n=None, axis=-1, norm=None, forward=None):
     a = asarray(a)
     if n is None:
         n = a.shape[axis]
-    if forward:
-        inv_norm = _get_forward_norm(n, norm)
-    else:
-        inv_norm = _get_backward_norm(n, norm)
+    inv_norm = _get_inv_norm(n, norm)
     output = _raw_fft(a, n, axis, False, forward, inv_norm)
     return output
 
@@ -34,10 +57,7 @@ def _fftr2c(a, n=None, axis=-1, norm=None, forward=None):
     a = asarray(a)
     if n is None:
         n = a.shape[axis]
-    if forward:
-        inv_norm = _get_forward_norm(n, norm)
-    else:
-        inv_norm = _get_backward_norm(n, norm)
+    inv_norm = _get_inv_norm(n, norm)
     output = _raw_fft(a, n, axis, True, True, inv_norm)
     if not forward:
         output = output.conj()
@@ -48,43 +68,67 @@ def _fftc2r(a, n=None, axis=-1, norm=None, forward=None):
     a = asarray(a)
     if n is None:
         n = (a.shape[axis] - 1) * 2
-    if forward:
-        inv_norm = _get_forward_norm(n, norm)
-    else:
-        inv_norm = _get_backward_norm(n, norm)
+    inv_norm = _get_inv_norm(n, norm)
     output = _raw_fft(a.conj() if forward else a, n, axis, True, False,
                       inv_norm)
     return output
 
 
-def fft_c2c(x, axes, normalization, forward):
+# general fft functors
+def _fft_c2c_nd(x, axes, norm_mode, forward):
     f = partial(_fftc2c, forward=forward)
-    y = _raw_fftnd(x, s=None, axes=axes, function=f, norm=normalization)
+    y = _raw_fftnd(x, s=None, axes=axes, function=f, norm=norm_mode)
     return y
 
 
-def fft_c2c_backward(dy, axes, normalization, forward):
-    f = partial(_fftc2c, forward=forward)
-    dx = _raw_fftnd(dy, s=None, axes=axes, function=f, norm=normalization)
-    return dx
-
-
-def fft_r2c(x, axes, normalization, forward, onesided):
+def _fft_r2c_nd(x, axes, norm_mode, forward, onesided):
     a = asarray(x)
     s, axes = _cook_nd_args(a, axes=axes)
     if onesided:
-        a = _fftr2c(a, s[-1], axes[-1], normalization, forward)
-        for ii in range(len(axes) - 1):
-            a = _fftc2c(a, s[ii], axes[ii], normalization, forward)
+        a = _fftr2c(a, s[-1], axes[-1], norm_mode, forward)
+        a = _fft_c2c_nd(a, axes[:-1], norm_mode, forward)
     else:
-        a = fft_c2c(x, axes, normalization, forward)
+        a = _fft_c2c_nd(x, axes, norm_mode, forward)
+    return a
+
+
+def _fft_c2r_nd(x, axes, norm_mode, forward, last_dim_size):
+    a = asarray(x)
+    s, axes = _cook_nd_args(a, axes=axes, invreal=1)
+    if last_dim_size is not None:
+        s[-1] = last_dim_size
+    a = _fft_c2c_nd(a, axes[:-1], norm_mode, forward)
+    a = _fftc2r(a, s[-1], axes[-1], norm_mode, forward)
     return a
 
 
-def fft_r2c_backward(dy, x, axes, normalization, forward, onesided):
+# kernels
+def fft_c2c(x, axes, normalization, forward):
+    norm_mode = _get_norm_mode(normalization, forward)
+    return _fft_c2c_nd(x, axes, norm_mode, forward)
+
+
+def fft_c2r(x, axes, normalization, forward, last_dim_size):
+    norm_mode = _get_norm_mode(normalization, forward)
+    return _fft_c2r_nd(x, axes, norm_mode, forward, last_dim_size)
+
+
+def fft_r2c(x, axes, normalization, forward, onesided):
+    norm_mode = _get_norm_mode(normalization, forward)
+    return _fft_r2c_nd(x, axes, norm_mode, forward, onesided)
+
+
+# backward kernel
+def fft_c2c_backward(dy, axes, normalization, forward):
+    norm_mode = _get_norm_mode(normalization, forward)
+    dx = _fft_c2c_nd(dy, axes, norm_mode, not forward)
+    return dx
+
+
+def fft_r2c_backward(x, dy, axes, normalization, forward, onesided):
     a = dy
     if not onesided:
-        a = fft_c2c_backward(a, axes, normalization, forward).real
+        a = fft_c2c_backward(a, axes, normalization, forward)
     else:
         pad_widths = [(0, 0)] * a.ndim
         last_axis = axes[-1]
@@ -93,16 +137,25 @@ def fft_r2c_backward(dy, x, axes, normalization, forward, onesided):
         last_dim_size = a.shape[last_axis]
         pad_widths[last_axis] = (0, x.shape[last_axis] - last_dim_size)
         a = np.pad(a, pad_width=pad_widths)
-        a = fft_c2c_backward(a, axes, normalization, forward).real
-    return a
+        a = fft_c2c_backward(a, axes, normalization, forward)
+    return a.real
 
 
-def fft_c2r(x, axes, normalization, forward, last_dim_size):
-    a = asarray(x)
-    s, axes = _cook_nd_args(a, axes=axes, invreal=1)
-    if last_dim_size is not None:
-        s[-1] = last_dim_size
-    for ii in range(len(axes) - 1):
-        a = _fftc2c(a, s[ii], axes[ii], normalization, forward)
-    a = _fftc2r(a, s[-1], axes[-1], normalization, forward)
+def _fft_fill_conj_grad(x, axes, length_to_double):
+    last_fft_axis = axes[-1]
+    shape = x.shape
+    for multi_index in np.ndindex(*shape):
+        if 0 < multi_index[last_fft_axis] and multi_index[
+                last_fft_axis] <= length_to_double:
+            x[multi_index] *= 2
+    return x
+
+
+def fft_c2r_backward(x, dy, axes, normalization, forward, last_dim_size):
+    norm_mode = _get_norm_mode(normalization, forward)
+    a = dy
+    a = _fft_r2c_nd(dy, axes, norm_mode, not forward, True)
+    last_fft_axis = axes[-1]
+    length_to_double = dy.shape[last_fft_axis] - x.shape[last_fft_axis]
+    a = _fft_fill_conj_grad(a, axes, length_to_double)
     return a
diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft.py b/python/paddle/fluid/tests/unittests/fft/test_fft.py
index f386fdc9c34608c0909119f6cd1bddabb5f37e56..f7cc9fbf4a130f5f377568efcf0e43c385bf68af 100644
--- a/python/paddle/fluid/tests/unittests/fft/test_fft.py
+++ b/python/paddle/fluid/tests/unittests/fft/test_fft.py
@@ -473,7 +473,7 @@ class TestIrfft2(unittest.TestCase):
 @parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
     ('test_bool_input',
      (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
-         np.bool_), None, -1, 'backward', NotImplementedError),
+         np.bool_), None, -1, 'backward', RuntimeError),
     ('test_n_nagative', np.random.randn(4, 4, 4) +
      1j * np.random.randn(4, 4, 4), -1, -1, 'backward', ValueError),
     ('test_n_zero', np.random.randn(4, 4) + 1j * np.random.randn(4, 4), 0, -1,
@@ -543,7 +543,7 @@ class TestIrfftException(unittest.TestCase):
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
     [('test_bool_input',
       (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
-          np.bool_), None, (-2, -1), 'backward', NotImplementedError),
+          np.bool_), None, (-2, -1), 'backward', RuntimeError),
      ('test_n_nagative',
       np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
       (-2, -1), 'backward', ValueError),
@@ -625,7 +625,7 @@ class TestIrfft2Exception(unittest.TestCase):
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
     [('test_bool_input',
       (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
-          np.bool_), None, (-2, -1), 'backward', NotImplementedError),
+          np.bool_), None, (-2, -1), 'backward', RuntimeError),
      ('test_n_nagative',
       np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
       (-2, -1), 'backward', ValueError),
diff --git a/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py b/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py
index ba4092965920bc3f1863132a1adce85f4c75f3f0..066869750ed0742af7c529ce6a4b29d2d6aca012 100644
--- a/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py
+++ b/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py
@@ -20,12 +20,13 @@ import paddle
 
 import re
 import sys
-from spectral_op_np import fft_c2c, fft_r2c, fft_c2r
+from spectral_op_np import fft_c2c, fft_r2c, fft_c2r, fft_c2c_backward, fft_r2c_backward, fft_c2r_backward
 import paddle.fluid.core as core
 import paddle.fluid.dygraph as dg
 import paddle.static as static
 from numpy.random import random as rand
 from paddle.fluid import Program, program_guard
+from paddle import _C_ops
 
 sys.path.append("../")
 from op_test import OpTest
@@ -73,14 +74,26 @@ def class_name(cls, num, params_dict):
     return "{}_{}{}".format(cls.__name__, num, suffix and "_" + suffix)
 
 
+def fft_c2c_python_api(x, axes, norm, forward):
+    return _C_ops.final_state_fft_c2c(x, axes, norm, forward)
+
+
+def fft_r2c_python_api(x, axes, norm, forward, onesided):
+    return _C_ops.final_state_fft_r2c(x, axes, norm, forward, onesided)
+
+
+def fft_c2r_python_api(x, axes, norm, forward, last_dim_size=0):
+    return _C_ops.final_state_fft_c2r(x, axes, norm, forward, last_dim_size)
+
+
 @parameterize(
     (TEST_CASE_NAME, 'x', 'axes', 'norm', 'forward'),
     [('test_axes_is_sqe_type', (np.random.random(
         (12, 14)) + 1j * np.random.random(
             (12, 14))).astype(np.complex128), [0, 1], 'forward', True),
      ('test_axis_not_last', (np.random.random(
-         (4, 4, 4)) + 1j * np.random.random(
-             (4, 4, 4))).astype(np.complex128), (0, 1), "backward", False),
+         (4, 8, 4)) + 1j * np.random.random(
+             (4, 8, 4))).astype(np.complex128), (0, 1), "backward", False),
      ('test_norm_forward', (np.random.random((12, 14)) + 1j * np.random.random(
          (12, 14))).astype(np.complex128), (0, ), "forward", False),
      ('test_norm_backward', (np.random.random((12, 14)) + 1j * np.random.random(
@@ -88,11 +101,11 @@ def class_name(cls, num, params_dict):
      ('test_norm_ortho', (np.random.random((12, 14)) + 1j * np.random.random(
          (12, 14))).astype(np.complex128), (1, ), "ortho", True)])
 class TestFFTC2COp(OpTest):
-    # Because framwork not support complex numerial gradient, we skip gradient check.
-    no_need_check_grad = True
 
     def setUp(self):
         self.op_type = "fft_c2c"
+        self.dtype = self.x.dtype
+        self.python_api = fft_c2c_python_api
 
         out = fft_c2c(self.x, self.axes, self.norm, self.forward)
 
@@ -104,8 +117,21 @@ class TestFFTC2COp(OpTest):
         }
         self.outputs = {'Out': out}
 
+        self.out_grad = (np.random.random(self.x.shape) +
+                         1j * np.random.random(self.x.shape)).astype(
+                             self.x.dtype)
+        self.x_grad = fft_c2c_backward(self.out_grad, self.axes, self.norm,
+                                       self.forward)
+
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
+
+    def test_check_grad(self):
+        self.check_grad("X",
+                        "Out",
+                        user_defined_grads=[self.x_grad],
+                        user_defined_grad_outputs=[self.out_grad],
+                        check_eager=True)
 
 
 @parameterize(
@@ -114,7 +140,7 @@ class TestFFTC2COp(OpTest):
         (12, 14)) + 1j * np.random.random(
             (12, 14))).astype(np.complex128), [0, 1], 'forward', True, 26),
      ('test_axis_not_last', (np.random.random(
-         (4, 4, 4)) + 1j * np.random.random((4, 4, 4))).astype(np.complex128),
+         (4, 7, 4)) + 1j * np.random.random((4, 7, 4))).astype(np.complex128),
       (0, 1), "backward", False, None),
      ('test_norm_forward', (np.random.random((12, 14)) + 1j * np.random.random(
          (12, 14))).astype(np.complex128), (0, ), "forward", False, 22),
@@ -123,11 +149,11 @@ class TestFFTC2COp(OpTest):
      ('test_norm_ortho', (np.random.random((12, 14)) + 1j * np.random.random(
          (12, 14))).astype(np.complex128), (1, ), "ortho", True, 26)])
 class TestFFTC2ROp(OpTest):
-    # Because framwork not support complex numerial gradient, we skip gradient check.
-    no_need_check_grad = True
 
     def setUp(self):
         self.op_type = "fft_c2r"
+        self.dtype = self.x.dtype
+        self.python_api = fft_c2r_python_api
 
         out = fft_c2r(self.x, self.axes, self.norm, self.forward,
                       self.last_dim_size)
@@ -141,28 +167,40 @@ class TestFFTC2ROp(OpTest):
         }
         self.outputs = {'Out': out}
 
+        self.out_grad = np.random.random(out.shape).astype(out.dtype)
+        self.x_grad = fft_c2r_backward(self.x, self.out_grad, self.axes,
+                                       self.norm, self.forward,
+                                       self.last_dim_size)
+
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
+
+    def test_check_grad(self):
+        self.check_grad(["X"],
+                        "Out",
+                        user_defined_grads=[self.x_grad],
+                        user_defined_grad_outputs=[self.out_grad],
+                        check_eager=True)
 
 
 @parameterize(
     (TEST_CASE_NAME, 'x', 'axes', 'norm', 'forward', 'onesided'),
-    [('test_axes_is_sqe_type', np.random.randn(12, 14).astype(np.float64),
+    [('test_axes_is_sqe_type', np.random.randn(12, 18).astype(np.float64),
       (0, 1), 'forward', True, True),
-     ('test_axis_not_last', np.random.randn(4, 4, 4).astype(np.float64),
+     ('test_axis_not_last', np.random.randn(4, 8, 4).astype(np.float64),
       (0, 1), "backward", False, True),
-     ('test_norm_forward', np.random.randn(12, 14).astype(np.float64),
+     ('test_norm_forward', np.random.randn(12, 18).astype(np.float64),
       (0, 1), "forward", False, False),
-     ('test_norm_backward', np.random.randn(12, 14).astype(np.float64),
+     ('test_norm_backward', np.random.randn(12, 18).astype(np.float64),
       (0, ), "backward", True, False),
-     ('test_norm_ortho', np.random.randn(12, 14).astype(np.float64),
+     ('test_norm_ortho', np.random.randn(12, 18).astype(np.float64),
       (1, ), "ortho", True, False)])
 class TestFFTR2COp(OpTest):
-    # Because framwork not support complex numerial gradient, we skip gradient check.
-    no_need_check_grad = True
 
     def setUp(self):
         self.op_type = "fft_r2c"
+        self.dtype = self.x.dtype
+        self.python_api = fft_r2c_python_api
 
         out = fft_r2c(self.x, self.axes, self.norm, self.forward, self.onesided)
 
@@ -175,5 +213,16 @@ class TestFFTR2COp(OpTest):
         }
         self.outputs = {'Out': out}
 
+        self.out_grad = np.random.random(out.shape).astype(out.dtype)
+        self.x_grad = fft_r2c_backward(self.x, self.out_grad, self.axes,
+                                       self.norm, self.forward, self.onesided)
+
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
+
+    def test_check_grad(self):
+        self.check_grad("X",
+                        "Out",
+                        user_defined_grads=[self.x_grad],
+                        user_defined_grad_outputs=[self.out_grad],
+                        check_eager=True)