From 319f95d03471fce306a2e1af9d22ca81ae1d2e65 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Wed, 23 Mar 2022 13:18:59 +0800 Subject: [PATCH] Add complex type compatibility for stft api and stft op. (#40113) * Add stft_op. * Add stft_grad_op. * Add stft_op unittest. * [DLTP-45176] Add complex compatibility in static mode for stft api. * [DLTP-45176] Add complex compatibility in static mode for stft api. * Add doc. * Update unitests of stft op. * Update spectral helper. * fix coding style. --- paddle/fluid/operators/frame_op.cc | 20 +- paddle/fluid/operators/mean_op.cc | 12 +- paddle/fluid/operators/mean_op.cu | 11 +- paddle/fluid/operators/overlap_add_op.cc | 23 +- paddle/fluid/operators/spectral_helper.h | 806 +++++++-------- paddle/fluid/operators/spectral_op.cc | 482 +-------- paddle/fluid/operators/spectral_op.cu | 489 +-------- paddle/fluid/operators/spectral_op.cu.h | 944 ++++++++++++++++++ paddle/fluid/operators/spectral_op.h | 5 + paddle/fluid/operators/stft_op.cc | 154 +++ paddle/fluid/operators/stft_op.cu | 26 + paddle/fluid/operators/stft_op.h | 157 +++ paddle/phi/kernels/cpu/pad3d_kernel.cc | 13 +- paddle/phi/kernels/gpu/pad3d_kernel.cu | 5 +- .../fluid/layers/layer_function_generator.py | 7 +- python/paddle/fluid/layers/nn.py | 27 +- python/paddle/fluid/layers/tensor.py | 2 +- .../fluid/tests/unittests/test_stft_op.py | 84 ++ python/paddle/signal.py | 49 +- python/paddle/tensor/linalg.py | 4 +- 20 files changed, 1901 insertions(+), 1419 deletions(-) create mode 100644 paddle/fluid/operators/spectral_op.cu.h create mode 100644 paddle/fluid/operators/stft_op.cc create mode 100644 paddle/fluid/operators/stft_op.cu create mode 100644 paddle/fluid/operators/stft_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_stft_op.py diff --git a/paddle/fluid/operators/frame_op.cc b/paddle/fluid/operators/frame_op.cc index 9aa71e0944..2ff9beb36f 100644 --- a/paddle/fluid/operators/frame_op.cc +++ b/paddle/fluid/operators/frame_op.cc @@ -64,18 +64,26 @@ class FrameOp : public framework::OperatorWithKernel { end_axis = x_rank - 2; } - PADDLE_ENFORCE_LE(frame_length, seq_length, - platform::errors::InvalidArgument( - "Attribute(frame_length) of FrameOp should be less " - "equal than sequence length, but got (%s) > (%s).", - frame_length, seq_length)); + bool contain_unknown_dim = phi::contain_unknown_dim(x_dims); + bool check = ctx->IsRuntime() || !contain_unknown_dim; + if (check) { + PADDLE_ENFORCE_LE(frame_length, seq_length, + platform::errors::InvalidArgument( + "Attribute(frame_length) of FrameOp should be less " + "equal than sequence length, but got (%s) > (%s).", + frame_length, seq_length)); + } // It won't go into for loop when x_rank == 1U. for (int i = start_axis; i <= end_axis; i++) { output_shape.push_back(x_dims[i]); } - n_frames = 1 + (seq_length - frame_length) / hop_length; + if (seq_length == -1) { + n_frames = -1; + } else { + n_frames = 1 + (seq_length - frame_length) / hop_length; + } if (axis == 0) { // (n_frames, frame_length, ...) diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc index 83fe1aa6dd..785b16ae28 100644 --- a/paddle/fluid/operators/mean_op.cc +++ b/paddle/fluid/operators/mean_op.cc @@ -98,9 +98,17 @@ REGISTER_OP_CPU_KERNEL( mean, ops::MeanKernel, ops::MeanKernel, ops::MeanKernel); + paddle::platform::bfloat16>, + ops::MeanKernel>, + ops::MeanKernel>); REGISTER_OP_CPU_KERNEL( mean_grad, ops::MeanGradKernel, ops::MeanGradKernel, ops::MeanGradKernel); + paddle::platform::bfloat16>, + ops::MeanGradKernel>, + ops::MeanGradKernel>); diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu index 01a5632a96..e8964765ec 100644 --- a/paddle/fluid/operators/mean_op.cu +++ b/paddle/fluid/operators/mean_op.cu @@ -102,10 +102,17 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( mean, ops::MeanCUDAKernel, ops::MeanCUDAKernel, - ops::MeanCUDAKernel); + ops::MeanCUDAKernel, + ops::MeanCUDAKernel>, + ops::MeanCUDAKernel>); REGISTER_OP_CUDA_KERNEL( mean_grad, ops::MeanCUDAGradKernel, ops::MeanCUDAGradKernel, + ops::MeanCUDAGradKernel, ops::MeanCUDAGradKernel); + paddle::platform::complex>, + ops::MeanCUDAGradKernel>); diff --git a/paddle/fluid/operators/overlap_add_op.cc b/paddle/fluid/operators/overlap_add_op.cc index adae2c8f8a..0e6f0f8422 100644 --- a/paddle/fluid/operators/overlap_add_op.cc +++ b/paddle/fluid/operators/overlap_add_op.cc @@ -54,6 +54,7 @@ class OverlapAddOp : public framework::OperatorWithKernel { std::vector output_shape; int n_frames; int frame_length; + int seq_length; int start_axis; int end_axis; @@ -69,14 +70,22 @@ class OverlapAddOp : public framework::OperatorWithKernel { end_axis = x_rank - 3; } - PADDLE_ENFORCE_LE( - hop_length, frame_length, - platform::errors::InvalidArgument( - "Attribute(hop_length) of OverlapAddOp should be less or equal " - "than frame_length, but got hop_length(%s) > frame_length(%s).", - hop_length, frame_length)); + bool contain_unknown_dim = phi::contain_unknown_dim(x_dims); + bool check = ctx->IsRuntime() || !contain_unknown_dim; + if (check) { + PADDLE_ENFORCE_LE( + hop_length, frame_length, + platform::errors::InvalidArgument( + "Attribute(hop_length) of OverlapAddOp should be less or equal " + "than frame_length, but got hop_length(%s) > frame_length(%s).", + hop_length, frame_length)); + } - const int seq_length = (n_frames - 1) * hop_length + frame_length; + if (n_frames == -1) { + seq_length = -1; + } else { + seq_length = (n_frames - 1) * hop_length + frame_length; + } // It won't go into for loop when x_rank == 2U. for (int i = start_axis; i <= end_axis; i++) { diff --git a/paddle/fluid/operators/spectral_helper.h b/paddle/fluid/operators/spectral_helper.h index 3963976824..c9889ad539 100644 --- a/paddle/fluid/operators/spectral_helper.h +++ b/paddle/fluid/operators/spectral_helper.h @@ -16,451 +16,469 @@ #include "paddle/fluid/operators/spectral_op.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/dynload/hipfft.h" -#endif - -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/dynload/cufft.h" +#if defined(PADDLE_WITH_ONEMKL) +#include "paddle/phi/backends/dynload/mklrt.h" +#elif defined(PADDLE_WITH_POCKETFFT) +#include "extern_pocketfft/pocketfft_hdronly.h" #endif namespace paddle { namespace operators { -using ScalarType = framework::proto::VarType::Type; -const int64_t kMaxFFTNdim = 3; -const int64_t kMaxDataNdim = kMaxFFTNdim + 1; -// This struct is used to easily compute hashes of the -// parameters. It will be the **key** to the plan cache. -struct FFTConfigKey { - // between 1 and kMaxFFTNdim, i.e., 1 <= signal_ndim <= 3 - int64_t signal_ndim_; - // These include additional batch dimension as well. - int64_t sizes_[kMaxDataNdim]; - int64_t input_shape_[kMaxDataNdim]; - int64_t output_shape_[kMaxDataNdim]; - FFTTransformType fft_type_; - ScalarType value_type_; - - FFTConfigKey() = default; - - FFTConfigKey(const std::vector& in_shape, - const std::vector& out_shape, - const std::vector& signal_size, - FFTTransformType fft_type, ScalarType value_type) { - // Padding bits must be zeroed for hashing - memset(this, 0, sizeof(*this)); - signal_ndim_ = signal_size.size() - 1; - fft_type_ = fft_type; - value_type_ = value_type; - - std::copy(signal_size.cbegin(), signal_size.cend(), sizes_); - std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_); - std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_); - } -}; - -#if defined(PADDLE_WITH_CUDA) -// An RAII encapsulation of cuFFTHandle -class CuFFTHandle { - ::cufftHandle handle_; - public: - CuFFTHandle() { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftCreate(&handle_)); - } - - CuFFTHandle(const CuFFTHandle& other) = delete; - CuFFTHandle& operator=(const CuFFTHandle& other) = delete; +using Tensor = framework::Tensor; - CuFFTHandle(CuFFTHandle&& other) = delete; - CuFFTHandle& operator=(CuFFTHandle&& other) = delete; +// FFT Functors +#if defined(PADDLE_WITH_ONEMKL) - ::cufftHandle& get() { return handle_; } - const ::cufftHandle& get() const { return handle_; } +#define MKL_DFTI_CHECK(expr) \ + do { \ + MKL_LONG status = (expr); \ + if (!phi::dynload::DftiErrorClass(status, DFTI_NO_ERROR)) \ + PADDLE_THROW( \ + platform::errors::External(phi::dynload::DftiErrorMessage(status))); \ + } while (0); - ~CuFFTHandle() { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftDestroy(handle_)); +struct DftiDescriptorDeleter { + void operator()(DFTI_DESCRIPTOR_HANDLE handle) { + if (handle != nullptr) { + MKL_DFTI_CHECK(phi::dynload::DftiFreeDescriptor(&handle)); + } } }; -using plan_size_type = long long int; // NOLINT -// This class contains all the information needed to execute a cuFFT plan: -// 1. the plan -// 2. the workspace size needed -class FFTConfig { +// A RAII wrapper for MKL_DESCRIPTOR* +class DftiDescriptor { public: - // Only move semantics is enought for this class. Although we already use - // unique_ptr for the plan, still remove copy constructor and assignment op so - // we don't accidentally copy and take perf hit. - explicit FFTConfig(const FFTConfigKey& plan_key) - : FFTConfig( - std::vector(plan_key.sizes_, - plan_key.sizes_ + plan_key.signal_ndim_ + 1), - plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {} - - // sizes are full signal, including batch size and always two-sided - FFTConfig(const std::vector& sizes, const int64_t signal_ndim, - FFTTransformType fft_type, ScalarType dtype) - : fft_type_(fft_type), value_type_(dtype) { - // signal sizes (excluding batch dim) - std::vector signal_sizes(sizes.begin() + 1, sizes.end()); - - // input batch size - const auto batch = static_cast(sizes[0]); - // const int64_t signal_ndim = sizes.size() - 1; - PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1, - platform::errors::InvalidArgument( - "The signal_ndim must be equal to sizes.size() - 1," - "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]", - signal_ndim, sizes.size() - 1)); - - cudaDataType itype, otype, exec_type; - const auto complex_input = has_complex_input(fft_type); - const auto complex_output = has_complex_output(fft_type); - if (dtype == framework::proto::VarType::FP32) { - itype = complex_input ? CUDA_C_32F : CUDA_R_32F; - otype = complex_output ? CUDA_C_32F : CUDA_R_32F; - exec_type = CUDA_C_32F; - } else if (dtype == framework::proto::VarType::FP64) { - itype = complex_input ? CUDA_C_64F : CUDA_R_64F; - otype = complex_output ? CUDA_C_64F : CUDA_R_64F; - exec_type = CUDA_C_64F; - } else if (dtype == framework::proto::VarType::FP16) { - itype = complex_input ? CUDA_C_16F : CUDA_R_16F; - otype = complex_output ? CUDA_C_16F : CUDA_R_16F; - exec_type = CUDA_C_16F; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "cuFFT only support transforms of type float16, float32 and " - "float64")); - } - - // disable auto allocation of workspace to use allocator from the framework - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetAutoAllocation( - plan(), /* autoAllocate */ 0)); - - size_t ws_size_t; - - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtMakePlanMany( - plan(), signal_ndim, signal_sizes.data(), - /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype, - /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype, - batch, &ws_size_t, exec_type)); - - ws_size = ws_size_t; + void init(DFTI_CONFIG_VALUE precision, DFTI_CONFIG_VALUE signal_type, + MKL_LONG signal_ndim, MKL_LONG* sizes) { + PADDLE_ENFORCE_EQ(desc_.get(), nullptr, + platform::errors::AlreadyExists( + "DftiDescriptor has already been initialized.")); + + DFTI_DESCRIPTOR* raw_desc; + MKL_DFTI_CHECK(phi::dynload::DftiCreateDescriptorX( + &raw_desc, precision, signal_type, signal_ndim, sizes)); + desc_.reset(raw_desc); } - FFTConfig(const FFTConfig& other) = delete; - FFTConfig& operator=(const FFTConfig& other) = delete; - - FFTConfig(FFTConfig&& other) = delete; - FFTConfig& operator=(FFTConfig&& other) = delete; - - const cufftHandle& plan() const { return plan_ptr.get(); } - - FFTTransformType transform_type() const { return fft_type_; } - ScalarType data_type() const { return value_type_; } - size_t workspace_size() const { return ws_size; } + DFTI_DESCRIPTOR* get() const { + DFTI_DESCRIPTOR* raw_desc = desc_.get(); + PADDLE_ENFORCE_NOT_NULL(raw_desc, + platform::errors::PreconditionNotMet( + "DFTI DESCRIPTOR has not been initialized.")); + return raw_desc; + } private: - CuFFTHandle plan_ptr; - size_t ws_size; - FFTTransformType fft_type_; - ScalarType value_type_; + std::unique_ptr desc_; }; -#elif defined(PADDLE_WITH_HIP) -// An RAII encapsulation of cuFFTHandle -class HIPFFTHandle { - ::hipfftHandle handle_; - - public: - HIPFFTHandle() { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftCreate(&handle_)); +static DftiDescriptor _plan_mkl_fft( + const framework::proto::VarType::Type& in_dtype, + const framework::proto::VarType::Type& out_dtype, + const framework::DDim& in_strides, const framework::DDim& out_strides, + const std::vector& signal_sizes, FFTNormMode normalization, + bool forward) { + const DFTI_CONFIG_VALUE precision = [&] { + switch (in_dtype) { + case framework::proto::VarType::FP32: + return DFTI_SINGLE; + case framework::proto::VarType::COMPLEX64: + return DFTI_SINGLE; + case framework::proto::VarType::FP64: + return DFTI_DOUBLE; + case framework::proto::VarType::COMPLEX128: + return DFTI_DOUBLE; + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid input datatype (%s), input data type should be FP32, " + "FP64, COMPLEX64 or COMPLEX128.", + framework::DataTypeToString(in_dtype))); + } + }(); + + // C2C, R2C, C2R + const FFTTransformType fft_type = GetFFTTransformType(in_dtype, out_dtype); + const DFTI_CONFIG_VALUE domain = + (fft_type == FFTTransformType::C2C) ? DFTI_COMPLEX : DFTI_REAL; + + DftiDescriptor descriptor; + std::vector fft_sizes(signal_sizes.cbegin(), signal_sizes.cend()); + const MKL_LONG signal_ndim = fft_sizes.size() - 1; + descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1); + + // placement inplace or not inplace + MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(), DFTI_PLACEMENT, + DFTI_NOT_INPLACE)); + + // number of transformations + const MKL_LONG batch_size = fft_sizes[0]; + MKL_DFTI_CHECK(phi::dynload::DftiSetValue( + descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size)); + + // input & output distance + const MKL_LONG idist = in_strides[0]; + const MKL_LONG odist = out_strides[0]; + MKL_DFTI_CHECK( + phi::dynload::DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist)); + MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(), + DFTI_OUTPUT_DISTANCE, odist)); + + // input & output stride + std::vector mkl_in_stride(1 + signal_ndim, 0); + std::vector mkl_out_stride(1 + signal_ndim, 0); + for (MKL_LONG i = 1; i <= signal_ndim; i++) { + mkl_in_stride[i] = in_strides[i]; + mkl_out_stride[i] = out_strides[i]; } - - HIPFFTHandle(const HIPFFTHandle& other) = delete; - HIPFFTHandle& operator=(const HIPFFTHandle& other) = delete; - - HIPFFTHandle(HIPFFTHandle&& other) = delete; - HIPFFTHandle& operator=(HIPFFTHandle&& other) = delete; - - ::hipfftHandle& get() { return handle_; } - const ::hipfftHandle& get() const { return handle_; } - - ~HIPFFTHandle() { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftDestroy(handle_)); + MKL_DFTI_CHECK(phi::dynload::DftiSetValue( + descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data())); + MKL_DFTI_CHECK(phi::dynload::DftiSetValue( + descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data())); + + // conjugate even storage + if (!(fft_type == FFTTransformType::C2C)) { + MKL_DFTI_CHECK(phi::dynload::DftiSetValue( + descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX)); } -}; -using plan_size_type = int; -// This class contains all the information needed to execute a cuFFT plan: -// 1. the plan -// 2. the workspace size needed -class FFTConfig { - public: - // Only move semantics is enought for this class. Although we already use - // unique_ptr for the plan, still remove copy constructor and assignment op so - // we don't accidentally copy and take perf hit. - explicit FFTConfig(const FFTConfigKey& plan_key) - : FFTConfig( - std::vector(plan_key.sizes_, - plan_key.sizes_ + plan_key.signal_ndim_ + 1), - plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {} - - // sizes are full signal, including batch size and always two-sided - FFTConfig(const std::vector& sizes, const int64_t signal_ndim, - FFTTransformType fft_type, ScalarType dtype) - : fft_type_(fft_type), value_type_(dtype) { - // signal sizes (excluding batch dim) - std::vector signal_sizes(sizes.begin() + 1, sizes.end()); - - // input batch size - const auto batch = static_cast(sizes[0]); - // const int64_t signal_ndim = sizes.size() - 1; - PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1, - platform::errors::InvalidArgument( - "The signal_ndim must be equal to sizes.size() - 1," - "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]", - signal_ndim, sizes.size() - 1)); - - hipfftType exec_type = [&] { - if (dtype == framework::proto::VarType::FP32) { - switch (fft_type) { - case FFTTransformType::C2C: - return HIPFFT_C2C; - case FFTTransformType::R2C: - return HIPFFT_R2C; - case FFTTransformType::C2R: - return HIPFFT_C2R; - } - } else if (dtype == framework::proto::VarType::FP64) { - switch (fft_type) { - case FFTTransformType::C2C: - return HIPFFT_Z2Z; - case FFTTransformType::R2C: - return HIPFFT_D2Z; - case FFTTransformType::C2R: - return HIPFFT_Z2D; - } + + MKL_LONG signal_numel = + std::accumulate(fft_sizes.cbegin() + 1, fft_sizes.cend(), 1UL, + std::multiplies()); + if (normalization != FFTNormMode::none) { + const double scale = + ((normalization == FFTNormMode::by_sqrt_n) + ? 1.0 / std::sqrt(static_cast(signal_numel)) + : 1.0 / static_cast(signal_numel)); + const auto scale_direction = [&]() { + if (fft_type == FFTTransformType::R2C || + (fft_type == FFTTransformType::C2C && forward)) { + return DFTI_FORWARD_SCALE; + } else { + // (fft_type == FFTTransformType::C2R || + // (fft_type == FFTTransformType::C2C && !forward)) + return DFTI_BACKWARD_SCALE; } - PADDLE_THROW(platform::errors::InvalidArgument( - "hipFFT only support transforms of type float32 and float64")); }(); - - // disable auto allocation of workspace to use allocator from the framework - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetAutoAllocation( - plan(), /* autoAllocate */ 0)); - - size_t ws_size_t; - - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftMakePlanMany( - plan(), signal_ndim, signal_sizes.data(), - /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, - /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type, - batch, &ws_size_t)); - - ws_size = ws_size_t; + MKL_DFTI_CHECK( + phi::dynload::DftiSetValue(descriptor.get(), scale_direction, scale)); } - const hipfftHandle& plan() const { return plan_ptr.get(); } - - FFTTransformType transform_type() const { return fft_type_; } - ScalarType data_type() const { return value_type_; } - size_t workspace_size() const { return ws_size; } + // commit the descriptor + MKL_DFTI_CHECK(phi::dynload::DftiCommitDescriptor(descriptor.get())); + return descriptor; +} - private: - HIPFFTHandle plan_ptr; - size_t ws_size; - FFTTransformType fft_type_; - ScalarType value_type_; -}; -#endif +// Execute a general fft operation (can be c2c, onesided r2c or onesided c2r) +template +void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out, + const std::vector& axes, FFTNormMode normalization, + bool forward) { + const framework::DDim& in_sizes = x->dims(); + const int ndim = in_sizes.size(); + const int signal_ndim = axes.size(); + const int batch_ndim = ndim - signal_ndim; + const framework::DDim& out_sizes = out->dims(); + + // make a dim permutation + std::vector dim_permute(ndim); + std::iota(dim_permute.begin(), dim_permute.end(), 0); + std::vector is_transformed_dim(ndim, false); + for (const auto& d : axes) { + is_transformed_dim[d] = true; + } + const auto batch_end = + std::partition(dim_permute.begin(), dim_permute.end(), + [&](size_t axis) { return !is_transformed_dim[axis]; }); + std::copy(axes.cbegin(), axes.cend(), batch_end); + + // transpose input according to that permutation + framework::DDim transposed_input_shape = in_sizes.transpose(dim_permute); + std::vector transposed_input_shape_ = + phi::vectorize(transposed_input_shape); + framework::Tensor transposed_input; + transposed_input.Resize(transposed_input_shape); + const auto place = ctx.GetPlace(); + transposed_input.mutable_data(place); + TransCompute(ndim, ctx, *x, &transposed_input, + dim_permute); + + // make an collapsed input: collapse batch axes for input + const int batch_size = std::accumulate( + transposed_input_shape.Get(), transposed_input_shape.Get() + batch_ndim, + 1L, std::multiplies()); + std::vector collapsed_input_shape_(1 + signal_ndim); + collapsed_input_shape_[0] = batch_size; + std::copy(transposed_input_shape_.begin() + batch_ndim, + transposed_input_shape_.end(), collapsed_input_shape_.begin() + 1); + const framework::DDim collapsed_input_shape = + phi::make_ddim(collapsed_input_shape_); + transposed_input.Resize(collapsed_input_shape); + framework::Tensor& collapsed_input = transposed_input; + + // make a collapsed output + std::vector collapsed_output_shape_(1 + signal_ndim); + collapsed_output_shape_[0] = batch_size; + for (int i = 0; i < signal_ndim; i++) { + collapsed_output_shape_[1 + i] = out_sizes[axes[i]]; + } + const framework::DDim collapsed_output_shape = + phi::make_ddim(collapsed_output_shape_); + framework::Tensor collapsed_output; + collapsed_output.Resize(collapsed_output_shape); + collapsed_output.mutable_data(place, out->type()); + + // signal sizes + std::vector signal_sizes(1 + signal_ndim); + signal_sizes[0] = batch_size; + for (int i = 0; i < signal_ndim; i++) { + signal_sizes[1 + i] = + std::max(collapsed_input_shape[1 + i], collapsed_output_shape[1 + i]); + } -// Hashing machinery for Key -// Fowler–Noll–Vo hash function -// see -// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function -template -struct KeyHash { - // Key must be a POD because we read out its memory - // contenst as char* when hashing - static_assert(std::is_pod::value, "Key must be plain old data type"); - - size_t operator()(const Key& params) const { - auto ptr = reinterpret_cast(¶ms); - uint32_t value = 0x811C9DC5; - for (int i = 0; i < static_cast(sizeof(Key)); ++i) { - value ^= ptr[i]; - value *= 0x01000193; + // input & output stride + const framework::DDim input_stride = phi::stride(collapsed_input_shape); + const framework::DDim output_stride = phi::stride(collapsed_output_shape); + + // make a DFTI_DESCRIPTOR + DftiDescriptor desc = + _plan_mkl_fft(framework::TransToProtoVarType(x->dtype()), + framework::TransToProtoVarType(out->dtype()), input_stride, + output_stride, signal_sizes, normalization, forward); + + const FFTTransformType fft_type = + GetFFTTransformType(framework::TransToProtoVarType(x->dtype()), + framework::TransToProtoVarType(out->type())); + if (fft_type == FFTTransformType::C2R && forward) { + framework::Tensor collapsed_input_conj(collapsed_input.dtype()); + collapsed_input_conj.mutable_data(collapsed_input.dims(), + ctx.GetPlace()); + // conjugate the input + platform::ForRange for_range(ctx, collapsed_input.numel()); + phi::funcs::ConjFunctor functor(collapsed_input.data(), + collapsed_input.numel(), + collapsed_input_conj.data()); + for_range(functor); + MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward( + desc.get(), collapsed_input_conj.data(), collapsed_output.data())); + } else if (fft_type == FFTTransformType::R2C && !forward) { + framework::Tensor collapsed_output_conj(collapsed_output.dtype()); + collapsed_output_conj.mutable_data(collapsed_output.dims(), + ctx.GetPlace()); + MKL_DFTI_CHECK(phi::dynload::DftiComputeForward( + desc.get(), collapsed_input.data(), collapsed_output_conj.data())); + // conjugate the output + platform::ForRange for_range(ctx, collapsed_output.numel()); + phi::funcs::ConjFunctor functor(collapsed_output_conj.data(), + collapsed_output.numel(), + collapsed_output.data()); + for_range(functor); + } else { + if (forward) { + MKL_DFTI_CHECK(phi::dynload::DftiComputeForward( + desc.get(), collapsed_input.data(), collapsed_output.data())); + } else { + MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward( + desc.get(), collapsed_input.data(), collapsed_output.data())); } - return static_cast(value); } -}; -template -struct KeyEqual { - // Key must be a POD because we read out its memory - // contenst as char* when comparing - static_assert(std::is_pod::value, "Key must be plain old data type"); + // resize for the collapsed output + framework::DDim transposed_output_shape = out_sizes.transpose(dim_permute); + collapsed_output.Resize(transposed_output_shape); + framework::Tensor& transposed_output = collapsed_output; - bool operator()(const Key& a, const Key& b) const { - auto ptr1 = reinterpret_cast(&a); - auto ptr2 = reinterpret_cast(&b); - return memcmp(ptr1, ptr2, sizeof(Key)) == 0; + // reverse the transposition + std::vector reverse_dim_permute(ndim); + for (int i = 0; i < ndim; i++) { + reverse_dim_permute[dim_permute[i]] = i; } -}; - -#if CUDA_VERSION < 10000 -// Note that the max plan number for CUDA version < 10 has to be 1023 -// due to a bug that fails on the 1024th plan -constexpr size_t CUFFT_MAX_PLAN_NUM = 1023; -constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM; -#else -constexpr size_t CUFFT_MAX_PLAN_NUM = std::numeric_limits::max(); -// The default max cache size chosen for CUDA version > 10 is arbitrary. -// This number puts a limit on how big of a plan cache should we maintain by -// default. Users can always configure it via cufft_set_plan_cache_max_size. -constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = 4096; -#endif -static_assert(CUFFT_MAX_PLAN_NUM >= 0 && - CUFFT_MAX_PLAN_NUM <= std::numeric_limits::max(), - "CUFFT_MAX_PLAN_NUM not in size_t range"); -static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 && - CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM, - "CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range"); - -// This cache assumes that the mapping from key to value never changes. -// This is **NOT** thread-safe. Please use a mutex when using it **AND** the -// value returned from try_emplace_value. -// The contract of using this cache is that try_emplace_value should only be -// used when the max_size is positive. -class FFTConfigCache { - public: - using kv_t = typename std::pair; - using map_t = typename std::unordered_map< - std::reference_wrapper, typename std::list::iterator, - KeyHash, KeyEqual>; - using map_kkv_iter_t = typename map_t::iterator; - - FFTConfigCache() : FFTConfigCache(CUFFT_DEFAULT_CACHE_SIZE) {} - - explicit FFTConfigCache(int64_t max_size) { _set_max_size(max_size); } - - FFTConfigCache(const FFTConfigCache& other) = delete; - FFTConfigCache& operator=(const FFTConfigCache& other) = delete; - - FFTConfigCache(FFTConfigCache&& other) noexcept - : _usage_list(std::move(other._usage_list)), - _cache_map(std::move(other._cache_map)), - _max_size(other._max_size) {} + TransCompute(ndim, ctx, transposed_output, + out, reverse_dim_permute); +} - FFTConfigCache& operator=(FFTConfigCache&& other) noexcept { - _usage_list = std::move(other._usage_list); - _cache_map = std::move(other._cache_map); - _max_size = other._max_size; - return *this; +template +struct FFTC2CFunctor { + void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x, + Tensor* out, const std::vector& axes, + FFTNormMode normalization, bool forward) { + exec_fft(ctx, x, out, axes, + normalization, forward); } +}; - // If key is in this cache, return the cached config. Otherwise, emplace the - // config in this cache and return it. - FFTConfig& lookup(FFTConfigKey params) { - PADDLE_ENFORCE_GT(_max_size, 0, - platform::errors::InvalidArgument( - "The max size of FFTConfigCache must be great than 0," - "But received is [%d]", - _max_size)); - - map_kkv_iter_t map_it = _cache_map.find(params); - // Hit, put to list front - if (map_it != _cache_map.end()) { - _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second); - return map_it->second->second; - } +template +struct FFTR2CFunctor { + void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x, + Tensor* out, const std::vector& axes, + FFTNormMode normalization, bool forward) { + exec_fft(ctx, x, out, axes, + normalization, forward); + } +}; - // Miss - // remove if needed - if (_usage_list.size() >= _max_size) { - auto last = _usage_list.end(); - last--; - _cache_map.erase(last->first); - _usage_list.pop_back(); +template +struct FFTC2RFunctor { + void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x, + Tensor* out, const std::vector& axes, + FFTNormMode normalization, bool forward) { + if (axes.size() > 1) { + const std::vector c2c_dims(axes.begin(), axes.end() - 1); + Tensor temp; + temp.mutable_data(x->dims(), ctx.GetPlace()); + + FFTC2CFunctor c2c_functor; + c2c_functor(ctx, x, &temp, c2c_dims, normalization, forward); + + const std::vector new_axes{axes.back()}; + exec_fft(ctx, &temp, out, new_axes, + normalization, forward); + } else { + exec_fft(ctx, x, out, axes, + normalization, forward); } - - // construct new plan at list front, then insert into _cache_map - _usage_list.emplace_front(std::piecewise_construct, - std::forward_as_tuple(params), - std::forward_as_tuple(params)); - auto kv_it = _usage_list.begin(); - _cache_map.emplace(std::piecewise_construct, - std::forward_as_tuple(kv_it->first), - std::forward_as_tuple(kv_it)); - return kv_it->second; } - - void clear() { - _cache_map.clear(); - _usage_list.clear(); +}; +#elif defined(PADDLE_WITH_POCKETFFT) + +template +T compute_factor(int64_t size, FFTNormMode normalization) { + constexpr auto one = static_cast(1); + switch (normalization) { + case FFTNormMode::none: + return one; + case FFTNormMode::by_n: + return one / static_cast(size); + case FFTNormMode::by_sqrt_n: + return one / std::sqrt(static_cast(size)); } + PADDLE_THROW( + platform::errors::InvalidArgument("Unsupported normalization type")); +} - void resize(int64_t new_size) { - _set_max_size(new_size); - auto cur_size = _usage_list.size(); - if (cur_size > _max_size) { - auto delete_it = _usage_list.end(); - for (size_t i = 0; i < cur_size - _max_size; i++) { - delete_it--; - _cache_map.erase(delete_it->first); - } - _usage_list.erase(delete_it, _usage_list.end()); +template +struct FFTC2CFunctor { + void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x, + Tensor* out, const std::vector& axes, + FFTNormMode normalization, bool forward) { + using R = typename Ti::value_type; + using C = std::complex; + + const auto& input_dim = x->dims(); + const std::vector in_sizes = phi::vectorize(input_dim); + std::vector in_strides = + phi::vectorize(phi::stride(input_dim)); + const int64_t data_size = sizeof(C); + std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(), + [&](std::ptrdiff_t s) { return s * data_size; }); + + const auto* in_data = reinterpret_cast(x->data()); + auto* out_data = reinterpret_cast(out->data()); + // pocketfft requires std::vector + std::vector axes_(axes.size()); + std::copy(axes.begin(), axes.end(), axes_.begin()); + // compuet factor + int64_t signal_numel = 1; + for (auto i : axes) { + signal_numel *= in_sizes[i]; } + R factor = compute_factor(signal_numel, normalization); + pocketfft::c2c(in_sizes, in_strides, in_strides, axes_, forward, in_data, + out_data, factor); } +}; - size_t size() const { return _cache_map.size(); } - - size_t max_size() const noexcept { return _max_size; } +template +struct FFTR2CFunctor { + void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x, + Tensor* out, const std::vector& axes, + FFTNormMode normalization, bool forward) { + using R = Ti; + using C = std::complex; + + const auto& input_dim = x->dims(); + const std::vector in_sizes = phi::vectorize(input_dim); + std::vector in_strides = + phi::vectorize(phi::stride(input_dim)); + { + const int64_t data_size = sizeof(R); + std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(), + [&](std::ptrdiff_t s) { return s * data_size; }); + } - std::mutex mutex; + const auto& output_dim = out->dims(); + const std::vector out_sizes = phi::vectorize(output_dim); + std::vector out_strides = + phi::vectorize(phi::stride(output_dim)); + { + const int64_t data_size = sizeof(C); + std::transform(out_strides.begin(), out_strides.end(), + out_strides.begin(), + [&](std::ptrdiff_t s) { return s * data_size; }); + } - private: - // Only sets size and does value check. Does not resize the data structures. - void _set_max_size(int64_t new_size) { - // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since - // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check - // first. - PADDLE_ENFORCE_GE( - new_size, 0, - platform::errors::InvalidArgument( - "cuFFT plan cache size must be non-negative, But received is [%d]", - new_size)); - PADDLE_ENFORCE_LE(new_size, CUFFT_MAX_PLAN_NUM, - platform::errors::InvalidArgument( - "cuFFT plan cache size can not be larger than [%d], " - "But received is [%d]", - CUFFT_MAX_PLAN_NUM, new_size)); - _max_size = static_cast(new_size); + const auto* in_data = x->data(); + auto* out_data = reinterpret_cast(out->data()); + // pocketfft requires std::vector + std::vector axes_(axes.size()); + std::copy(axes.begin(), axes.end(), axes_.begin()); + // compuet normalization factor + int64_t signal_numel = 1; + for (auto i : axes) { + signal_numel *= in_sizes[i]; + } + R factor = compute_factor(signal_numel, normalization); + pocketfft::r2c(in_sizes, in_strides, out_strides, axes_, forward, in_data, + out_data, factor); } - - std::list _usage_list; - map_t _cache_map; - size_t _max_size; }; -static std::vector> plan_caches; -static std::mutex plan_caches_mutex; - -static inline FFTConfigCache& get_fft_plan_cache(int64_t device_index) { - std::lock_guard guard(plan_caches_mutex); +template +struct FFTC2RFunctor { + void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x, + Tensor* out, const std::vector& axes, + FFTNormMode normalization, bool forward) { + using R = To; + using C = std::complex; + + const auto& input_dim = x->dims(); + const std::vector in_sizes = phi::vectorize(input_dim); + std::vector in_strides = + phi::vectorize(phi::stride(input_dim)); + { + const int64_t data_size = sizeof(C); + std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(), + [&](std::ptrdiff_t s) { return s * data_size; }); + } - if (device_index >= plan_caches.size()) { - plan_caches.resize(device_index + 1); - } + const auto& output_dim = out->dims(); + const std::vector out_sizes = phi::vectorize(output_dim); + std::vector out_strides = + phi::vectorize(phi::stride(output_dim)); + { + const int64_t data_size = sizeof(R); + std::transform(out_strides.begin(), out_strides.end(), + out_strides.begin(), + [&](std::ptrdiff_t s) { return s * data_size; }); + } - if (!plan_caches[device_index]) { - plan_caches[device_index] = std::make_unique(); + const auto* in_data = reinterpret_cast(x->data()); + auto* out_data = out->data(); + // pocketfft requires std::vector + std::vector axes_(axes.size()); + std::copy(axes.begin(), axes.end(), axes_.begin()); + // compuet normalization factor + int64_t signal_numel = 1; + for (auto i : axes) { + signal_numel *= out_sizes[i]; + } + R factor = compute_factor(signal_numel, normalization); + pocketfft::c2r(out_sizes, in_strides, out_strides, axes_, forward, in_data, + out_data, factor); } +}; - return *plan_caches[device_index]; -} +#endif } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/spectral_op.cc b/paddle/fluid/operators/spectral_op.cc index db3dc214bf..0270f7e057 100644 --- a/paddle/fluid/operators/spectral_op.cc +++ b/paddle/fluid/operators/spectral_op.cc @@ -13,28 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/spectral_op.h" - -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" - -#if defined(PADDLE_WITH_ONEMKL) -#include "paddle/phi/backends/dynload/mklrt.h" -#elif defined(PADDLE_WITH_POCKETFFT) -#include "extern_pocketfft/pocketfft_hdronly.h" -#endif - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/for_range.h" +#include "paddle/fluid/operators/spectral_helper.h" namespace paddle { namespace operators { @@ -355,465 +334,6 @@ FFTNormMode get_norm_from_string(const std::string& norm, bool forward) { norm)); } -// FFT Functors -#if defined(PADDLE_WITH_ONEMKL) - -#define MKL_DFTI_CHECK(expr) \ - do { \ - MKL_LONG status = (expr); \ - if (!phi::dynload::DftiErrorClass(status, DFTI_NO_ERROR)) \ - PADDLE_THROW( \ - platform::errors::External(phi::dynload::DftiErrorMessage(status))); \ - } while (0); - -namespace { - -struct DftiDescriptorDeleter { - void operator()(DFTI_DESCRIPTOR_HANDLE handle) { - if (handle != nullptr) { - MKL_DFTI_CHECK(phi::dynload::DftiFreeDescriptor(&handle)); - } - } -}; - -// A RAII wrapper for MKL_DESCRIPTOR* -class DftiDescriptor { - public: - void init(DFTI_CONFIG_VALUE precision, DFTI_CONFIG_VALUE signal_type, - MKL_LONG signal_ndim, MKL_LONG* sizes) { - PADDLE_ENFORCE_EQ(desc_.get(), nullptr, - platform::errors::AlreadyExists( - "DftiDescriptor has already been initialized.")); - - DFTI_DESCRIPTOR* raw_desc; - MKL_DFTI_CHECK(phi::dynload::DftiCreateDescriptorX( - &raw_desc, precision, signal_type, signal_ndim, sizes)); - desc_.reset(raw_desc); - } - - DFTI_DESCRIPTOR* get() const { - DFTI_DESCRIPTOR* raw_desc = desc_.get(); - PADDLE_ENFORCE_NOT_NULL(raw_desc, - platform::errors::PreconditionNotMet( - "DFTI DESCRIPTOR has not been initialized.")); - return raw_desc; - } - - private: - std::unique_ptr desc_; -}; - -DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype, - const framework::proto::VarType::Type& out_dtype, - const framework::DDim& in_strides, - const framework::DDim& out_strides, - const std::vector& signal_sizes, - FFTNormMode normalization, bool forward) { - const DFTI_CONFIG_VALUE precision = [&] { - switch (in_dtype) { - case framework::proto::VarType::FP32: - return DFTI_SINGLE; - case framework::proto::VarType::COMPLEX64: - return DFTI_SINGLE; - case framework::proto::VarType::FP64: - return DFTI_DOUBLE; - case framework::proto::VarType::COMPLEX128: - return DFTI_DOUBLE; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Invalid input datatype (%s), input data type should be FP32, " - "FP64, COMPLEX64 or COMPLEX128.", - framework::DataTypeToString(in_dtype))); - } - }(); - - // C2C, R2C, C2R - const FFTTransformType fft_type = GetFFTTransformType(in_dtype, out_dtype); - const DFTI_CONFIG_VALUE domain = - (fft_type == FFTTransformType::C2C) ? DFTI_COMPLEX : DFTI_REAL; - - DftiDescriptor descriptor; - std::vector fft_sizes(signal_sizes.cbegin(), signal_sizes.cend()); - const MKL_LONG signal_ndim = fft_sizes.size() - 1; - descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1); - - // placement inplace or not inplace - MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(), DFTI_PLACEMENT, - DFTI_NOT_INPLACE)); - - // number of transformations - const MKL_LONG batch_size = fft_sizes[0]; - MKL_DFTI_CHECK(phi::dynload::DftiSetValue( - descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size)); - - // input & output distance - const MKL_LONG idist = in_strides[0]; - const MKL_LONG odist = out_strides[0]; - MKL_DFTI_CHECK( - phi::dynload::DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist)); - MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(), - DFTI_OUTPUT_DISTANCE, odist)); - - // input & output stride - std::vector mkl_in_stride(1 + signal_ndim, 0); - std::vector mkl_out_stride(1 + signal_ndim, 0); - for (MKL_LONG i = 1; i <= signal_ndim; i++) { - mkl_in_stride[i] = in_strides[i]; - mkl_out_stride[i] = out_strides[i]; - } - MKL_DFTI_CHECK(phi::dynload::DftiSetValue( - descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data())); - MKL_DFTI_CHECK(phi::dynload::DftiSetValue( - descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data())); - - // conjugate even storage - if (!(fft_type == FFTTransformType::C2C)) { - MKL_DFTI_CHECK(phi::dynload::DftiSetValue( - descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX)); - } - - MKL_LONG signal_numel = - std::accumulate(fft_sizes.cbegin() + 1, fft_sizes.cend(), 1UL, - std::multiplies()); - if (normalization != FFTNormMode::none) { - const double scale = - ((normalization == FFTNormMode::by_sqrt_n) - ? 1.0 / std::sqrt(static_cast(signal_numel)) - : 1.0 / static_cast(signal_numel)); - const auto scale_direction = [&]() { - if (fft_type == FFTTransformType::R2C || - (fft_type == FFTTransformType::C2C && forward)) { - return DFTI_FORWARD_SCALE; - } else { - // (fft_type == FFTTransformType::C2R || - // (fft_type == FFTTransformType::C2C && !forward)) - return DFTI_BACKWARD_SCALE; - } - }(); - MKL_DFTI_CHECK( - phi::dynload::DftiSetValue(descriptor.get(), scale_direction, scale)); - } - - // commit the descriptor - MKL_DFTI_CHECK(phi::dynload::DftiCommitDescriptor(descriptor.get())); - return descriptor; -} - -// Execute a general fft operation (can be c2c, onesided r2c or onesided c2r) -template -void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out, - const std::vector& axes, FFTNormMode normalization, - bool forward) { - const framework::DDim& in_sizes = x->dims(); - const int ndim = in_sizes.size(); - const int signal_ndim = axes.size(); - const int batch_ndim = ndim - signal_ndim; - const framework::DDim& out_sizes = out->dims(); - - // make a dim permutation - std::vector dim_permute(ndim); - std::iota(dim_permute.begin(), dim_permute.end(), 0); - std::vector is_transformed_dim(ndim, false); - for (const auto& d : axes) { - is_transformed_dim[d] = true; - } - const auto batch_end = - std::partition(dim_permute.begin(), dim_permute.end(), - [&](size_t axis) { return !is_transformed_dim[axis]; }); - std::copy(axes.cbegin(), axes.cend(), batch_end); - - // transpose input according to that permutation - framework::DDim transposed_input_shape = in_sizes.transpose(dim_permute); - std::vector transposed_input_shape_ = - phi::vectorize(transposed_input_shape); - framework::Tensor transposed_input; - transposed_input.Resize(transposed_input_shape); - const auto place = ctx.GetPlace(); - transposed_input.mutable_data(place); - TransCompute(ndim, ctx, *x, &transposed_input, - dim_permute); - - // make an collapsed input: collapse batch axes for input - const int batch_size = std::accumulate( - transposed_input_shape.Get(), transposed_input_shape.Get() + batch_ndim, - 1L, std::multiplies()); - std::vector collapsed_input_shape_(1 + signal_ndim); - collapsed_input_shape_[0] = batch_size; - std::copy(transposed_input_shape_.begin() + batch_ndim, - transposed_input_shape_.end(), collapsed_input_shape_.begin() + 1); - const framework::DDim collapsed_input_shape = - phi::make_ddim(collapsed_input_shape_); - transposed_input.Resize(collapsed_input_shape); - framework::Tensor& collapsed_input = transposed_input; - - // make a collapsed output - std::vector collapsed_output_shape_(1 + signal_ndim); - collapsed_output_shape_[0] = batch_size; - for (int i = 0; i < signal_ndim; i++) { - collapsed_output_shape_[1 + i] = out_sizes[axes[i]]; - } - const framework::DDim collapsed_output_shape = - phi::make_ddim(collapsed_output_shape_); - framework::Tensor collapsed_output; - collapsed_output.Resize(collapsed_output_shape); - collapsed_output.mutable_data(place, out->type()); - - // signal sizes - std::vector signal_sizes(1 + signal_ndim); - signal_sizes[0] = batch_size; - for (int i = 0; i < signal_ndim; i++) { - signal_sizes[1 + i] = - std::max(collapsed_input_shape[1 + i], collapsed_output_shape[1 + i]); - } - - // input & output stride - const framework::DDim input_stride = phi::stride(collapsed_input_shape); - const framework::DDim output_stride = phi::stride(collapsed_output_shape); - - // make a DFTI_DESCRIPTOR - DftiDescriptor desc = - _plan_mkl_fft(framework::TransToProtoVarType(x->dtype()), - framework::TransToProtoVarType(out->dtype()), input_stride, - output_stride, signal_sizes, normalization, forward); - - const FFTTransformType fft_type = - GetFFTTransformType(framework::TransToProtoVarType(x->dtype()), - framework::TransToProtoVarType(out->type())); - if (fft_type == FFTTransformType::C2R && forward) { - framework::Tensor collapsed_input_conj(collapsed_input.dtype()); - collapsed_input_conj.mutable_data(collapsed_input.dims(), - ctx.GetPlace()); - // conjugate the input - platform::ForRange for_range(ctx, collapsed_input.numel()); - phi::funcs::ConjFunctor functor(collapsed_input.data(), - collapsed_input.numel(), - collapsed_input_conj.data()); - for_range(functor); - MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward( - desc.get(), collapsed_input_conj.data(), collapsed_output.data())); - } else if (fft_type == FFTTransformType::R2C && !forward) { - framework::Tensor collapsed_output_conj(collapsed_output.dtype()); - collapsed_output_conj.mutable_data(collapsed_output.dims(), - ctx.GetPlace()); - MKL_DFTI_CHECK(phi::dynload::DftiComputeForward( - desc.get(), collapsed_input.data(), collapsed_output_conj.data())); - // conjugate the output - platform::ForRange for_range(ctx, collapsed_output.numel()); - phi::funcs::ConjFunctor functor(collapsed_output_conj.data(), - collapsed_output.numel(), - collapsed_output.data()); - for_range(functor); - } else { - if (forward) { - MKL_DFTI_CHECK(phi::dynload::DftiComputeForward( - desc.get(), collapsed_input.data(), collapsed_output.data())); - } else { - MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward( - desc.get(), collapsed_input.data(), collapsed_output.data())); - } - } - - // resize for the collapsed output - framework::DDim transposed_output_shape = out_sizes.transpose(dim_permute); - collapsed_output.Resize(transposed_output_shape); - framework::Tensor& transposed_output = collapsed_output; - - // reverse the transposition - std::vector reverse_dim_permute(ndim); - for (int i = 0; i < ndim; i++) { - reverse_dim_permute[dim_permute[i]] = i; - } - TransCompute(ndim, ctx, transposed_output, - out, reverse_dim_permute); -} -} // anonymous namespace - -template -struct FFTC2CFunctor { - void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x, - Tensor* out, const std::vector& axes, - FFTNormMode normalization, bool forward) { - exec_fft(ctx, x, out, axes, - normalization, forward); - } -}; - -template -struct FFTR2CFunctor { - void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x, - Tensor* out, const std::vector& axes, - FFTNormMode normalization, bool forward) { - exec_fft(ctx, x, out, axes, - normalization, forward); - } -}; - -template -struct FFTC2RFunctor { - void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x, - Tensor* out, const std::vector& axes, - FFTNormMode normalization, bool forward) { - if (axes.size() > 1) { - const std::vector c2c_dims(axes.begin(), axes.end() - 1); - Tensor temp; - temp.mutable_data(x->dims(), ctx.GetPlace()); - - FFTC2CFunctor c2c_functor; - c2c_functor(ctx, x, &temp, c2c_dims, normalization, forward); - - const std::vector new_axes{axes.back()}; - exec_fft(ctx, &temp, out, new_axes, - normalization, forward); - } else { - exec_fft(ctx, x, out, axes, - normalization, forward); - } - } -}; - -#elif defined(PADDLE_WITH_POCKETFFT) - -namespace { -template -T compute_factor(int64_t size, FFTNormMode normalization) { - constexpr auto one = static_cast(1); - switch (normalization) { - case FFTNormMode::none: - return one; - case FFTNormMode::by_n: - return one / static_cast(size); - case FFTNormMode::by_sqrt_n: - return one / std::sqrt(static_cast(size)); - } - PADDLE_THROW( - platform::errors::InvalidArgument("Unsupported normalization type")); -} -} // anonymous namespace - -template -struct FFTC2CFunctor { - void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x, - Tensor* out, const std::vector& axes, - FFTNormMode normalization, bool forward) { - using R = typename Ti::value_type; - using C = std::complex; - - const auto& input_dim = x->dims(); - const std::vector in_sizes = phi::vectorize(input_dim); - std::vector in_strides = - phi::vectorize(phi::stride(input_dim)); - const int64_t data_size = sizeof(C); - std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(), - [&](std::ptrdiff_t s) { return s * data_size; }); - - const auto* in_data = reinterpret_cast(x->data()); - auto* out_data = reinterpret_cast(out->data()); - // pocketfft requires std::vector - std::vector axes_(axes.size()); - std::copy(axes.begin(), axes.end(), axes_.begin()); - // compuet factor - int64_t signal_numel = 1; - for (auto i : axes) { - signal_numel *= in_sizes[i]; - } - R factor = compute_factor(signal_numel, normalization); - pocketfft::c2c(in_sizes, in_strides, in_strides, axes_, forward, in_data, - out_data, factor); - } -}; - -template -struct FFTR2CFunctor { - void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x, - Tensor* out, const std::vector& axes, - FFTNormMode normalization, bool forward) { - using R = Ti; - using C = std::complex; - - const auto& input_dim = x->dims(); - const std::vector in_sizes = phi::vectorize(input_dim); - std::vector in_strides = - phi::vectorize(phi::stride(input_dim)); - { - const int64_t data_size = sizeof(R); - std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(), - [&](std::ptrdiff_t s) { return s * data_size; }); - } - - const auto& output_dim = out->dims(); - const std::vector out_sizes = phi::vectorize(output_dim); - std::vector out_strides = - phi::vectorize(phi::stride(output_dim)); - { - const int64_t data_size = sizeof(C); - std::transform(out_strides.begin(), out_strides.end(), - out_strides.begin(), - [&](std::ptrdiff_t s) { return s * data_size; }); - } - - const auto* in_data = x->data(); - auto* out_data = reinterpret_cast(out->data()); - // pocketfft requires std::vector - std::vector axes_(axes.size()); - std::copy(axes.begin(), axes.end(), axes_.begin()); - // compuet normalization factor - int64_t signal_numel = 1; - for (auto i : axes) { - signal_numel *= in_sizes[i]; - } - R factor = compute_factor(signal_numel, normalization); - pocketfft::r2c(in_sizes, in_strides, out_strides, axes_, forward, in_data, - out_data, factor); - } -}; - -template -struct FFTC2RFunctor { - void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x, - Tensor* out, const std::vector& axes, - FFTNormMode normalization, bool forward) { - using R = To; - using C = std::complex; - - const auto& input_dim = x->dims(); - const std::vector in_sizes = phi::vectorize(input_dim); - std::vector in_strides = - phi::vectorize(phi::stride(input_dim)); - { - const int64_t data_size = sizeof(C); - std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(), - [&](std::ptrdiff_t s) { return s * data_size; }); - } - - const auto& output_dim = out->dims(); - const std::vector out_sizes = phi::vectorize(output_dim); - std::vector out_strides = - phi::vectorize(phi::stride(output_dim)); - { - const int64_t data_size = sizeof(R); - std::transform(out_strides.begin(), out_strides.end(), - out_strides.begin(), - [&](std::ptrdiff_t s) { return s * data_size; }); - } - - const auto* in_data = reinterpret_cast(x->data()); - auto* out_data = out->data(); - // pocketfft requires std::vector - std::vector axes_(axes.size()); - std::copy(axes.begin(), axes.end(), axes_.begin()); - // compuet normalization factor - int64_t signal_numel = 1; - for (auto i : axes) { - signal_numel *= out_sizes[i]; - } - R factor = compute_factor(signal_numel, normalization); - pocketfft::c2r(out_sizes, in_strides, out_strides, axes_, forward, in_data, - out_data, factor); - } -}; - -#endif - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu index b7b6b5302a..b7fb83d9d5 100644 --- a/paddle/fluid/operators/spectral_op.cu +++ b/paddle/fluid/operators/spectral_op.cu @@ -8,496 +8,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "paddle/fluid/operators/conj_op.h" -#include "paddle/fluid/operators/spectral_helper.h" +#include "paddle/fluid/operators/spectral_op.cu.h" #include "paddle/fluid/operators/spectral_op.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" - -namespace paddle { -namespace operators { - -namespace { - -// Calculates the normalization constant -double fft_normalization_scale(FFTNormMode normalization, - const std::vector& sizes, - const std::vector& dims) { - // auto norm = static_cast(normalization); - if (normalization == FFTNormMode::none) { - return static_cast(1.0); - } - - int64_t signal_numel = 1; - for (auto dim : dims) { - signal_numel *= sizes[dim]; - } - const double scale_denom = (normalization == FFTNormMode::by_sqrt_n) - ? std::sqrt(signal_numel) - : static_cast(signal_numel); - return static_cast(1.0 / scale_denom); -} - -template -void exec_normalization(const DeviceContext& ctx, const Tensor* in, Tensor* out, - FFTNormMode normalization, - const std::vector& sizes, - const std::vector& axes) { - double scale = fft_normalization_scale(normalization, sizes, axes); - if (scale != 1.0) { - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*in); - auto dev = ctx.eigen_device(); - EigenScale::Eval(*dev, eigen_out, eigen_in, - static_cast(scale), - static_cast(0), false); - } else { - framework::TensorCopy(*in, ctx.GetPlace(), out); - } -} - -#if defined(PADDLE_WITH_CUDA) -FFTConfigKey create_fft_configkey(const framework::Tensor& input, - const framework::Tensor& output, - int signal_ndim) { - // Create the transform plan (either from cache or locally) - const auto value_type = - framework::IsComplexType(framework::TransToProtoVarType(input.dtype())) - ? framework::ToRealType(framework::TransToProtoVarType(input.dtype())) - : framework::TransToProtoVarType(input.dtype()); - auto fft_type = - GetFFTTransformType(framework::TransToProtoVarType(input.dtype()), - framework::TransToProtoVarType(output.dtype())); - // signal sizes - std::vector signal_size(signal_ndim + 1); - - signal_size[0] = input.dims()[0]; - for (int64_t i = 1; i <= signal_ndim; ++i) { - auto in_size = input.dims()[i]; - auto out_size = output.dims()[i]; - signal_size[i] = std::max(in_size, out_size); - } - FFTConfigKey key(phi::vectorize(input.dims()), phi::vectorize(output.dims()), - signal_size, fft_type, value_type); - return key; -} - -// Execute a pre-planned transform -static void exec_cufft_plan_raw(const FFTConfig& config, void* in_data, - void* out_data, bool forward) { - auto& plan = config.plan(); - - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtExec( - plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE)); -} - -template -void exec_cufft_plan(const DeviceContext& ctx, const FFTConfig& config, - framework::Tensor* input, framework::Tensor* output, - bool forward) { - // execute transform plan - auto fft_type = config.transform_type(); - if (fft_type == FFTTransformType::C2R && forward) { - forward = false; - framework::Tensor input_conj(input->type()); - input_conj.mutable_data(input->dims(), ctx.GetPlace()); - platform::ForRange for_range(ctx, input->numel()); - phi::funcs::ConjFunctor functor(input->data(), input->numel(), - input_conj.data()); - for_range(functor); - exec_cufft_plan_raw(config, input_conj.data(), output->data(), forward); - } else if (fft_type == FFTTransformType::R2C && !forward) { - forward = true; - framework::Tensor out_conj(output->type()); - out_conj.mutable_data(output->dims(), ctx.GetPlace()); - exec_cufft_plan_raw(config, input->data(), out_conj.data(), forward); - - platform::ForRange for_range(ctx, output->numel()); - phi::funcs::ConjFunctor functor(out_conj.data(), output->numel(), - output->data()); - for_range(functor); - } else { - exec_cufft_plan_raw(config, input->data(), output->data(), forward); - } -} - -#elif defined(PADDLE_WITH_HIP) - -FFTConfigKey create_fft_configkey(const framework::Tensor& input, - const framework::Tensor& output, - int signal_ndim) { - // Create the transform plan (either from cache or locally) - const auto value_type = - framework::IsComplexType(framework::TransToProtoVarType(input.dtype())) - ? framework::ToRealType(framework::TransToProtoVarType(input.dtype())) - : framework::TransToProtoVarType(input.dtype()); - auto fft_type = - GetFFTTransformType(framework::TransToProtoVarType(input.dtype()), - framework::TransToProtoVarType(output.type())); - // signal sizes - std::vector signal_size(signal_ndim + 1); - - signal_size[0] = input.dims()[0]; - for (int64_t i = 1; i <= signal_ndim; ++i) { - auto in_size = input.dims()[i]; - auto out_size = output.dims()[i]; - signal_size[i] = std::max(in_size, out_size); - } - FFTConfigKey key(phi::vectorize(input.dims()), phi::vectorize(output.dims()), - signal_size, fft_type, value_type); - return key; -} - -// Execute a pre-planned transform -static void exec_hipfft_plan_raw(const FFTConfig& config, void* in_data, - void* out_data, bool forward) { - auto& plan = config.plan(); - - auto value_type = config.data_type(); - if (value_type == framework::proto::VarType::FP32) { - switch (config.transform_type()) { - case FFTTransformType::C2C: { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2C( - plan, static_cast(in_data), - static_cast(out_data), - forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); - return; - } - case FFTTransformType::R2C: { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecR2C( - plan, static_cast(in_data), - static_cast(out_data))); - return; - } - case FFTTransformType::C2R: { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2R( - plan, static_cast(in_data), - static_cast(out_data))); - return; - } - } - } else if (value_type == framework::proto::VarType::FP64) { - switch (config.transform_type()) { - case FFTTransformType::C2C: { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2Z( - plan, static_cast(in_data), - static_cast(out_data), - forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); - return; - } - case FFTTransformType::R2C: { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecD2Z( - plan, static_cast(in_data), - static_cast(out_data))); - return; - } - case FFTTransformType::C2R: { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2D( - plan, static_cast(in_data), - static_cast(out_data))); - return; - } - } - } - PADDLE_THROW(platform::errors::InvalidArgument( - "hipFFT only support transforms of type float32 and float64")); -} - -template -void exec_hipfft_plan(const DeviceContext& ctx, const FFTConfig& config, - framework::Tensor* input, framework::Tensor* output, - bool forward) { - auto fft_type = config.transform_type(); - if (fft_type == FFTTransformType::C2R && forward) { - forward = false; - framework::Tensor input_conj(input->type()); - input_conj.mutable_data(input->dims(), ctx.GetPlace()); - platform::ForRange for_range(ctx, input->numel()); - phi::funcs::ConjFunctor functor(input->data(), input->numel(), - input_conj.data()); - for_range(functor); - exec_hipfft_plan_raw(config, input_conj.data(), output->data(), forward); - } else if (fft_type == FFTTransformType::R2C && !forward) { - forward = true; - framework::Tensor out_conj(output->type()); - out_conj.mutable_data(output->dims(), ctx.GetPlace()); - exec_hipfft_plan_raw(config, input->data(), out_conj.data(), forward); - - platform::ForRange for_range(ctx, output->numel()); - phi::funcs::ConjFunctor functor(out_conj.data(), output->numel(), - output->data()); - for_range(functor); - } else { - exec_hipfft_plan_raw(config, input->data(), output->data(), forward); - } -} - -#endif - -// Execute a general unnormalized fft operation (can be c2c, onesided r2c or -// onesided c2r) -template -void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out, - const std::vector& dim, bool forward) { - const auto x_dims = phi::vectorize(X->dims()); - const int64_t ndim = static_cast(X->dims().size()); - auto tensor_place = ctx.GetPlace(); - - // make a dim permutation - std::vector dim_permute(ndim); - std::iota(dim_permute.begin(), dim_permute.end(), int{0}); - std::vector is_transformed_dim(ndim); - for (const auto& d : dim) { - is_transformed_dim[d] = true; - } - auto batch_end = - std::partition(dim_permute.begin(), dim_permute.end(), - [&](int64_t d) { return !is_transformed_dim[d]; }); - std::sort(dim_permute.begin(), batch_end); - std::copy(dim.cbegin(), dim.cend(), batch_end); - - // transpose input according to dim permutation - auto transposed_input_shape = X->dims().transpose(dim_permute); - framework::Tensor transposed_input; - transposed_input.Resize(transposed_input_shape); - transposed_input.mutable_data(tensor_place); - TransCompute(ndim, ctx, *X, &transposed_input, - dim_permute); - - // Reshape batch dimensions into a single dimension - const int64_t signal_ndim = static_cast(dim.size()); - std::vector collapsed_input_shape(signal_ndim + 1); - - auto transposed_input_shape_ = phi::vectorize(transposed_input_shape); - const int64_t batch_dims = ndim - signal_ndim; - auto batch_size = - std::accumulate(transposed_input_shape_.begin(), - transposed_input_shape_.begin() + batch_dims, - static_cast(1), std::multiplies()); - collapsed_input_shape[0] = batch_size; - - std::copy(transposed_input_shape_.begin() + batch_dims, - transposed_input_shape_.end(), collapsed_input_shape.begin() + 1); - - framework::Tensor& collapsed_input = transposed_input; - collapsed_input.Resize(phi::make_ddim(collapsed_input_shape)); - - // make a collpased output - const auto out_dims = phi::vectorize(out->dims()); - std::vector collapsed_output_shape(1 + signal_ndim); - collapsed_output_shape[0] = batch_size; - for (size_t i = 0; i < dim.size(); ++i) { - collapsed_output_shape[i + 1] = out_dims[dim[i]]; - } - framework::Tensor collapsed_output; - collapsed_output.Resize(phi::make_ddim(collapsed_output_shape)); - collapsed_output.mutable_data(tensor_place); - - FFTConfig* config = nullptr; - -#if defined(PADDLE_WITH_CUDA) - std::unique_ptr config_ = nullptr; - // create plan - FFTConfigKey key = - create_fft_configkey(collapsed_input, collapsed_output, signal_ndim); - bool using_cache = false; -#if !defined(CUFFT_VERSION) || (CUFFT_VERSION < 10200) - using_cache = true; -#endif - - if (using_cache) { - const int64_t device_id = static_cast( - reinterpret_cast(&collapsed_input.place()) - ->GetDeviceId()); - FFTConfigCache& plan_cache = get_fft_plan_cache(device_id); - std::unique_lock guard(plan_cache.mutex, std::defer_lock); - guard.lock(); - config = &(plan_cache.lookup(key)); - } else { - config_ = std::make_unique(key); - config = config_.get(); - } - - // prepare cufft for execution - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cufftSetStream(config->plan(), ctx.stream())); - framework::Tensor workspace_tensor; - workspace_tensor.mutable_data(tensor_place, config->workspace_size()); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetWorkArea( - config->plan(), workspace_tensor.data())); - // execute transform plan - exec_cufft_plan(ctx, *config, &collapsed_input, - &collapsed_output, forward); - -#elif defined(PADDLE_WITH_HIP) - // create plan - FFTConfigKey key = - create_fft_configkey(collapsed_input, collapsed_output, signal_ndim); - const int64_t device_id = static_cast( - reinterpret_cast(&collapsed_input.place()) - ->GetDeviceId()); - FFTConfigCache& plan_cache = get_fft_plan_cache(device_id); - std::unique_lock guard(plan_cache.mutex, std::defer_lock); - guard.lock(); - config = &(plan_cache.lookup(key)); - - // prepare cufft for execution - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::hipfftSetStream(config->plan(), ctx.stream())); - framework::Tensor workspace_tensor; - workspace_tensor.mutable_data(tensor_place, config->workspace_size()); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetWorkArea( - config->plan(), workspace_tensor.data())); - // execute transform plan - exec_hipfft_plan(ctx, *config, &collapsed_input, - &collapsed_output, forward); -#endif - - // Inverting output by reshape and transpose to original batch and dimension - auto transposed_out_shape = out->dims().transpose(dim_permute); - - collapsed_output.Resize(transposed_out_shape); - auto& transposed_output = collapsed_output; - - std::vector reverse_dim_permute(ndim); - for (size_t i = 0; i < ndim; i++) { - reverse_dim_permute[dim_permute[i]] = i; - } - - TransCompute(ndim, ctx, transposed_output, out, - reverse_dim_permute); -} - -} // anonymous namespace - -// Use the optimized path to perform single R2C or C2R if transformation dim is -// supported by cuFFT -bool use_optimized_fft_path(const std::vector& axes) { - // For performance reason, when axes starts with (0, 1), do not use the - // optimized path. - if (axes.size() > kMaxFFTNdim || - (axes.size() >= 2 && axes[0] == 0 && axes[1] == 1)) { - return false; - } else { - return true; - } -} - -template -struct FFTC2CFunctor { - void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X, - Tensor* out, const std::vector& axes, - FFTNormMode normalization, bool forward) { - if (axes.empty()) { - framework::TensorCopy(*X, ctx.GetPlace(), out); - return; - } - - framework::Tensor* p_out = out; - std::vector out_dims = phi::vectorize(X->dims()); - std::vector working_axes(axes.begin(), axes.end()); - std::vector first_dims; - size_t max_dims; - framework::Tensor working_tensor; - working_tensor.mutable_data(X->dims(), ctx.GetPlace()); - framework::Tensor* p_working_tensor = &working_tensor; - framework::TensorCopy(*X, ctx.GetPlace(), &working_tensor); - - while (true) { - max_dims = - std::min(static_cast(kMaxFFTNdim), working_axes.size()); - first_dims.assign(working_axes.end() - max_dims, working_axes.end()); - - exec_fft(ctx, p_working_tensor, - p_out, first_dims, forward); - working_axes.resize(working_axes.size() - max_dims); - first_dims.clear(); - - if (working_axes.empty()) { - break; - } - - std::swap(p_out, p_working_tensor); - } - exec_normalization( - ctx, p_out, out, normalization, out_dims, axes); - } -}; - -template -struct FFTC2RFunctor { - void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X, - Tensor* out, const std::vector& axes, - FFTNormMode normalization, bool forward) { - std::vector in_dims = phi::vectorize(X->dims()); - std::vector out_dims = phi::vectorize(out->dims()); - - if (use_optimized_fft_path(axes)) { - framework::Tensor x_copy(X->type()); - x_copy.mutable_data(X->dims(), ctx.GetPlace()); - framework::TensorCopy(*X, ctx.GetPlace(), &x_copy); - exec_fft(ctx, &x_copy, out, axes, - forward); - } else { - framework::Tensor temp_tensor; - temp_tensor.mutable_data(X->dims(), ctx.GetPlace()); - const std::vector dims(axes.begin(), axes.end() - 1); - - FFTC2CFunctor c2c_functor; - c2c_functor(ctx, X, &temp_tensor, dims, FFTNormMode::none, forward); - - exec_fft(ctx, &temp_tensor, out, - {axes.back()}, forward); - } - exec_normalization( - ctx, out, out, normalization, out_dims, axes); - } -}; - -// n dimension real to complex FFT use cufft lib -template -struct FFTR2CFunctor { - void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X, - Tensor* out, const std::vector& axes, - FFTNormMode normalization, bool forward) { - // Step1: R2C transform on the last dimension - framework::Tensor* r2c_out = out; - const std::vector last_dim{axes.back()}; - std::vector out_dims = phi::vectorize(out->dims()); - exec_fft(ctx, X, r2c_out, last_dim, - forward); - - // Step2: C2C transform on the remaining dimension - framework::Tensor c2c_out; - if (axes.size() > 1) { - c2c_out.mutable_data(out->dims(), ctx.GetPlace()); - std::vector remain_dim(axes.begin(), axes.end() - 1); - FFTC2CFunctor fft_c2c_func; - fft_c2c_func(ctx, r2c_out, &c2c_out, remain_dim, FFTNormMode::none, - forward); - } - - const auto in_sizes = phi::vectorize(X->dims()); - framework::Tensor* norm_tensor = axes.size() > 1 ? &c2c_out : r2c_out; - exec_normalization( - ctx, norm_tensor, out, normalization, in_sizes, axes); - } -}; - -} // namespace operators -} // namespace paddle namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( diff --git a/paddle/fluid/operators/spectral_op.cu.h b/paddle/fluid/operators/spectral_op.cu.h new file mode 100644 index 0000000000..fdb0e0d284 --- /dev/null +++ b/paddle/fluid/operators/spectral_op.cu.h @@ -0,0 +1,944 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/operators/conj_op.h" +#include "paddle/fluid/operators/spectral_op.h" +#include "paddle/fluid/operators/transpose_op.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" + +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/dynload/hipfft.h" +#endif + +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/dynload/cufft.h" +#endif + +namespace paddle { +namespace operators { + +using ScalarType = framework::proto::VarType::Type; +const int64_t kMaxFFTNdim = 3; +const int64_t kMaxDataNdim = kMaxFFTNdim + 1; +// This struct is used to easily compute hashes of the +// parameters. It will be the **key** to the plan cache. +struct FFTConfigKey { + // between 1 and kMaxFFTNdim, i.e., 1 <= signal_ndim <= 3 + int64_t signal_ndim_; + // These include additional batch dimension as well. + int64_t sizes_[kMaxDataNdim]; + int64_t input_shape_[kMaxDataNdim]; + int64_t output_shape_[kMaxDataNdim]; + FFTTransformType fft_type_; + ScalarType value_type_; + + FFTConfigKey() = default; + + FFTConfigKey(const std::vector& in_shape, + const std::vector& out_shape, + const std::vector& signal_size, + FFTTransformType fft_type, ScalarType value_type) { + // Padding bits must be zeroed for hashing + memset(this, 0, sizeof(*this)); + signal_ndim_ = signal_size.size() - 1; + fft_type_ = fft_type; + value_type_ = value_type; + + std::copy(signal_size.cbegin(), signal_size.cend(), sizes_); + std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_); + std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_); + } +}; + +#if defined(PADDLE_WITH_CUDA) +// An RAII encapsulation of cuFFTHandle +class CuFFTHandle { + ::cufftHandle handle_; + + public: + CuFFTHandle() { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftCreate(&handle_)); + } + + CuFFTHandle(const CuFFTHandle& other) = delete; + CuFFTHandle& operator=(const CuFFTHandle& other) = delete; + + CuFFTHandle(CuFFTHandle&& other) = delete; + CuFFTHandle& operator=(CuFFTHandle&& other) = delete; + + ::cufftHandle& get() { return handle_; } + const ::cufftHandle& get() const { return handle_; } + + ~CuFFTHandle() { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftDestroy(handle_)); + } +}; + +using plan_size_type = long long int; // NOLINT +// This class contains all the information needed to execute a cuFFT plan: +// 1. the plan +// 2. the workspace size needed +class FFTConfig { + public: + // Only move semantics is enought for this class. Although we already use + // unique_ptr for the plan, still remove copy constructor and assignment op so + // we don't accidentally copy and take perf hit. + explicit FFTConfig(const FFTConfigKey& plan_key) + : FFTConfig( + std::vector(plan_key.sizes_, + plan_key.sizes_ + plan_key.signal_ndim_ + 1), + plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {} + + // sizes are full signal, including batch size and always two-sided + FFTConfig(const std::vector& sizes, const int64_t signal_ndim, + FFTTransformType fft_type, ScalarType dtype) + : fft_type_(fft_type), value_type_(dtype) { + // signal sizes (excluding batch dim) + std::vector signal_sizes(sizes.begin() + 1, sizes.end()); + + // input batch size + const auto batch = static_cast(sizes[0]); + // const int64_t signal_ndim = sizes.size() - 1; + PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1, + platform::errors::InvalidArgument( + "The signal_ndim must be equal to sizes.size() - 1," + "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]", + signal_ndim, sizes.size() - 1)); + + cudaDataType itype, otype, exec_type; + const auto complex_input = has_complex_input(fft_type); + const auto complex_output = has_complex_output(fft_type); + if (dtype == framework::proto::VarType::FP32) { + itype = complex_input ? CUDA_C_32F : CUDA_R_32F; + otype = complex_output ? CUDA_C_32F : CUDA_R_32F; + exec_type = CUDA_C_32F; + } else if (dtype == framework::proto::VarType::FP64) { + itype = complex_input ? CUDA_C_64F : CUDA_R_64F; + otype = complex_output ? CUDA_C_64F : CUDA_R_64F; + exec_type = CUDA_C_64F; + } else if (dtype == framework::proto::VarType::FP16) { + itype = complex_input ? CUDA_C_16F : CUDA_R_16F; + otype = complex_output ? CUDA_C_16F : CUDA_R_16F; + exec_type = CUDA_C_16F; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "cuFFT only support transforms of type float16, float32 and " + "float64")); + } + + // disable auto allocation of workspace to use allocator from the framework + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetAutoAllocation( + plan(), /* autoAllocate */ 0)); + + size_t ws_size_t; + + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtMakePlanMany( + plan(), signal_ndim, signal_sizes.data(), + /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype, + /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype, + batch, &ws_size_t, exec_type)); + + ws_size = ws_size_t; + } + + FFTConfig(const FFTConfig& other) = delete; + FFTConfig& operator=(const FFTConfig& other) = delete; + + FFTConfig(FFTConfig&& other) = delete; + FFTConfig& operator=(FFTConfig&& other) = delete; + + const cufftHandle& plan() const { return plan_ptr.get(); } + + FFTTransformType transform_type() const { return fft_type_; } + ScalarType data_type() const { return value_type_; } + size_t workspace_size() const { return ws_size; } + + private: + CuFFTHandle plan_ptr; + size_t ws_size; + FFTTransformType fft_type_; + ScalarType value_type_; +}; + +#elif defined(PADDLE_WITH_HIP) +// An RAII encapsulation of cuFFTHandle +class HIPFFTHandle { + ::hipfftHandle handle_; + + public: + HIPFFTHandle() { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftCreate(&handle_)); + } + + HIPFFTHandle(const HIPFFTHandle& other) = delete; + HIPFFTHandle& operator=(const HIPFFTHandle& other) = delete; + + HIPFFTHandle(HIPFFTHandle&& other) = delete; + HIPFFTHandle& operator=(HIPFFTHandle&& other) = delete; + + ::hipfftHandle& get() { return handle_; } + const ::hipfftHandle& get() const { return handle_; } + + ~HIPFFTHandle() { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftDestroy(handle_)); + } +}; +using plan_size_type = int; +// This class contains all the information needed to execute a cuFFT plan: +// 1. the plan +// 2. the workspace size needed +class FFTConfig { + public: + // Only move semantics is enought for this class. Although we already use + // unique_ptr for the plan, still remove copy constructor and assignment op so + // we don't accidentally copy and take perf hit. + explicit FFTConfig(const FFTConfigKey& plan_key) + : FFTConfig( + std::vector(plan_key.sizes_, + plan_key.sizes_ + plan_key.signal_ndim_ + 1), + plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {} + + // sizes are full signal, including batch size and always two-sided + FFTConfig(const std::vector& sizes, const int64_t signal_ndim, + FFTTransformType fft_type, ScalarType dtype) + : fft_type_(fft_type), value_type_(dtype) { + // signal sizes (excluding batch dim) + std::vector signal_sizes(sizes.begin() + 1, sizes.end()); + + // input batch size + const auto batch = static_cast(sizes[0]); + // const int64_t signal_ndim = sizes.size() - 1; + PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1, + platform::errors::InvalidArgument( + "The signal_ndim must be equal to sizes.size() - 1," + "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]", + signal_ndim, sizes.size() - 1)); + + hipfftType exec_type = [&] { + if (dtype == framework::proto::VarType::FP32) { + switch (fft_type) { + case FFTTransformType::C2C: + return HIPFFT_C2C; + case FFTTransformType::R2C: + return HIPFFT_R2C; + case FFTTransformType::C2R: + return HIPFFT_C2R; + } + } else if (dtype == framework::proto::VarType::FP64) { + switch (fft_type) { + case FFTTransformType::C2C: + return HIPFFT_Z2Z; + case FFTTransformType::R2C: + return HIPFFT_D2Z; + case FFTTransformType::C2R: + return HIPFFT_Z2D; + } + } + PADDLE_THROW(platform::errors::InvalidArgument( + "hipFFT only support transforms of type float32 and float64")); + }(); + + // disable auto allocation of workspace to use allocator from the framework + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetAutoAllocation( + plan(), /* autoAllocate */ 0)); + + size_t ws_size_t; + + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftMakePlanMany( + plan(), signal_ndim, signal_sizes.data(), + /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, + /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type, + batch, &ws_size_t)); + + ws_size = ws_size_t; + } + + const hipfftHandle& plan() const { return plan_ptr.get(); } + + FFTTransformType transform_type() const { return fft_type_; } + ScalarType data_type() const { return value_type_; } + size_t workspace_size() const { return ws_size; } + + private: + HIPFFTHandle plan_ptr; + size_t ws_size; + FFTTransformType fft_type_; + ScalarType value_type_; +}; +#endif + +// Hashing machinery for Key +// Fowler–Noll–Vo hash function +// see +// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function +template +struct KeyHash { + // Key must be a POD because we read out its memory + // contenst as char* when hashing + static_assert(std::is_pod::value, "Key must be plain old data type"); + + size_t operator()(const Key& params) const { + auto ptr = reinterpret_cast(¶ms); + uint32_t value = 0x811C9DC5; + for (int i = 0; i < static_cast(sizeof(Key)); ++i) { + value ^= ptr[i]; + value *= 0x01000193; + } + return static_cast(value); + } +}; + +template +struct KeyEqual { + // Key must be a POD because we read out its memory + // contenst as char* when comparing + static_assert(std::is_pod::value, "Key must be plain old data type"); + + bool operator()(const Key& a, const Key& b) const { + auto ptr1 = reinterpret_cast(&a); + auto ptr2 = reinterpret_cast(&b); + return memcmp(ptr1, ptr2, sizeof(Key)) == 0; + } +}; + +#if CUDA_VERSION < 10000 +// Note that the max plan number for CUDA version < 10 has to be 1023 +// due to a bug that fails on the 1024th plan +constexpr size_t CUFFT_MAX_PLAN_NUM = 1023; +constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM; +#else +constexpr size_t CUFFT_MAX_PLAN_NUM = std::numeric_limits::max(); +// The default max cache size chosen for CUDA version > 10 is arbitrary. +// This number puts a limit on how big of a plan cache should we maintain by +// default. Users can always configure it via cufft_set_plan_cache_max_size. +constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = 4096; +#endif +static_assert(CUFFT_MAX_PLAN_NUM >= 0 && + CUFFT_MAX_PLAN_NUM <= std::numeric_limits::max(), + "CUFFT_MAX_PLAN_NUM not in size_t range"); +static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 && + CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM, + "CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range"); + +// This cache assumes that the mapping from key to value never changes. +// This is **NOT** thread-safe. Please use a mutex when using it **AND** the +// value returned from try_emplace_value. +// The contract of using this cache is that try_emplace_value should only be +// used when the max_size is positive. +class FFTConfigCache { + public: + using kv_t = typename std::pair; + using map_t = typename std::unordered_map< + std::reference_wrapper, typename std::list::iterator, + KeyHash, KeyEqual>; + using map_kkv_iter_t = typename map_t::iterator; + + FFTConfigCache() : FFTConfigCache(CUFFT_DEFAULT_CACHE_SIZE) {} + + explicit FFTConfigCache(int64_t max_size) { _set_max_size(max_size); } + + FFTConfigCache(const FFTConfigCache& other) = delete; + FFTConfigCache& operator=(const FFTConfigCache& other) = delete; + + FFTConfigCache(FFTConfigCache&& other) noexcept + : _usage_list(std::move(other._usage_list)), + _cache_map(std::move(other._cache_map)), + _max_size(other._max_size) {} + + FFTConfigCache& operator=(FFTConfigCache&& other) noexcept { + _usage_list = std::move(other._usage_list); + _cache_map = std::move(other._cache_map); + _max_size = other._max_size; + return *this; + } + + // If key is in this cache, return the cached config. Otherwise, emplace the + // config in this cache and return it. + FFTConfig& lookup(FFTConfigKey params) { + PADDLE_ENFORCE_GT(_max_size, 0, + platform::errors::InvalidArgument( + "The max size of FFTConfigCache must be great than 0," + "But received is [%d]", + _max_size)); + + map_kkv_iter_t map_it = _cache_map.find(params); + // Hit, put to list front + if (map_it != _cache_map.end()) { + _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second); + return map_it->second->second; + } + + // Miss + // remove if needed + if (_usage_list.size() >= _max_size) { + auto last = _usage_list.end(); + last--; + _cache_map.erase(last->first); + _usage_list.pop_back(); + } + + // construct new plan at list front, then insert into _cache_map + _usage_list.emplace_front(std::piecewise_construct, + std::forward_as_tuple(params), + std::forward_as_tuple(params)); + auto kv_it = _usage_list.begin(); + _cache_map.emplace(std::piecewise_construct, + std::forward_as_tuple(kv_it->first), + std::forward_as_tuple(kv_it)); + return kv_it->second; + } + + void clear() { + _cache_map.clear(); + _usage_list.clear(); + } + + void resize(int64_t new_size) { + _set_max_size(new_size); + auto cur_size = _usage_list.size(); + if (cur_size > _max_size) { + auto delete_it = _usage_list.end(); + for (size_t i = 0; i < cur_size - _max_size; i++) { + delete_it--; + _cache_map.erase(delete_it->first); + } + _usage_list.erase(delete_it, _usage_list.end()); + } + } + + size_t size() const { return _cache_map.size(); } + + size_t max_size() const noexcept { return _max_size; } + + std::mutex mutex; + + private: + // Only sets size and does value check. Does not resize the data structures. + void _set_max_size(int64_t new_size) { + // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since + // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check + // first. + PADDLE_ENFORCE_GE( + new_size, 0, + platform::errors::InvalidArgument( + "cuFFT plan cache size must be non-negative, But received is [%d]", + new_size)); + PADDLE_ENFORCE_LE(new_size, CUFFT_MAX_PLAN_NUM, + platform::errors::InvalidArgument( + "cuFFT plan cache size can not be larger than [%d], " + "But received is [%d]", + CUFFT_MAX_PLAN_NUM, new_size)); + _max_size = static_cast(new_size); + } + + std::list _usage_list; + map_t _cache_map; + size_t _max_size; +}; + +static std::vector> plan_caches; +static std::mutex plan_caches_mutex; + +static inline FFTConfigCache& get_fft_plan_cache(int64_t device_index) { + std::lock_guard guard(plan_caches_mutex); + + if (device_index >= plan_caches.size()) { + plan_caches.resize(device_index + 1); + } + + if (!plan_caches[device_index]) { + plan_caches[device_index] = std::make_unique(); + } + + return *plan_caches[device_index]; +} + +// Calculates the normalization constant +static double fft_normalization_scale(FFTNormMode normalization, + const std::vector& sizes, + const std::vector& dims) { + // auto norm = static_cast(normalization); + if (normalization == FFTNormMode::none) { + return static_cast(1.0); + } + + int64_t signal_numel = 1; + for (auto dim : dims) { + signal_numel *= sizes[dim]; + } + const double scale_denom = (normalization == FFTNormMode::by_sqrt_n) + ? std::sqrt(signal_numel) + : static_cast(signal_numel); + return static_cast(1.0 / scale_denom); +} + +template +void exec_normalization(const DeviceContext& ctx, const Tensor* in, Tensor* out, + FFTNormMode normalization, + const std::vector& sizes, + const std::vector& axes) { + double scale = fft_normalization_scale(normalization, sizes, axes); + if (scale != 1.0) { + auto eigen_out = framework::EigenVector::Flatten(*out); + auto eigen_in = framework::EigenVector::Flatten(*in); + auto dev = ctx.eigen_device(); + EigenScale::Eval(*dev, eigen_out, eigen_in, + static_cast(scale), + static_cast(0), false); + } else { + framework::TensorCopy(*in, ctx.GetPlace(), out); + } +} + +#if defined(PADDLE_WITH_CUDA) +static FFTConfigKey create_fft_configkey(const framework::Tensor& input, + const framework::Tensor& output, + int signal_ndim) { + // Create the transform plan (either from cache or locally) + const auto value_type = + framework::IsComplexType(framework::TransToProtoVarType(input.dtype())) + ? framework::ToRealType(framework::TransToProtoVarType(input.dtype())) + : framework::TransToProtoVarType(input.dtype()); + auto fft_type = + GetFFTTransformType(framework::TransToProtoVarType(input.dtype()), + framework::TransToProtoVarType(output.dtype())); + // signal sizes + std::vector signal_size(signal_ndim + 1); + + signal_size[0] = input.dims()[0]; + for (int64_t i = 1; i <= signal_ndim; ++i) { + auto in_size = input.dims()[i]; + auto out_size = output.dims()[i]; + signal_size[i] = std::max(in_size, out_size); + } + FFTConfigKey key(phi::vectorize(input.dims()), phi::vectorize(output.dims()), + signal_size, fft_type, value_type); + return key; +} + +// Execute a pre-planned transform +static void exec_cufft_plan_raw(const FFTConfig& config, void* in_data, + void* out_data, bool forward) { + auto& plan = config.plan(); + + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtExec( + plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE)); +} + +template +void exec_cufft_plan(const DeviceContext& ctx, const FFTConfig& config, + framework::Tensor* input, framework::Tensor* output, + bool forward) { + // execute transform plan + auto fft_type = config.transform_type(); + if (fft_type == FFTTransformType::C2R && forward) { + forward = false; + framework::Tensor input_conj(input->type()); + input_conj.mutable_data(input->dims(), ctx.GetPlace()); + platform::ForRange for_range(ctx, input->numel()); + phi::funcs::ConjFunctor functor(input->data(), input->numel(), + input_conj.data()); + for_range(functor); + exec_cufft_plan_raw(config, input_conj.data(), output->data(), forward); + } else if (fft_type == FFTTransformType::R2C && !forward) { + forward = true; + framework::Tensor out_conj(output->type()); + out_conj.mutable_data(output->dims(), ctx.GetPlace()); + exec_cufft_plan_raw(config, input->data(), out_conj.data(), forward); + + platform::ForRange for_range(ctx, output->numel()); + phi::funcs::ConjFunctor functor(out_conj.data(), output->numel(), + output->data()); + for_range(functor); + } else { + exec_cufft_plan_raw(config, input->data(), output->data(), forward); + } +} + +#elif defined(PADDLE_WITH_HIP) + +static FFTConfigKey create_fft_configkey(const framework::Tensor& input, + const framework::Tensor& output, + int signal_ndim) { + // Create the transform plan (either from cache or locally) + const auto value_type = + framework::IsComplexType(framework::TransToProtoVarType(input.dtype())) + ? framework::ToRealType(framework::TransToProtoVarType(input.dtype())) + : framework::TransToProtoVarType(input.dtype()); + auto fft_type = + GetFFTTransformType(framework::TransToProtoVarType(input.dtype()), + framework::TransToProtoVarType(output.type())); + // signal sizes + std::vector signal_size(signal_ndim + 1); + + signal_size[0] = input.dims()[0]; + for (int64_t i = 1; i <= signal_ndim; ++i) { + auto in_size = input.dims()[i]; + auto out_size = output.dims()[i]; + signal_size[i] = std::max(in_size, out_size); + } + FFTConfigKey key(phi::vectorize(input.dims()), phi::vectorize(output.dims()), + signal_size, fft_type, value_type); + return key; +} + +// Execute a pre-planned transform +static void exec_hipfft_plan_raw(const FFTConfig& config, void* in_data, + void* out_data, bool forward) { + auto& plan = config.plan(); + + auto value_type = config.data_type(); + if (value_type == framework::proto::VarType::FP32) { + switch (config.transform_type()) { + case FFTTransformType::C2C: { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2C( + plan, static_cast(in_data), + static_cast(out_data), + forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); + return; + } + case FFTTransformType::R2C: { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecR2C( + plan, static_cast(in_data), + static_cast(out_data))); + return; + } + case FFTTransformType::C2R: { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2R( + plan, static_cast(in_data), + static_cast(out_data))); + return; + } + } + } else if (value_type == framework::proto::VarType::FP64) { + switch (config.transform_type()) { + case FFTTransformType::C2C: { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2Z( + plan, static_cast(in_data), + static_cast(out_data), + forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); + return; + } + case FFTTransformType::R2C: { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecD2Z( + plan, static_cast(in_data), + static_cast(out_data))); + return; + } + case FFTTransformType::C2R: { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2D( + plan, static_cast(in_data), + static_cast(out_data))); + return; + } + } + } + PADDLE_THROW(platform::errors::InvalidArgument( + "hipFFT only support transforms of type float32 and float64")); +} + +template +void exec_hipfft_plan(const DeviceContext& ctx, const FFTConfig& config, + framework::Tensor* input, framework::Tensor* output, + bool forward) { + auto fft_type = config.transform_type(); + if (fft_type == FFTTransformType::C2R && forward) { + forward = false; + framework::Tensor input_conj(input->type()); + input_conj.mutable_data(input->dims(), ctx.GetPlace()); + platform::ForRange for_range(ctx, input->numel()); + phi::funcs::ConjFunctor functor(input->data(), input->numel(), + input_conj.data()); + for_range(functor); + exec_hipfft_plan_raw(config, input_conj.data(), output->data(), forward); + } else if (fft_type == FFTTransformType::R2C && !forward) { + forward = true; + framework::Tensor out_conj(output->type()); + out_conj.mutable_data(output->dims(), ctx.GetPlace()); + exec_hipfft_plan_raw(config, input->data(), out_conj.data(), forward); + + platform::ForRange for_range(ctx, output->numel()); + phi::funcs::ConjFunctor functor(out_conj.data(), output->numel(), + output->data()); + for_range(functor); + } else { + exec_hipfft_plan_raw(config, input->data(), output->data(), forward); + } +} + +#endif + +// Execute a general unnormalized fft operation (can be c2c, onesided r2c or +// onesided c2r) +template +void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out, + const std::vector& dim, bool forward) { + const auto x_dims = phi::vectorize(X->dims()); + const int64_t ndim = static_cast(X->dims().size()); + auto tensor_place = ctx.GetPlace(); + + // make a dim permutation + std::vector dim_permute(ndim); + std::iota(dim_permute.begin(), dim_permute.end(), int{0}); + std::vector is_transformed_dim(ndim); + for (const auto& d : dim) { + is_transformed_dim[d] = true; + } + auto batch_end = + std::partition(dim_permute.begin(), dim_permute.end(), + [&](int64_t d) { return !is_transformed_dim[d]; }); + std::sort(dim_permute.begin(), batch_end); + std::copy(dim.cbegin(), dim.cend(), batch_end); + + // transpose input according to dim permutation + auto transposed_input_shape = X->dims().transpose(dim_permute); + framework::Tensor transposed_input; + transposed_input.Resize(transposed_input_shape); + transposed_input.mutable_data(tensor_place); + TransCompute(ndim, ctx, *X, &transposed_input, + dim_permute); + + // Reshape batch dimensions into a single dimension + const int64_t signal_ndim = static_cast(dim.size()); + std::vector collapsed_input_shape(signal_ndim + 1); + + auto transposed_input_shape_ = phi::vectorize(transposed_input_shape); + const int64_t batch_dims = ndim - signal_ndim; + auto batch_size = + std::accumulate(transposed_input_shape_.begin(), + transposed_input_shape_.begin() + batch_dims, + static_cast(1), std::multiplies()); + collapsed_input_shape[0] = batch_size; + + std::copy(transposed_input_shape_.begin() + batch_dims, + transposed_input_shape_.end(), collapsed_input_shape.begin() + 1); + + framework::Tensor& collapsed_input = transposed_input; + collapsed_input.Resize(phi::make_ddim(collapsed_input_shape)); + + // make a collpased output + const auto out_dims = phi::vectorize(out->dims()); + std::vector collapsed_output_shape(1 + signal_ndim); + collapsed_output_shape[0] = batch_size; + for (size_t i = 0; i < dim.size(); ++i) { + collapsed_output_shape[i + 1] = out_dims[dim[i]]; + } + framework::Tensor collapsed_output; + collapsed_output.Resize(phi::make_ddim(collapsed_output_shape)); + collapsed_output.mutable_data(tensor_place); + + FFTConfig* config = nullptr; + +#if defined(PADDLE_WITH_CUDA) + std::unique_ptr config_ = nullptr; + // create plan + FFTConfigKey key = + create_fft_configkey(collapsed_input, collapsed_output, signal_ndim); + bool using_cache = false; +#if !defined(CUFFT_VERSION) || (CUFFT_VERSION < 10200) + using_cache = true; +#endif + + if (using_cache) { + const int64_t device_id = static_cast( + reinterpret_cast(&collapsed_input.place()) + ->GetDeviceId()); + FFTConfigCache& plan_cache = get_fft_plan_cache(device_id); + std::unique_lock guard(plan_cache.mutex, std::defer_lock); + guard.lock(); + config = &(plan_cache.lookup(key)); + } else { + config_ = std::make_unique(key); + config = config_.get(); + } + + // prepare cufft for execution + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cufftSetStream(config->plan(), ctx.stream())); + framework::Tensor workspace_tensor; + workspace_tensor.mutable_data(tensor_place, config->workspace_size()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetWorkArea( + config->plan(), workspace_tensor.data())); + // execute transform plan + exec_cufft_plan(ctx, *config, &collapsed_input, + &collapsed_output, forward); + +#elif defined(PADDLE_WITH_HIP) + // create plan + FFTConfigKey key = + create_fft_configkey(collapsed_input, collapsed_output, signal_ndim); + const int64_t device_id = static_cast( + reinterpret_cast(&collapsed_input.place()) + ->GetDeviceId()); + FFTConfigCache& plan_cache = get_fft_plan_cache(device_id); + std::unique_lock guard(plan_cache.mutex, std::defer_lock); + guard.lock(); + config = &(plan_cache.lookup(key)); + + // prepare cufft for execution + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::hipfftSetStream(config->plan(), ctx.stream())); + framework::Tensor workspace_tensor; + workspace_tensor.mutable_data(tensor_place, config->workspace_size()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetWorkArea( + config->plan(), workspace_tensor.data())); + // execute transform plan + exec_hipfft_plan(ctx, *config, &collapsed_input, + &collapsed_output, forward); +#endif + + // Inverting output by reshape and transpose to original batch and dimension + auto transposed_out_shape = out->dims().transpose(dim_permute); + + collapsed_output.Resize(transposed_out_shape); + auto& transposed_output = collapsed_output; + + std::vector reverse_dim_permute(ndim); + for (size_t i = 0; i < ndim; i++) { + reverse_dim_permute[dim_permute[i]] = i; + } + + TransCompute(ndim, ctx, transposed_output, out, + reverse_dim_permute); +} + +// Use the optimized path to perform single R2C or C2R if transformation dim is +// supported by cuFFT +static bool use_optimized_fft_path(const std::vector& axes) { + // For performance reason, when axes starts with (0, 1), do not use the + // optimized path. + if (axes.size() > kMaxFFTNdim || + (axes.size() >= 2 && axes[0] == 0 && axes[1] == 1)) { + return false; + } else { + return true; + } +} + +template +struct FFTC2CFunctor { + void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X, + Tensor* out, const std::vector& axes, + FFTNormMode normalization, bool forward) { + if (axes.empty()) { + framework::TensorCopy(*X, ctx.GetPlace(), out); + return; + } + + framework::Tensor* p_out = out; + std::vector out_dims = phi::vectorize(X->dims()); + std::vector working_axes(axes.begin(), axes.end()); + std::vector first_dims; + size_t max_dims; + framework::Tensor working_tensor; + working_tensor.mutable_data(X->dims(), ctx.GetPlace()); + framework::Tensor* p_working_tensor = &working_tensor; + framework::TensorCopy(*X, ctx.GetPlace(), &working_tensor); + + while (true) { + max_dims = + std::min(static_cast(kMaxFFTNdim), working_axes.size()); + first_dims.assign(working_axes.end() - max_dims, working_axes.end()); + + exec_fft(ctx, p_working_tensor, + p_out, first_dims, forward); + working_axes.resize(working_axes.size() - max_dims); + first_dims.clear(); + + if (working_axes.empty()) { + break; + } + + std::swap(p_out, p_working_tensor); + } + exec_normalization( + ctx, p_out, out, normalization, out_dims, axes); + } +}; + +template +struct FFTC2RFunctor { + void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X, + Tensor* out, const std::vector& axes, + FFTNormMode normalization, bool forward) { + std::vector in_dims = phi::vectorize(X->dims()); + std::vector out_dims = phi::vectorize(out->dims()); + + if (use_optimized_fft_path(axes)) { + framework::Tensor x_copy(X->type()); + x_copy.mutable_data(X->dims(), ctx.GetPlace()); + framework::TensorCopy(*X, ctx.GetPlace(), &x_copy); + exec_fft(ctx, &x_copy, out, axes, + forward); + } else { + framework::Tensor temp_tensor; + temp_tensor.mutable_data(X->dims(), ctx.GetPlace()); + const std::vector dims(axes.begin(), axes.end() - 1); + + FFTC2CFunctor c2c_functor; + c2c_functor(ctx, X, &temp_tensor, dims, FFTNormMode::none, forward); + + exec_fft(ctx, &temp_tensor, out, + {axes.back()}, forward); + } + exec_normalization( + ctx, out, out, normalization, out_dims, axes); + } +}; + +// n dimension real to complex FFT use cufft lib +template +struct FFTR2CFunctor { + void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X, + Tensor* out, const std::vector& axes, + FFTNormMode normalization, bool forward) { + // Step1: R2C transform on the last dimension + framework::Tensor* r2c_out = out; + const std::vector last_dim{axes.back()}; + std::vector out_dims = phi::vectorize(out->dims()); + exec_fft(ctx, X, r2c_out, last_dim, + forward); + + // Step2: C2C transform on the remaining dimension + framework::Tensor c2c_out; + if (axes.size() > 1) { + c2c_out.mutable_data(out->dims(), ctx.GetPlace()); + std::vector remain_dim(axes.begin(), axes.end() - 1); + FFTC2CFunctor fft_c2c_func; + fft_c2c_func(ctx, r2c_out, &c2c_out, remain_dim, FFTNormMode::none, + forward); + } + + const auto in_sizes = phi::vectorize(X->dims()); + framework::Tensor* norm_tensor = axes.size() > 1 ? &c2c_out : r2c_out; + exec_normalization( + ctx, norm_tensor, out, normalization, in_sizes, axes); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/spectral_op.h b/paddle/fluid/operators/spectral_op.h index a60ec5a4df..71b54caf5e 100644 --- a/paddle/fluid/operators/spectral_op.h +++ b/paddle/fluid/operators/spectral_op.h @@ -11,8 +11,11 @@ #pragma once #define NOMINMAX // to use std::min std::max correctly on windows +#include +#include #include #include +#include #include #include #include "paddle/fluid/framework/convert_utils.h" @@ -23,8 +26,10 @@ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/conj_op.h" #include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/padding.h" #if defined(__NVCC__) || defined(__HIPCC__) #include "thrust/device_vector.h" diff --git a/paddle/fluid/operators/stft_op.cc b/paddle/fluid/operators/stft_op.cc new file mode 100644 index 0000000000..ecbd9edd87 --- /dev/null +++ b/paddle/fluid/operators/stft_op.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/stft_op.h" +#include "paddle/fluid/operators/spectral_helper.h" + +namespace paddle { +namespace operators { +class StftOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "frame"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "frame"); + + const int n_fft = ctx->Attrs().Get("n_fft"); + const int hop_length = ctx->Attrs().Get("hop_length"); + + const auto x_dims = ctx->GetInputDim("X"); + const int x_rank = x_dims.size(); + const bool onesided = ctx->Attrs().Get("onesided"); + + PADDLE_ENFORCE_EQ( + x_rank, 2, + platform::errors::InvalidArgument( + "Input(X) of StftOp should be a tensor with shape [N, T], " + "but got rank %s.", + x_rank)); + PADDLE_ENFORCE_GT( + hop_length, 0, + platform::errors::InvalidArgument( + "Attribute(hop_length) should be greater than 0, but got %s.", + hop_length)); + + int seq_length = x_dims[x_rank - 1]; + int n_frames = 1 + (seq_length - n_fft) / hop_length; + + PADDLE_ENFORCE_LE(n_fft, seq_length, + platform::errors::InvalidArgument( + "Attribute(frame_length) should be less equal than " + "sequence length, but got (%s) > (%s).", + n_fft, seq_length)); + + std::vector output_shape; + output_shape.push_back(x_dims[0]); + if (onesided) { + output_shape.push_back(n_fft / 2 + 1); + } else { + output_shape.push_back(n_fft); + } + output_shape.push_back(n_frames); + + ctx->SetOutputDim("Out", phi::make_ddim(output_shape)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + const auto in_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + return framework::OpKernelType(in_dtype, ctx.GetPlace()); + } +}; + +class StftOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input waveforms with shape (N, T)"); + AddOutput("Out", + "The complex STFT output tensor with shape (N, n_fft, " + "num_frames) or (N, n_fft/2 + 1, num_frames)"); + AddAttr("n_fft", "The number of input samples to perform FFT"); + AddAttr("hop_length", "Number of samples between adjacent frames"); + AddAttr("normalized", + "Control whether to scale the output by 1/sqrt(n_fft)"); + AddAttr("onesided", + "Control whether to return half of the FFT output"); + AddComment(R"DOC( + Short-time Fourier transform (STFT). + )DOC"); + } +}; + +template +class StftGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr grad_op) const override { + grad_op->SetType("stft_grad"); + grad_op->SetInput("X", this->Input("X")); + grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + grad_op->SetAttrMap(this->Attrs()); + } +}; + +class StftGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + const auto out_grad_name = framework::GradVarName("Out"); + OP_INOUT_CHECK(ctx->HasInput(out_grad_name), "Input", out_grad_name, + "stft_grad"); + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "stft_grad"); + + const auto x_grad_name = framework::GradVarName("X"); + OP_INOUT_CHECK(ctx->HasOutput(x_grad_name), "Output", x_grad_name, + "stft_grad"); + + ctx->ShareDim("X", /*->*/ x_grad_name); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + const auto in_dtype = OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); + const auto kernel_dtype = framework::ToRealType(in_dtype); + return framework::OpKernelType(kernel_dtype, ctx.GetPlace()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(stft, ops::StftOp, ops::StftOpMaker, + ops::StftGradOpMaker, + ops::StftGradOpMaker); + +REGISTER_OPERATOR(stft_grad, ops::StftGradOp); + +REGISTER_OP_CPU_KERNEL( + stft, ops::StftKernel, + ops::StftKernel); + +REGISTER_OP_CPU_KERNEL( + stft_grad, ops::StftGradKernel, + ops::StftGradKernel); diff --git a/paddle/fluid/operators/stft_op.cu b/paddle/fluid/operators/stft_op.cu new file mode 100644 index 0000000000..5272be29c0 --- /dev/null +++ b/paddle/fluid/operators/stft_op.cu @@ -0,0 +1,26 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/spectral_op.cu.h" +#include "paddle/fluid/operators/stft_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + stft, ops::StftKernel, + ops::StftKernel); + +REGISTER_OP_CUDA_KERNEL( + stft_grad, ops::StftGradKernel, + ops::StftGradKernel); diff --git a/paddle/fluid/operators/stft_op.h b/paddle/fluid/operators/stft_op.h new file mode 100644 index 0000000000..4f0746ee14 --- /dev/null +++ b/paddle/fluid/operators/stft_op.h @@ -0,0 +1,157 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" + +#include "paddle/fluid/operators/frame_op.h" +#include "paddle/fluid/operators/spectral_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class StftKernel : public framework::OpKernel { + public: + /* + Batch Signals (N, T) -> Frames (N, n_fft, num_frames) -> FFTR2C -> (N, + n_fft/2 + 1, num_frames) or (N, n_fft, num_frames) + */ + void Compute(const framework::ExecutionContext& ctx) const override { + using C = paddle::platform::complex; + const Tensor* x = ctx.Input("X"); + Tensor* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + const size_t x_rank = x->dims().size(); + const size_t out_rank = out->dims().size(); + + const int n_fft = ctx.Attr("n_fft"); + const int hop_length = ctx.Attr("hop_length"); + const bool normalized = ctx.Attr("normalized"); + const bool onesided = ctx.Attr("onesided"); + + const int n_frames = out->dims()[out_rank - 1]; + const int seq_length = x->dims()[x_rank - 1]; + + auto& dev_ctx = ctx.device_context(); + + std::vector axes = {1}; + + // Frame + Tensor frames; + framework::DDim frames_dims(out->dims()); + frames_dims.at(axes.back()) = n_fft; + frames.mutable_data(frames_dims, ctx.GetPlace()); + FrameFunctor()(dev_ctx, x, &frames, seq_length, n_fft, + n_frames, hop_length, /*is_grad*/ false); + + // FFTR2C + FFTNormMode normalization; + if (normalized) { + normalization = get_norm_from_string("ortho", true); + } else { + normalization = get_norm_from_string("backward", true); + } + FFTR2CFunctor fft_r2c_func; + + if (onesided) { + fft_r2c_func(dev_ctx, &frames, out, axes, normalization, true); + } else { + framework::DDim onesided_dims(out->dims()); + const int64_t onesided_axis_size = out->dims().at(axes.back()) / 2 + 1; + onesided_dims.at(axes.back()) = onesided_axis_size; + Tensor onesided_out; + onesided_out.mutable_data(onesided_dims, ctx.GetPlace()); + fft_r2c_func(dev_ctx, &frames, &onesided_out, axes, normalization, true); + fill_conj(dev_ctx, &onesided_out, out, axes); + } + } +}; + +template +class StftGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using C = paddle::platform::complex; + auto& dev_ctx = ctx.device_context(); + + const auto* dy = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + dx->mutable_data(ctx.GetPlace()); + + const size_t dy_rank = dy->dims().size(); + const size_t dx_rank = dx->dims().size(); + + const int n_fft = ctx.Attr("n_fft"); + const int hop_length = ctx.Attr("hop_length"); + const bool normalized = ctx.Attr("normalized"); + const bool onesided = ctx.Attr("onesided"); + const int n_frames = dy->dims()[dy_rank - 1]; + const int seq_length = dx->dims()[dx_rank - 1]; + + std::vector axes = {1}; + Tensor d_frames; + framework::DDim d_frames_dims(dy->dims()); + d_frames_dims.at(axes.back()) = n_fft; + d_frames.mutable_data(d_frames_dims, ctx.GetPlace()); + + Tensor complex_d_frames; + complex_d_frames.mutable_data(d_frames_dims, ctx.GetPlace()); + + // dy -> d_frames + FFTNormMode normalization; + if (normalized) { + normalization = get_norm_from_string("ortho", true); + } else { + normalization = get_norm_from_string("backward", true); + } + FFTC2CFunctor fft_c2c_func; + + if (!onesided) { + fft_c2c_func(dev_ctx, dy, &complex_d_frames, axes, normalization, false); + } else { + Tensor full_dy; + full_dy.mutable_data(d_frames_dims, ctx.GetPlace()); + auto zero_length = static_cast(full_dy.dims().at(axes.back()) - + dy->dims().at(axes.back())); + auto rank = dy->dims().size(); + + std::vector pads(rank * 2, 0); + pads[axes.back() * 2 + 1] = zero_length; + + phi::funcs::PaddingFunctor( + rank, ctx.template device_context(), pads, + static_cast(0), *dy, &full_dy); + fft_c2c_func(dev_ctx, &full_dy, &complex_d_frames, axes, normalization, + false); + } + framework::TransComplexToReal( + framework::TransToProtoVarType(d_frames.dtype()), + framework::TransToProtoVarType(complex_d_frames.dtype()), + complex_d_frames, &d_frames); + + // d_frames -> dx + FrameFunctor()(dev_ctx, &d_frames, dx, seq_length, n_fft, + n_frames, hop_length, /*is_grad*/ true); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/phi/kernels/cpu/pad3d_kernel.cc b/paddle/phi/kernels/cpu/pad3d_kernel.cc index 0dc01f485f..68bd921683 100644 --- a/paddle/phi/kernels/cpu/pad3d_kernel.cc +++ b/paddle/phi/kernels/cpu/pad3d_kernel.cc @@ -15,6 +15,7 @@ #include "paddle/phi/kernels/pad3d_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/complex.h" #include "paddle/phi/core/kernel_registry.h" namespace phi { @@ -574,5 +575,13 @@ void Pad3dKernel(const Context& dev_ctx, } // namespace phi -PD_REGISTER_KERNEL( - pad3d, CPU, ALL_LAYOUT, phi::Pad3dKernel, float, double, int, int64_t) {} +PD_REGISTER_KERNEL(pad3d, + CPU, + ALL_LAYOUT, + phi::Pad3dKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/pad3d_kernel.cu b/paddle/phi/kernels/gpu/pad3d_kernel.cu index 2cef77cc0e..8f7cf716e7 100644 --- a/paddle/phi/kernels/gpu/pad3d_kernel.cu +++ b/paddle/phi/kernels/gpu/pad3d_kernel.cu @@ -19,6 +19,7 @@ #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/complex.h" #include "paddle/phi/core/kernel_registry.h" namespace phi { @@ -585,4 +586,6 @@ PD_REGISTER_KERNEL(pad3d, float, double, int, - int64_t) {} + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py index 676ee3e3c7..148f4d95c6 100755 --- a/python/paddle/fluid/layers/layer_function_generator.py +++ b/python/paddle/fluid/layers/layer_function_generator.py @@ -266,9 +266,10 @@ def generate_activation_fn(op_type): op_type) else: # abs exp square ops support dtype(int32, int64, float16, float32, float64) - check_variable_and_dtype( - x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], - op_type) + check_variable_and_dtype(x, 'x', [ + 'int32', 'int64', 'float16', 'float32', 'float64', 'complex64', + 'complex128' + ], op_type) helper = LayerHelper(op_type, **locals()) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d836db5bb9..c72d51af8a 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -5616,9 +5616,10 @@ def transpose(x, perm, name=None): out, _ = _C_ops.transpose2(x, 'axis', perm) return out - check_variable_and_dtype( - x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], - 'transpose') + check_variable_and_dtype(x, 'x', [ + 'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64', + 'complex128' + ], 'transpose') check_type(perm, 'perm', (list, tuple), 'transpose') if isinstance(perm, tuple): perm = list(perm) @@ -6410,10 +6411,10 @@ def squeeze(input, axes, name=None): return out helper = LayerHelper("squeeze", **locals()) - check_variable_and_dtype( - input, 'input', - ['float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64'], - 'squeeze') + check_variable_and_dtype(input, 'input', [ + 'float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64', + 'complex64', 'complex128' + ], 'squeeze') check_type(axes, 'axis/axes', (list, tuple), 'squeeze') out = helper.create_variable_for_type_inference(dtype=input.dtype) x_shape = helper.create_variable_for_type_inference(dtype=input.dtype) @@ -6471,8 +6472,16 @@ def unsqueeze(input, axes, name=None): check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze') check_variable_and_dtype(input, 'input', [ - 'float16', 'float32', 'float64', 'bool', 'int8', 'int16', 'int32', - 'int64' + 'float16', + 'float32', + 'float64', + 'bool', + 'int8', + 'int16', + 'int32', + 'int64', + 'complex64', + 'complex128', ], 'unsqueeze') helper = LayerHelper("unsqueeze2", **locals()) inputs = {"X": input} diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index c63ad42288..683bf2bc81 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -756,7 +756,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None): check_shape(shape) check_dtype(dtype, 'dtype', [ 'bool', 'float16', 'float32', 'float64', 'uint8', 'int16', 'int32', - 'int64' + 'int64', 'complex64', 'complex128' ], 'fill_constant') check_type(shape, 'shape', (Variable, list, tuple), 'fill_constant') diff --git a/python/paddle/fluid/tests/unittests/test_stft_op.py b/python/paddle/fluid/tests/unittests/test_stft_op.py new file mode 100644 index 0000000000..64b8084a16 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_stft_op.py @@ -0,0 +1,84 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from numpy.lib.stride_tricks import as_strided +import paddle +import unittest + +from op_test import OpTest + + +def frame_from_librosa(x, frame_length, hop_length, axis=-1): + if axis == -1 and not x.flags["C_CONTIGUOUS"]: + x = np.ascontiguousarray(x) + elif axis == 0 and not x.flags["F_CONTIGUOUS"]: + x = np.asfortranarray(x) + + n_frames = 1 + (x.shape[axis] - frame_length) // hop_length + strides = np.asarray(x.strides) + + if axis == -1: + shape = list(x.shape)[:-1] + [frame_length, n_frames] + strides = list(strides) + [hop_length * x.itemsize] + + elif axis == 0: + shape = [n_frames, frame_length] + list(x.shape)[1:] + strides = [hop_length * x.itemsize] + list(strides) + + else: + raise ValueError("Frame axis={} must be either 0 or -1".format(axis)) + + return as_strided(x, shape=shape, strides=strides) + + +def stft_np(x, n_fft, hop_length, **kwargs): + frames = frame_from_librosa(x, n_fft, hop_length) + res = np.fft.rfft(frames, axis=1) + return res + + +class TestStftOp(OpTest): + def setUp(self): + self.op_type = "stft" + self.shape, self.type, self.attrs = self.initTestCase() + self.inputs = { + 'X': np.random.random(size=self.shape).astype(self.type), + } + self.outputs = {'Out': stft_np(x=self.inputs['X'], **self.attrs)} + + def initTestCase(self): + input_shape = (2, 100) + input_type = 'float64' + attrs = { + 'n_fft': 50, + 'hop_length': 15, + 'normalized': False, + 'onesided': True, + } + return input_shape, input_type, attrs + + def test_check_output(self): + paddle.enable_static() + self.check_output() + paddle.disable_static() + + def test_check_grad_normal(self): + paddle.enable_static() + self.check_grad(['X'], 'Out') + paddle.disable_static() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/signal.py b/python/paddle/signal.py index cd8ba2b58a..f5b225bc6d 100644 --- a/python/paddle/signal.py +++ b/python/paddle/signal.py @@ -119,10 +119,11 @@ def frame(x, frame_length, hop_length, axis=-1, name=None): f'Unexpected hop_length: {hop_length}. It should be an positive integer.' ) - if frame_length > x.shape[axis]: - raise ValueError( - f'Attribute frame_length should be less equal than sequence length, ' - f'but got ({frame_length}) > ({x.shape[axis]}).') + if in_dygraph_mode(): + if frame_length > x.shape[axis]: + raise ValueError( + f'Attribute frame_length should be less equal than sequence length, ' + f'but got ({frame_length}) > ({x.shape[axis]}).') op_type = 'frame' @@ -306,8 +307,7 @@ def stft(x, y1 = stft(x, n_fft=512, center=False, onesided=False) # [8, 512, 372] """ check_variable_and_dtype( - x, 'x', ['float16', 'float32', 'float64', 'complex64', 'complex128'], - 'stft') + x, 'x', ['float32', 'float64', 'complex64', 'complex128'], 'stft') x_rank = len(x.shape) assert x_rank in [1, 2], \ @@ -325,8 +325,9 @@ def stft(x, if win_length is None: win_length = n_fft - assert 0 < n_fft <= x.shape[-1], \ - f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.' + if in_dygraph_mode(): + assert 0 < n_fft <= x.shape[-1], \ + f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.' assert 0 < win_length <= n_fft, \ f'win_length should be in (0, n_fft({n_fft})], but got {win_length}.' @@ -359,7 +360,7 @@ def stft(x, x_frames = x_frames.transpose( perm=[0, 2, 1]) # switch n_fft to last dim, egs: (batch, num_frames, n_fft) - x_frames = x_frames * window + x_frames = paddle.multiply(x_frames, window) norm = 'ortho' if normalized else 'backward' if is_complex(x_frames): @@ -495,18 +496,22 @@ def istft(x, n_frames = x.shape[-1] fft_size = x.shape[-2] - if onesided: - assert (fft_size == n_fft // 2 + 1), \ - 'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(n_fft // 2 + 1, fft_size) - else: - assert (fft_size == n_fft), \ - 'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(n_fft, fft_size) + if in_dygraph_mode(): + if onesided: + assert (fft_size == n_fft // 2 + 1), \ + 'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(n_fft // 2 + 1, fft_size) + else: + assert (fft_size == n_fft), \ + 'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(n_fft, fft_size) if window is not None: assert len(window.shape) == 1 and len(window) == win_length, \ 'expected a 1D window tensor of size equal to win_length({}), but got window with shape {}.'.format(win_length, window.shape) else: - window = paddle.ones(shape=(win_length, )) + window_dtype = paddle.float32 if x.dtype in [ + paddle.float32, paddle.complex64 + ] else paddle.float64 + window = paddle.ones(shape=(win_length, ), dtype=window_dtype) if win_length < n_fft: pad_left = (n_fft - win_length) // 2 @@ -534,15 +539,15 @@ def istft(x, x = x[:, :, :n_fft // 2 + 1] out = fft_c2r(x=x, n=None, axis=-1, norm=norm, forward=False, name=None) + out = paddle.multiply(out, window).transpose( + perm=[0, 2, 1]) # (batch, n_fft, num_frames) out = overlap_add( - x=(out * window).transpose( - perm=[0, 2, 1]), # (batch, n_fft, num_frames) - hop_length=hop_length, - axis=-1) # (batch, seq_length) + x=out, hop_length=hop_length, axis=-1) # (batch, seq_length) window_envelop = overlap_add( x=paddle.tile( - x=window * window, repeat_times=[n_frames, 1]).transpose( + x=paddle.multiply(window, window).unsqueeze(0), + repeat_times=[n_frames, 1]).transpose( perm=[1, 0]), # (n_fft, num_frames) hop_length=hop_length, axis=-1) # (seq_length, ) @@ -561,7 +566,7 @@ def istft(x, window_envelop = window_envelop[start:start + length] # Check whether the Nonzero Overlap Add (NOLA) constraint is met. - if window_envelop.abs().min().item() < 1e-11: + if in_dygraph_mode() and window_envelop.abs().min().item() < 1e-11: raise ValueError( 'Abort istft because Nonzero Overlap Add (NOLA) condition failed. For more information about NOLA constraint please see `scipy.signal.check_NOLA`(https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.check_NOLA.html).' ) diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 1a0e636124..6c82539ec6 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -147,7 +147,9 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None): var_names = {'x': x, 'y': y} for name, val in var_names.items(): check_variable_and_dtype( - val, name, ['float16', 'float32', 'float64'], 'matmul') + val, name, + ['float16', 'float32', 'float64', 'complex64', 'complex128'], + 'matmul') __check_input(x, y) -- GitLab