diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 893f7d51140a70550b18ac440ff81266c47e0f39..a25c54bae2a436415bc23d5e661f200f21280e7f 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -104,7 +104,7 @@ endif() set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel backward_infermeta) register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op quantize_linear_op - recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) + recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS}) target_link_libraries(run_program_op cuda_graph_with_memory_pool) @@ -129,22 +129,6 @@ else() op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) endif() -if (WITH_GPU OR WITH_ROCM) - if (MKL_FOUND AND WITH_ONEMKL) - op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda dynload_mklrt ${OP_HEADER_DEPS}) - target_include_directories(spectral_op PRIVATE ${MKL_INCLUDE}) - else() - op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda ${OP_HEADER_DEPS}) - endif() -else() - if (MKL_FOUND AND WITH_ONEMKL) - op_library(spectral_op SRCS spectral_op.cc DEPS dynload_mklrt ${OP_HEADER_DEPS}) - target_include_directories(spectral_op PRIVATE ${MKL_INCLUDE}) - else() - op_library(spectral_op SRCS spectral_op.cc DEPS ${OP_HEADER_DEPS}) - endif() -endif() - if (WITH_ASCEND_CL) op_library(sync_batch_norm_op) endif() diff --git a/paddle/fluid/operators/spectral_helper.h b/paddle/fluid/operators/spectral_helper.h deleted file mode 100644 index f69573e18927e1623e3e80e0167e104b3be176b4..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/spectral_helper.h +++ /dev/null @@ -1,545 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/operators/spectral_op.h" - -#if defined(PADDLE_WITH_ONEMKL) -#include "paddle/phi/backends/dynload/mklrt.h" -#elif defined(PADDLE_WITH_POCKETFFT) -#include "extern_pocketfft/pocketfft_hdronly.h" -#endif - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -// FFT Functors -#if defined(PADDLE_WITH_ONEMKL) - -#define MKL_DFTI_CHECK(expr) \ - do { \ - MKL_LONG status = (expr); \ - if (!phi::dynload::DftiErrorClass(status, DFTI_NO_ERROR)) \ - PADDLE_THROW( \ - platform::errors::External(phi::dynload::DftiErrorMessage(status))); \ - } while (0); - -struct DftiDescriptorDeleter { - void operator()(DFTI_DESCRIPTOR_HANDLE handle) { - if (handle != nullptr) { - MKL_DFTI_CHECK(phi::dynload::DftiFreeDescriptor(&handle)); - } - } -}; - -// A RAII wrapper for MKL_DESCRIPTOR* -class DftiDescriptor { - public: - void init(DFTI_CONFIG_VALUE precision, - DFTI_CONFIG_VALUE signal_type, - MKL_LONG signal_ndim, - MKL_LONG* sizes) { - PADDLE_ENFORCE_EQ(desc_.get(), - nullptr, - platform::errors::AlreadyExists( - "DftiDescriptor has already been initialized.")); - - DFTI_DESCRIPTOR* raw_desc; - MKL_DFTI_CHECK(phi::dynload::DftiCreateDescriptorX( - &raw_desc, precision, signal_type, signal_ndim, sizes)); - desc_.reset(raw_desc); - } - - DFTI_DESCRIPTOR* get() const { - DFTI_DESCRIPTOR* raw_desc = desc_.get(); - PADDLE_ENFORCE_NOT_NULL(raw_desc, - platform::errors::PreconditionNotMet( - "DFTI DESCRIPTOR has not been initialized.")); - return raw_desc; - } - - private: - std::unique_ptr desc_; -}; - -static DftiDescriptor _plan_mkl_fft( - const framework::proto::VarType::Type& in_dtype, - const framework::proto::VarType::Type& out_dtype, - const framework::DDim& in_strides, - const framework::DDim& out_strides, - const std::vector& signal_sizes, - FFTNormMode normalization, - bool forward) { - const DFTI_CONFIG_VALUE precision = [&] { - switch (in_dtype) { - case framework::proto::VarType::FP32: - return DFTI_SINGLE; - case framework::proto::VarType::COMPLEX64: - return DFTI_SINGLE; - case framework::proto::VarType::FP64: - return DFTI_DOUBLE; - case framework::proto::VarType::COMPLEX128: - return DFTI_DOUBLE; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Invalid input datatype (%s), input data type should be FP32, " - "FP64, COMPLEX64 or COMPLEX128.", - framework::DataTypeToString(in_dtype))); - } - }(); - - // C2C, R2C, C2R - const FFTTransformType fft_type = GetFFTTransformType(in_dtype, out_dtype); - const DFTI_CONFIG_VALUE domain = - (fft_type == FFTTransformType::C2C) ? DFTI_COMPLEX : DFTI_REAL; - - DftiDescriptor descriptor; - std::vector fft_sizes(signal_sizes.cbegin(), signal_sizes.cend()); - const MKL_LONG signal_ndim = fft_sizes.size() - 1; - descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1); - - // placement inplace or not inplace - MKL_DFTI_CHECK(phi::dynload::DftiSetValue( - descriptor.get(), DFTI_PLACEMENT, DFTI_NOT_INPLACE)); - - // number of transformations - const MKL_LONG batch_size = fft_sizes[0]; - MKL_DFTI_CHECK(phi::dynload::DftiSetValue( - descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size)); - - // input & output distance - const MKL_LONG idist = in_strides[0]; - const MKL_LONG odist = out_strides[0]; - MKL_DFTI_CHECK( - phi::dynload::DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist)); - MKL_DFTI_CHECK(phi::dynload::DftiSetValue( - descriptor.get(), DFTI_OUTPUT_DISTANCE, odist)); - - // input & output stride - std::vector mkl_in_stride(1 + signal_ndim, 0); - std::vector mkl_out_stride(1 + signal_ndim, 0); - for (MKL_LONG i = 1; i <= signal_ndim; i++) { - mkl_in_stride[i] = in_strides[i]; - mkl_out_stride[i] = out_strides[i]; - } - MKL_DFTI_CHECK(phi::dynload::DftiSetValue( - descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data())); - MKL_DFTI_CHECK(phi::dynload::DftiSetValue( - descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data())); - - // conjugate even storage - if (!(fft_type == FFTTransformType::C2C)) { - MKL_DFTI_CHECK(phi::dynload::DftiSetValue( - descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX)); - } - - MKL_LONG signal_numel = std::accumulate(fft_sizes.cbegin() + 1, - fft_sizes.cend(), - 1UL, - std::multiplies()); - if (normalization != FFTNormMode::none) { - const double scale = - ((normalization == FFTNormMode::by_sqrt_n) - ? 1.0 / std::sqrt(static_cast(signal_numel)) - : 1.0 / static_cast(signal_numel)); - const auto scale_direction = [&]() { - if (fft_type == FFTTransformType::R2C || - (fft_type == FFTTransformType::C2C && forward)) { - return DFTI_FORWARD_SCALE; - } else { - // (fft_type == FFTTransformType::C2R || - // (fft_type == FFTTransformType::C2C && !forward)) - return DFTI_BACKWARD_SCALE; - } - }(); - MKL_DFTI_CHECK( - phi::dynload::DftiSetValue(descriptor.get(), scale_direction, scale)); - } - - // commit the descriptor - MKL_DFTI_CHECK(phi::dynload::DftiCommitDescriptor(descriptor.get())); - return descriptor; -} - -// Execute a general fft operation (can be c2c, onesided r2c or onesided c2r) -template -void exec_fft(const DeviceContext& ctx, - const Tensor* x, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward) { - const framework::DDim& in_sizes = x->dims(); - const int ndim = in_sizes.size(); - const int signal_ndim = axes.size(); - const int batch_ndim = ndim - signal_ndim; - const framework::DDim& out_sizes = out->dims(); - - // make a dim permutation - std::vector dim_permute(ndim); - std::iota(dim_permute.begin(), dim_permute.end(), 0); - std::vector is_transformed_dim(ndim, false); - for (const auto& d : axes) { - is_transformed_dim[d] = true; - } - const auto batch_end = - std::partition(dim_permute.begin(), dim_permute.end(), [&](size_t axis) { - return !is_transformed_dim[axis]; - }); - std::copy(axes.cbegin(), axes.cend(), batch_end); - - // transpose input according to that permutation - framework::DDim transposed_input_shape = in_sizes.transpose(dim_permute); - std::vector transposed_input_shape_ = - phi::vectorize(transposed_input_shape); - framework::Tensor transposed_input; - transposed_input.Resize(transposed_input_shape); - const auto place = ctx.GetPlace(); - transposed_input.mutable_data(place); - TransCompute( - ndim, ctx, *x, &transposed_input, dim_permute); - - // make an collapsed input: collapse batch axes for input - const int batch_size = - std::accumulate(transposed_input_shape.Get(), - transposed_input_shape.Get() + batch_ndim, - 1L, - std::multiplies()); - std::vector collapsed_input_shape_(1 + signal_ndim); - collapsed_input_shape_[0] = batch_size; - std::copy(transposed_input_shape_.begin() + batch_ndim, - transposed_input_shape_.end(), - collapsed_input_shape_.begin() + 1); - const framework::DDim collapsed_input_shape = - phi::make_ddim(collapsed_input_shape_); - transposed_input.Resize(collapsed_input_shape); - framework::Tensor& collapsed_input = transposed_input; - - // make a collapsed output - std::vector collapsed_output_shape_(1 + signal_ndim); - collapsed_output_shape_[0] = batch_size; - for (int i = 0; i < signal_ndim; i++) { - collapsed_output_shape_[1 + i] = out_sizes[axes[i]]; - } - const framework::DDim collapsed_output_shape = - phi::make_ddim(collapsed_output_shape_); - framework::Tensor collapsed_output; - collapsed_output.Resize(collapsed_output_shape); - collapsed_output.mutable_data(place, out->type()); - - // signal sizes - std::vector signal_sizes(1 + signal_ndim); - signal_sizes[0] = batch_size; - for (int i = 0; i < signal_ndim; i++) { - signal_sizes[1 + i] = - std::max(collapsed_input_shape[1 + i], collapsed_output_shape[1 + i]); - } - - // input & output stride - const framework::DDim input_stride = phi::stride(collapsed_input_shape); - const framework::DDim output_stride = phi::stride(collapsed_output_shape); - - // make a DFTI_DESCRIPTOR - DftiDescriptor desc = - _plan_mkl_fft(framework::TransToProtoVarType(x->dtype()), - framework::TransToProtoVarType(out->dtype()), - input_stride, - output_stride, - signal_sizes, - normalization, - forward); - - const FFTTransformType fft_type = - GetFFTTransformType(framework::TransToProtoVarType(x->dtype()), - framework::TransToProtoVarType(out->type())); - if (fft_type == FFTTransformType::C2R && forward) { - framework::Tensor collapsed_input_conj(collapsed_input.dtype()); - collapsed_input_conj.mutable_data(collapsed_input.dims(), - ctx.GetPlace()); - // conjugate the input - platform::ForRange for_range(ctx, collapsed_input.numel()); - phi::funcs::ConjFunctor functor(collapsed_input.data(), - collapsed_input.numel(), - collapsed_input_conj.data()); - for_range(functor); - MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward( - desc.get(), collapsed_input_conj.data(), collapsed_output.data())); - } else if (fft_type == FFTTransformType::R2C && !forward) { - framework::Tensor collapsed_output_conj(collapsed_output.dtype()); - collapsed_output_conj.mutable_data(collapsed_output.dims(), - ctx.GetPlace()); - MKL_DFTI_CHECK(phi::dynload::DftiComputeForward( - desc.get(), collapsed_input.data(), collapsed_output_conj.data())); - // conjugate the output - platform::ForRange for_range(ctx, collapsed_output.numel()); - phi::funcs::ConjFunctor functor(collapsed_output_conj.data(), - collapsed_output.numel(), - collapsed_output.data()); - for_range(functor); - } else { - if (forward) { - MKL_DFTI_CHECK(phi::dynload::DftiComputeForward( - desc.get(), collapsed_input.data(), collapsed_output.data())); - } else { - MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward( - desc.get(), collapsed_input.data(), collapsed_output.data())); - } - } - - // resize for the collapsed output - framework::DDim transposed_output_shape = out_sizes.transpose(dim_permute); - collapsed_output.Resize(transposed_output_shape); - framework::Tensor& transposed_output = collapsed_output; - - // reverse the transposition - std::vector reverse_dim_permute(ndim); - for (int i = 0; i < ndim; i++) { - reverse_dim_permute[dim_permute[i]] = i; - } - TransCompute( - ndim, ctx, transposed_output, out, reverse_dim_permute); -} - -template -struct FFTC2CFunctor { - void operator()(const phi::CPUContext& ctx, - const Tensor* x, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward) { - exec_fft( - ctx, x, out, axes, normalization, forward); - } -}; - -template -struct FFTR2CFunctor { - void operator()(const phi::CPUContext& ctx, - const Tensor* x, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward) { - exec_fft( - ctx, x, out, axes, normalization, forward); - } -}; - -template -struct FFTC2RFunctor { - void operator()(const phi::CPUContext& ctx, - const Tensor* x, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward) { - if (axes.size() > 1) { - const std::vector c2c_dims(axes.begin(), axes.end() - 1); - Tensor temp; - temp.mutable_data(x->dims(), ctx.GetPlace()); - - FFTC2CFunctor c2c_functor; - c2c_functor(ctx, x, &temp, c2c_dims, normalization, forward); - - const std::vector new_axes{axes.back()}; - exec_fft( - ctx, &temp, out, new_axes, normalization, forward); - } else { - exec_fft( - ctx, x, out, axes, normalization, forward); - } - } -}; -#elif defined(PADDLE_WITH_POCKETFFT) - -template -T compute_factor(int64_t size, FFTNormMode normalization) { - constexpr auto one = static_cast(1); - switch (normalization) { - case FFTNormMode::none: - return one; - case FFTNormMode::by_n: - return one / static_cast(size); - case FFTNormMode::by_sqrt_n: - return one / std::sqrt(static_cast(size)); - } - PADDLE_THROW( - platform::errors::InvalidArgument("Unsupported normalization type")); -} - -template -struct FFTC2CFunctor { - void operator()(const phi::CPUContext& ctx, - const Tensor* x, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward) { - using R = typename Ti::value_type; - using C = std::complex; - - const auto& input_dim = x->dims(); - const std::vector in_sizes = phi::vectorize(input_dim); - std::vector in_strides = - phi::vectorize(phi::stride(input_dim)); - const int64_t data_size = sizeof(C); - std::transform(in_strides.begin(), - in_strides.end(), - in_strides.begin(), - [&](std::ptrdiff_t s) { return s * data_size; }); - - const auto* in_data = reinterpret_cast(x->data()); - auto* out_data = reinterpret_cast(out->data()); - // pocketfft requires std::vector - std::vector axes_(axes.size()); - std::copy(axes.begin(), axes.end(), axes_.begin()); - // compuet factor - int64_t signal_numel = 1; - for (auto i : axes) { - signal_numel *= in_sizes[i]; - } - R factor = compute_factor(signal_numel, normalization); - pocketfft::c2c(in_sizes, - in_strides, - in_strides, - axes_, - forward, - in_data, - out_data, - factor); - } -}; - -template -struct FFTR2CFunctor { - void operator()(const phi::CPUContext& ctx, - const Tensor* x, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward) { - using R = Ti; - using C = std::complex; - - const auto& input_dim = x->dims(); - const std::vector in_sizes = phi::vectorize(input_dim); - std::vector in_strides = - phi::vectorize(phi::stride(input_dim)); - { - const int64_t data_size = sizeof(R); - std::transform(in_strides.begin(), - in_strides.end(), - in_strides.begin(), - [&](std::ptrdiff_t s) { return s * data_size; }); - } - - const auto& output_dim = out->dims(); - const std::vector out_sizes = phi::vectorize(output_dim); - std::vector out_strides = - phi::vectorize(phi::stride(output_dim)); - { - const int64_t data_size = sizeof(C); - std::transform(out_strides.begin(), - out_strides.end(), - out_strides.begin(), - [&](std::ptrdiff_t s) { return s * data_size; }); - } - - const auto* in_data = x->data(); - auto* out_data = reinterpret_cast(out->data()); - // pocketfft requires std::vector - std::vector axes_(axes.size()); - std::copy(axes.begin(), axes.end(), axes_.begin()); - // compuet normalization factor - int64_t signal_numel = 1; - for (auto i : axes) { - signal_numel *= in_sizes[i]; - } - R factor = compute_factor(signal_numel, normalization); - pocketfft::r2c(in_sizes, - in_strides, - out_strides, - axes_, - forward, - in_data, - out_data, - factor); - } -}; - -template -struct FFTC2RFunctor { - void operator()(const phi::CPUContext& ctx, - const Tensor* x, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward) { - using R = To; - using C = std::complex; - - const auto& input_dim = x->dims(); - const std::vector in_sizes = phi::vectorize(input_dim); - std::vector in_strides = - phi::vectorize(phi::stride(input_dim)); - { - const int64_t data_size = sizeof(C); - std::transform(in_strides.begin(), - in_strides.end(), - in_strides.begin(), - [&](std::ptrdiff_t s) { return s * data_size; }); - } - - const auto& output_dim = out->dims(); - const std::vector out_sizes = phi::vectorize(output_dim); - std::vector out_strides = - phi::vectorize(phi::stride(output_dim)); - { - const int64_t data_size = sizeof(R); - std::transform(out_strides.begin(), - out_strides.end(), - out_strides.begin(), - [&](std::ptrdiff_t s) { return s * data_size; }); - } - - const auto* in_data = reinterpret_cast(x->data()); - auto* out_data = out->data(); - // pocketfft requires std::vector - std::vector axes_(axes.size()); - std::copy(axes.begin(), axes.end(), axes_.begin()); - // compuet normalization factor - int64_t signal_numel = 1; - for (auto i : axes) { - signal_numel *= out_sizes[i]; - } - R factor = compute_factor(signal_numel, normalization); - pocketfft::c2r(out_sizes, - in_strides, - out_strides, - axes_, - forward, - in_data, - out_data, - factor); - } -}; - -#endif - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/spectral_op.cc b/paddle/fluid/operators/spectral_op.cc deleted file mode 100644 index 91e3880dff0049b324bed7188a90962224a4e25b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/spectral_op.cc +++ /dev/null @@ -1,389 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/spectral_op.h" - -#include "paddle/fluid/operators/spectral_helper.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -// FFTC2C -class FFTC2COpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor), the input tensor of fft_c2c op."); - AddOutput("Out", "(Tensor), the output tensor of fft_c2c op."); - AddAttr>("axes", - "std::vector, the fft axes."); - AddAttr("normalization", - "fft_norm_type, the fft normalization type."); - AddAttr("forward", "bool, the fft direction."); - AddComment(R"DOC( - Compute complex to complex FFT. - )DOC"); - } -}; - -class FFTC2COp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fft_c2c"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "fft_c2c"); - const auto axes = ctx->Attrs().Get>("axes"); - const auto x_dim = ctx->GetInputDim("X"); - for (size_t i = 0; i < axes.size(); i++) { - PADDLE_ENFORCE_GT(x_dim[axes[i]], - 0, - platform::errors::InvalidArgument( - "Invalid fft n-point (%d).", x_dim[axes[i]])); - } - ctx->ShareDim("X", /*->*/ "Out"); // only for c2c - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - const auto in_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - const auto kernel_dtype = framework::ToRealType(in_dtype); - return framework::OpKernelType(kernel_dtype, ctx.GetPlace()); - } -}; - -template -class FFTC2CGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("fft_c2c_grad"); - grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - grad_op->SetAttrMap(this->Attrs()); - } -}; - -class FFTC2CGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - const auto out_grad_name = framework::GradVarName("Out"); - OP_INOUT_CHECK( - ctx->HasInput(out_grad_name), "Input", out_grad_name, "fft_c2c_grad"); - const auto x_grad_name = framework::GradVarName("X"); - OP_INOUT_CHECK( - ctx->HasOutput(x_grad_name), "Output", x_grad_name, "fft_c2c_grad"); - - ctx->SetOutputDim(x_grad_name, ctx->GetInputDim(out_grad_name)); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - const auto in_dtype = OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")); - const auto kernel_dtype = framework::ToRealType(in_dtype); - return framework::OpKernelType(kernel_dtype, ctx.GetPlace()); - } -}; - -// FFTR2C -class FFTR2COpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor), the input tensor of fft_r2c op."); - AddOutput("Out", "(Tensor), the output tensor of fft_r2c op."); - AddAttr>("axes", - "std::vector, the fft axes."); - AddAttr("normalization", - "fft_norm_type, the fft normalization type."); - AddAttr("forward", "bool, the fft direction."); - AddAttr("onesided", "bool, perform onesided fft."); - AddComment(R"DOC( - Compute real to complex FFT. - )DOC"); - } -}; - -class FFTR2COp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fft_r2c"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "fft_r2c"); - const auto axes = ctx->Attrs().Get>("axes"); - const auto x_dim = ctx->GetInputDim("X"); - for (size_t i = 0; i < axes.size() - 1L; i++) { - PADDLE_ENFORCE_GT(x_dim[axes[i]], - 0, - platform::errors::InvalidArgument( - "Invalid fft n-point (%d).", x_dim[axes[i]])); - } - - const bool onesided = ctx->Attrs().Get("onesided"); - if (!onesided) { - ctx->ShareDim("X", /*->*/ "Out"); - } else { - framework::DDim out_dim(ctx->GetInputDim("X")); - const int64_t last_fft_axis = axes.back(); - const int64_t last_fft_dim_size = out_dim.at(last_fft_axis); - out_dim.at(last_fft_axis) = last_fft_dim_size / 2 + 1; - ctx->SetOutputDim("Out", out_dim); - } - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - const auto in_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - return framework::OpKernelType(in_dtype, ctx.GetPlace()); - } -}; - -template -class FFTR2CGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("fft_r2c_grad"); - grad_op->SetInput("X", this->Input("X")); - grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - grad_op->SetAttrMap(this->Attrs()); - } -}; - -class FFTR2CGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - const auto out_grad_name = framework::GradVarName("Out"); - OP_INOUT_CHECK( - ctx->HasInput(out_grad_name), "Input", out_grad_name, "fft_r2c_grad"); - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fft_r2c_grad"); - - const auto x_grad_name = framework::GradVarName("X"); - OP_INOUT_CHECK( - ctx->HasOutput(x_grad_name), "Output", x_grad_name, "fft_r2c_grad"); - - ctx->ShareDim("X", /*->*/ x_grad_name); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - const auto in_dtype = OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")); - const auto kernel_dtype = framework::ToRealType(in_dtype); - return framework::OpKernelType(kernel_dtype, ctx.GetPlace()); - } -}; - -// FFTC2R -class FFTC2ROpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor), the input tensor of fft_c2r op."); - AddOutput("Out", "(Tensor), the output tensor of fft_c2r op."); - AddAttr>("axes", - "std::vector, the fft axes."); - AddAttr("normalization", - "fft_norm_type, the fft normalization type."); - AddAttr("forward", "bool, the fft direction."); - AddAttr( - "last_dim_size", - "int", - "Length of the transformed " - "axis of the output. For n output points, last_dim_size//2 + 1 input" - " points are necessary. If the input is longer than this," - " it is cropped. If it is shorter than this, it is padded" - " with zeros. If last_dim_size is not given, it is taken to be 2*(m-1)" - " where m is the length of the input along the axis " - "specified by axis.") - .SetDefault(0L); - AddComment(R"DOC( - Compute complex to complex FFT. - )DOC"); - } -}; - -class FFTC2ROp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fft_c2r"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "fft_c2r"); - - const auto axes = ctx->Attrs().Get>("axes"); - const auto x_dim = ctx->GetInputDim("X"); - for (size_t i = 0; i < axes.size() - 1L; i++) { - PADDLE_ENFORCE_GT(x_dim[axes[i]], - 0, - platform::errors::InvalidArgument( - "Invalid fft n-point (%d).", x_dim[axes[i]])); - } - - const int64_t last_dim_size = ctx->Attrs().Get("last_dim_size"); - framework::DDim out_dim(ctx->GetInputDim("X")); - const int64_t last_fft_axis = axes.back(); - if (last_dim_size == 0) { - const int64_t last_fft_dim_size = out_dim.at(last_fft_axis); - const int64_t fft_n_point = (last_fft_dim_size - 1) * 2; - PADDLE_ENFORCE_GT(fft_n_point, - 0, - platform::errors::InvalidArgument( - "Invalid fft n-point (%d).", fft_n_point)); - out_dim.at(last_fft_axis) = fft_n_point; - } else { - PADDLE_ENFORCE_GT(last_dim_size, - 0, - platform::errors::InvalidArgument( - "Invalid fft n-point (%d).", last_dim_size)); - out_dim.at(last_fft_axis) = last_dim_size; - } - ctx->SetOutputDim("Out", out_dim); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - const auto in_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - const auto kernel_dtype = framework::ToRealType(in_dtype); - return framework::OpKernelType(kernel_dtype, ctx.GetPlace()); - } -}; - -template -class FFTC2RGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("fft_c2r_grad"); - grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - grad_op->SetAttrMap(this->Attrs()); - } -}; - -class FFTC2RGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - const auto out_grad_name = framework::GradVarName("Out"); - OP_INOUT_CHECK( - ctx->HasInput(out_grad_name), "Input", out_grad_name, "fft_c2r_grad"); - - const auto x_grad_name = framework::GradVarName("X"); - OP_INOUT_CHECK( - ctx->HasOutput(x_grad_name), "Output", x_grad_name, "fft_c2r_grad"); - - const auto axes = ctx->Attrs().Get>("axes"); - - const auto out_grad_dim = ctx->GetInputDim(out_grad_name); - framework::DDim x_grad_dim(out_grad_dim); - const int64_t last_fft_axis = axes.back(); - const int64_t last_fft_dim_size = x_grad_dim.at(last_fft_axis); - x_grad_dim.at(last_fft_axis) = last_fft_dim_size / 2 + 1; - ctx->SetOutputDim(x_grad_name, x_grad_dim); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - const auto in_dtype = OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")); - return framework::OpKernelType(in_dtype, ctx.GetPlace()); - } -}; - -// common functions -FFTNormMode get_norm_from_string(const std::string& norm, bool forward) { - if (norm.empty() || norm == "backward") { - return forward ? FFTNormMode::none : FFTNormMode::by_n; - } - - if (norm == "forward") { - return forward ? FFTNormMode::by_n : FFTNormMode::none; - } - - if (norm == "ortho") { - return FFTNormMode::by_sqrt_n; - } - - PADDLE_THROW(platform::errors::InvalidArgument( - "FFT norm string must be 'forward' or 'backward' or 'ortho', " - "received %s", - norm)); -} - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OPERATOR(fft_c2c, - ops::FFTC2COp, - ops::FFTC2COpMaker, - ops::FFTC2CGradOpMaker, - ops::FFTC2CGradOpMaker); -REGISTER_OP_CPU_KERNEL(fft_c2c, - ops::FFTC2CKernel, - ops::FFTC2CKernel); - -REGISTER_OPERATOR(fft_c2c_grad, ops::FFTC2CGradOp); -REGISTER_OP_CPU_KERNEL(fft_c2c_grad, - ops::FFTC2CGradKernel, - ops::FFTC2CGradKernel); - -REGISTER_OPERATOR(fft_r2c, - ops::FFTR2COp, - ops::FFTR2COpMaker, - ops::FFTR2CGradOpMaker, - ops::FFTR2CGradOpMaker); -REGISTER_OP_CPU_KERNEL(fft_r2c, - ops::FFTR2CKernel, - ops::FFTR2CKernel); - -REGISTER_OPERATOR(fft_r2c_grad, ops::FFTR2CGradOp); -REGISTER_OP_CPU_KERNEL(fft_r2c_grad, - ops::FFTR2CGradKernel, - ops::FFTR2CGradKernel); - -REGISTER_OPERATOR(fft_c2r, - ops::FFTC2ROp, - ops::FFTC2ROpMaker, - ops::FFTC2RGradOpMaker, - ops::FFTC2RGradOpMaker); -REGISTER_OP_CPU_KERNEL(fft_c2r, - ops::FFTC2RKernel, - ops::FFTC2RKernel); - -REGISTER_OPERATOR(fft_c2r_grad, ops::FFTC2RGradOp); -REGISTER_OP_CPU_KERNEL(fft_c2r_grad, - ops::FFTC2RGradKernel, - ops::FFTC2RGradKernel); diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu deleted file mode 100644 index 661fcc83771f54fd3290499584d5896e76ba8c5b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/spectral_op.cu +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include "paddle/fluid/operators/spectral_op.h" -#include "paddle/fluid/operators/spectral_op.cu.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(fft_c2c, - ops::FFTC2CKernel, - ops::FFTC2CKernel); - -REGISTER_OP_CUDA_KERNEL(fft_c2c_grad, - ops::FFTC2CGradKernel, - ops::FFTC2CGradKernel); - -REGISTER_OP_CUDA_KERNEL(fft_c2r, - ops::FFTC2RKernel, - ops::FFTC2RKernel); - -REGISTER_OP_CUDA_KERNEL(fft_c2r_grad, - ops::FFTC2RGradKernel, - ops::FFTC2RGradKernel); - -REGISTER_OP_CUDA_KERNEL(fft_r2c, - ops::FFTR2CKernel, - ops::FFTR2CKernel); - -REGISTER_OP_CUDA_KERNEL(fft_r2c_grad, - ops::FFTR2CGradKernel, - ops::FFTR2CGradKernel); diff --git a/paddle/fluid/operators/spectral_op.cu.h b/paddle/fluid/operators/spectral_op.cu.h deleted file mode 100644 index 5ced67691ee0764d38a5457ea261e1f44b48c830..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/spectral_op.cu.h +++ /dev/null @@ -1,1018 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/operators/spectral_op.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" - -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/dynload/hipfft.h" -#endif - -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/dynload/cufft.h" -#endif - -namespace paddle { -namespace operators { - -using ScalarType = framework::proto::VarType::Type; -const int64_t kMaxFFTNdim = 3; -const int64_t kMaxDataNdim = kMaxFFTNdim + 1; -// This struct is used to easily compute hashes of the -// parameters. It will be the **key** to the plan cache. -struct FFTConfigKey { - // between 1 and kMaxFFTNdim, i.e., 1 <= signal_ndim <= 3 - int64_t signal_ndim_; - // These include additional batch dimension as well. - int64_t sizes_[kMaxDataNdim]; - int64_t input_shape_[kMaxDataNdim]; - int64_t output_shape_[kMaxDataNdim]; - FFTTransformType fft_type_; - ScalarType value_type_; - - FFTConfigKey() = default; - - FFTConfigKey(const std::vector& in_shape, - const std::vector& out_shape, - const std::vector& signal_size, - FFTTransformType fft_type, - ScalarType value_type) { - // Padding bits must be zeroed for hashing - memset(this, 0, sizeof(*this)); - signal_ndim_ = signal_size.size() - 1; - fft_type_ = fft_type; - value_type_ = value_type; - - std::copy(signal_size.cbegin(), signal_size.cend(), sizes_); - std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_); - std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_); - } -}; - -#if defined(PADDLE_WITH_CUDA) -// An RAII encapsulation of cuFFTHandle -class CuFFTHandle { - ::cufftHandle handle_; - - public: - CuFFTHandle() { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftCreate(&handle_)); - } - - CuFFTHandle(const CuFFTHandle& other) = delete; - CuFFTHandle& operator=(const CuFFTHandle& other) = delete; - - CuFFTHandle(CuFFTHandle&& other) = delete; - CuFFTHandle& operator=(CuFFTHandle&& other) = delete; - - ::cufftHandle& get() { return handle_; } - const ::cufftHandle& get() const { return handle_; } - - ~CuFFTHandle() { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftDestroy(handle_)); - } -}; - -using plan_size_type = long long int; // NOLINT -// This class contains all the information needed to execute a cuFFT plan: -// 1. the plan -// 2. the workspace size needed -class FFTConfig { - public: - // Only move semantics is enought for this class. Although we already use - // unique_ptr for the plan, still remove copy constructor and assignment op so - // we don't accidentally copy and take perf hit. - explicit FFTConfig(const FFTConfigKey& plan_key) - : FFTConfig( - std::vector(plan_key.sizes_, - plan_key.sizes_ + plan_key.signal_ndim_ + 1), - plan_key.signal_ndim_, - plan_key.fft_type_, - plan_key.value_type_) {} - - // sizes are full signal, including batch size and always two-sided - FFTConfig(const std::vector& sizes, - const int64_t signal_ndim, - FFTTransformType fft_type, - ScalarType dtype) - : fft_type_(fft_type), value_type_(dtype) { - // signal sizes (excluding batch dim) - std::vector signal_sizes(sizes.begin() + 1, sizes.end()); - - // input batch size - const auto batch = static_cast(sizes[0]); - // const int64_t signal_ndim = sizes.size() - 1; - PADDLE_ENFORCE_EQ(signal_ndim, - sizes.size() - 1, - platform::errors::InvalidArgument( - "The signal_ndim must be equal to sizes.size() - 1," - "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]", - signal_ndim, - sizes.size() - 1)); - - cudaDataType itype, otype, exec_type; - const auto complex_input = has_complex_input(fft_type); - const auto complex_output = has_complex_output(fft_type); - if (dtype == framework::proto::VarType::FP32) { - itype = complex_input ? CUDA_C_32F : CUDA_R_32F; - otype = complex_output ? CUDA_C_32F : CUDA_R_32F; - exec_type = CUDA_C_32F; - } else if (dtype == framework::proto::VarType::FP64) { - itype = complex_input ? CUDA_C_64F : CUDA_R_64F; - otype = complex_output ? CUDA_C_64F : CUDA_R_64F; - exec_type = CUDA_C_64F; - } else if (dtype == framework::proto::VarType::FP16) { - itype = complex_input ? CUDA_C_16F : CUDA_R_16F; - otype = complex_output ? CUDA_C_16F : CUDA_R_16F; - exec_type = CUDA_C_16F; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "cuFFT only support transforms of type float16, float32 and " - "float64")); - } - - // disable auto allocation of workspace to use allocator from the framework - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetAutoAllocation( - plan(), /* autoAllocate */ 0)); - - size_t ws_size_t; - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cufftXtMakePlanMany(plan(), - signal_ndim, - signal_sizes.data(), - /* inembed */ nullptr, - /* base_istride */ 1, - /* idist */ 1, - itype, - /* onembed */ nullptr, - /* base_ostride */ 1, - /* odist */ 1, - otype, - batch, - &ws_size_t, - exec_type)); - - ws_size = ws_size_t; - } - - FFTConfig(const FFTConfig& other) = delete; - FFTConfig& operator=(const FFTConfig& other) = delete; - - FFTConfig(FFTConfig&& other) = delete; - FFTConfig& operator=(FFTConfig&& other) = delete; - - const cufftHandle& plan() const { return plan_ptr.get(); } - - FFTTransformType transform_type() const { return fft_type_; } - ScalarType data_type() const { return value_type_; } - size_t workspace_size() const { return ws_size; } - - private: - CuFFTHandle plan_ptr; - size_t ws_size; - FFTTransformType fft_type_; - ScalarType value_type_; -}; - -#elif defined(PADDLE_WITH_HIP) -// An RAII encapsulation of cuFFTHandle -class HIPFFTHandle { - ::hipfftHandle handle_; - - public: - HIPFFTHandle() { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftCreate(&handle_)); - } - - HIPFFTHandle(const HIPFFTHandle& other) = delete; - HIPFFTHandle& operator=(const HIPFFTHandle& other) = delete; - - HIPFFTHandle(HIPFFTHandle&& other) = delete; - HIPFFTHandle& operator=(HIPFFTHandle&& other) = delete; - - ::hipfftHandle& get() { return handle_; } - const ::hipfftHandle& get() const { return handle_; } - - ~HIPFFTHandle() { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftDestroy(handle_)); - } -}; -using plan_size_type = int; -// This class contains all the information needed to execute a cuFFT plan: -// 1. the plan -// 2. the workspace size needed -class FFTConfig { - public: - // Only move semantics is enought for this class. Although we already use - // unique_ptr for the plan, still remove copy constructor and assignment op so - // we don't accidentally copy and take perf hit. - explicit FFTConfig(const FFTConfigKey& plan_key) - : FFTConfig( - std::vector(plan_key.sizes_, - plan_key.sizes_ + plan_key.signal_ndim_ + 1), - plan_key.signal_ndim_, - plan_key.fft_type_, - plan_key.value_type_) {} - - // sizes are full signal, including batch size and always two-sided - FFTConfig(const std::vector& sizes, - const int64_t signal_ndim, - FFTTransformType fft_type, - ScalarType dtype) - : fft_type_(fft_type), value_type_(dtype) { - // signal sizes (excluding batch dim) - std::vector signal_sizes(sizes.begin() + 1, sizes.end()); - - // input batch size - const auto batch = static_cast(sizes[0]); - // const int64_t signal_ndim = sizes.size() - 1; - PADDLE_ENFORCE_EQ(signal_ndim, - sizes.size() - 1, - platform::errors::InvalidArgument( - "The signal_ndim must be equal to sizes.size() - 1," - "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]", - signal_ndim, - sizes.size() - 1)); - - hipfftType exec_type = [&] { - if (dtype == framework::proto::VarType::FP32) { - switch (fft_type) { - case FFTTransformType::C2C: - return HIPFFT_C2C; - case FFTTransformType::R2C: - return HIPFFT_R2C; - case FFTTransformType::C2R: - return HIPFFT_C2R; - } - } else if (dtype == framework::proto::VarType::FP64) { - switch (fft_type) { - case FFTTransformType::C2C: - return HIPFFT_Z2Z; - case FFTTransformType::R2C: - return HIPFFT_D2Z; - case FFTTransformType::C2R: - return HIPFFT_Z2D; - } - } - PADDLE_THROW(platform::errors::InvalidArgument( - "hipFFT only support transforms of type float32 and float64")); - }(); - - // disable auto allocation of workspace to use allocator from the framework - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetAutoAllocation( - plan(), /* autoAllocate */ 0)); - - size_t ws_size_t; - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::hipfftMakePlanMany(plan(), - signal_ndim, - signal_sizes.data(), - /* inembed */ nullptr, - /* base_istride */ 1, - /* idist */ 1, - /* onembed */ nullptr, - /* base_ostride */ 1, - /* odist */ 1, - exec_type, - batch, - &ws_size_t)); - - ws_size = ws_size_t; - } - - const hipfftHandle& plan() const { return plan_ptr.get(); } - - FFTTransformType transform_type() const { return fft_type_; } - ScalarType data_type() const { return value_type_; } - size_t workspace_size() const { return ws_size; } - - private: - HIPFFTHandle plan_ptr; - size_t ws_size; - FFTTransformType fft_type_; - ScalarType value_type_; -}; -#endif - -// Hashing machinery for Key -// Fowler–Noll–Vo hash function -// see -// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function -template -struct KeyHash { - // Key must be a POD because we read out its memory - // contenst as char* when hashing - static_assert(std::is_pod::value, "Key must be plain old data type"); - - size_t operator()(const Key& params) const { - auto ptr = reinterpret_cast(¶ms); - uint32_t value = 0x811C9DC5; - for (int i = 0; i < static_cast(sizeof(Key)); ++i) { - value ^= ptr[i]; - value *= 0x01000193; - } - return static_cast(value); - } -}; - -template -struct KeyEqual { - // Key must be a POD because we read out its memory - // contenst as char* when comparing - static_assert(std::is_pod::value, "Key must be plain old data type"); - - bool operator()(const Key& a, const Key& b) const { - auto ptr1 = reinterpret_cast(&a); - auto ptr2 = reinterpret_cast(&b); - return memcmp(ptr1, ptr2, sizeof(Key)) == 0; - } -}; - -#if CUDA_VERSION < 10000 -// Note that the max plan number for CUDA version < 10 has to be 1023 -// due to a bug that fails on the 1024th plan -constexpr size_t CUFFT_MAX_PLAN_NUM = 1023; -constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM; -#else -constexpr size_t CUFFT_MAX_PLAN_NUM = std::numeric_limits::max(); -// The default max cache size chosen for CUDA version > 10 is arbitrary. -// This number puts a limit on how big of a plan cache should we maintain by -// default. Users can always configure it via cufft_set_plan_cache_max_size. -constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = 4096; -#endif -static_assert(CUFFT_MAX_PLAN_NUM >= 0 && - CUFFT_MAX_PLAN_NUM <= std::numeric_limits::max(), - "CUFFT_MAX_PLAN_NUM not in size_t range"); -static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 && - CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM, - "CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range"); - -// This cache assumes that the mapping from key to value never changes. -// This is **NOT** thread-safe. Please use a mutex when using it **AND** the -// value returned from try_emplace_value. -// The contract of using this cache is that try_emplace_value should only be -// used when the max_size is positive. -class FFTConfigCache { - public: - using kv_t = typename std::pair; - using map_t = - typename std::unordered_map, - typename std::list::iterator, - KeyHash, - KeyEqual>; - using map_kkv_iter_t = typename map_t::iterator; - - FFTConfigCache() : FFTConfigCache(CUFFT_DEFAULT_CACHE_SIZE) {} - - explicit FFTConfigCache(int64_t max_size) { _set_max_size(max_size); } - - FFTConfigCache(const FFTConfigCache& other) = delete; - FFTConfigCache& operator=(const FFTConfigCache& other) = delete; - - FFTConfigCache(FFTConfigCache&& other) noexcept - : _usage_list(std::move(other._usage_list)), - _cache_map(std::move(other._cache_map)), - _max_size(other._max_size) {} - - FFTConfigCache& operator=(FFTConfigCache&& other) noexcept { - _usage_list = std::move(other._usage_list); - _cache_map = std::move(other._cache_map); - _max_size = other._max_size; - return *this; - } - - // If key is in this cache, return the cached config. Otherwise, emplace the - // config in this cache and return it. - FFTConfig& lookup(FFTConfigKey params) { - PADDLE_ENFORCE_GT(_max_size, - 0, - platform::errors::InvalidArgument( - "The max size of FFTConfigCache must be great than 0," - "But received is [%d]", - _max_size)); - - map_kkv_iter_t map_it = _cache_map.find(params); - // Hit, put to list front - if (map_it != _cache_map.end()) { - _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second); - return map_it->second->second; - } - - // Miss - // remove if needed - if (_usage_list.size() >= _max_size) { - auto last = _usage_list.end(); - last--; - _cache_map.erase(last->first); - _usage_list.pop_back(); - } - - // construct new plan at list front, then insert into _cache_map - _usage_list.emplace_front(std::piecewise_construct, - std::forward_as_tuple(params), - std::forward_as_tuple(params)); - auto kv_it = _usage_list.begin(); - _cache_map.emplace(std::piecewise_construct, - std::forward_as_tuple(kv_it->first), - std::forward_as_tuple(kv_it)); - return kv_it->second; - } - - void clear() { - _cache_map.clear(); - _usage_list.clear(); - } - - void resize(int64_t new_size) { - _set_max_size(new_size); - auto cur_size = _usage_list.size(); - if (cur_size > _max_size) { - auto delete_it = _usage_list.end(); - for (size_t i = 0; i < cur_size - _max_size; i++) { - delete_it--; - _cache_map.erase(delete_it->first); - } - _usage_list.erase(delete_it, _usage_list.end()); - } - } - - size_t size() const { return _cache_map.size(); } - - size_t max_size() const noexcept { return _max_size; } - - std::mutex mutex; - - private: - // Only sets size and does value check. Does not resize the data structures. - void _set_max_size(int64_t new_size) { - // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since - // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check - // first. - PADDLE_ENFORCE_GE( - new_size, - 0, - platform::errors::InvalidArgument( - "cuFFT plan cache size must be non-negative, But received is [%d]", - new_size)); - PADDLE_ENFORCE_LE(new_size, - CUFFT_MAX_PLAN_NUM, - platform::errors::InvalidArgument( - "cuFFT plan cache size can not be larger than [%d], " - "But received is [%d]", - CUFFT_MAX_PLAN_NUM, - new_size)); - _max_size = static_cast(new_size); - } - - std::list _usage_list; - map_t _cache_map; - size_t _max_size; -}; - -static std::vector> plan_caches; -static std::mutex plan_caches_mutex; - -static inline FFTConfigCache& get_fft_plan_cache(int64_t device_index) { - std::lock_guard guard(plan_caches_mutex); - - if (device_index >= plan_caches.size()) { - plan_caches.resize(device_index + 1); - } - - if (!plan_caches[device_index]) { - plan_caches[device_index] = std::make_unique(); - } - - return *plan_caches[device_index]; -} - -// Calculates the normalization constant -static double fft_normalization_scale(FFTNormMode normalization, - const std::vector& sizes, - const std::vector& dims) { - // auto norm = static_cast(normalization); - if (normalization == FFTNormMode::none) { - return static_cast(1.0); - } - - int64_t signal_numel = 1; - for (auto dim : dims) { - signal_numel *= sizes[dim]; - } - const double scale_denom = (normalization == FFTNormMode::by_sqrt_n) - ? std::sqrt(signal_numel) - : static_cast(signal_numel); - return static_cast(1.0 / scale_denom); -} - -template -void exec_normalization(const DeviceContext& ctx, - const Tensor* in, - Tensor* out, - FFTNormMode normalization, - const std::vector& sizes, - const std::vector& axes) { - double scale = fft_normalization_scale(normalization, sizes, axes); - if (scale != 1.0) { - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*in); - auto dev = ctx.eigen_device(); - EigenScale::Eval(*dev, - eigen_out, - eigen_in, - static_cast(scale), - static_cast(0), - false); - } else { - framework::TensorCopy(*in, ctx.GetPlace(), out); - } -} - -#if defined(PADDLE_WITH_CUDA) -static FFTConfigKey create_fft_configkey(const framework::Tensor& input, - const framework::Tensor& output, - int signal_ndim) { - // Create the transform plan (either from cache or locally) - const auto value_type = - framework::IsComplexType(framework::TransToProtoVarType(input.dtype())) - ? framework::ToRealType(framework::TransToProtoVarType(input.dtype())) - : framework::TransToProtoVarType(input.dtype()); - auto fft_type = - GetFFTTransformType(framework::TransToProtoVarType(input.dtype()), - framework::TransToProtoVarType(output.dtype())); - // signal sizes - std::vector signal_size(signal_ndim + 1); - - signal_size[0] = input.dims()[0]; - for (int64_t i = 1; i <= signal_ndim; ++i) { - auto in_size = input.dims()[i]; - auto out_size = output.dims()[i]; - signal_size[i] = std::max(in_size, out_size); - } - FFTConfigKey key(phi::vectorize(input.dims()), - phi::vectorize(output.dims()), - signal_size, - fft_type, - value_type); - return key; -} - -// Execute a pre-planned transform -static void exec_cufft_plan_raw(const FFTConfig& config, - void* in_data, - void* out_data, - bool forward) { - auto& plan = config.plan(); - - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtExec( - plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE)); -} - -template -void exec_cufft_plan(const DeviceContext& ctx, - const FFTConfig& config, - framework::Tensor* input, - framework::Tensor* output, - bool forward) { - // execute transform plan - auto fft_type = config.transform_type(); - if (fft_type == FFTTransformType::C2R && forward) { - forward = false; - framework::Tensor input_conj(input->type()); - input_conj.mutable_data(input->dims(), ctx.GetPlace()); - platform::ForRange for_range(ctx, input->numel()); - phi::funcs::ConjFunctor functor( - input->data(), input->numel(), input_conj.data()); - for_range(functor); - exec_cufft_plan_raw(config, input_conj.data(), output->data(), forward); - } else if (fft_type == FFTTransformType::R2C && !forward) { - forward = true; - framework::Tensor out_conj(output->type()); - out_conj.mutable_data(output->dims(), ctx.GetPlace()); - exec_cufft_plan_raw(config, input->data(), out_conj.data(), forward); - - platform::ForRange for_range(ctx, output->numel()); - phi::funcs::ConjFunctor functor( - out_conj.data(), output->numel(), output->data()); - for_range(functor); - } else { - exec_cufft_plan_raw(config, input->data(), output->data(), forward); - } -} - -#elif defined(PADDLE_WITH_HIP) - -static FFTConfigKey create_fft_configkey(const framework::Tensor& input, - const framework::Tensor& output, - int signal_ndim) { - // Create the transform plan (either from cache or locally) - const auto value_type = - framework::IsComplexType(framework::TransToProtoVarType(input.dtype())) - ? framework::ToRealType(framework::TransToProtoVarType(input.dtype())) - : framework::TransToProtoVarType(input.dtype()); - auto fft_type = - GetFFTTransformType(framework::TransToProtoVarType(input.dtype()), - framework::TransToProtoVarType(output.type())); - // signal sizes - std::vector signal_size(signal_ndim + 1); - - signal_size[0] = input.dims()[0]; - for (int64_t i = 1; i <= signal_ndim; ++i) { - auto in_size = input.dims()[i]; - auto out_size = output.dims()[i]; - signal_size[i] = std::max(in_size, out_size); - } - FFTConfigKey key(phi::vectorize(input.dims()), - phi::vectorize(output.dims()), - signal_size, - fft_type, - value_type); - return key; -} - -// Execute a pre-planned transform -static void exec_hipfft_plan_raw(const FFTConfig& config, - void* in_data, - void* out_data, - bool forward) { - auto& plan = config.plan(); - - auto value_type = config.data_type(); - if (value_type == framework::proto::VarType::FP32) { - switch (config.transform_type()) { - case FFTTransformType::C2C: { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2C( - plan, - static_cast(in_data), - static_cast(out_data), - forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); - return; - } - case FFTTransformType::R2C: { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecR2C( - plan, - static_cast(in_data), - static_cast(out_data))); - return; - } - case FFTTransformType::C2R: { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2R( - plan, - static_cast(in_data), - static_cast(out_data))); - return; - } - } - } else if (value_type == framework::proto::VarType::FP64) { - switch (config.transform_type()) { - case FFTTransformType::C2C: { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2Z( - plan, - static_cast(in_data), - static_cast(out_data), - forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); - return; - } - case FFTTransformType::R2C: { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecD2Z( - plan, - static_cast(in_data), - static_cast(out_data))); - return; - } - case FFTTransformType::C2R: { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2D( - plan, - static_cast(in_data), - static_cast(out_data))); - return; - } - } - } - PADDLE_THROW(platform::errors::InvalidArgument( - "hipFFT only support transforms of type float32 and float64")); -} - -template -void exec_hipfft_plan(const DeviceContext& ctx, - const FFTConfig& config, - framework::Tensor* input, - framework::Tensor* output, - bool forward) { - auto fft_type = config.transform_type(); - if (fft_type == FFTTransformType::C2R && forward) { - forward = false; - framework::Tensor input_conj(input->type()); - input_conj.mutable_data(input->dims(), ctx.GetPlace()); - platform::ForRange for_range(ctx, input->numel()); - phi::funcs::ConjFunctor functor( - input->data(), input->numel(), input_conj.data()); - for_range(functor); - exec_hipfft_plan_raw(config, input_conj.data(), output->data(), forward); - } else if (fft_type == FFTTransformType::R2C && !forward) { - forward = true; - framework::Tensor out_conj(output->type()); - out_conj.mutable_data(output->dims(), ctx.GetPlace()); - exec_hipfft_plan_raw(config, input->data(), out_conj.data(), forward); - - platform::ForRange for_range(ctx, output->numel()); - phi::funcs::ConjFunctor functor( - out_conj.data(), output->numel(), output->data()); - for_range(functor); - } else { - exec_hipfft_plan_raw(config, input->data(), output->data(), forward); - } -} - -#endif - -// Execute a general unnormalized fft operation (can be c2c, onesided r2c or -// onesided c2r) -template -void exec_fft(const DeviceContext& ctx, - const Tensor* X, - Tensor* out, - const std::vector& dim, - bool forward) { - const auto x_dims = phi::vectorize(X->dims()); - const int64_t ndim = static_cast(X->dims().size()); - auto tensor_place = ctx.GetPlace(); - - // make a dim permutation - std::vector dim_permute(ndim); - std::iota(dim_permute.begin(), dim_permute.end(), int{0}); - std::vector is_transformed_dim(ndim); - for (const auto& d : dim) { - is_transformed_dim[d] = true; - } - auto batch_end = - std::partition(dim_permute.begin(), dim_permute.end(), [&](int64_t d) { - return !is_transformed_dim[d]; - }); - std::sort(dim_permute.begin(), batch_end); - std::copy(dim.cbegin(), dim.cend(), batch_end); - - // transpose input according to dim permutation - auto transposed_input_shape = X->dims().transpose(dim_permute); - framework::Tensor transposed_input; - transposed_input.Resize(transposed_input_shape); - transposed_input.mutable_data(tensor_place); - TransCompute( - ndim, ctx, *X, &transposed_input, dim_permute); - - // Reshape batch dimensions into a single dimension - const int64_t signal_ndim = static_cast(dim.size()); - std::vector collapsed_input_shape(signal_ndim + 1); - - auto transposed_input_shape_ = phi::vectorize(transposed_input_shape); - const int64_t batch_dims = ndim - signal_ndim; - auto batch_size = - std::accumulate(transposed_input_shape_.begin(), - transposed_input_shape_.begin() + batch_dims, - static_cast(1), - std::multiplies()); - collapsed_input_shape[0] = batch_size; - - std::copy(transposed_input_shape_.begin() + batch_dims, - transposed_input_shape_.end(), - collapsed_input_shape.begin() + 1); - - framework::Tensor& collapsed_input = transposed_input; - collapsed_input.Resize(phi::make_ddim(collapsed_input_shape)); - - // make a collpased output - const auto out_dims = phi::vectorize(out->dims()); - std::vector collapsed_output_shape(1 + signal_ndim); - collapsed_output_shape[0] = batch_size; - for (size_t i = 0; i < dim.size(); ++i) { - collapsed_output_shape[i + 1] = out_dims[dim[i]]; - } - framework::Tensor collapsed_output; - collapsed_output.Resize(phi::make_ddim(collapsed_output_shape)); - collapsed_output.mutable_data(tensor_place); - - FFTConfig* config = nullptr; - -#if defined(PADDLE_WITH_CUDA) - std::unique_ptr config_ = nullptr; - // create plan - FFTConfigKey key = - create_fft_configkey(collapsed_input, collapsed_output, signal_ndim); - bool using_cache = false; -#if !defined(CUFFT_VERSION) || (CUFFT_VERSION < 10200) - using_cache = true; -#endif - - if (using_cache) { - const int64_t device_id = static_cast( - reinterpret_cast(&collapsed_input.place()) - ->GetDeviceId()); - FFTConfigCache& plan_cache = get_fft_plan_cache(device_id); - std::unique_lock guard(plan_cache.mutex, std::defer_lock); - guard.lock(); - config = &(plan_cache.lookup(key)); - } else { - config_ = std::make_unique(key); - config = config_.get(); - } - - // prepare cufft for execution - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cufftSetStream(config->plan(), ctx.stream())); - framework::Tensor workspace_tensor; - workspace_tensor.mutable_data(tensor_place, config->workspace_size()); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetWorkArea( - config->plan(), workspace_tensor.data())); - // execute transform plan - exec_cufft_plan( - ctx, *config, &collapsed_input, &collapsed_output, forward); - -#elif defined(PADDLE_WITH_HIP) - // create plan - FFTConfigKey key = - create_fft_configkey(collapsed_input, collapsed_output, signal_ndim); - const int64_t device_id = static_cast( - reinterpret_cast(&collapsed_input.place()) - ->GetDeviceId()); - FFTConfigCache& plan_cache = get_fft_plan_cache(device_id); - std::unique_lock guard(plan_cache.mutex, std::defer_lock); - guard.lock(); - config = &(plan_cache.lookup(key)); - - // prepare cufft for execution - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::hipfftSetStream(config->plan(), ctx.stream())); - framework::Tensor workspace_tensor; - workspace_tensor.mutable_data(tensor_place, config->workspace_size()); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetWorkArea( - config->plan(), workspace_tensor.data())); - // execute transform plan - exec_hipfft_plan( - ctx, *config, &collapsed_input, &collapsed_output, forward); -#endif - - // Inverting output by reshape and transpose to original batch and dimension - auto transposed_out_shape = out->dims().transpose(dim_permute); - - collapsed_output.Resize(transposed_out_shape); - auto& transposed_output = collapsed_output; - - std::vector reverse_dim_permute(ndim); - for (size_t i = 0; i < ndim; i++) { - reverse_dim_permute[dim_permute[i]] = i; - } - - TransCompute( - ndim, ctx, transposed_output, out, reverse_dim_permute); -} - -// Use the optimized path to perform single R2C or C2R if transformation dim is -// supported by cuFFT -static bool use_optimized_fft_path(const std::vector& axes) { - // For performance reason, when axes starts with (0, 1), do not use the - // optimized path. - if (axes.size() > kMaxFFTNdim || - (axes.size() >= 2 && axes[0] == 0 && axes[1] == 1)) { - return false; - } else { - return true; - } -} - -template -struct FFTC2CFunctor { - void operator()(const phi::GPUContext& ctx, - const Tensor* X, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward) { - if (axes.empty()) { - framework::TensorCopy(*X, ctx.GetPlace(), out); - return; - } - - framework::Tensor* p_out = out; - std::vector out_dims = phi::vectorize(X->dims()); - std::vector working_axes(axes.begin(), axes.end()); - std::vector first_dims; - size_t max_dims; - framework::Tensor working_tensor; - working_tensor.mutable_data(X->dims(), ctx.GetPlace()); - framework::Tensor* p_working_tensor = &working_tensor; - framework::TensorCopy(*X, ctx.GetPlace(), &working_tensor); - - while (true) { - max_dims = - std::min(static_cast(kMaxFFTNdim), working_axes.size()); - first_dims.assign(working_axes.end() - max_dims, working_axes.end()); - - exec_fft( - ctx, p_working_tensor, p_out, first_dims, forward); - working_axes.resize(working_axes.size() - max_dims); - first_dims.clear(); - - if (working_axes.empty()) { - break; - } - - std::swap(p_out, p_working_tensor); - } - exec_normalization( - ctx, p_out, out, normalization, out_dims, axes); - } -}; - -template -struct FFTC2RFunctor { - void operator()(const phi::GPUContext& ctx, - const Tensor* X, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward) { - std::vector in_dims = phi::vectorize(X->dims()); - std::vector out_dims = phi::vectorize(out->dims()); - - if (use_optimized_fft_path(axes)) { - framework::Tensor x_copy(X->type()); - x_copy.mutable_data(X->dims(), ctx.GetPlace()); - framework::TensorCopy(*X, ctx.GetPlace(), &x_copy); - exec_fft(ctx, &x_copy, out, axes, forward); - } else { - framework::Tensor temp_tensor; - temp_tensor.mutable_data(X->dims(), ctx.GetPlace()); - const std::vector dims(axes.begin(), axes.end() - 1); - - FFTC2CFunctor c2c_functor; - c2c_functor(ctx, X, &temp_tensor, dims, FFTNormMode::none, forward); - - exec_fft( - ctx, &temp_tensor, out, {axes.back()}, forward); - } - exec_normalization( - ctx, out, out, normalization, out_dims, axes); - } -}; - -// n dimension real to complex FFT use cufft lib -template -struct FFTR2CFunctor { - void operator()(const phi::GPUContext& ctx, - const Tensor* X, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward) { - // Step1: R2C transform on the last dimension - framework::Tensor* r2c_out = out; - const std::vector last_dim{axes.back()}; - std::vector out_dims = phi::vectorize(out->dims()); - exec_fft(ctx, X, r2c_out, last_dim, forward); - - // Step2: C2C transform on the remaining dimension - framework::Tensor c2c_out; - if (axes.size() > 1) { - c2c_out.mutable_data(out->dims(), ctx.GetPlace()); - std::vector remain_dim(axes.begin(), axes.end() - 1); - FFTC2CFunctor fft_c2c_func; - fft_c2c_func( - ctx, r2c_out, &c2c_out, remain_dim, FFTNormMode::none, forward); - } - - const auto in_sizes = phi::vectorize(X->dims()); - framework::Tensor* norm_tensor = axes.size() > 1 ? &c2c_out : r2c_out; - exec_normalization( - ctx, norm_tensor, out, normalization, in_sizes, axes); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/spectral_op.h b/paddle/fluid/operators/spectral_op.h deleted file mode 100644 index 9296f997584dd7895d8f706421f3a829a53d02bb..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/spectral_op.h +++ /dev/null @@ -1,507 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once -#define NOMINMAX // to use std::min std::max correctly on windows -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" -#include "paddle/phi/kernels/funcs/padding.h" -#if defined(__NVCC__) || defined(__HIPCC__) -#include "thrust/device_vector.h" -#endif - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -enum class FFTNormMode : int64_t { - none, // No normalization - by_sqrt_n, // Divide by sqrt(signal_size) - by_n, // Divide by signal_size -}; - -FFTNormMode get_norm_from_string(const std::string& norm, bool forward); - -// Enum representing the FFT type -enum class FFTTransformType : int64_t { - C2C = 0, // Complex-to-complex - R2C, // Real-to-complex - C2R, // Complex-to-real -}; - -// Create transform type enum from bools representing if input and output are -// complex -inline FFTTransformType GetFFTTransformType( - framework::proto::VarType::Type input_dtype, - framework::proto::VarType::Type output_dtype) { - auto complex_input = framework::IsComplexType(input_dtype); - auto complex_output = framework::IsComplexType(output_dtype); - if (complex_input && complex_output) { - return FFTTransformType::C2C; - } else if (complex_input && !complex_output) { - return FFTTransformType::C2R; - } else if (!complex_input && complex_output) { - return FFTTransformType::R2C; - } - PADDLE_THROW( - platform::errors::InvalidArgument("Real to real FFTs are not supported")); -} - -// Returns true if the transform type has complex input -inline bool has_complex_input(FFTTransformType type) { - switch (type) { - case FFTTransformType::C2C: - case FFTTransformType::C2R: - return true; - - case FFTTransformType::R2C: - return false; - } - PADDLE_THROW(platform::errors::InvalidArgument("Unknown FFTTransformType")); -} - -// Returns true if the transform type has complex output -inline bool has_complex_output(FFTTransformType type) { - switch (type) { - case FFTTransformType::C2C: - case FFTTransformType::R2C: - return true; - - case FFTTransformType::C2R: - return false; - } - PADDLE_THROW(platform::errors::InvalidArgument("Unknown FFTTransformType")); -} - -template -struct FFTFillConjGradFunctor { - T* input_; - const size_t axis_; - const int64_t* strides_; - const size_t double_length_; - - FFTFillConjGradFunctor(T* input, - size_t axis, - const int64_t* strides, - size_t double_length) - : input_(input), - axis_(axis), - strides_(strides), - double_length_(double_length) {} - - HOSTDEVICE void operator()(size_t index) { - size_t offtset = index; // back - size_t index_i; - for (size_t i = 0; i <= axis_; i++) { - index_i = offtset / strides_[i]; - offtset %= strides_[i]; - } - - if ((0 < index_i) && (index_i < double_length_ + 1)) { - input_[index] *= static_cast(2); - } - } -}; - -template -struct FFTC2CFunctor { - void operator()(const DeviceContext& ctx, - const Tensor* X, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward); -}; - -template -struct FFTR2CFunctor { - void operator()(const DeviceContext& ctx, - const Tensor* X, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward); -}; - -template -struct FFTC2RFunctor { - void operator()(const DeviceContext& ctx, - const Tensor* X, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward); -}; - -// Giving a linear destination index and strides of tensor, get_idx return the -// corresponding linear position of source tensor. -// The linear index is the position of flatten tensor. -// Giving a linear destination index and strides of tensor, get_idx return the -// corresponding linear position of source tensor. -// The linear index is the position of flatten tensor. -HOSTDEVICE inline int64_t get_src_idx(const int64_t dst_idx, - const int64_t* dst_strides, - const int64_t* dst_shape, - const int64_t* src_strides, - const bool* is_fft_axis, - const bool conj, - const int64_t rank) { - int64_t src_idx = 0; - int64_t quotient = dst_idx; - int64_t remainder = 0; - - for (int64_t i = 0; i < rank; i++) { - remainder = quotient % dst_strides[i]; - quotient = quotient / dst_strides[i]; - if (conj && is_fft_axis[i]) { - src_idx += ((dst_shape[i] - quotient) % dst_shape[i]) * src_strides[i]; - } else { - src_idx += src_strides[i] * quotient; - } - quotient = remainder; - } - - return src_idx; -} - -HOSTDEVICE inline bool is_conj_part(const int64_t dst_idx, - const int64_t* dst_strides, - const int64_t last_axis, - const int64_t last_axis_size) { - int64_t quotient = dst_idx; - int64_t remainder = 0; - - for (int64_t i = 0; i < last_axis + 1; i++) { - remainder = quotient % dst_strides[i]; - quotient = quotient / dst_strides[i]; - - if ((i == last_axis) && (quotient > last_axis_size - 1)) { - return true; - } - - quotient = remainder; - } - - return false; -} - -// FFTFillConjFunctor fill the destination tensor with source tensor and -// conjugate symmetry element of source tensor . -// Use framework::ForRange to iterate destination element with -// supporting different device -template -struct FFTFillConjFunctor { - FFTFillConjFunctor(const C* src_data, - C* dst_data, - const int64_t* src_strides, - const int64_t* dst_strides, - const int64_t* dst_shape, - const bool* is_fft_axis, - const int64_t last_axis, - const int64_t last_axis_size, - const int64_t rank) - : src_data_(src_data), - dst_data_(dst_data), - src_strides_(src_strides), - dst_strides_(dst_strides), - dst_shape_(dst_shape), - is_fft_axis_(is_fft_axis), - last_axis_(last_axis), - last_axis_size_(last_axis_size), - rank_(rank) {} - HOSTDEVICE void operator()(int64_t dst_idx) { - if (is_conj_part(dst_idx, dst_strides_, last_axis_, last_axis_size_)) { - const auto conj_idx = get_src_idx(dst_idx, - dst_strides_, - dst_shape_, - src_strides_, - is_fft_axis_, - true, - rank_); - auto src_value = src_data_[conj_idx]; - auto conj_value = C(src_value.real, -src_value.imag); - dst_data_[dst_idx] = conj_value; - } else { - const auto copy_idx = get_src_idx(dst_idx, - dst_strides_, - dst_shape_, - src_strides_, - is_fft_axis_, - false, - rank_); - dst_data_[dst_idx] = src_data_[copy_idx]; - } - } - - const C* src_data_; - C* dst_data_; - const int64_t* src_strides_; - const int64_t* dst_strides_; - const int64_t* dst_shape_; - const bool* is_fft_axis_; - const int64_t last_axis_; - const int64_t last_axis_size_; - const int64_t rank_; -}; - -template -void fill_conj(const DeviceContext& ctx, - const Tensor* src, - Tensor* dst, - const std::vector& axes) { - std::vector src_strides_v = - phi::vectorize(phi::stride(src->dims())); - std::vector dst_strides_v = - phi::vectorize(phi::stride(dst->dims())); - std::vector dst_shape_v = phi::vectorize(dst->dims()); - const auto src_data = src->data(); - auto dst_data = dst->data(); - const auto last_axis = axes.back(); - const auto last_axis_size = dst->dims().at(last_axis) / 2 + 1; - const int64_t rank = dst->dims().size(); - auto _is_fft_axis = std::make_unique(rank); - for (const auto i : axes) { - _is_fft_axis[i] = true; - } - -#if defined(__NVCC__) || defined(__HIPCC__) - const thrust::device_vector src_strides_g(src_strides_v); - const auto src_strides = thrust::raw_pointer_cast(src_strides_g.data()); - const thrust::device_vector dst_strides_g(dst_strides_v); - const auto dst_strides = thrust::raw_pointer_cast(dst_strides_g.data()); - const thrust::device_vector dst_shape_g(dst_shape_v); - const auto dst_shape = thrust::raw_pointer_cast(dst_shape_g.data()); - const thrust::device_vector is_fft_axis_g(_is_fft_axis.get(), - _is_fft_axis.get() + rank); - const auto p_is_fft_axis = thrust::raw_pointer_cast(is_fft_axis_g.data()); -#else - const auto src_strides = src_strides_v.data(); - const auto dst_strides = dst_strides_v.data(); - const auto dst_shape = dst_shape_v.data(); - const auto p_is_fft_axis = _is_fft_axis.get(); -#endif - platform::ForRange for_range(ctx, dst->numel()); - FFTFillConjFunctor fill_conj_functor(src_data, - dst_data, - src_strides, - dst_strides, - dst_shape, - p_is_fft_axis, - last_axis, - last_axis_size, - rank); - for_range(fill_conj_functor); -} - -template -class FFTC2CKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using C = paddle::platform::complex; - auto& dev_ctx = ctx.device_context(); - - auto axes = ctx.Attr>("axes"); - const std::string& norm_str = ctx.Attr("normalization"); - const bool forward = ctx.Attr("forward"); - const auto* x = ctx.Input("X"); - auto* y = ctx.Output("Out"); - - y->mutable_data(ctx.GetPlace()); - auto normalization = get_norm_from_string(norm_str, forward); - - FFTC2CFunctor fft_c2c_func; - fft_c2c_func(dev_ctx, x, y, axes, normalization, forward); - } -}; - -template -class FFTC2CGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using C = paddle::platform::complex; - auto& dev_ctx = ctx.device_context(); - - auto axes = ctx.Attr>("axes"); - const std::string& norm_str = ctx.Attr("normalization"); - const bool forward = ctx.Attr("forward"); - const auto* dy = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - - dx->mutable_data(ctx.GetPlace()); - auto normalization = get_norm_from_string(norm_str, forward); - - FFTC2CFunctor fft_c2c_func; - fft_c2c_func(dev_ctx, dy, dx, axes, normalization, !forward); - } -}; - -template -class FFTR2CKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using C = paddle::platform::complex; - auto& dev_ctx = ctx.device_context(); - - auto axes = ctx.Attr>("axes"); - const std::string& norm_str = ctx.Attr("normalization"); - const bool forward = ctx.Attr("forward"); - const bool onesided = ctx.Attr("onesided"); - const auto* x = ctx.Input("X"); - auto* y = ctx.Output("Out"); - - y->mutable_data(ctx.GetPlace()); - auto normalization = get_norm_from_string(norm_str, forward); - - FFTR2CFunctor fft_r2c_func; - - if (onesided) { - fft_r2c_func(dev_ctx, x, y, axes, normalization, forward); - } else { - framework::DDim onesided_dims(y->dims()); - const int64_t onesided_last_axis_size = y->dims().at(axes.back()) / 2 + 1; - onesided_dims.at(axes.back()) = onesided_last_axis_size; - framework::Tensor onesided_out; - onesided_out.mutable_data(onesided_dims, ctx.GetPlace()); - fft_r2c_func(dev_ctx, x, &onesided_out, axes, normalization, forward); - fill_conj(dev_ctx, &onesided_out, y, axes); - } - } -}; - -template -class FFTR2CGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using C = paddle::platform::complex; - auto& dev_ctx = ctx.device_context(); - - const auto axes = ctx.Attr>("axes"); - const std::string& norm_str = ctx.Attr("normalization"); - const bool forward = ctx.Attr("forward"); - const bool onesided = ctx.Attr("onesided"); - - const auto* dy = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - framework::Tensor complex_dx; - complex_dx.mutable_data(dx->dims(), ctx.GetPlace()); - - auto normalization = get_norm_from_string(norm_str, forward); - FFTC2CFunctor fft_c2c_func; - - if (!onesided) { - fft_c2c_func(dev_ctx, dy, &complex_dx, axes, normalization, !forward); - } else { - framework::Tensor full_dy; - full_dy.mutable_data(dx->dims(), ctx.GetPlace()); - auto zero_length = static_cast(full_dy.dims().at(axes.back()) - - dy->dims().at(axes.back())); - auto rank = dy->dims().size(); - - std::vector pads(rank * 2, 0); - pads[axes.back() * 2 + 1] = zero_length; - - phi::funcs::PaddingFunctor( - rank, - ctx.template device_context(), - pads, - static_cast(0), - *dy, - &full_dy); - fft_c2c_func( - dev_ctx, &full_dy, &complex_dx, axes, normalization, !forward); - } - framework::TransComplexToReal( - framework::TransToProtoVarType(dx->dtype()), - framework::TransToProtoVarType(complex_dx.dtype()), - complex_dx, - dx); - } -}; - -template -class FFTC2RKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using C = paddle::platform::complex; - auto& dev_ctx = ctx.device_context(); - - auto axes = ctx.Attr>("axes"); - const std::string& norm_str = ctx.Attr("normalization"); - const bool forward = ctx.Attr("forward"); - const auto* x = ctx.Input("X"); - auto* y = ctx.Output("Out"); - - y->mutable_data(ctx.GetPlace()); - auto normalization = get_norm_from_string(norm_str, forward); - - FFTC2RFunctor fft_c2r_func; - fft_c2r_func(dev_ctx, x, y, axes, normalization, forward); - } -}; - -template -class FFTC2RGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using C = paddle::platform::complex; - auto& dev_ctx = ctx.device_context(); - - auto axes = ctx.Attr>("axes"); - const std::string& norm_str = ctx.Attr("normalization"); - const bool forward = ctx.Attr("forward"); - const auto* dy = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - - C* pdx = dx->mutable_data(ctx.GetPlace()); - auto normalization = get_norm_from_string(norm_str, forward); - - FFTR2CFunctor fft_r2c_func; - fft_r2c_func(dev_ctx, dy, dx, axes, normalization, !forward); - - const int64_t double_length = - dy->dims()[axes.back()] - dx->dims()[axes.back()]; - const framework::DDim strides = phi::stride(dx->dims()); - -#if defined(__NVCC__) || defined(__HIPCC__) - const thrust::device_vector strides_g(phi::vectorize(strides)); - const int64_t* pstrides = thrust::raw_pointer_cast(strides_g.data()); -#else - const int64_t* pstrides = strides.Get(); -#endif - - FFTFillConjGradFunctor func(pdx, axes.back(), pstrides, double_length); - size_t limit = dx->numel(); - platform::ForRange for_range(dev_ctx, limit); - for_range(func); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/stft_op.cc b/paddle/fluid/operators/stft_op.cc index d708abe3d0f8c16414572933e0f3b3b052e159c2..986911a1391456b270ba0ab1ce5b1b93010a14d9 100644 --- a/paddle/fluid/operators/stft_op.cc +++ b/paddle/fluid/operators/stft_op.cc @@ -14,8 +14,6 @@ #include "paddle/fluid/operators/stft_op.h" -#include "paddle/fluid/operators/spectral_helper.h" - namespace paddle { namespace operators { class StftOp : public framework::OperatorWithKernel { diff --git a/paddle/fluid/operators/stft_op.cu b/paddle/fluid/operators/stft_op.cu index 9edee0f66c51428ab2481e132338a24fbed916f8..bd48112f0737eca993c482c4f413dcb9ba7058ea 100644 --- a/paddle/fluid/operators/stft_op.cu +++ b/paddle/fluid/operators/stft_op.cu @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/fluid/operators/stft_op.h" -#include "paddle/fluid/operators/spectral_op.cu.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/stft_op.h b/paddle/fluid/operators/stft_op.h index bbd9b137699dc7612d45d478cd48e8a81f6b99f2..fb2ca31608cd70f6a2036de2db6e7a67ae60751a 100644 --- a/paddle/fluid/operators/stft_op.h +++ b/paddle/fluid/operators/stft_op.h @@ -18,8 +18,11 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/operators/spectral_op.h" +#include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/funcs/fft.h" +#include "paddle/phi/kernels/funcs/fft_fill_conj.h" #include "paddle/phi/kernels/funcs/frame_functor.h" +#include "paddle/phi/kernels/funcs/padding.h" namespace paddle { namespace operators { @@ -76,25 +79,25 @@ class StftKernel : public framework::OpKernel { ctx, &frames, window, axes.back(), MulFunctor(), &frames_w); // FFTR2C - FFTNormMode normalization; + phi::funcs::FFTNormMode normalization; if (normalized) { - normalization = get_norm_from_string("ortho", true); + normalization = phi::funcs::get_norm_from_string("ortho", true); } else { - normalization = get_norm_from_string("backward", true); + normalization = phi::funcs::get_norm_from_string("backward", true); } - FFTR2CFunctor fft_r2c_func; + phi::funcs::FFTR2CFunctor fft_r2c_func; if (onesided) { - fft_r2c_func(dev_ctx, &frames_w, out, axes, normalization, true); + fft_r2c_func(dev_ctx, frames_w, out, axes, normalization, true); } else { framework::DDim onesided_dims(out->dims()); const int64_t onesided_axis_size = out->dims().at(axes.back()) / 2 + 1; onesided_dims.at(axes.back()) = onesided_axis_size; Tensor onesided_out; onesided_out.mutable_data(onesided_dims, ctx.GetPlace()); - fft_r2c_func( - dev_ctx, &frames_w, &onesided_out, axes, normalization, true); - fill_conj(dev_ctx, &onesided_out, out, axes); + fft_r2c_func(dev_ctx, frames_w, &onesided_out, axes, normalization, true); + phi::funcs::FFTFillConj( + dev_ctx, &onesided_out, out, axes); } } }; @@ -131,17 +134,17 @@ class StftGradKernel : public framework::OpKernel { complex_d_frames_w.mutable_data(d_frames_dims, ctx.GetPlace()); // dy -> d_frames_w - FFTNormMode normalization; + phi::funcs::FFTNormMode normalization; if (normalized) { - normalization = get_norm_from_string("ortho", true); + normalization = phi::funcs::get_norm_from_string("ortho", true); } else { - normalization = get_norm_from_string("backward", true); + normalization = phi::funcs::get_norm_from_string("backward", true); } - FFTC2CFunctor fft_c2c_func; + phi::funcs::FFTC2CFunctor fft_c2c_func; if (!onesided) { fft_c2c_func( - dev_ctx, dy, &complex_d_frames_w, axes, normalization, false); + dev_ctx, *dy, &complex_d_frames_w, axes, normalization, false); } else { Tensor full_dy; full_dy.mutable_data(d_frames_dims, ctx.GetPlace()); @@ -153,20 +156,11 @@ class StftGradKernel : public framework::OpKernel { pads[axes.back() * 2 + 1] = zero_length; phi::funcs::PaddingFunctor( - rank, - ctx.template device_context(), - pads, - static_cast(0), - *dy, - &full_dy); + rank, dev_ctx, pads, static_cast(0), *dy, &full_dy); fft_c2c_func( - dev_ctx, &full_dy, &complex_d_frames_w, axes, normalization, false); + dev_ctx, full_dy, &complex_d_frames_w, axes, normalization, false); } - framework::TransComplexToReal( - framework::TransToProtoVarType(d_frames_w.dtype()), - framework::TransToProtoVarType(complex_d_frames_w.dtype()), - complex_d_frames_w, - &d_frames_w); + phi::RealKernel(dev_ctx, complex_d_frames_w, &d_frames_w); // d_frames_w -> d_frames Tensor d_frames; diff --git a/paddle/phi/api/yaml/api.yaml b/paddle/phi/api/yaml/api.yaml index 1156206ee4b51a4d9dd103f20e406a9ad69d72c4..1674ecd2c6da7dedc00ee6576980e4d334272562 100644 --- a/paddle/phi/api/yaml/api.yaml +++ b/paddle/phi/api/yaml/api.yaml @@ -98,6 +98,33 @@ func : erf backward : erf_grad +- api : fft_c2c + args : (Tensor x, int64_t[] axes, str normalization, bool forward) + output : Tensor + infer_meta : + func : FFTC2CInferMeta + kernel : + func : fft_c2c + backward : fft_c2c_grad + +- api : fft_c2r + args : (Tensor x, int64_t[] axes, str normalization, bool forward, int64_t last_dim_size=0L) + output : Tensor + infer_meta : + func : FFTC2RInferMeta + kernel : + func : fft_c2r + backward : fft_c2r_grad + +- api : fft_r2c + args : (Tensor x, int64_t[] axes, str normalization, bool forward, bool onesided) + output : Tensor + infer_meta : + func : FFTR2CInferMeta + kernel : + func : fft_r2c + backward : fft_r2c_grad + - api : lgamma args : (Tensor x) output : Tensor(out) @@ -105,7 +132,7 @@ func : UnchangedInferMeta kernel : func : lgamma - backward : lgamma_grad + backward : lgamma_grad - api : mv args : (Tensor x, Tensor vec) diff --git a/paddle/phi/api/yaml/api_compat.yaml b/paddle/phi/api/yaml/api_compat.yaml index 5300d551f8ef8987307f3477b2cdd9d5a9fa2ea7..afa55479b3570805170fec1cd7bb381d8558725b 100644 --- a/paddle/phi/api/yaml/api_compat.yaml +++ b/paddle/phi/api/yaml/api_compat.yaml @@ -31,6 +31,15 @@ float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false, int workspace_size_MB = 512, bool exhaustive_search = false] +- api : conv2d + extra : + attrs : [bool use_cudnn = false, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false, + bool use_quantizer = false, str mkldnn_data_type = "float32", bool fuse_relu = false, + str fuse_activation = "", bool fuse_alpha = false, bool fuse_beta = false, bool use_addto = false, + bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f, + float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false, + int workspace_size_MB = 512, bool exhaustive_search = false] + - api : cross inputs : {x : X, y : Y} @@ -112,3 +121,15 @@ x : X outputs : out : Out + +- api: fft_c2c + inputs: {x: X} + outputs: {out: Out} + +- api: fft_c2r + inputs: {x: X} + outputs: {out: Out} + +- api: fft_r2c + inputs: {x: X} + outputs: {out: Out} diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml index 53cdc97a716d7ac928254add06162ae3e7622130..30c2c9f9e12d39daf33d76924783c36bf77a6faa 100644 --- a/paddle/phi/api/yaml/backward.yaml +++ b/paddle/phi/api/yaml/backward.yaml @@ -105,6 +105,38 @@ func : erf_grad data_type : out_grad +- backward_api : fft_c2c_grad + forward: fft_c2c(Tensor x, int64_t[] axes, str normalization, bool forward) -> Tensor(out) + args : (Tensor out_grad, int64_t[] axes, str normalization, bool forward) + output: Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [out_grad] + kernel : + func : fft_c2c_grad + +- backward_api : fft_c2r_grad + forward: fft_c2r(Tensor x, int64_t[] axes, str normalization, bool forward, int64_t last_dim_size) -> Tensor(out) + args : (Tensor out_grad, int64_t[] axes, str normalization, bool forward, int64_t last_dim_size) + output: Tensor(x_grad) + infer_meta : + func : FFTC2RGradInferMeta + kernel : + func : fft_c2r_grad + data_type: out_grad + +- backward_api : fft_r2c_grad + forward: fft_r2c(Tensor x, int64_t[] axes, str normalization, bool forward, bool onesided) -> Tensor(out) + args : (Tensor x, Tensor out_grad, int64_t[] axes, str normalization, bool forward, bool onesided) + output: Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : fft_r2c_grad + data_type: out_grad + no_need_buffer: x + - backward_api : lgamma_grad forward : lgamma(Tensor x) -> Tensor(out) args : (Tensor x, Tensor out_grad) diff --git a/paddle/phi/core/utils/data_type.h b/paddle/phi/core/utils/data_type.h index 975d55889c7173ddb1a00fa278ba465291c7275e..9877149dc52bd8308c459f8f8f53a8f090ab57e7 100644 --- a/paddle/phi/core/utils/data_type.h +++ b/paddle/phi/core/utils/data_type.h @@ -97,4 +97,18 @@ inline DataType ToComplexType(const DataType& type) { type)); } } + +inline DataType ToRealType(const DataType& type) { + switch (type) { + case DataType::COMPLEX64: + return DataType::FLOAT32; + case DataType::COMPLEX128: + return DataType::FLOAT64; + default: + PADDLE_THROW(errors::Unimplemented( + "Can not transform data type (%s) to real type, now only support " + "complex64 and complex128 value.", + type)); + } +} } // namespace phi diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index e3898adf56c55803ab41d1e7b4853e5e644ac50c..d5c88a2f9e393990551a2f48cb5fba82cc9d79c8 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/phi/infermeta/backward.h" #include "paddle/phi/common/type_traits.h" +#include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/kernels/funcs/axis_utils.h" namespace phi { @@ -285,6 +286,47 @@ void EigvalshGradInferMeta(const MetaTensor& out_v, } } +void FFTC2RGradInferMeta(const MetaTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + int64_t last_dim_size, + MetaTensor* out, + MetaConfig config) { + PADDLE_ENFORCE_NOT_NULL(out, + phi::errors::InvalidArgument( + "Output of fft_c2r _grad should not be null.")); + const phi::DDim x_dim = x.dims(); + + // only ensure that fft axes' size greater than zero at runtime + // they might be -1 to indicate unknown size ar compile time + if (config.is_runtime) { + for (size_t i = 0; i < axes.size(); i++) { + PADDLE_ENFORCE_GT(x_dim[axes[i]], + 0, + phi::errors::InvalidArgument( + "Invalid fft n-point (%d).", x_dim[axes[i]])); + } + } + + out->set_layout(x.layout()); + out->set_dtype(ToComplexType(x.dtype())); + + phi::DDim out_dim = x.dims(); + const int64_t last_fft_axis = axes.back(); + if (last_dim_size > 0) { + out_dim.at(last_fft_axis) = last_dim_size / 2 + 1; + } else if (config.is_runtime) { + const int64_t last_fft_dim_size = x_dim[last_fft_axis]; + out_dim.at(last_fft_axis) = last_fft_dim_size / 2 + 1; + } else { + const int64_t last_fft_dim_size = x_dim[last_fft_axis]; + out_dim.at(last_fft_axis) = + last_fft_dim_size == -1 ? -1 : last_fft_dim_size / 2 + 1; + } + out->set_dims(out_dim); +} + void FillDiagonalGradInferMeta(const MetaTensor& dout, float value, int offset, diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index 15ab16eff1c19d55619a05ddafa23865804784ba..f21b942bf952c5d14e58ea679d954c9dc3742d0a 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -137,6 +137,14 @@ void EigvalshGradInferMeta(const MetaTensor& out_v, bool is_test, MetaTensor* x_grad); +void FFTC2RGradInferMeta(const MetaTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + int64_t last_dim_size, + MetaTensor* out, + MetaConfig = MetaConfig()); + void FillDiagonalGradInferMeta( const MetaTensor& dout, float value, int offset, bool wrap, MetaTensor* dx); diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index a7ed6e71d0503a4b5b40743dc9c82ccaee32be81..bdcef965be2585adc523835b9b18fdc645af3178 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -866,6 +866,112 @@ void FillDiagonalInferMeta( out->set_dtype(x.dtype()); } +void FFTC2CInferMeta(const MetaTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + MetaTensor* out, + MetaConfig config) { + PADDLE_ENFORCE_NOT_NULL( + out, + phi::errors::InvalidArgument("Output of fft_c2c should not be null.")); + // only ensure that fft axes' size greater than zero at runtime + // they might be -1 to indicate unknown size ar compile time + if (config.is_runtime) { + const phi::DDim x_dim = x.dims(); + for (size_t i = 0; i < axes.size(); i++) { + PADDLE_ENFORCE_GT(x_dim[axes[i]], + 0, + phi::errors::InvalidArgument( + "Invalid fft n-point (%d).", x_dim[axes[i]])); + } + } + out->share_meta(x); +} + +void FFTC2RInferMeta(const MetaTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + int64_t last_dim_size, + MetaTensor* out, + MetaConfig config) { + PADDLE_ENFORCE_NOT_NULL( + out, + phi::errors::InvalidArgument("Output of fft_c2r should not be null.")); + const phi::DDim x_dim = x.dims(); + const int64_t last_fft_axis = axes.back(); + + // only ensure that fft axes' size greater than zero at runtime + // they might be -1 to indicate unknown size ar compile time + if (config.is_runtime) { + size_t signal_dims = axes.size(); + for (size_t i = 0; i < signal_dims - 1; i++) { + PADDLE_ENFORCE_GT(x_dim[axes[i]], + 0, + phi::errors::InvalidArgument( + "Invalid fft n-point (%d).", x_dim[axes[i]])); + } + } + + out->set_layout(x.layout()); + out->set_dtype(ToRealType(x.dtype())); + phi::DDim out_dim = x_dim; + + if (last_dim_size > 0) { + out_dim.at(last_fft_axis) = last_dim_size; + } else if (config.is_runtime) { + const int64_t input_last_dim_size = x_dim[last_fft_axis]; + const int64_t fft_n_point = (input_last_dim_size - 1) * 2; + PADDLE_ENFORCE_GT( + fft_n_point, + 0, + phi::errors::InvalidArgument("Invalid fft n-point (%d).", fft_n_point)); + out_dim.at(last_fft_axis) = fft_n_point; + } else { + const int64_t input_last_dim_size = x_dim[last_fft_axis]; + out_dim.at(last_fft_axis) = + input_last_dim_size == -1 ? -1 : (input_last_dim_size - 1) * 2; + } + out->set_dims(out_dim); +} + +void FFTR2CInferMeta(const MetaTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + bool onesided, + MetaTensor* out, + MetaConfig config) { + PADDLE_ENFORCE_NOT_NULL( + out, + phi::errors::InvalidArgument("Output of fft_r2c should not be null.")); + const phi::DDim x_dim = x.dims(); + + // only ensure that fft axes' size greater than zero at runtime + // they might be -1 to indicate unknown size ar compile time + if (config.is_runtime) { + for (size_t i = 0; i < axes.size(); i++) { + PADDLE_ENFORCE_GT(x_dim[axes[i]], + 0, + phi::errors::InvalidArgument( + "Invalid fft n-point (%d).", x_dim[axes[i]])); + } + } + + out->set_layout(x.layout()); + out->set_dtype(ToComplexType(x.dtype())); + if (!onesided) { + out->share_dims(x); + } else { + phi::DDim out_dim = x.dims(); + const int64_t last_fft_axis = axes.back(); + const int64_t last_fft_dim_size = x_dim[last_fft_axis]; + out_dim.at(last_fft_axis) = last_fft_dim_size / 2 + 1; + out->set_dims(out_dim); + } +} + void FlattenInferMeta(const MetaTensor& x, int start_axis, int stop_axis, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 5a8bcfb2b702fbd6dd267aac727aefe3d6e75bb3..f2bb43e952d5f8e41157b1bb48a197b42666bbd2 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -135,6 +135,29 @@ void ExpandInferMeta(const MetaTensor& x, void FillDiagonalInferMeta( const MetaTensor& x, float value, int offset, bool wrap, MetaTensor* out); +void FFTC2CInferMeta(const MetaTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + MetaTensor* out, + MetaConfig = MetaConfig()); + +void FFTC2RInferMeta(const MetaTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + int64_t last_dim_size, + MetaTensor* out, + MetaConfig = MetaConfig()); + +void FFTR2CInferMeta(const MetaTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + bool onesided, + MetaTensor* out, + MetaConfig = MetaConfig()); + void FlattenInferMeta(const MetaTensor& x, int start_axis, int stop_axis, diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 7b64658d571cf8260c280f2883fb956043e5e70e..64ffdbe885356d6861407e80a3b25ff5cba93297 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -65,7 +65,8 @@ set(COMMON_KERNEL_DEPS matrix_solve phi_dynload_warpctc sequence_padding - sequence_scale) + sequence_scale + fft) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} diff --git a/paddle/phi/kernels/assign_kernel.h b/paddle/phi/kernels/assign_kernel.h index 0294dc950deb15bb2e453e16cdfdf38c34d7ac9d..41be3e43a303d1233f2f18bbf2cd4af6f34d31be 100644 --- a/paddle/phi/kernels/assign_kernel.h +++ b/paddle/phi/kernels/assign_kernel.h @@ -18,6 +18,7 @@ #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/unary.h" namespace phi { @@ -26,6 +27,16 @@ void AssignKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); +template +DenseTensor Assign(const Context& dev_ctx, const DenseTensor& x) { + DenseTensor out; + MetaTensor meta_out(&out); + MetaTensor meta_x(x); + UnchangedInferMeta(meta_x, &meta_out); + AssignKernel(dev_ctx, x, &out); + return out; +} + // In order to be compatible with the `AsDispensable` input in the original // assign op maker, the input parameter here needs to be dispensable, but // this looks weird diff --git a/paddle/phi/kernels/cpu/fft_grad_kernel.cc b/paddle/phi/kernels/cpu/fft_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..aecaf6c5c13f8f78e99bbde895be6e219a201bf0 --- /dev/null +++ b/paddle/phi/kernels/cpu/fft_grad_kernel.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/fft_grad_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/fft_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(fft_c2c_grad, + CPU, + ALL_LAYOUT, + phi::FFTC2CGradKernel, + phi::dtype::complex, + phi::dtype::complex) {} +PD_REGISTER_KERNEL( + fft_c2r_grad, CPU, ALL_LAYOUT, phi::FFTC2RGradKernel, float, double) {} +PD_REGISTER_KERNEL(fft_r2c_grad, + CPU, + ALL_LAYOUT, + phi::FFTR2CGradKernel, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/fft_kernel.cc b/paddle/phi/kernels/cpu/fft_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..f5e3e350fc085176a8becca856d635344bde129f --- /dev/null +++ b/paddle/phi/kernels/cpu/fft_kernel.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/fft_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/fft_kernel_impl.h" + +PD_REGISTER_KERNEL(fft_c2c, + CPU, + ALL_LAYOUT, + phi::FFTC2CKernel, + phi::dtype::complex, + phi::dtype::complex) {} +PD_REGISTER_KERNEL(fft_c2r, + CPU, + ALL_LAYOUT, + phi::FFTC2RKernel, + phi::dtype::complex, + phi::dtype::complex) {} +PD_REGISTER_KERNEL(fft_r2c, CPU, ALL_LAYOUT, phi::FFTR2CKernel, float, double) { +} diff --git a/paddle/phi/kernels/cpu/scale_kernel.cc b/paddle/phi/kernels/cpu/scale_kernel.cc index e929b5bd7219b60acb226374f67a0bc511c41723..421aae270ee591c13dd205dd6909a8eb7bc3cef5 100644 --- a/paddle/phi/kernels/cpu/scale_kernel.cc +++ b/paddle/phi/kernels/cpu/scale_kernel.cc @@ -62,4 +62,6 @@ PD_REGISTER_KERNEL(scale, int8_t, int16_t, int, - int64_t) {} + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/fft_grad_kernel.h b/paddle/phi/kernels/fft_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..8f5237f1fd08c7aa8c492dbbbc32dab6ebae2fc6 --- /dev/null +++ b/paddle/phi/kernels/fft_grad_kernel.h @@ -0,0 +1,48 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { +template +void FFTC2CGradKernel(const Context& ctx, + const DenseTensor& out_grad, + const std::vector& axes, + const std::string& normalization, + bool forward, + DenseTensor* x_grad); + +template +void FFTC2RGradKernel(const Context& ctx, + const DenseTensor& out_grad, + const std::vector& axes, + const std::string& normalization, + bool forward, + int64_t last_dim_size, + DenseTensor* x_grad); + +template +void FFTR2CGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& axes, + const std::string& normalization, + bool forward, + bool onesided, + DenseTensor* x_grad); +} // namespace phi diff --git a/paddle/phi/kernels/fft_kernel.h b/paddle/phi/kernels/fft_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..6105ec4d0b3ec9e72bf4842ac192c312badc48a1 --- /dev/null +++ b/paddle/phi/kernels/fft_kernel.h @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { +template +void FFTC2CKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + DenseTensor* out); + +template +void FFTC2RKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + int64_t last_dim_size, + DenseTensor* out); + +template +void FFTR2CKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + bool onesided, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index 646d65cf8a63eac052a841eed1c952c1f0ceb44e..afa46f1dacaefeb28946f840d778a5969b554bed 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -16,3 +16,20 @@ math_library(pooling DEPS dense_tensor) math_library(segment_pooling) math_library(sequence2batch) math_library(matrix_solve DEPS dense_tensor eigen3 blas math_function) + +if(WITH_GPU OR WITH_ROCM) + if(MKL_FOUND AND WITH_ONEMKL) + math_library(fft spectral_op.cu DEPS dynload_cuda dynload_mklrt + dense_tensor) + target_include_directories(fft PRIVATE ${MKL_INCLUDE}) + else() + math_library(fft spectral_op.cu DEPS dynload_cuda dense_tensor pocketfft) + endif() +else() + if(MKL_FOUND AND WITH_ONEMKL) + mathp_library(fft DEPS dynload_mklrt dense_tensor) + target_include_directories(fft PRIVATE ${MKL_INCLUDE}) + else() + math_library(fft DEPS dense_tensor pocketfft) + endif() +endif() diff --git a/paddle/phi/kernels/funcs/cufft_util.h b/paddle/phi/kernels/funcs/cufft_util.h new file mode 100644 index 0000000000000000000000000000000000000000..584425c6112a5dfe3c7f0ad7b7294a0be1812402 --- /dev/null +++ b/paddle/phi/kernels/funcs/cufft_util.h @@ -0,0 +1,160 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/backends/dynload/cufft.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/funcs/fft.h" +#include "paddle/phi/kernels/funcs/fft_key.h" + +namespace phi { +namespace funcs { +namespace detail { + +// An RAII encapsulation of cuFFTHandle +class CuFFTHandle { + public: + CuFFTHandle() { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cufftCreate(&handle_)); + } + + CuFFTHandle(const CuFFTHandle& other) = delete; + CuFFTHandle& operator=(const CuFFTHandle& other) = delete; + + CuFFTHandle(CuFFTHandle&& other) = delete; + CuFFTHandle& operator=(CuFFTHandle&& other) = delete; + + ::cufftHandle& get() { return handle_; } + const ::cufftHandle& get() const { return handle_; } + + ~CuFFTHandle() { phi::dynload::cufftDestroy(handle_); } + + private: + ::cufftHandle handle_; +}; + +// Returns true if the transform type has complex input +inline bool has_complex_input(FFTTransformType type) { + switch (type) { + case FFTTransformType::C2C: + case FFTTransformType::C2R: + return true; + + case FFTTransformType::R2C: + return false; + } + PADDLE_THROW(phi::errors::InvalidArgument("Unknown FFTTransformType")); +} + +// Returns true if the transform type has complex output +inline bool has_complex_output(FFTTransformType type) { + switch (type) { + case FFTTransformType::C2C: + case FFTTransformType::R2C: + return true; + + case FFTTransformType::C2R: + return false; + } + PADDLE_THROW(phi::errors::InvalidArgument("Unknown FFTTransformType")); +} + +class FFTConfig { + public: + using plan_size_type = long long int; // NOLINT (be consistent with cufft) + explicit FFTConfig(const FFTConfigKey& key) + : FFTConfig( + std::vector(key.sizes_, key.sizes_ + key.signal_ndim_ + 1), + key.fft_type_, + key.value_type_) {} + // sizes are full signal, including batch size and always two-sided + FFTConfig(const std::vector& sizes, + FFTTransformType fft_type, + DataType precison) + : fft_type_(fft_type), precision_(precison) { + const auto batch_size = static_cast(sizes[0]); + std::vector signal_sizes(sizes.cbegin() + 1, sizes.cend()); + const int signal_ndim = sizes.size() - 1; + + cudaDataType itype, otype, exec_type; + const bool complex_input = has_complex_input(fft_type); + const bool complex_output = has_complex_output(fft_type); + if (precison == DataType::FLOAT32) { + itype = complex_input ? CUDA_C_32F : CUDA_R_32F; + otype = complex_output ? CUDA_C_32F : CUDA_R_32F; + exec_type = CUDA_C_32F; + } else if (precison == DataType::FLOAT64) { + itype = complex_input ? CUDA_C_64F : CUDA_R_64F; + otype = complex_output ? CUDA_C_64F : CUDA_R_64F; + exec_type = CUDA_C_64F; + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "Only transforms of type float32 and float64 are supported.")); + } + + // disable auto allocation of workspace to use allocator from the framework + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cufftSetAutoAllocation(plan(), /* autoAllocate */ 0)); + + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cufftXtMakePlanMany(plan(), + signal_ndim, + signal_sizes.data(), + /* inembed */ nullptr, + /* base_istride */ 1L, + /* idist */ 1L, + itype, + /* onembed */ nullptr, + /* base_ostride */ 1L, + /* odist */ 1L, + otype, + batch_size, + &ws_size_, + exec_type)); + } + + FFTConfig(const FFTConfig& other) = delete; + FFTConfig& operator=(const FFTConfig& other) = delete; + + FFTConfig(FFTConfig&& other) = delete; + FFTConfig& operator=(FFTConfig&& other) = delete; + + const cufftHandle& plan() const { return plan_.get(); } + FFTTransformType transform_type() const { return fft_type_; } + DataType data_type() const { return precision_; } + size_t workspace_size() const { return ws_size_; } + + private: + CuFFTHandle plan_; + size_t ws_size_; // workspace size in bytes + FFTTransformType fft_type_; + DataType precision_; +}; + +// NOTE: R2C is forward-only, C2R is backward only +static void exec_plan(const FFTConfig& config, + void* in_data, + void* out_data, + bool forward) { + auto& plan = config.plan(); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cufftXtExec( + plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE)); +} + +} // namespace detail +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/fft.cc b/paddle/phi/kernels/funcs/fft.cc new file mode 100644 index 0000000000000000000000000000000000000000..9895ff406cb89bbf6dc39b1cc7c5780e8a9ff10d --- /dev/null +++ b/paddle/phi/kernels/funcs/fft.cc @@ -0,0 +1,378 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/funcs/fft.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" +#if defined(PADDLE_WITH_ONEMKL) +#include "paddle/phi/kernels/funcs/mkl_fft_utils.h" +#elif defined(PADDLE_WITH_POCKETFFT) +#define POCKETFFT_CACHE_SIZE 16 +#include "extern_pocketfft/pocketfft_hdronly.h" +#endif + +namespace phi { +namespace funcs { +#if defined(PADDLE_WITH_ONEMKL) + +namespace detail { +// Execute a general fft operation (can be c2c, onesided r2c or onesided c2r) +template +void exec_fft(const phi::CPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward) { + const phi::DDim& in_sizes = x.dims(); + const int ndim = in_sizes.size(); + const int signal_ndim = axes.size(); + const int batch_ndim = ndim - signal_ndim; + const phi::DDim& out_sizes = out->dims(); + + // make a dim permutation + std::vector dim_permute(ndim); + std::iota(dim_permute.begin(), dim_permute.end(), 0); + std::vector is_transformed_dim(ndim, false); + for (const auto& d : axes) { + is_transformed_dim[d] = true; + } + const auto batch_end = + std::partition(dim_permute.begin(), dim_permute.end(), [&](size_t axis) { + return !is_transformed_dim[axis]; + }); + std::copy(axes.cbegin(), axes.cend(), batch_end); + + // transpose input according to the permutation + DenseTensor transposed_input = + Transpose(ctx, x, dim_permute); + const phi::DDim& transposed_input_shape = transposed_input.dims(); + + // batch size + int64_t batch_size = 1L; + for (int i = 0; i < batch_ndim; i++) { + batch_size *= transposed_input_shape[i]; + } + + // make an collapsed input: collapse batch axes for input + std::vector collapsed_input_shape_; + collapsed_input_shape_.reserve(1 + signal_ndim); + collapsed_input_shape_.emplace_back(batch_size); + for (int i = 0; i < signal_ndim; i++) { + collapsed_input_shape_.push_back(in_sizes[axes[i]]); + } + phi::DDim collapsed_input_shape = phi::make_ddim(collapsed_input_shape_); + transposed_input.Resize(collapsed_input_shape); + DenseTensor& collapsed_input = transposed_input; + + // make a collapsed output + phi::DDim transposed_output_shape = out_sizes.transpose(dim_permute); + std::vector collapsed_output_shape_; + collapsed_output_shape_.reserve(1 + signal_ndim); + collapsed_output_shape_.emplace_back(batch_size); + for (int i = 0; i < signal_ndim; i++) { + collapsed_output_shape_.push_back(out_sizes[axes[i]]); + } + phi::DDim collapsed_output_shape = phi::make_ddim(collapsed_output_shape_); + DenseTensor collapsed_output; + collapsed_output.Resize(collapsed_output_shape); + ctx.Alloc(&collapsed_output); + + // make a DFTI_DESCRIPTOR + std::vector signal_sizes(1 + signal_ndim); + signal_sizes[0] = batch_size; + for (int i = 0; i < signal_ndim; i++) { + signal_sizes[1 + i] = + std::max(collapsed_input_shape[1 + i], collapsed_output_shape[1 + i]); + } + const phi::DDim input_stride = phi::stride(collapsed_input_shape); + const phi::DDim output_stride = phi::stride(collapsed_output_shape); + + DftiDescriptor desc = plan_mkl_fft(x.dtype(), + out->dtype(), + input_stride, + output_stride, + signal_sizes, + normalization, + forward); + // execute the transform + const FFTTransformType fft_type = GetFFTTransformType(x.dtype(), out->type()); + if (fft_type == FFTTransformType::C2R && forward) { + ConjKernel(ctx, collapsed_input, &collapsed_input); + MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward( + desc.get(), collapsed_input.data(), collapsed_output.data())); + } else if (fft_type == FFTTransformType::R2C && !forward) { + MKL_DFTI_CHECK(phi::dynload::DftiComputeForward( + desc.get(), collapsed_input.data(), collapsed_output.data())); + ConjKernel(ctx, collapsed_output, &collapsed_output); + } else { + if (forward) { + MKL_DFTI_CHECK(phi::dynload::DftiComputeForward( + desc.get(), collapsed_input.data(), collapsed_output.data())); + } else { + MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward( + desc.get(), collapsed_input.data(), collapsed_output.data())); + } + } + + // resize for the collapsed output + collapsed_output.Resize(transposed_output_shape); + phi::DenseTensor& transposed_output = collapsed_output; + + // reverse the transposition + std::vector reverse_dim_permute(ndim); + for (int i = 0; i < ndim; i++) { + reverse_dim_permute[dim_permute[i]] = i; + } + TransposeKernel( + ctx, transposed_output, reverse_dim_permute, out); +} +} // namespace detail + +template +struct FFTC2CFunctor { + void operator()(const phi::CPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward) { + detail::exec_fft(ctx, x, out, axes, normalization, forward); + } +}; + +template +struct FFTR2CFunctor { + void operator()(const phi::CPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward) { + detail::exec_fft(ctx, x, out, axes, normalization, forward); + } +}; + +template +struct FFTC2RFunctor { + void operator()(const phi::CPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward) { + if (axes.size() > 1) { + DenseTensor c2c_result = EmptyLike(ctx, x); + + const std::vector c2c_dims(axes.begin(), axes.end() - 1); + FFTC2CFunctor c2c_functor; + c2c_functor(ctx, x, &c2c_result, c2c_dims, normalization, forward); + + const std::vector new_axes{axes.back()}; + detail::exec_fft( + ctx, c2c_result, out, new_axes, normalization, forward); + } else { + detail::exec_fft(ctx, x, out, axes, normalization, forward); + } + } +}; + +#elif defined(PADDLE_WITH_POCKETFFT) +namespace detail { +template +static T compute_factor(size_t size, FFTNormMode normalization) { + constexpr auto one = static_cast(1); + switch (normalization) { + case FFTNormMode::none: + return one; + case FFTNormMode::by_n: + return one / static_cast(size); + case FFTNormMode::by_sqrt_n: + return one / std::sqrt(static_cast(size)); + } + PADDLE_THROW(phi::errors::InvalidArgument("Unsupported normalization type")); +} +} // namespace detail + +template +struct FFTC2CFunctor { + void operator()(const phi::CPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward) { + using R = typename Ti::value_type; + using C = std::complex; + + const auto& input_dim = x.dims(); + const std::vector in_sizes = phi::vectorize(input_dim); + std::vector in_strides = + phi::vectorize(phi::stride(input_dim)); + const int64_t data_size = sizeof(C); + std::transform(in_strides.begin(), + in_strides.end(), + in_strides.begin(), + [&](std::ptrdiff_t s) { return s * data_size; }); + + const auto* in_data = reinterpret_cast(x.data()); + auto* out_data = reinterpret_cast(out->data()); + // pocketfft requires std::vector + std::vector axes_(axes.size()); + std::copy(axes.begin(), axes.end(), axes_.begin()); + // compuet factor + size_t signal_numel = 1; + for (const auto axis : axes) { + signal_numel *= in_sizes[axis]; + } + R factor = detail::compute_factor(signal_numel, normalization); + pocketfft::c2c(in_sizes, + in_strides, + in_strides, + axes_, + forward, + in_data, + out_data, + factor); + } +}; + +template +struct FFTR2CFunctor { + void operator()(const phi::CPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward) { + using R = Ti; + using C = std::complex; + + const auto& input_dim = x.dims(); + const std::vector in_sizes = phi::vectorize(input_dim); + std::vector in_strides = + phi::vectorize(phi::stride(input_dim)); + { + const int64_t data_size = sizeof(R); + std::transform(in_strides.begin(), + in_strides.end(), + in_strides.begin(), + [&](std::ptrdiff_t s) { return s * data_size; }); + } + + const auto& output_dim = out->dims(); + const std::vector out_sizes = phi::vectorize(output_dim); + std::vector out_strides = + phi::vectorize(phi::stride(output_dim)); + { + const int64_t data_size = sizeof(C); + std::transform(out_strides.begin(), + out_strides.end(), + out_strides.begin(), + [&](std::ptrdiff_t s) { return s * data_size; }); + } + + const auto* in_data = x.data(); + auto* out_data = reinterpret_cast(out->data()); + // pocketfft requires std::vector + std::vector axes_(axes.size()); + std::copy(axes.begin(), axes.end(), axes_.begin()); + // compuet normalization factor + size_t signal_numel = 1; + for (const auto axis : axes) { + signal_numel *= in_sizes[axis]; + } + R factor = detail::compute_factor(signal_numel, normalization); + pocketfft::r2c(in_sizes, + in_strides, + out_strides, + axes_, + forward, + in_data, + out_data, + factor); + } +}; + +template +struct FFTC2RFunctor { + void operator()(const phi::CPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward) { + using R = To; + using C = std::complex; + + const auto& input_dim = x.dims(); + const std::vector in_sizes = phi::vectorize(input_dim); + std::vector in_strides = + phi::vectorize(phi::stride(input_dim)); + { + const int64_t data_size = sizeof(C); + std::transform(in_strides.begin(), + in_strides.end(), + in_strides.begin(), + [&](std::ptrdiff_t s) { return s * data_size; }); + } + + const auto& output_dim = out->dims(); + const std::vector out_sizes = phi::vectorize(output_dim); + std::vector out_strides = + phi::vectorize(phi::stride(output_dim)); + { + const int64_t data_size = sizeof(R); + std::transform(out_strides.begin(), + out_strides.end(), + out_strides.begin(), + [&](std::ptrdiff_t s) { return s * data_size; }); + } + + const auto* in_data = reinterpret_cast(x.data()); + auto* out_data = out->data(); + // pocketfft requires std::vector + std::vector axes_(axes.size()); + std::copy(axes.begin(), axes.end(), axes_.begin()); + // compuet normalization factor + size_t signal_numel = 1; + for (const auto axis : axes) { + signal_numel *= out_sizes[axis]; + } + R factor = detail::compute_factor(signal_numel, normalization); + pocketfft::c2r(out_sizes, + in_strides, + out_strides, + axes_, + forward, + in_data, + out_data, + factor); + } +}; +#endif + +using complex64_t = phi::dtype::complex; +using complex128_t = phi::dtype::complex; +template struct FFTC2CFunctor; +template struct FFTC2CFunctor; +template struct FFTC2RFunctor; +template struct FFTC2RFunctor; +template struct FFTR2CFunctor; +template struct FFTR2CFunctor; +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/fft.cu b/paddle/phi/kernels/funcs/fft.cu new file mode 100644 index 0000000000000000000000000000000000000000..edac497bc8e8b7c164108f0d856fd0aa0dcc981a --- /dev/null +++ b/paddle/phi/kernels/funcs/fft.cu @@ -0,0 +1,346 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/kernels/funcs/fft.h" +#include "paddle/phi/kernels/funcs/fft_cache.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/assign_kernel.h" +#include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/scale_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" + +namespace phi { +namespace funcs { +namespace detail { + +// Use the optimized path to perform single R2C or C2R if transformation dim is +// supported by cuFFT +static bool use_optimized_fft_path(const std::vector& axes) { + // For performance reason, when axes starts with (0, 1), do not use the + // optimized path. + if (axes.size() > kMaxFFTNdim || + (axes.size() >= 2 && axes[0] == 0 && axes[1] == 1)) { + return false; + } else { + return true; + } +} + +static double fft_normalization_scale(FFTNormMode normalization, + const std::vector& sizes, + const std::vector& dims) { + // auto norm = static_cast(normalization); + if (normalization == FFTNormMode::none) { + return static_cast(1.0); + } + + int64_t signal_numel = 1; + for (auto dim : dims) { + signal_numel *= sizes[dim]; + } + const double scale_denom = (normalization == FFTNormMode::by_sqrt_n) + ? std::sqrt(signal_numel) + : static_cast(signal_numel); + return static_cast(1.0 / scale_denom); +} + +template +void exec_normalization(const phi::GPUContext& ctx, + const DenseTensor& in, + DenseTensor* out, + FFTNormMode normalization, + const std::vector& sizes, + const std::vector& axes) { + const double scale = fft_normalization_scale(normalization, sizes, axes); + if (scale != 1.0) { + ScaleKernel(ctx, in, scale, 0, true, out); + } else { + AssignKernel(ctx, in, out); + } +} + +bool has_large_prime_factor(int64_t n) { + constexpr int64_t first_large_prime = 11; + const std::array prime_radices{{2, 3, 5, 7}}; + for (auto prime : prime_radices) { + if (n < first_large_prime) { + return false; + } + while (n % prime == 0) { + n /= prime; + } + } + return n != 1; +} + +#if defined(PADDLE_WITH_CUDA) +inline bool use_cache(const int64_t* signal_size) { + bool using_cache = true; + int cufft_version; + phi::dynload::cufftGetVersion(&cufft_version); + if (10300 <= cufft_version && cufft_version <= 10400) { + using_cache = std::none_of( + signal_size + 1, signal_size + kMaxDataNdim, [](int64_t dim_size) { + return has_large_prime_factor(dim_size); + }); + } + return using_cache; +} +#elif defined(PADDLE_WITH_HIP) +inline bool use_cache(const int64_t* signal_size) { return true; } +#endif + +// up to 3d unnormalized fft transform (c2r, r2c, c2c) +template +void exec_fft(const phi::GPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + bool forward) { + const phi::DDim& in_sizes = x.dims(); + const int ndim = in_sizes.size(); + const int signal_ndim = axes.size(); + const int batch_ndim = ndim - signal_ndim; + const phi::DDim& out_sizes = out->dims(); + + // make a dim permutation + std::vector dim_permute(ndim); + std::iota(dim_permute.begin(), dim_permute.end(), 0); + std::vector is_transformed_dim(ndim, false); + for (const auto& d : axes) { + is_transformed_dim[d] = true; + } + const auto batch_end = + std::partition(dim_permute.begin(), dim_permute.end(), [&](size_t axis) { + return !is_transformed_dim[axis]; + }); + std::copy(axes.cbegin(), axes.cend(), batch_end); + + // transpose input according to the permutation + DenseTensor transposed_input = + Transpose(ctx, x, dim_permute); + const phi::DDim transposed_input_shape = transposed_input.dims(); + + // batch size + int64_t batch_size = 1L; + for (int i = 0; i < batch_ndim; i++) { + batch_size *= transposed_input_shape[i]; + } + + // make an collapsed input: collapse batch axes for input + std::vector collapsed_input_shape_; + collapsed_input_shape_.reserve(1 + signal_ndim); + collapsed_input_shape_.emplace_back(batch_size); + for (int i = 0; i < signal_ndim; i++) { + collapsed_input_shape_.push_back(in_sizes[axes[i]]); + } + phi::DDim collapsed_input_shape = phi::make_ddim(collapsed_input_shape_); + transposed_input.Resize(collapsed_input_shape); + DenseTensor& collapsed_input = transposed_input; + + // make a collapsed output + phi::DDim transposed_output_shape = out_sizes.transpose(dim_permute); + std::vector collapsed_output_shape_; + collapsed_output_shape_.reserve(1 + signal_ndim); + collapsed_output_shape_.emplace_back(batch_size); + for (int i = 0; i < signal_ndim; i++) { + collapsed_output_shape_.push_back(out_sizes[axes[i]]); + } + phi::DDim collapsed_output_shape = phi::make_ddim(collapsed_output_shape_); + DenseTensor collapsed_output; + collapsed_output.Resize(collapsed_output_shape); + ctx.Alloc(&collapsed_output); + + FFTConfigKey key = + create_fft_configkey(collapsed_input, collapsed_output, signal_ndim); + int64_t device_id = ctx.GetPlace().GetDeviceId(); + FFTConfig* config = nullptr; + std::unique_ptr config_ = nullptr; + bool using_cache = use_cache(key.sizes_); + + if (using_cache) { + FFTConfigCache& plan_cache = get_fft_plan_cache(device_id); + std::unique_lock guard(plan_cache.mutex, std::defer_lock); + guard.lock(); + config = &(plan_cache.lookup(key)); + } else { + config_ = std::make_unique(key); + config = config_.get(); + } + + const int64_t workspace_size = static_cast(config->workspace_size()); + DenseTensor workspace_tensor = Empty(ctx, {workspace_size}); + + // prepare cufft for execution +#if defined(PADDLE_WITH_CUDA) + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cufftSetStream(config->plan(), ctx.stream())); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cufftSetWorkArea(config->plan(), workspace_tensor.data())); +#elif defined(PADDLE_WITH_HIP) + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::hipfftSetStream(config->plan(), ctx.stream())); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::hipfftSetWorkArea(config->plan(), workspace_tensor.data())); +#endif + + // execution of fft plan + const FFTTransformType fft_type = config->transform_type(); + if (fft_type == FFTTransformType::C2R && forward) { + ConjKernel(ctx, collapsed_input, &collapsed_input); + exec_plan(*config, collapsed_input.data(), collapsed_output.data(), false); + } else if (fft_type == FFTTransformType::R2C && !forward) { + exec_plan(*config, collapsed_input.data(), collapsed_output.data(), true); + ConjKernel(ctx, collapsed_output, &collapsed_output); + } else { + exec_plan( + *config, collapsed_input.data(), collapsed_output.data(), forward); + } + + // resize for the collapsed output + collapsed_output.Resize(transposed_output_shape); + phi::DenseTensor& transposed_output = collapsed_output; + + // reverse the transposition + std::vector reverse_dim_permute(ndim); + for (int i = 0; i < ndim; i++) { + reverse_dim_permute[dim_permute[i]] = i; + } + TransposeKernel( + ctx, transposed_output, reverse_dim_permute, out); +} +} // namespace detail + +template +struct FFTC2CFunctor { + void operator()(const phi::GPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward) { + if (axes.empty()) { + AssignKernel(ctx, x, out); + return; + } + + std::vector working_axes = axes; + std::sort(working_axes.begin(), working_axes.end()); + std::vector first_dims; + size_t max_dims; + + DenseTensor working_tensor = x; // shallow copy + while (true) { + max_dims = std::min(static_cast(detail::kMaxFFTNdim), + working_axes.size()); + first_dims.assign(working_axes.end() - max_dims, working_axes.end()); + + detail::exec_fft(ctx, working_tensor, out, first_dims, forward); + working_axes.resize(working_axes.size() - max_dims); + first_dims.clear(); + + if (working_axes.empty()) { + break; + } + + if (working_tensor.IsSharedWith(x)) { + working_tensor = std::move(*out); + *out = EmptyLike(ctx, x); + } else { + std::swap(*out, working_tensor); + } + } + + std::vector out_dims = phi::vectorize(x.dims()); + detail::exec_normalization( + ctx, *out, out, normalization, out_dims, axes); + } +}; + +template +struct FFTC2RFunctor { + void operator()(const phi::GPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward) { + std::vector out_dims = phi::vectorize(out->dims()); + + if (detail::use_optimized_fft_path(axes)) { + DenseTensor x_copy = Assign(ctx, x); + detail::exec_fft(ctx, x_copy, out, axes, forward); + } else { + DenseTensor c2c_result = EmptyLike(ctx, x); + FFTC2CFunctor c2c_functor; + c2c_functor(ctx, + x, + &c2c_result, + {axes.begin(), axes.end() - 1}, + FFTNormMode::none, + forward); + detail::exec_fft(ctx, c2c_result, out, {axes.back()}, forward); + } + detail::exec_normalization( + ctx, *out, out, normalization, out_dims, axes); + } +}; + +template +struct FFTR2CFunctor { + void operator()(const phi::GPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward) { + if (detail::use_optimized_fft_path(axes)) { + detail::exec_fft(ctx, x, out, axes, forward); + } else { + DenseTensor r2c_result = EmptyLike(ctx, *out); + detail::exec_fft(ctx, x, &r2c_result, {axes.back()}, forward); + + FFTC2CFunctor fft_c2c_func; + fft_c2c_func(ctx, + r2c_result, + out, + {axes.begin(), axes.end() - 1}, + FFTNormMode::none, + forward); + } + + const auto in_dims = phi::vectorize(x.dims()); + detail::exec_normalization( + ctx, *out, out, normalization, in_dims, axes); + } +}; + +using complex64_t = phi::dtype::complex; +using complex128_t = phi::dtype::complex; +template struct FFTC2CFunctor; +template struct FFTC2CFunctor; +template struct FFTC2RFunctor; +template struct FFTC2RFunctor; +template struct FFTR2CFunctor; +template struct FFTR2CFunctor; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/fft.h b/paddle/phi/kernels/funcs/fft.h new file mode 100644 index 0000000000000000000000000000000000000000..3f9e1191ebb3e69cb9aa9176bd64abc822569d80 --- /dev/null +++ b/paddle/phi/kernels/funcs/fft.h @@ -0,0 +1,103 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/utils/data_type.h" + +namespace phi { +namespace funcs { + +enum class FFTNormMode : int8_t { + none, // No normalization + by_sqrt_n, // Divide by sqrt(signal_size) + by_n, // Divide by signal_size +}; + +inline FFTNormMode get_norm_from_string(const std::string& norm, bool forward) { + if (norm.empty() || norm == "backward") { + return forward ? FFTNormMode::none : FFTNormMode::by_n; + } + + if (norm == "forward") { + return forward ? FFTNormMode::by_n : FFTNormMode::none; + } + + if (norm == "ortho") { + return FFTNormMode::by_sqrt_n; + } + + PADDLE_THROW(phi::errors::InvalidArgument( + "FFT norm string must be 'forward' or 'backward' or 'ortho', " + "received %s", + norm)); +} + +enum class FFTTransformType : int8_t { + C2C = 0, // Complex-to-complex + R2C, // Real-to-complex + C2R, // Complex-to-real +}; + +// Create transform type enum from bools representing if input and output are +// complex +inline FFTTransformType GetFFTTransformType(DataType input_dtype, + DataType output_dtype) { + auto complex_input = IsComplexType(input_dtype); + auto complex_output = IsComplexType(output_dtype); + if (complex_input && complex_output) { + return FFTTransformType::C2C; + } else if (complex_input && !complex_output) { + return FFTTransformType::C2R; + } else if (!complex_input && complex_output) { + return FFTTransformType::R2C; + } + PADDLE_THROW( + phi::errors::InvalidArgument("Real to real FFTs are not supported")); +} + +template +struct FFTC2CFunctor { + void operator()(const DeviceContext& ctx, + const DenseTensor& X, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward); +}; + +template +struct FFTR2CFunctor { + void operator()(const DeviceContext& ctx, + const DenseTensor& X, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward); +}; + +template +struct FFTC2RFunctor { + void operator()(const DeviceContext& ctx, + const DenseTensor& X, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward); +}; +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/fft_cache.h b/paddle/phi/kernels/funcs/fft_cache.h new file mode 100644 index 0000000000000000000000000000000000000000..51e90a6c0d95b5e802fa76cfc92c8b7bbf44943e --- /dev/null +++ b/paddle/phi/kernels/funcs/fft_cache.h @@ -0,0 +1,189 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#if defined(PADDLE_WITH_CUDA) +#include "paddle/phi/kernels/funcs/cufft_util.h" +#elif defined(PADDLE_WITH_HIP) +#include "paddle/phi/kernels/funcs/hipfft_util.h" +#endif + +namespace phi { +namespace funcs { +namespace detail { + +#if CUDA_VERSION < 10000 +// Note that the max plan number for CUDA version < 10 has to be 1023 +// due to a bug that fails on the 1024th plan +constexpr size_t CUFFT_MAX_PLAN_NUM = 1023; +constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM; +#else +constexpr size_t CUFFT_MAX_PLAN_NUM = std::numeric_limits::max(); +// The default max cache size chosen for CUDA version > 10 is arbitrary. +// This number puts a limit on how big of a plan cache should we maintain by +// default. Users can always configure it via cufft_set_plan_cache_max_size. +constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = 4096; +#endif + +static_assert(CUFFT_MAX_PLAN_NUM >= 0 && + CUFFT_MAX_PLAN_NUM <= std::numeric_limits::max(), + "CUFFT_MAX_PLAN_NUM not in size_t range"); +static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 && + CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM, + "CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range"); + +class FFTConfigCache { + public: + using kv_t = typename std::pair; + using map_t = + typename std::unordered_map, + typename std::list::iterator, + KeyHash, + KeyEqual>; + using map_kkv_iter_t = typename map_t::iterator; + + FFTConfigCache() : FFTConfigCache(CUFFT_DEFAULT_CACHE_SIZE) {} + + explicit FFTConfigCache(int64_t max_size) { _set_max_size(max_size); } + + FFTConfigCache(const FFTConfigCache& other) = delete; + FFTConfigCache& operator=(const FFTConfigCache& other) = delete; + + FFTConfigCache(FFTConfigCache&& other) noexcept + : _usage_list(std::move(other._usage_list)), + _cache_map(std::move(other._cache_map)), + _max_size(other._max_size) {} + + FFTConfigCache& operator=(FFTConfigCache&& other) noexcept { + _usage_list = std::move(other._usage_list); + _cache_map = std::move(other._cache_map); + _max_size = other._max_size; + return *this; + } + + // If key is in this cache, return the cached config. Otherwise, emplace the + // config in this cache and return it. + FFTConfig& lookup(FFTConfigKey params) { + PADDLE_ENFORCE_GT(_max_size, + 0, + phi::errors::InvalidArgument( + "The max size of FFTConfigCache must be great than 0," + "But received is [%d]", + _max_size)); + + map_kkv_iter_t map_it = _cache_map.find(params); + // Hit, put to list front + if (map_it != _cache_map.end()) { + _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second); + return map_it->second->second; + } + + // Miss + // remove if needed + if (_usage_list.size() >= _max_size) { + auto last = _usage_list.end(); + last--; + _cache_map.erase(last->first); + _usage_list.pop_back(); + } + + // construct new plan at list front, then insert into _cache_map + _usage_list.emplace_front(std::piecewise_construct, + std::forward_as_tuple(params), + std::forward_as_tuple(params)); + auto kv_it = _usage_list.begin(); + _cache_map.emplace(std::piecewise_construct, + std::forward_as_tuple(kv_it->first), + std::forward_as_tuple(kv_it)); + return kv_it->second; + } + + void clear() { + _cache_map.clear(); + _usage_list.clear(); + } + + void resize(int64_t new_size) { + _set_max_size(new_size); + auto cur_size = _usage_list.size(); + if (cur_size > _max_size) { + auto delete_it = _usage_list.end(); + for (size_t i = 0; i < cur_size - _max_size; i++) { + delete_it--; + _cache_map.erase(delete_it->first); + } + _usage_list.erase(delete_it, _usage_list.end()); + } + } + + size_t size() const { return _cache_map.size(); } + + size_t max_size() const noexcept { return _max_size; } + + std::mutex mutex; + + private: + // Only sets size and does value check. Does not resize the data structures. + void _set_max_size(int64_t new_size) { + // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since + // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check + // first. + PADDLE_ENFORCE_GE( + new_size, + 0, + phi::errors::InvalidArgument( + "cuFFT plan cache size must be non-negative, But received is [%d]", + new_size)); + PADDLE_ENFORCE_LE(new_size, + CUFFT_MAX_PLAN_NUM, + phi::errors::InvalidArgument( + "cuFFT plan cache size can not be larger than [%d], " + "But received is [%d]", + CUFFT_MAX_PLAN_NUM, + new_size)); + _max_size = static_cast(new_size); + } + + std::list _usage_list; + map_t _cache_map; + size_t _max_size; +}; + +static std::vector> plan_caches; +static std::mutex plan_caches_mutex; + +static inline FFTConfigCache& get_fft_plan_cache(int64_t device_index) { + std::lock_guard guard(plan_caches_mutex); + + if (device_index >= plan_caches.size()) { + plan_caches.resize(device_index + 1); + } + + if (!plan_caches[device_index]) { + plan_caches[device_index] = std::make_unique(); + } + + return *plan_caches[device_index]; +} +} // namespace detail +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/fft_fill_conj.h b/paddle/phi/kernels/funcs/fft_fill_conj.h new file mode 100644 index 0000000000000000000000000000000000000000..91d859020f88b93d9a656accea4525c25e62d485 --- /dev/null +++ b/paddle/phi/kernels/funcs/fft_fill_conj.h @@ -0,0 +1,219 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#if defined(__NVCC__) || defined(__HIPCC__) +#include "thrust/device_vector.h" +#endif + +namespace phi { +namespace funcs { + +// Giving a linear destination index and strides of tensor, get_idx return the +// corresponding linear position of source tensor. +// The linear index is the position of flatten tensor. +// Giving a linear destination index and strides of tensor, get_idx return the +// corresponding linear position of source tensor. +// The linear index is the position of flatten tensor. +HOSTDEVICE inline int64_t get_src_idx(const int64_t dst_idx, + const int64_t* dst_strides, + const int64_t* dst_shape, + const int64_t* src_strides, + const bool* is_fft_axis, + const bool conj, + const int64_t rank) { + int64_t src_idx = 0; + int64_t quotient = dst_idx; + int64_t remainder = 0; + + for (int64_t i = 0; i < rank; i++) { + remainder = quotient % dst_strides[i]; + quotient = quotient / dst_strides[i]; + if (conj && is_fft_axis[i]) { + src_idx += ((dst_shape[i] - quotient) % dst_shape[i]) * src_strides[i]; + } else { + src_idx += src_strides[i] * quotient; + } + quotient = remainder; + } + + return src_idx; +} + +HOSTDEVICE inline bool is_conj_part(const int64_t dst_idx, + const int64_t* dst_strides, + const int64_t last_axis, + const int64_t last_axis_size) { + int64_t quotient = dst_idx; + int64_t remainder = 0; + + for (int64_t i = 0; i < last_axis + 1; i++) { + remainder = quotient % dst_strides[i]; + quotient = quotient / dst_strides[i]; + + if ((i == last_axis) && (quotient > last_axis_size - 1)) { + return true; + } + + quotient = remainder; + } + + return false; +} + +// FFTFillConjFunctor fill the destination tensor with source tensor and +// conjugate symmetry element of source tensor . +// Use framework::ForRange to iterate destination element with +// supporting different device +template +struct FFTFillConjFunctor { + FFTFillConjFunctor(const C* src_data, + C* dst_data, + const int64_t* src_strides, + const int64_t* dst_strides, + const int64_t* dst_shape, + const bool* is_fft_axis, + const int64_t last_axis, + const int64_t last_axis_size, + const int64_t rank) + : src_data_(src_data), + dst_data_(dst_data), + src_strides_(src_strides), + dst_strides_(dst_strides), + dst_shape_(dst_shape), + is_fft_axis_(is_fft_axis), + last_axis_(last_axis), + last_axis_size_(last_axis_size), + rank_(rank) {} + HOSTDEVICE void operator()(int64_t dst_idx) { + if (is_conj_part(dst_idx, dst_strides_, last_axis_, last_axis_size_)) { + const auto conj_idx = get_src_idx(dst_idx, + dst_strides_, + dst_shape_, + src_strides_, + is_fft_axis_, + true, + rank_); + auto src_value = src_data_[conj_idx]; + auto conj_value = C(src_value.real, -src_value.imag); + dst_data_[dst_idx] = conj_value; + } else { + const auto copy_idx = get_src_idx(dst_idx, + dst_strides_, + dst_shape_, + src_strides_, + is_fft_axis_, + false, + rank_); + dst_data_[dst_idx] = src_data_[copy_idx]; + } + } + + const C* src_data_; + C* dst_data_; + const int64_t* src_strides_; + const int64_t* dst_strides_; + const int64_t* dst_shape_; + const bool* is_fft_axis_; + const int64_t last_axis_; + const int64_t last_axis_size_; + const int64_t rank_; +}; + +template +void FFTFillConj(const DeviceContext& ctx, + const DenseTensor* src, + DenseTensor* dst, + const std::vector& axes) { + std::vector src_strides_v = + phi::vectorize(phi::stride(src->dims())); + std::vector dst_strides_v = + phi::vectorize(phi::stride(dst->dims())); + std::vector dst_shape_v = phi::vectorize(dst->dims()); + const auto src_data = src->data(); + auto dst_data = dst->data(); + const auto last_axis = axes.back(); + const auto last_axis_size = dst->dims().at(last_axis) / 2 + 1; + const int64_t rank = dst->dims().size(); + auto _is_fft_axis = std::make_unique(rank); + for (const auto i : axes) { + _is_fft_axis[i] = true; + } + +#if defined(__NVCC__) || defined(__HIPCC__) + const thrust::device_vector src_strides_g(src_strides_v); + const auto src_strides = thrust::raw_pointer_cast(src_strides_g.data()); + const thrust::device_vector dst_strides_g(dst_strides_v); + const auto dst_strides = thrust::raw_pointer_cast(dst_strides_g.data()); + const thrust::device_vector dst_shape_g(dst_shape_v); + const auto dst_shape = thrust::raw_pointer_cast(dst_shape_g.data()); + const thrust::device_vector is_fft_axis_g(_is_fft_axis.get(), + _is_fft_axis.get() + rank); + const auto p_is_fft_axis = thrust::raw_pointer_cast(is_fft_axis_g.data()); +#else + const auto src_strides = src_strides_v.data(); + const auto dst_strides = dst_strides_v.data(); + const auto dst_shape = dst_shape_v.data(); + const auto p_is_fft_axis = _is_fft_axis.get(); +#endif + ForRange for_range(ctx, dst->numel()); + FFTFillConjFunctor fill_conj_functor(src_data, + dst_data, + src_strides, + dst_strides, + dst_shape, + p_is_fft_axis, + last_axis, + last_axis_size, + rank); + for_range(fill_conj_functor); +} + +template +struct FFTFillConjGradFunctor { + T* input_; + const size_t axis_; + const int64_t* strides_; + const size_t double_length_; + + FFTFillConjGradFunctor(T* input, + size_t axis, + const int64_t* strides, + size_t double_length) + : input_(input), + axis_(axis), + strides_(strides), + double_length_(double_length) {} + + HOSTDEVICE void operator()(size_t index) { + size_t offtset = index; // back + size_t index_i; + for (size_t i = 0; i <= axis_; i++) { + index_i = offtset / strides_[i]; + offtset %= strides_[i]; + } + + if ((0 < index_i) && (index_i < double_length_ + 1)) { + input_[index] *= static_cast(2); + } + } +}; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/fft_key.h b/paddle/phi/kernels/funcs/fft_key.h new file mode 100644 index 0000000000000000000000000000000000000000..5893cfc6ba019f3b90c5e84f2d8c3482c530fc17 --- /dev/null +++ b/paddle/phi/kernels/funcs/fft_key.h @@ -0,0 +1,115 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/funcs/fft.h" + +namespace phi { +namespace funcs { +namespace detail { + +const int64_t kMaxFFTNdim = 3; +const int64_t kMaxDataNdim = kMaxFFTNdim + 1; + +struct FFTConfigKey { + int signal_ndim_; // 1 <= signal_ndim <= kMaxFFTNdim + // These include additional batch dimension as well. + int64_t sizes_[kMaxDataNdim]; + int64_t input_shape_[kMaxDataNdim]; + int64_t output_shape_[kMaxDataNdim]; + FFTTransformType fft_type_; + DataType value_type_; + + using shape_t = std::vector; + FFTConfigKey() = default; + + FFTConfigKey(const shape_t& in_shape, + const shape_t& out_shape, + const shape_t& signal_size, + FFTTransformType fft_type, + DataType value_type) { + // Padding bits must be zeroed for hashing + memset(this, 0, sizeof(*this)); + signal_ndim_ = signal_size.size() - 1; + fft_type_ = fft_type; + value_type_ = value_type; + std::copy(signal_size.cbegin(), signal_size.cend(), sizes_); + std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_); + std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_); + } +}; + +// Hashing machinery for Key +// Fowler–Noll–Vo hash function +// see +// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function +template +struct KeyHash { + // Key must be a POD because we read out its memory + // contenst as char* when hashing + static_assert(std::is_pod::value, "Key must be plain old data type"); + + size_t operator()(const Key& params) const { + auto ptr = reinterpret_cast(¶ms); + uint32_t value = 0x811C9DC5; + for (int i = 0; i < static_cast(sizeof(Key)); ++i) { + value ^= ptr[i]; + value *= 0x01000193; + } + return static_cast(value); + } +}; + +template +struct KeyEqual { + // Key must be a POD because we read out its memory + // contenst as char* when comparing + static_assert(std::is_pod::value, "Key must be plain old data type"); + + bool operator()(const Key& a, const Key& b) const { + auto ptr1 = reinterpret_cast(&a); + auto ptr2 = reinterpret_cast(&b); + return memcmp(ptr1, ptr2, sizeof(Key)) == 0; + } +}; + +static FFTConfigKey create_fft_configkey(const DenseTensor& input, + const DenseTensor& output, + int signal_ndim) { + // Create the transform plan (either from cache or locally) + DataType input_dtype = input.dtype(); + const auto value_type = + IsComplexType(input_dtype) ? ToRealType(input_dtype) : input_dtype; + const auto fft_type = GetFFTTransformType(input.dtype(), output.dtype()); + // signal sizes + std::vector signal_size(signal_ndim + 1); + + signal_size[0] = input.dims()[0]; + for (int64_t i = 1; i <= signal_ndim; ++i) { + auto in_size = input.dims()[i]; + auto out_size = output.dims()[i]; + signal_size[i] = std::max(in_size, out_size); + } + FFTConfigKey key(phi::vectorize(input.dims()), + phi::vectorize(output.dims()), + signal_size, + fft_type, + value_type); + return key; +} + +} // namespace detail +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/hipfft_util.h b/paddle/phi/kernels/funcs/hipfft_util.h new file mode 100644 index 0000000000000000000000000000000000000000..6583a97f17a1d8b358e190ca81f52eb60d5f14d6 --- /dev/null +++ b/paddle/phi/kernels/funcs/hipfft_util.h @@ -0,0 +1,184 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/backends/dynload/hipfft.h" +#include "paddle/phi/kernels/funcs/fft.h" +#include "paddle/phi/kernels/funcs/fft_key.h" + +namespace phi { +namespace funcs { +namespace detail { + +// An RAII encapsulation of hipFFTHandle +class HIPFFTHandle { + public: + HIPFFTHandle() { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::hipfftCreate(&handle_)); + } + + HIPFFTHandle(const HIPFFTHandle& other) = delete; + HIPFFTHandle& operator=(const HIPFFTHandle& other) = delete; + + HIPFFTHandle(HIPFFTHandle&& other) = delete; + HIPFFTHandle& operator=(HIPFFTHandle&& other) = delete; + + ::hipfftHandle& get() { return handle_; } + const ::hipfftHandle& get() const { return handle_; } + + ~HIPFFTHandle() { phi::dynload::hipfftDestroy(handle_); } + + private: + ::hipfftHandle handle_; +}; + +class FFTConfig { + public: + using plan_size_type = int; + explicit FFTConfig(const FFTConfigKey& key) + : FFTConfig( + std::vector(key.sizes_, key.sizes_ + key.signal_ndim_ + 1), + key.fft_type_, + key.value_type_) {} + FFTConfig(const std::vector& sizes, + FFTTransformType fft_type, + DataType precision) + : fft_type_(fft_type), precision_(precision) { + std::vector signal_sizes(sizes.begin() + 1, sizes.end()); + const auto batch_size = static_cast(sizes[0]); + const int signal_ndim = sizes.size() - 1; + + hipfftType exec_type = [&]() { + if (precision == DataType::FLOAT32) { + switch (fft_type) { + case FFTTransformType::C2C: + return HIPFFT_C2C; + case FFTTransformType::R2C: + return HIPFFT_R2C; + case FFTTransformType::C2R: + return HIPFFT_C2R; + } + } else if (precision == DataType::FLOAT64) { + switch (fft_type) { + case FFTTransformType::C2C: + return HIPFFT_Z2Z; + case FFTTransformType::R2C: + return HIPFFT_D2Z; + case FFTTransformType::C2R: + return HIPFFT_Z2D; + } + } + PADDLE_THROW(phi::errors::InvalidArgument( + "Only transforms of type float32 and float64 are supported.")); + }(); + + // disable auto allocation of workspace to use allocator from the framework + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::hipfftSetAutoAllocation(plan(), /* autoAllocate */ 0)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::hipfftMakePlanMany(plan(), + signal_ndim, + signal_sizes.data(), + /* inembed */ nullptr, + /* base_istride */ 1, + /* idist */ 1, + /* onembed */ nullptr, + /* base_ostride */ 1, + /* odist */ 1, + exec_type, + batch_size, + &ws_size_)); + } + + const hipfftHandle& plan() const { return plan_.get(); } + FFTTransformType transform_type() const { return fft_type_; } + DataType data_type() const { return precision_; } + size_t workspace_size() const { return ws_size_; } + + private: + HIPFFTHandle plan_; + size_t ws_size_; // workspace size in bytes + FFTTransformType fft_type_; + DataType precision_; +}; + +// NOTE: R2C is forward-only, C2R is backward only +static void exec_plan(const FFTConfig& config, + void* in_data, + void* out_data, + bool forward) { + const hipfftHandle& plan = config.plan(); + + DataType value_type = config.data_type(); + if (value_type == DataType::FLOAT32) { + switch (config.transform_type()) { + case FFTTransformType::C2C: { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::hipfftExecC2C( + plan, + static_cast(in_data), + static_cast(out_data), + forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); + return; + } + case FFTTransformType::R2C: { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::hipfftExecR2C(plan, + static_cast(in_data), + static_cast(out_data))); + return; + } + case FFTTransformType::C2R: { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::hipfftExecC2R(plan, + static_cast(in_data), + static_cast(out_data))); + return; + } + } + } else if (value_type == DataType::FLOAT64) { + switch (config.transform_type()) { + case FFTTransformType::C2C: { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::hipfftExecZ2Z( + plan, + static_cast(in_data), + static_cast(out_data), + forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); + return; + } + case FFTTransformType::R2C: { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::hipfftExecD2Z( + plan, + static_cast(in_data), + static_cast(out_data))); + return; + } + case FFTTransformType::C2R: { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::hipfftExecZ2D( + plan, + static_cast(in_data), + static_cast(out_data))); + return; + } + } + } + PADDLE_THROW(phi::errors::InvalidArgument( + "hipFFT only support transforms of type float32 and float64")); +} + +} // namespace detail +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/mkl_fft_utils.h b/paddle/phi/kernels/funcs/mkl_fft_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..dbc0678ab7ae5a0e23725a5145d86dba04485c1b --- /dev/null +++ b/paddle/phi/kernels/funcs/mkl_fft_utils.h @@ -0,0 +1,172 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/phi/backends/dynload/mklrt.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/kernels/funcs/fft.h" + +namespace phi { +namespace funcs { +namespace detail { + +#define MKL_DFTI_CHECK(expr) \ + do { \ + MKL_LONG status = (expr); \ + if (!phi::dynload::DftiErrorClass(status, DFTI_NO_ERROR)) \ + PADDLE_THROW( \ + phi::errors::External(phi::dynload::DftiErrorMessage(status))); \ + } while (0); + +struct DftiDescriptorDeleter { + void operator()(DFTI_DESCRIPTOR_HANDLE handle) { + if (handle != nullptr) { + MKL_DFTI_CHECK(phi::dynload::DftiFreeDescriptor(&handle)); + } + } +}; + +// A RAII wrapper for MKL_DESCRIPTOR* +class DftiDescriptor { + public: + void init(DFTI_CONFIG_VALUE precision, + DFTI_CONFIG_VALUE signal_type, + MKL_LONG signal_ndim, + MKL_LONG* sizes) { + PADDLE_ENFORCE_EQ(desc_.get(), + nullptr, + phi::errors::AlreadyExists( + "DftiDescriptor has already been initialized.")); + + DFTI_DESCRIPTOR* raw_desc; + MKL_DFTI_CHECK(phi::dynload::DftiCreateDescriptorX( + &raw_desc, precision, signal_type, signal_ndim, sizes)); + desc_.reset(raw_desc); + } + + DFTI_DESCRIPTOR* get() const { + DFTI_DESCRIPTOR* raw_desc = desc_.get(); + PADDLE_ENFORCE_NOT_NULL(raw_desc, + phi::errors::PreconditionNotMet( + "DFTI DESCRIPTOR has not been initialized.")); + return raw_desc; + } + + private: + std::unique_ptr desc_; +}; + +static DftiDescriptor plan_mkl_fft(const DataType in_dtype, + const DataType out_dtype, + const phi::DDim& in_strides, + const phi::DDim& out_strides, + const std::vector& signal_sizes, + FFTNormMode normalization, + bool forward) { + const DFTI_CONFIG_VALUE precision = [&] { + switch (in_dtype) { + case DataType::FLOAT32: + return DFTI_SINGLE; + case DataType::COMPLEX64: + return DFTI_SINGLE; + case DataType::FLOAT64: + return DFTI_DOUBLE; + case DataType::COMPLEX128: + return DFTI_DOUBLE; + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "Invalid input datatype (%s), input data type should be FP32, " + "FP64, COMPLEX64 or COMPLEX128.", + in_dtype)); + } + }(); + + // C2C, R2C, C2R + const FFTTransformType fft_type = GetFFTTransformType(in_dtype, out_dtype); + const DFTI_CONFIG_VALUE domain = + (fft_type == FFTTransformType::C2C) ? DFTI_COMPLEX : DFTI_REAL; + + DftiDescriptor descriptor; + std::vector fft_sizes(signal_sizes.cbegin(), signal_sizes.cend()); + const MKL_LONG signal_ndim = fft_sizes.size() - 1; + descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1); + + // placement inplace or not inplace + MKL_DFTI_CHECK(phi::dynload::DftiSetValue( + descriptor.get(), DFTI_PLACEMENT, DFTI_NOT_INPLACE)); + + // number of transformations + const MKL_LONG batch_size = fft_sizes[0]; + MKL_DFTI_CHECK(phi::dynload::DftiSetValue( + descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size)); + + // input & output distance + const MKL_LONG idist = in_strides[0]; + const MKL_LONG odist = out_strides[0]; + MKL_DFTI_CHECK( + phi::dynload::DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist)); + MKL_DFTI_CHECK(phi::dynload::DftiSetValue( + descriptor.get(), DFTI_OUTPUT_DISTANCE, odist)); + + // input & output stride + std::vector mkl_in_stride(1 + signal_ndim, 0); + std::vector mkl_out_stride(1 + signal_ndim, 0); + for (MKL_LONG i = 1; i <= signal_ndim; i++) { + mkl_in_stride[i] = in_strides[i]; + mkl_out_stride[i] = out_strides[i]; + } + MKL_DFTI_CHECK(phi::dynload::DftiSetValue( + descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data())); + MKL_DFTI_CHECK(phi::dynload::DftiSetValue( + descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data())); + + // conjugate even storage + if (!(fft_type == FFTTransformType::C2C)) { + MKL_DFTI_CHECK(phi::dynload::DftiSetValue( + descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX)); + } + + MKL_LONG signal_numel = std::accumulate(fft_sizes.cbegin() + 1, + fft_sizes.cend(), + 1UL, + std::multiplies()); + if (normalization != FFTNormMode::none) { + const double scale = + ((normalization == FFTNormMode::by_sqrt_n) + ? 1.0 / std::sqrt(static_cast(signal_numel)) + : 1.0 / static_cast(signal_numel)); + const auto scale_direction = [&]() { + if (fft_type == FFTTransformType::R2C || + (fft_type == FFTTransformType::C2C && forward)) { + return DFTI_FORWARD_SCALE; + } else { + // (fft_type == FFTTransformType::C2R || + // (fft_type == FFTTransformType::C2C && !forward)) + return DFTI_BACKWARD_SCALE; + } + }(); + MKL_DFTI_CHECK( + phi::dynload::DftiSetValue(descriptor.get(), scale_direction, scale)); + } + + // commit the descriptor + MKL_DFTI_CHECK(phi::dynload::DftiCommitDescriptor(descriptor.get())); + return descriptor; +} + +} // namespace detail +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/fft_grad_kernel.cu b/paddle/phi/kernels/gpu/fft_grad_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..69a95cffc3ee0e5ba71c4d96f35ad1b91ae6ccf6 --- /dev/null +++ b/paddle/phi/kernels/gpu/fft_grad_kernel.cu @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/fft_grad_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/fft_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(fft_c2c_grad, + GPU, + ALL_LAYOUT, + phi::FFTC2CGradKernel, + phi::dtype::complex, + phi::dtype::complex) {} +PD_REGISTER_KERNEL( + fft_c2r_grad, GPU, ALL_LAYOUT, phi::FFTC2RGradKernel, float, double) {} +PD_REGISTER_KERNEL(fft_r2c_grad, + GPU, + ALL_LAYOUT, + phi::FFTR2CGradKernel, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/fft_kernel.cu b/paddle/phi/kernels/gpu/fft_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..aaa1ed0c225c600e94bacd352cc1bd17432d7c6c --- /dev/null +++ b/paddle/phi/kernels/gpu/fft_kernel.cu @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/fft_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/fft_kernel_impl.h" + +PD_REGISTER_KERNEL(fft_c2c, + GPU, + ALL_LAYOUT, + phi::FFTC2CKernel, + phi::dtype::complex, + phi::dtype::complex) {} +PD_REGISTER_KERNEL(fft_c2r, + GPU, + ALL_LAYOUT, + phi::FFTC2RKernel, + phi::dtype::complex, + phi::dtype::complex) {} +PD_REGISTER_KERNEL(fft_r2c, GPU, ALL_LAYOUT, phi::FFTR2CKernel, float, double) { +} diff --git a/paddle/phi/kernels/gpu/scale_kernel.cu b/paddle/phi/kernels/gpu/scale_kernel.cu index 6f96a697b2f2db6c2097640f34c30142939f80e0..1a574c05494fdaa14f3d8ca0f148d2f60c3964f5 100644 --- a/paddle/phi/kernels/gpu/scale_kernel.cu +++ b/paddle/phi/kernels/gpu/scale_kernel.cu @@ -74,4 +74,6 @@ PD_REGISTER_KERNEL(scale, int8_t, int16_t, int, - int64_t) {} + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/impl/fft_grad_kernel_impl.h b/paddle/phi/kernels/impl/fft_grad_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..02a8fbd14bca94efefddcf7ec593583d0c095971 --- /dev/null +++ b/paddle/phi/kernels/impl/fft_grad_kernel_impl.h @@ -0,0 +1,110 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/kernels/fft_grad_kernel.h" + +#include +#include + +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/fft.h" +#include "paddle/phi/kernels/funcs/fft_fill_conj.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/pad_kernel.h" + +namespace phi { +template +void FFTC2CGradKernel(const Context& ctx, + const DenseTensor& out_grad, + const std::vector& axes, + const std::string& normalization, + bool forward, + DenseTensor* x_grad) { + ctx.template Alloc(x_grad); + auto norm_type = funcs::get_norm_from_string(normalization, forward); + funcs::FFTC2CFunctor fft_c2c_func; + fft_c2c_func(ctx, out_grad, x_grad, axes, norm_type, !forward); +} + +template +void FFTR2CGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& axes, + const std::string& normalization, + bool forward, + bool onesided, + DenseTensor* x_grad) { + using R = typename T::value_type; + DenseTensor complex_x_grad = EmptyLike(ctx, x); + ctx.template Alloc(x_grad); + auto norm_type = funcs::get_norm_from_string(normalization, forward); + funcs::FFTC2CFunctor fft_c2c_func; + + if (!onesided) { + fft_c2c_func(ctx, out_grad, &complex_x_grad, axes, norm_type, !forward); + } else { + DenseTensor full_dy; + DenseTensorMeta full_dy_meta(out_grad.type(), x_grad->dims()); + full_dy.set_meta(full_dy_meta); + auto zero_length = static_cast(full_dy.dims().at(axes.back()) - + out_grad.dims().at(axes.back())); + auto rank = out_grad.dims().size(); + std::vector pads(rank * 2, 0); + pads[axes.back() * 2 + 1] = zero_length; + PadKernel(ctx, out_grad, pads, static_cast(0.0), &full_dy); + fft_c2c_func(ctx, full_dy, &complex_x_grad, axes, norm_type, !forward); + } + RealKernel(ctx, complex_x_grad, x_grad); +} + +template +void FFTC2RGradKernel(const Context& ctx, + const DenseTensor& out_grad, + const std::vector& axes, + const std::string& normalization, + bool forward, + int64_t last_dim_size, + DenseTensor* x_grad) { + using C = phi::dtype::complex; + ctx.template Alloc(x_grad); + auto norm_type = funcs::get_norm_from_string(normalization, forward); + + funcs::FFTR2CFunctor fft_r2c_func; + fft_r2c_func(ctx, out_grad, x_grad, axes, norm_type, !forward); + + const int64_t double_length = + out_grad.dims()[axes.back()] - x_grad->dims()[axes.back()]; + const phi::DDim strides = phi::stride(x_grad->dims()); + +#if defined(__NVCC__) || defined(__HIPCC__) + const thrust::device_vector strides_g(phi::vectorize(strides)); + const int64_t* pstrides = thrust::raw_pointer_cast(strides_g.data()); +#else + const int64_t* pstrides = strides.Get(); +#endif + + funcs::FFTFillConjGradFunctor func( + x_grad->data(), axes.back(), pstrides, double_length); + size_t limit = x_grad->numel(); + funcs::ForRange for_range(ctx, limit); + for_range(func); +} +} // namespace phi diff --git a/paddle/phi/kernels/impl/fft_kernel_impl.h b/paddle/phi/kernels/impl/fft_kernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..d441093db07c9802e139eba4048da3d01d07ff09 --- /dev/null +++ b/paddle/phi/kernels/impl/fft_kernel_impl.h @@ -0,0 +1,83 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/kernels/fft_kernel.h" + +#include +#include + +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/fft.h" +#include "paddle/phi/kernels/funcs/fft_fill_conj.h" + +namespace phi { +template +void FFTC2CKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + DenseTensor* out) { + ctx.template Alloc(out); + const auto norm_type = funcs::get_norm_from_string(normalization, forward); + funcs::FFTC2CFunctor fft_c2c_func; + fft_c2c_func(ctx, x, out, axes, norm_type, forward); +} + +template +void FFTC2RKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + int64_t last_dim_size, + DenseTensor* out) { + using R = typename T::value_type; // get real type + ctx.template Alloc(out); + const auto norm_type = funcs::get_norm_from_string(normalization, forward); + funcs::FFTC2RFunctor fft_c2r_func; + fft_c2r_func(ctx, x, out, axes, norm_type, forward); +} + +template +void FFTR2CKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + bool onesided, + DenseTensor* out) { + using C = phi::dtype::complex; + ctx.template Alloc(out); + auto norm_type = funcs::get_norm_from_string(normalization, forward); + funcs::FFTR2CFunctor fft_r2c_func; + + if (onesided) { + fft_r2c_func(ctx, x, out, axes, norm_type, forward); + } else { + phi::DDim onesided_out_shape = x.dims(); + const int64_t last_fft_axis = axes.back(); + const int64_t onesided_last_axis_size = + out->dims().at(last_fft_axis) / 2 + 1; + onesided_out_shape[last_fft_axis] = onesided_last_axis_size; + DenseTensor onesided_out = + Empty(ctx, phi::vectorize(onesided_out_shape)); + fft_r2c_func(ctx, x, &onesided_out, axes, norm_type, forward); + funcs::FFTFillConj(ctx, &onesided_out, out, axes); + } +} +} // namespace phi diff --git a/python/paddle/fft.py b/python/paddle/fft.py index f44111cb766187a726dd0c94e4b1890049d6aebb..4b6a93edc447b9a4cdc758135dec53eb2bbdba44 100644 --- a/python/paddle/fft.py +++ b/python/paddle/fft.py @@ -17,7 +17,7 @@ import numpy as np import paddle from .tensor.attribute import is_complex, is_floating_point, is_integer from .tensor.creation import _real_to_complex_dtype, _complex_to_real_dtype -from .fluid.framework import _non_static_mode +from .fluid.framework import _in_legacy_dygraph, in_dygraph_mode from . import _C_ops from .fluid.data_feeder import check_variable_and_dtype from .fluid.layer_helper import LayerHelper @@ -1392,7 +1392,9 @@ def fft_c2c(x, n, axis, norm, forward, name): op_type = 'fft_c2c' check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type) - if _non_static_mode(): + if in_dygraph_mode(): + out = _C_ops.final_state_fft_c2c(x, axes, norm, forward) + elif _in_legacy_dygraph(): attrs = ('axes', axes, 'normalization', norm, 'forward', forward) out = getattr(_C_ops, op_type)(x, *attrs) else: @@ -1426,7 +1428,9 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name): op_type = 'fft_r2c' check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], op_type) - if _non_static_mode(): + if in_dygraph_mode(): + out = _C_ops.final_state_fft_r2c(x, axes, norm, forward, onesided) + elif _in_legacy_dygraph(): attrs = ('axes', axes, 'normalization', norm, 'forward', forward, 'onesided', onesided) out = getattr(_C_ops, op_type)(x, *attrs) @@ -1469,7 +1473,12 @@ def fft_c2r(x, n, axis, norm, forward, name): op_type = 'fft_c2r' check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type) - if _non_static_mode(): + if in_dygraph_mode(): + if n is not None: + out = _C_ops.final_state_fft_c2r(x, axes, norm, forward, n) + else: + out = _C_ops.final_state_fft_c2r(x, axes, norm, forward, 0) + elif _in_legacy_dygraph(): if n is not None: attrs = ('axes', axes, 'normalization', norm, 'forward', forward, 'last_dim_size', n) @@ -1528,7 +1537,9 @@ def fftn_c2c(x, s, axes, norm, forward, name): op_type = 'fft_c2c' check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type) - if _non_static_mode(): + if in_dygraph_mode(): + out = _C_ops.final_state_fft_c2c(x, axes, norm, forward) + elif _in_legacy_dygraph(): attrs = ('axes', axes, 'normalization', norm, 'forward', forward) out = getattr(_C_ops, op_type)(x, *attrs) else: @@ -1579,7 +1590,9 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name): op_type = 'fft_r2c' check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], op_type) - if _non_static_mode(): + if in_dygraph_mode(): + out = _C_ops.final_state_fft_r2c(x, axes, norm, forward, onesided) + elif _in_legacy_dygraph(): attrs = ('axes', axes, 'normalization', norm, 'forward', forward, 'onesided', onesided) out = getattr(_C_ops, op_type)(x, *attrs) @@ -1642,7 +1655,12 @@ def fftn_c2r(x, s, axes, norm, forward, name): op_type = 'fft_c2r' check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type) - if _non_static_mode(): + if in_dygraph_mode(): + if s is not None: + out = _C_ops.final_state_fft_c2r(x, axes, norm, forward, s[-1]) + else: + out = _C_ops.final_state_fft_c2r(x, axes, norm, forward, 0) + elif _in_legacy_dygraph(): if s: attrs = ('axes', axes, 'normalization', norm, 'forward', forward, 'last_dim_size', s[-1]) diff --git a/python/paddle/fluid/tests/unittests/fft/spectral_op_np.py b/python/paddle/fluid/tests/unittests/fft/spectral_op_np.py index 3c48c99af34b585d481e3783c3364986873df8a7..756dd7889977b4f6a243d541ae8b76486ea41cf8 100644 --- a/python/paddle/fluid/tests/unittests/fft/spectral_op_np.py +++ b/python/paddle/fluid/tests/unittests/fft/spectral_op_np.py @@ -12,20 +12,43 @@ # See the License for the specific language governing permissions and # limitations under the License. +import enum import numpy as np from functools import partial from numpy import asarray from numpy.fft._pocketfft import _raw_fft, _raw_fftnd, _get_forward_norm, _get_backward_norm, _cook_nd_args +class NormMode(enum.Enum): + none = 1 + by_sqrt_n = 2 + by_n = 3 + + +def _get_norm_mode(norm, forward): + if norm == "ortho": + return NormMode.by_sqrt_n + if norm is None or norm == "backward": + return NormMode.none if forward else NormMode.by_n + return NormMode.by_n if forward else NormMode.none + + +def _get_inv_norm(n, norm_mode): + assert isinstance(norm_mode, + NormMode), "invalid norm_type {}".format(norm_mode) + if norm_mode == NormMode.none: + return 1.0 + if norm_mode == NormMode.by_sqrt_n: + return np.sqrt(n) + return n + + +# 1d transforms def _fftc2c(a, n=None, axis=-1, norm=None, forward=None): a = asarray(a) if n is None: n = a.shape[axis] - if forward: - inv_norm = _get_forward_norm(n, norm) - else: - inv_norm = _get_backward_norm(n, norm) + inv_norm = _get_inv_norm(n, norm) output = _raw_fft(a, n, axis, False, forward, inv_norm) return output @@ -34,10 +57,7 @@ def _fftr2c(a, n=None, axis=-1, norm=None, forward=None): a = asarray(a) if n is None: n = a.shape[axis] - if forward: - inv_norm = _get_forward_norm(n, norm) - else: - inv_norm = _get_backward_norm(n, norm) + inv_norm = _get_inv_norm(n, norm) output = _raw_fft(a, n, axis, True, True, inv_norm) if not forward: output = output.conj() @@ -48,43 +68,67 @@ def _fftc2r(a, n=None, axis=-1, norm=None, forward=None): a = asarray(a) if n is None: n = (a.shape[axis] - 1) * 2 - if forward: - inv_norm = _get_forward_norm(n, norm) - else: - inv_norm = _get_backward_norm(n, norm) + inv_norm = _get_inv_norm(n, norm) output = _raw_fft(a.conj() if forward else a, n, axis, True, False, inv_norm) return output -def fft_c2c(x, axes, normalization, forward): +# general fft functors +def _fft_c2c_nd(x, axes, norm_mode, forward): f = partial(_fftc2c, forward=forward) - y = _raw_fftnd(x, s=None, axes=axes, function=f, norm=normalization) + y = _raw_fftnd(x, s=None, axes=axes, function=f, norm=norm_mode) return y -def fft_c2c_backward(dy, axes, normalization, forward): - f = partial(_fftc2c, forward=forward) - dx = _raw_fftnd(dy, s=None, axes=axes, function=f, norm=normalization) - return dx - - -def fft_r2c(x, axes, normalization, forward, onesided): +def _fft_r2c_nd(x, axes, norm_mode, forward, onesided): a = asarray(x) s, axes = _cook_nd_args(a, axes=axes) if onesided: - a = _fftr2c(a, s[-1], axes[-1], normalization, forward) - for ii in range(len(axes) - 1): - a = _fftc2c(a, s[ii], axes[ii], normalization, forward) + a = _fftr2c(a, s[-1], axes[-1], norm_mode, forward) + a = _fft_c2c_nd(a, axes[:-1], norm_mode, forward) else: - a = fft_c2c(x, axes, normalization, forward) + a = _fft_c2c_nd(x, axes, norm_mode, forward) + return a + + +def _fft_c2r_nd(x, axes, norm_mode, forward, last_dim_size): + a = asarray(x) + s, axes = _cook_nd_args(a, axes=axes, invreal=1) + if last_dim_size is not None: + s[-1] = last_dim_size + a = _fft_c2c_nd(a, axes[:-1], norm_mode, forward) + a = _fftc2r(a, s[-1], axes[-1], norm_mode, forward) return a -def fft_r2c_backward(dy, x, axes, normalization, forward, onesided): +# kernels +def fft_c2c(x, axes, normalization, forward): + norm_mode = _get_norm_mode(normalization, forward) + return _fft_c2c_nd(x, axes, norm_mode, forward) + + +def fft_c2r(x, axes, normalization, forward, last_dim_size): + norm_mode = _get_norm_mode(normalization, forward) + return _fft_c2r_nd(x, axes, norm_mode, forward, last_dim_size) + + +def fft_r2c(x, axes, normalization, forward, onesided): + norm_mode = _get_norm_mode(normalization, forward) + return _fft_r2c_nd(x, axes, norm_mode, forward, onesided) + + +# backward kernel +def fft_c2c_backward(dy, axes, normalization, forward): + norm_mode = _get_norm_mode(normalization, forward) + dx = _fft_c2c_nd(dy, axes, norm_mode, not forward) + return dx + + +def fft_r2c_backward(x, dy, axes, normalization, forward, onesided): a = dy if not onesided: - a = fft_c2c_backward(a, axes, normalization, forward).real + a = fft_c2c_backward(a, axes, normalization, forward) else: pad_widths = [(0, 0)] * a.ndim last_axis = axes[-1] @@ -93,16 +137,25 @@ def fft_r2c_backward(dy, x, axes, normalization, forward, onesided): last_dim_size = a.shape[last_axis] pad_widths[last_axis] = (0, x.shape[last_axis] - last_dim_size) a = np.pad(a, pad_width=pad_widths) - a = fft_c2c_backward(a, axes, normalization, forward).real - return a + a = fft_c2c_backward(a, axes, normalization, forward) + return a.real -def fft_c2r(x, axes, normalization, forward, last_dim_size): - a = asarray(x) - s, axes = _cook_nd_args(a, axes=axes, invreal=1) - if last_dim_size is not None: - s[-1] = last_dim_size - for ii in range(len(axes) - 1): - a = _fftc2c(a, s[ii], axes[ii], normalization, forward) - a = _fftc2r(a, s[-1], axes[-1], normalization, forward) +def _fft_fill_conj_grad(x, axes, length_to_double): + last_fft_axis = axes[-1] + shape = x.shape + for multi_index in np.ndindex(*shape): + if 0 < multi_index[last_fft_axis] and multi_index[ + last_fft_axis] <= length_to_double: + x[multi_index] *= 2 + return x + + +def fft_c2r_backward(x, dy, axes, normalization, forward, last_dim_size): + norm_mode = _get_norm_mode(normalization, forward) + a = dy + a = _fft_r2c_nd(dy, axes, norm_mode, not forward, True) + last_fft_axis = axes[-1] + length_to_double = dy.shape[last_fft_axis] - x.shape[last_fft_axis] + a = _fft_fill_conj_grad(a, axes, length_to_double) return a diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft.py b/python/paddle/fluid/tests/unittests/fft/test_fft.py index f386fdc9c34608c0909119f6cd1bddabb5f37e56..f7cc9fbf4a130f5f377568efcf0e43c385bf68af 100644 --- a/python/paddle/fluid/tests/unittests/fft/test_fft.py +++ b/python/paddle/fluid/tests/unittests/fft/test_fft.py @@ -473,7 +473,7 @@ class TestIrfft2(unittest.TestCase): @parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [ ('test_bool_input', (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype( - np.bool_), None, -1, 'backward', NotImplementedError), + np.bool_), None, -1, 'backward', RuntimeError), ('test_n_nagative', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), -1, -1, 'backward', ValueError), ('test_n_zero', np.random.randn(4, 4) + 1j * np.random.randn(4, 4), 0, -1, @@ -543,7 +543,7 @@ class TestIrfftException(unittest.TestCase): (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [('test_bool_input', (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype( - np.bool_), None, (-2, -1), 'backward', NotImplementedError), + np.bool_), None, (-2, -1), 'backward', RuntimeError), ('test_n_nagative', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2), (-2, -1), 'backward', ValueError), @@ -625,7 +625,7 @@ class TestIrfft2Exception(unittest.TestCase): (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [('test_bool_input', (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype( - np.bool_), None, (-2, -1), 'backward', NotImplementedError), + np.bool_), None, (-2, -1), 'backward', RuntimeError), ('test_n_nagative', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2), (-2, -1), 'backward', ValueError), diff --git a/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py b/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py index ba4092965920bc3f1863132a1adce85f4c75f3f0..066869750ed0742af7c529ce6a4b29d2d6aca012 100644 --- a/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py +++ b/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py @@ -20,12 +20,13 @@ import paddle import re import sys -from spectral_op_np import fft_c2c, fft_r2c, fft_c2r +from spectral_op_np import fft_c2c, fft_r2c, fft_c2r, fft_c2c_backward, fft_r2c_backward, fft_c2r_backward import paddle.fluid.core as core import paddle.fluid.dygraph as dg import paddle.static as static from numpy.random import random as rand from paddle.fluid import Program, program_guard +from paddle import _C_ops sys.path.append("../") from op_test import OpTest @@ -73,14 +74,26 @@ def class_name(cls, num, params_dict): return "{}_{}{}".format(cls.__name__, num, suffix and "_" + suffix) +def fft_c2c_python_api(x, axes, norm, forward): + return _C_ops.final_state_fft_c2c(x, axes, norm, forward) + + +def fft_r2c_python_api(x, axes, norm, forward, onesided): + return _C_ops.final_state_fft_r2c(x, axes, norm, forward, onesided) + + +def fft_c2r_python_api(x, axes, norm, forward, last_dim_size=0): + return _C_ops.final_state_fft_c2r(x, axes, norm, forward, last_dim_size) + + @parameterize( (TEST_CASE_NAME, 'x', 'axes', 'norm', 'forward'), [('test_axes_is_sqe_type', (np.random.random( (12, 14)) + 1j * np.random.random( (12, 14))).astype(np.complex128), [0, 1], 'forward', True), ('test_axis_not_last', (np.random.random( - (4, 4, 4)) + 1j * np.random.random( - (4, 4, 4))).astype(np.complex128), (0, 1), "backward", False), + (4, 8, 4)) + 1j * np.random.random( + (4, 8, 4))).astype(np.complex128), (0, 1), "backward", False), ('test_norm_forward', (np.random.random((12, 14)) + 1j * np.random.random( (12, 14))).astype(np.complex128), (0, ), "forward", False), ('test_norm_backward', (np.random.random((12, 14)) + 1j * np.random.random( @@ -88,11 +101,11 @@ def class_name(cls, num, params_dict): ('test_norm_ortho', (np.random.random((12, 14)) + 1j * np.random.random( (12, 14))).astype(np.complex128), (1, ), "ortho", True)]) class TestFFTC2COp(OpTest): - # Because framwork not support complex numerial gradient, we skip gradient check. - no_need_check_grad = True def setUp(self): self.op_type = "fft_c2c" + self.dtype = self.x.dtype + self.python_api = fft_c2c_python_api out = fft_c2c(self.x, self.axes, self.norm, self.forward) @@ -104,8 +117,21 @@ class TestFFTC2COp(OpTest): } self.outputs = {'Out': out} + self.out_grad = (np.random.random(self.x.shape) + + 1j * np.random.random(self.x.shape)).astype( + self.x.dtype) + self.x_grad = fft_c2c_backward(self.out_grad, self.axes, self.norm, + self.forward) + def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) + + def test_check_grad(self): + self.check_grad("X", + "Out", + user_defined_grads=[self.x_grad], + user_defined_grad_outputs=[self.out_grad], + check_eager=True) @parameterize( @@ -114,7 +140,7 @@ class TestFFTC2COp(OpTest): (12, 14)) + 1j * np.random.random( (12, 14))).astype(np.complex128), [0, 1], 'forward', True, 26), ('test_axis_not_last', (np.random.random( - (4, 4, 4)) + 1j * np.random.random((4, 4, 4))).astype(np.complex128), + (4, 7, 4)) + 1j * np.random.random((4, 7, 4))).astype(np.complex128), (0, 1), "backward", False, None), ('test_norm_forward', (np.random.random((12, 14)) + 1j * np.random.random( (12, 14))).astype(np.complex128), (0, ), "forward", False, 22), @@ -123,11 +149,11 @@ class TestFFTC2COp(OpTest): ('test_norm_ortho', (np.random.random((12, 14)) + 1j * np.random.random( (12, 14))).astype(np.complex128), (1, ), "ortho", True, 26)]) class TestFFTC2ROp(OpTest): - # Because framwork not support complex numerial gradient, we skip gradient check. - no_need_check_grad = True def setUp(self): self.op_type = "fft_c2r" + self.dtype = self.x.dtype + self.python_api = fft_c2r_python_api out = fft_c2r(self.x, self.axes, self.norm, self.forward, self.last_dim_size) @@ -141,28 +167,40 @@ class TestFFTC2ROp(OpTest): } self.outputs = {'Out': out} + self.out_grad = np.random.random(out.shape).astype(out.dtype) + self.x_grad = fft_c2r_backward(self.x, self.out_grad, self.axes, + self.norm, self.forward, + self.last_dim_size) + def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) + + def test_check_grad(self): + self.check_grad(["X"], + "Out", + user_defined_grads=[self.x_grad], + user_defined_grad_outputs=[self.out_grad], + check_eager=True) @parameterize( (TEST_CASE_NAME, 'x', 'axes', 'norm', 'forward', 'onesided'), - [('test_axes_is_sqe_type', np.random.randn(12, 14).astype(np.float64), + [('test_axes_is_sqe_type', np.random.randn(12, 18).astype(np.float64), (0, 1), 'forward', True, True), - ('test_axis_not_last', np.random.randn(4, 4, 4).astype(np.float64), + ('test_axis_not_last', np.random.randn(4, 8, 4).astype(np.float64), (0, 1), "backward", False, True), - ('test_norm_forward', np.random.randn(12, 14).astype(np.float64), + ('test_norm_forward', np.random.randn(12, 18).astype(np.float64), (0, 1), "forward", False, False), - ('test_norm_backward', np.random.randn(12, 14).astype(np.float64), + ('test_norm_backward', np.random.randn(12, 18).astype(np.float64), (0, ), "backward", True, False), - ('test_norm_ortho', np.random.randn(12, 14).astype(np.float64), + ('test_norm_ortho', np.random.randn(12, 18).astype(np.float64), (1, ), "ortho", True, False)]) class TestFFTR2COp(OpTest): - # Because framwork not support complex numerial gradient, we skip gradient check. - no_need_check_grad = True def setUp(self): self.op_type = "fft_r2c" + self.dtype = self.x.dtype + self.python_api = fft_r2c_python_api out = fft_r2c(self.x, self.axes, self.norm, self.forward, self.onesided) @@ -175,5 +213,16 @@ class TestFFTR2COp(OpTest): } self.outputs = {'Out': out} + self.out_grad = np.random.random(out.shape).astype(out.dtype) + self.x_grad = fft_r2c_backward(self.x, self.out_grad, self.axes, + self.norm, self.forward, self.onesided) + def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) + + def test_check_grad(self): + self.check_grad("X", + "Out", + user_defined_grads=[self.x_grad], + user_defined_grad_outputs=[self.out_grad], + check_eager=True)