From 8f45d14263f7ec3fe6aa3ebba58c9f416da08c6e Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Tue, 1 Dec 2020 09:58:30 +0800 Subject: [PATCH] =?UTF-8?q?add=20complex64=20and=20complex128=20type;=20ad?= =?UTF-8?q?d=20+-*/@=20and=20slice=20opreator=20for=20c=E2=80=A6=20(#29199?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add complex64 and complex128 type; add +-*/@ and slice opreator for complex types * add test cases for complex elementwise, matmul and getitem unittest * add test cases for complex types * add test cases for complex matmul unittest --- paddle/fluid/framework/data_type.h | 42 +- .../framework/details/nan_inf_utils_detail.cc | 35 ++ paddle/fluid/framework/framework.proto | 2 + paddle/fluid/framework/tensor_util.cc | 36 ++ .../elementwise/elementwise_add_op.cc | 26 +- .../elementwise/elementwise_add_op.cu | 19 +- .../elementwise/elementwise_div_op.cc | 20 +- .../elementwise/elementwise_div_op.cu | 19 +- .../elementwise/elementwise_mul_op.cc | 20 +- .../elementwise/elementwise_mul_op.cu | 15 +- .../elementwise/elementwise_sub_op.cc | 20 +- .../elementwise/elementwise_sub_op.cu | 19 +- paddle/fluid/operators/math/blas_impl.cu.h | 272 ++++++++ paddle/fluid/operators/math/blas_impl.h | 337 +++++++++- .../fluid/operators/math/concat_and_split.h | 24 +- paddle/fluid/operators/math/math_function.cc | 34 +- paddle/fluid/operators/math/math_function.cu | 26 +- paddle/fluid/operators/matmul_v2_op.cc | 12 +- paddle/fluid/operators/matmul_v2_op.cu | 8 +- paddle/fluid/operators/slice_op.cc | 12 +- paddle/fluid/operators/slice_op.cu | 9 +- paddle/fluid/operators/strided_slice_op.cc | 12 +- paddle/fluid/operators/strided_slice_op.cu | 14 +- paddle/fluid/platform/complex128.h | 579 +++++++++++++++++ paddle/fluid/platform/complex64.h | 582 ++++++++++++++++++ paddle/fluid/platform/cuda_device_function.h | 45 ++ paddle/fluid/platform/dynload/cublas.h | 4 + paddle/fluid/platform/dynload/mklml.h | 10 + paddle/fluid/pybind/protobuf.cc | 2 + paddle/fluid/pybind/tensor_py.h | 56 ++ python/paddle/fluid/data_feeder.py | 4 + python/paddle/fluid/framework.py | 4 + .../test_complex_elementwise_layers.py | 64 ++ .../tests/unittests/test_complex_getitem.py | 71 +++ .../tests/unittests/test_complex_matmul.py | 86 +++ .../tests/unittests/test_complex_variable.py | 17 + 36 files changed, 2468 insertions(+), 89 deletions(-) create mode 100644 paddle/fluid/platform/complex128.h create mode 100644 paddle/fluid/platform/complex64.h diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h index be263c9fc5..d3cc0ac4e7 100644 --- a/paddle/fluid/framework/data_type.h +++ b/paddle/fluid/framework/data_type.h @@ -18,6 +18,8 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/platform/bfloat16.h" +#include "paddle/fluid/platform/complex128.h" +#include "paddle/fluid/platform/complex64.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" @@ -25,6 +27,8 @@ namespace paddle { namespace platform { struct bfloat16; struct float16; +struct complex64; +struct complex128; } // namespace platform } // namespace paddle @@ -45,23 +49,27 @@ struct DataTypeTrait { #define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \ callback(cpp_type, ::paddle::framework::proto::VarType::proto_type); -#define _ForEachDataType_(callback) \ - _ForEachDataTypeHelper_(callback, float, FP32); \ - _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \ - _ForEachDataTypeHelper_(callback, ::paddle::platform::bfloat16, BF16); \ - _ForEachDataTypeHelper_(callback, double, FP64); \ - _ForEachDataTypeHelper_(callback, int, INT32); \ - _ForEachDataTypeHelper_(callback, int64_t, INT64); \ - _ForEachDataTypeHelper_(callback, bool, BOOL); \ - _ForEachDataTypeHelper_(callback, uint8_t, UINT8); \ - _ForEachDataTypeHelper_(callback, int16_t, INT16); \ - _ForEachDataTypeHelper_(callback, int8_t, INT8) - -#define _ForEachDataTypeSmall_(callback) \ - _ForEachDataTypeHelper_(callback, float, FP32); \ - _ForEachDataTypeHelper_(callback, double, FP64); \ - _ForEachDataTypeHelper_(callback, int, INT32); \ - _ForEachDataTypeHelper_(callback, int64_t, INT64); +#define _ForEachDataType_(callback) \ + _ForEachDataTypeHelper_(callback, float, FP32); \ + _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \ + _ForEachDataTypeHelper_(callback, ::paddle::platform::bfloat16, BF16); \ + _ForEachDataTypeHelper_(callback, double, FP64); \ + _ForEachDataTypeHelper_(callback, int, INT32); \ + _ForEachDataTypeHelper_(callback, int64_t, INT64); \ + _ForEachDataTypeHelper_(callback, bool, BOOL); \ + _ForEachDataTypeHelper_(callback, uint8_t, UINT8); \ + _ForEachDataTypeHelper_(callback, int16_t, INT16); \ + _ForEachDataTypeHelper_(callback, int8_t, INT8); \ + _ForEachDataTypeHelper_(callback, ::paddle::platform::complex64, COMPLEX64); \ + _ForEachDataTypeHelper_(callback, ::paddle::platform::complex128, COMPLEX128); + +#define _ForEachDataTypeSmall_(callback) \ + _ForEachDataTypeHelper_(callback, float, FP32); \ + _ForEachDataTypeHelper_(callback, double, FP64); \ + _ForEachDataTypeHelper_(callback, int, INT32); \ + _ForEachDataTypeHelper_(callback, int64_t, INT64); \ + _ForEachDataTypeHelper_(callback, ::paddle::platform::complex64, COMPLEX64); \ + _ForEachDataTypeHelper_(callback, ::paddle::platform::complex128, COMPLEX128); // For the use of thrust, as index-type elements can be only integers. #define _ForEachDataTypeTiny_(callback) \ diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index 0ad84f5890..ceb358b47a 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -169,6 +169,10 @@ static void PrintNanInf(const T* value, const size_t numel, int print_num, #pragma omp declare reduction(+ : paddle::platform::float16 : omp_out += omp_in) #pragma omp declare reduction(+ : paddle::platform::bfloat16 : omp_out += \ omp_in) +#pragma omp declare reduction(+ : paddle::platform::complex64 : omp_out += \ + omp_in) +#pragma omp declare reduction(+ : paddle::platform::complex128 : omp_out += \ + omp_in) #endif template @@ -222,6 +226,37 @@ void CheckNanInf( PrintNanInf(value, numel, print_num, op_type, var_name); } } + +template <> +void CheckNanInf( + const paddle::platform::complex64* value, const size_t numel, int print_num, + const std::string& op_type, const std::string& var_name) { + paddle::platform::complex64 sum(0.0, 0.0); +#pragma omp parallel for reduction(+ : sum) + for (size_t i = 0; i < numel; ++i) { + sum += (value[i] - value[i]); + } + + if (std::isnan(sum) || std::isinf(sum)) { + PrintNanInf(value, numel, print_num, op_type, var_name); + } +} + +template <> +void CheckNanInf( + const paddle::platform::complex128* value, const size_t numel, + int print_num, const std::string& op_type, const std::string& var_name) { + paddle::platform::complex128 sum(0.0, 0.0); +#pragma omp parallel for reduction(+ : sum) + for (size_t i = 0; i < numel; ++i) { + sum += (value[i] - value[i]); + } + + if (std::isnan(sum) || std::isinf(sum)) { + PrintNanInf(value, numel, print_num, op_type, var_name); + } +} + #endif template <> diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index c33d71b3b0..baaecb55d0 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -116,6 +116,8 @@ message VarType { UINT8 = 20; INT8 = 21; BF16 = 22; + COMPLEX64 = 23; + COMPLEX128 = 24; // Other types that may need additional descriptions LOD_TENSOR = 7; diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 4730f6a4ec..5e38309dfe 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -22,6 +22,8 @@ limitations under the License. */ #include #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/platform/complex128.h" +#include "paddle/fluid/platform/complex64.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -990,6 +992,40 @@ std::ostream& print_tensor(std::ostream& os, const framework::Tensor& tensor) { return os; } +template <> +std::ostream& print_tensor( + std::ostream& os, const framework::Tensor& tensor) { + auto inspect = tensor.data(); + auto element_num = tensor.numel(); + + os << " - data: ["; + if (element_num > 0) { + os << signed(inspect[0].real) << signed(inspect[0].imag) << "j"; + for (int j = 1; j < element_num; ++j) { + os << signed(inspect[j].real) << signed(inspect[j].imag) << "j"; + } + } + os << "]"; + return os; +} + +template <> +std::ostream& print_tensor( + std::ostream& os, const framework::Tensor& tensor) { + auto inspect = tensor.data(); + auto element_num = tensor.numel(); + + os << " - data: ["; + if (element_num > 0) { + os << signed(inspect[0].real) << signed(inspect[0].imag) << "j"; + for (int j = 1; j < element_num; ++j) { + os << signed(inspect[j].real) << signed(inspect[j].imag) << "j"; + } + } + os << "]"; + return os; +} + std::ostream& operator<<(std::ostream& os, const Tensor& t) { os << " - place: " << t.place() << "\n"; os << " - shape: [" << t.dims() << "]\n"; diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index 68a98e7c6b..9885e9c095 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -19,6 +19,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h" +#include "paddle/fluid/platform/complex128.h" +#include "paddle/fluid/platform/complex64.h" namespace paddle { namespace framework { @@ -128,13 +130,21 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel); + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel); REGISTER_OP_CPU_KERNEL( elementwise_add_grad, ops::ElementwiseAddGradKernel, ops::ElementwiseAddGradKernel, ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel); + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel); REGISTER_OP_CPU_KERNEL( elementwise_add_grad_grad, ops::ElementwiseAddDoubleGradKernel, ops::ElementwiseAddDoubleGradKernel); + int64_t>, + ops::ElementwiseAddDoubleGradKernel, + ops::ElementwiseAddDoubleGradKernel); // A specialization elementwise_add operator, used in gradient accumulation with // inplace addto. @@ -159,4 +173,8 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel); + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index a4cbd14388..8de6416065 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h" +#include "paddle/fluid/platform/complex128.h" +#include "paddle/fluid/platform/complex64.h" #include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; @@ -95,26 +97,35 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel); + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel); REGISTER_OP_CUDA_KERNEL( elementwise_add_grad, ops::ElementwiseAddGradKernel, ops::ElementwiseAddGradKernel, ops::ElementwiseAddGradKernel, ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel); + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel); REGISTER_OP_CUDA_KERNEL( elementwise_add_grad_grad, ops::ElementwiseAddDoubleGradKernel, ops::ElementwiseAddDoubleGradKernel, ops::ElementwiseAddDoubleGradKernel, ops::ElementwiseAddDoubleGradKernel, + ops::ElementwiseAddDoubleGradKernel, ops::ElementwiseAddDoubleGradKernel); + plat::complex64>, + ops::ElementwiseAddDoubleGradKernel); REGISTER_OP_CUDA_KERNEL( grad_add, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel); + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc index 49d7158f4c..f14aee8e49 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc @@ -16,6 +16,8 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/elementwise/elementwise_op.h" +#include "paddle/fluid/platform/complex128.h" +#include "paddle/fluid/platform/complex64.h" namespace paddle { namespace operators { @@ -130,13 +132,21 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel); + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel); REGISTER_OP_CPU_KERNEL( elementwise_div_grad, ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel); + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel); REGISTER_OP_CPU_KERNEL( elementwise_div_grad_grad, @@ -147,4 +157,8 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseDivDoubleGradKernel, ops::ElementwiseDivDoubleGradKernel); + int64_t>, + ops::ElementwiseDivDoubleGradKernel, + ops::ElementwiseDivDoubleGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu index e31722a288..df5a2115c3 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_div_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" +#include "paddle/fluid/platform/complex128.h" +#include "paddle/fluid/platform/complex64.h" #include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; @@ -102,7 +104,11 @@ REGISTER_OP_CUDA_KERNEL( paddle::platform::float16>, ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel); + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel); REGISTER_OP_CUDA_KERNEL( elementwise_div_grad, ops::ElementwiseDivGradKernel, @@ -110,8 +116,11 @@ REGISTER_OP_CUDA_KERNEL( paddle::platform::float16>, ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel); + paddle::platform::complex128>); REGISTER_OP_CUDA_KERNEL( elementwise_div_grad_grad, ops::ElementwiseDivDoubleGradKernel, ops::ElementwiseDivDoubleGradKernel); + int64_t>, + ops::ElementwiseDivDoubleGradKernel, + ops::ElementwiseDivDoubleGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc index 289ea6c226..28b131e729 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc @@ -16,6 +16,8 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/elementwise/elementwise_op.h" +#include "paddle/fluid/platform/complex128.h" +#include "paddle/fluid/platform/complex64.h" namespace paddle { namespace operators { @@ -130,13 +132,21 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseMulKernel, ops::ElementwiseMulKernel, ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel); + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel); REGISTER_OP_CPU_KERNEL( elementwise_mul_grad, ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel); + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel); REGISTER_OP_CPU_KERNEL( elementwise_mul_grad_grad, ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel); + int64_t>, + ops::ElementwiseMulDoubleGradKernel, + ops::ElementwiseMulDoubleGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index 8533189f81..b3b4b05449 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h" +#include "paddle/fluid/platform/complex128.h" +#include "paddle/fluid/platform/complex64.h" #include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; @@ -100,19 +102,26 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMulKernel, ops::ElementwiseMulKernel, ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel); + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel); REGISTER_OP_CUDA_KERNEL( elementwise_mul_grad, ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel); + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel); REGISTER_OP_CUDA_KERNEL( elementwise_mul_grad_grad, ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel, + ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel); + plat::complex64>, + ops::ElementwiseMulDoubleGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc index 90f4ebb99e..d72eacbfd4 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc @@ -17,6 +17,8 @@ limitations under the License. */ #include #include "paddle/fluid/operators/elementwise/elementwise_op.h" +#include "paddle/fluid/platform/complex128.h" +#include "paddle/fluid/platform/complex64.h" namespace paddle { namespace framework { @@ -125,13 +127,21 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel); + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel); REGISTER_OP_CPU_KERNEL( elementwise_sub_grad, ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel); + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel); REGISTER_OP_CPU_KERNEL( elementwise_sub_grad_grad, ops::ElementwiseSubDoubleGradKernel, ops::ElementwiseSubDoubleGradKernel); + int64_t>, + ops::ElementwiseSubDoubleGradKernel, + ops::ElementwiseSubDoubleGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu index 9913927ee3..1996cc471a 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" +#include "paddle/fluid/platform/complex128.h" +#include "paddle/fluid/platform/complex64.h" #include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; @@ -99,7 +101,11 @@ REGISTER_OP_CUDA_KERNEL( paddle::platform::float16>, ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel); + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel); REGISTER_OP_CUDA_KERNEL( elementwise_sub_grad, ops::ElementwiseSubGradKernel, @@ -107,8 +113,11 @@ REGISTER_OP_CUDA_KERNEL( paddle::platform::float16>, ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel); + paddle::platform::complex128>); REGISTER_OP_CUDA_KERNEL( elementwise_sub_grad_grad, ops::ElementwiseSubDoubleGradKernel, ops::ElementwiseSubDoubleGradKernel); + int64_t>, + ops::ElementwiseSubDoubleGradKernel, + ops::ElementwiseSubDoubleGradKernel); diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index aeafe22235..53e07d2ba4 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -16,6 +16,7 @@ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/dynload/cublas.h" + #include "paddle/fluid/platform/gpu_info.h" DECLARE_bool(enable_cublas_tensor_op_math); @@ -258,6 +259,180 @@ struct CUBlas { } }; +template <> +struct CUBlas { + using complex64 = platform::complex64; + + static void GEMV(cublasHandle_t handle, cublasOperation_t transa, int m, + int n, const complex64 *alpha, const complex64 *A, int lda, + const complex64 *B, int ldb, const complex64 *beta, + complex64 *C, int ldc) { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCgemv( + handle, transa, m, n, reinterpret_cast(alpha), + reinterpret_cast(A), lda, + reinterpret_cast(B), ldb, + reinterpret_cast(beta), + reinterpret_cast(C), ldc)); + } + + static void GEMM_STRIDED_BATCH(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const complex64 *alpha, const complex64 *A, + int lda, long long int strideA, // NOLINT + const complex64 *B, // NOLINT + int ldb, long long int strideB, // NOLINT + const complex64 *beta, complex64 *C, int ldc, + long long int strideC, // NOLINT + int batchCount) { +#if CUDA_VERSION >= 8000 + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCgemmStridedBatched( + handle, transa, transb, m, n, k, + reinterpret_cast(alpha), + reinterpret_cast(A), lda, strideA, + reinterpret_cast(B), ldb, strideB, + reinterpret_cast(beta), + reinterpret_cast(C), ldc, strideC, batchCount)); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "CgemmStridedBatched is not supported on cuda <= 7.5")); +#endif + } + + static void GEMM(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const complex64 *alpha, const complex64 *A, int lda, + const complex64 *B, int ldb, const complex64 *beta, + complex64 *C, int ldc) { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCgemm( + handle, transa, transb, m, n, k, + reinterpret_cast(alpha), + reinterpret_cast(A), lda, + reinterpret_cast(B), ldb, + reinterpret_cast(beta), + reinterpret_cast(C), ldc)); + } + + // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. + // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode + template + static void GEMM_EX(platform::CUDADeviceContext *dev_ctx, + cublasOperation_t transa, cublasOperation_t transb, int m, + int n, int k, const void *alpha, const void *A, + cudaDataType_t Atype, int lda, const void *B, + cudaDataType_t Btype, int ldb, const void *beta, void *C, + cudaDataType_t Ctype, int ldc, + cudaDataType_t computeType) { +#if CUDA_VERSION >= 8000 + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; +#if CUDA_VERSION >= 9000 + bool use_tensor_op_math = dev_ctx->tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); +#endif // CUDA_VERSION >= 9000 + + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasGemmEx( + handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, + beta, C, Ctype, ldc, computeType, algo)); + }); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "cublasGemmEx is not supported on cuda <= 7.5")); +#endif + } +}; + +template <> +struct CUBlas { + using complex128 = platform::complex128; + + static void GEMV(cublasHandle_t handle, cublasOperation_t transa, int m, + int n, const complex128 *alpha, const complex128 *A, int lda, + const complex128 *B, int ldb, const complex128 *beta, + complex128 *C, int ldc) { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZgemv( + handle, transa, m, n, reinterpret_cast(alpha), + reinterpret_cast(A), lda, + reinterpret_cast(B), ldb, + reinterpret_cast(beta), + reinterpret_cast(C), ldc)); + } + + static void GEMM_STRIDED_BATCH(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const complex128 *alpha, const complex128 *A, + int lda, long long int strideA, // NOLINT + const complex128 *B, // NOLINT + int ldb, long long int strideB, // NOLINT + const complex128 *beta, complex128 *C, int ldc, + long long int strideC, // NOLINT + int batchCount) { +#if CUDA_VERSION >= 8000 + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZgemmStridedBatched( + handle, transa, transb, m, n, k, + reinterpret_cast(alpha), + reinterpret_cast(A), lda, strideA, + reinterpret_cast(B), ldb, strideB, + reinterpret_cast(beta), + reinterpret_cast(C), ldc, strideC, batchCount)); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "CgemmStridedBatched is not supported on cuda <= 7.5")); +#endif + } + + static void GEMM(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const complex128 *alpha, const complex128 *A, int lda, + const complex128 *B, int ldb, const complex128 *beta, + complex128 *C, int ldc) { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZgemm( + handle, transa, transb, m, n, k, + reinterpret_cast(alpha), + reinterpret_cast(A), lda, + reinterpret_cast(B), ldb, + reinterpret_cast(beta), + reinterpret_cast(C), ldc)); + } + + // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. + // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode + template + static void GEMM_EX(platform::CUDADeviceContext *dev_ctx, + cublasOperation_t transa, cublasOperation_t transb, int m, + int n, int k, const void *alpha, const void *A, + cudaDataType_t Atype, int lda, const void *B, + cudaDataType_t Btype, int ldb, const void *beta, void *C, + cudaDataType_t Ctype, int ldc, + cudaDataType_t computeType) { +#if CUDA_VERSION >= 8000 + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; +#if CUDA_VERSION >= 9000 + bool use_tensor_op_math = dev_ctx->tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); +#endif // CUDA_VERSION >= 9000 + + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasGemmEx( + handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, + beta, C, Ctype, ldc, computeType, algo)); + }); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "cublasGemmEx is not supported on cuda <= 7.5")); +#endif + } +}; + template <> template void Blas::GEMM(CBLAS_TRANSPOSE transA, @@ -338,6 +513,103 @@ inline void Blas::GEMM( #endif // CUDA_VERSION >= 8000 } +template <> +template <> +inline void Blas::GEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::complex64 alpha, const platform::complex64 *A, + const platform::complex64 *B, platform::complex64 beta, + platform::complex64 *C) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + + // TODO(kexinzhao): add processing code for compute capability < 53 case + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), 53, + platform::errors::InvalidArgument( + "cublas complex64 gemm requires GPU compute capability >= 53," + "but received %d", + context_.GetComputeCapability())); + + thrust::complex c_alpha = + thrust::complex(alpha.real, alpha.imag); + thrust::complex c_beta = thrust::complex(beta.real, beta.imag); + +#if CUDA_VERSION >= 8000 + // cublasHgemm does true FP16 computation which is slow for non-Volta + // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation: + // input/output in fp16, computation in fp32, which can also be accelerated + // using tensor cores in volta GPUs. + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX( + &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, CUDA_C_32F, ldb, A, + CUDA_C_32F, lda, &c_beta, C, CUDA_C_32F, N, CUDA_C_32F); +#else + // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm + + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, + &c_alpha, h_B, ldb, h_A, lda, &c_beta, + h_C, N); + }); +#endif // CUDA_VERSION >= 8000 +} + +template <> +template <> +inline void Blas::GEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::complex128 alpha, const platform::complex128 *A, + const platform::complex128 *B, platform::complex128 beta, + platform::complex128 *C) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + + // TODO(kexinzhao): add processing code for compute capability < 53 case + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), 53, + platform::errors::InvalidArgument( + "cublas complex128 gemm requires GPU compute capability >= 53," + "but received %d", + context_.GetComputeCapability())); + + thrust::complex c_alpha = + thrust::complex(alpha.real, alpha.imag); + thrust::complex c_beta = + thrust::complex(beta.real, beta.imag); + +#if CUDA_VERSION >= 8000 + // cublasHgemm does true FP16 computation which is slow for non-Volta + // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation: + // input/output in fp16, computation in fp32, which can also be accelerated + // using tensor cores in volta GPUs. + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX( + &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, CUDA_C_64F, ldb, A, + CUDA_C_64F, lda, &c_beta, C, CUDA_C_64F, N, CUDA_C_64F); +#else + // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm + + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, + &c_alpha, h_B, ldb, h_A, lda, &c_beta, + h_C, N); + }); +#endif // CUDA_VERSION >= 8000 +} + template <> template void Blas::GEMM(bool transA, bool transB, int M, diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index c53c453897..32aced7619 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -12,11 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. #pragma once +#ifdef PADDLE_WITH_MKLML +#include +#endif #include #include #include #include + #include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/complex128.h" +#include "paddle/fluid/platform/complex64.h" namespace paddle { namespace operators { @@ -287,6 +293,246 @@ struct CBlas { } }; +template <> +struct CBlas { + template + static void VCOPY(ARGS... args) { + platform::dynload::cblas_ccopy(args...); + } + + // the libmklml_intel.so paddle used has no vcAdd, vcSub, + // vcMul, vcDiv apis before rebuild from source + // so replace with the raw operator methods + /* + template + static void VADD(ARGS... args) { + platform::dynload::vcAdd(args...); + } + + template + static void VSUB(ARGS... args) { + platform::dynload::vcSub(args...); + } + + template + static void VMUL(ARGS... args) { + platform::dynload::vcMul(args...); + } + + template + static void VDIV(ARGS... args) { + platform::dynload::vcDiv(args...); + } + */ + + template + static void VADD(int n, const paddle::platform::complex64 *a, + const paddle::platform::complex64 *b, + paddle::platform::complex64 *y) { + for (int i = 0; i < n; ++i) { + y[i] = a[i] + b[i]; + } + } + + template + static void VSUB(int n, const paddle::platform::complex64 *a, + const paddle::platform::complex64 *b, + paddle::platform::complex64 *y) { + for (int i = 0; i < n; ++i) { + y[i] = a[i] - b[i]; + } + } + + template + static void VMUL(int n, const paddle::platform::complex64 *a, + const paddle::platform::complex64 *b, + paddle::platform::complex64 *y) { + for (int i = 0; i < n; ++i) { + y[i] = a[i] * b[i]; + } + } + template + static void VDIV(int n, const paddle::platform::complex64 *a, + const paddle::platform::complex64 *b, + paddle::platform::complex64 *y) { + for (int i = 0; i < n; ++i) { + y[i] = a[i] / b[i]; + } + } + + template + static void GEMV(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, int M, int N, + paddle::platform::complex64 alpha, + const paddle::platform::complex64 *A, int lda, + const paddle::platform::complex64 *X, int incx, + paddle::platform::complex64 beta, + paddle::platform::complex64 *Y, int incy) { + const void *a_ = (const void *)(A); + const void *x_ = (const void *)(X); + void *y_ = static_cast(Y); + platform::dynload::cblas_cgemv(layout, trans, M, N, &alpha, a_, lda, x_, + incx, &beta, y_, incy); + } + + template + static void GEMM(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans_a, + CBLAS_TRANSPOSE trans_b, int M, int N, int K, + paddle::platform::complex64 alpha, + const paddle::platform::complex64 *A, int lda, + const paddle::platform::complex64 *B, int ldb, + paddle::platform::complex64 beta, + paddle::platform::complex64 *C, int ldc) { + const void *a_ = (const void *)(A); + const void *b_ = (const void *)(B); + void *c_ = static_cast(C); + platform::dynload::cblas_cgemm(layout, trans_a, trans_b, M, N, K, &alpha, + a_, lda, b_, ldb, &beta, c_, ldc); + } + + template + static void GEMM_BATCH(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE *trans_a, + CBLAS_TRANSPOSE *trans_b, int *M, int *N, int *K, + paddle::platform::complex64 *alpha, + const paddle::platform::complex64 **A, const int *lda, + const paddle::platform::complex64 **B, const int *ldb, + paddle::platform::complex64 *beta, + paddle::platform::complex64 **C, const int *ldc, + int group_count, int *group_size) { + const void **A_void = (const void **)(&(*A)); + const void **B_void = (const void **)(&(*B)); + void **C_void = reinterpret_cast(C); + + platform::dynload::cblas_cgemm_batch(layout, trans_a, trans_b, M, N, K, + alpha, A_void, lda, B_void, ldb, beta, + C_void, ldc, group_count, group_size); + } + + template + static void GEMM_EX(ARGS... args) { + platform::dynload::cblas_cgemm_batch(args...); + } +}; + +template <> +struct CBlas { + template + static void VCOPY(ARGS... args) { + platform::dynload::cblas_zcopy(args...); + } + + // the libmklml_intel.so paddle used has no vzAdd, vzSub, + // vzMul, vzDiv apis before rebuild from source + // so replace with the raw operator methods + /* + template + static void VADD(ARGS... args) { + platform::dynload::vzAdd(args...); + } + + template + static void VSUB(ARGS... args) { + platform::dynload::vzSub(args...); + } + + template + static void VMUL(ARGS... args) { + platform::dynload::vzMul(args...); + } + + template + static void VDIV(ARGS... args) { + platform::dynload::vzDiv(args...); + } + */ + + template + static void VADD(int n, const paddle::platform::complex128 *a, + const paddle::platform::complex128 *b, + paddle::platform::complex128 *y) { + for (int i = 0; i < n; ++i) { + y[i] = a[i] + b[i]; + } + } + + template + static void VSUB(int n, const paddle::platform::complex128 *a, + const paddle::platform::complex128 *b, + paddle::platform::complex128 *y) { + for (int i = 0; i < n; ++i) { + y[i] = a[i] - b[i]; + } + } + + template + static void VMUL(int n, const paddle::platform::complex128 *a, + const paddle::platform::complex128 *b, + paddle::platform::complex128 *y) { + for (int i = 0; i < n; ++i) { + y[i] = a[i] * b[i]; + } + } + template + static void VDIV(int n, const paddle::platform::complex128 *a, + const paddle::platform::complex128 *b, + paddle::platform::complex128 *y) { + for (int i = 0; i < n; ++i) { + y[i] = a[i] / b[i]; + } + } + + template + static void GEMV(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, int M, int N, + paddle::platform::complex128 alpha, + const paddle::platform::complex128 *A, int lda, + const paddle::platform::complex128 *X, int incx, + paddle::platform::complex128 beta, + paddle::platform::complex128 *Y, int incy) { + const void *a_ = (const void *)(A); + const void *x_ = (const void *)(X); + void *y_ = static_cast(Y); + platform::dynload::cblas_zgemv(layout, trans, M, N, &alpha, a_, lda, x_, + incx, &beta, y_, incy); + } + + template + static void GEMM(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans_a, + CBLAS_TRANSPOSE trans_b, int M, int N, int K, + paddle::platform::complex128 alpha, + const paddle::platform::complex128 *A, int lda, + const paddle::platform::complex128 *B, int ldb, + paddle::platform::complex128 beta, + paddle::platform::complex128 *C, int ldc) { + const void *a_ = (const void *)(A); + const void *b_ = (const void *)(B); + void *c_ = static_cast(C); + platform::dynload::cblas_zgemm(layout, trans_a, trans_b, M, N, K, &alpha, + a_, lda, b_, ldb, &beta, c_, ldc); + } + + template + static void GEMM_BATCH(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE *trans_a, + CBLAS_TRANSPOSE *trans_b, int *M, int *N, int *K, + paddle::platform::complex128 *alpha, + const paddle::platform::complex128 **A, const int *lda, + const paddle::platform::complex128 **B, const int *ldb, + paddle::platform::complex128 *beta, + paddle::platform::complex128 **C, const int *ldc, + int group_count, int *group_size) { + const void **A_void = (const void **)(&(*A)); + const void **B_void = (const void **)(&(*B)); + void **C_void = reinterpret_cast(C); + + platform::dynload::cblas_zgemm_batch(layout, trans_a, trans_b, M, N, K, + alpha, A_void, lda, B_void, ldb, beta, + C_void, ldc, group_count, group_size); + } + + template + static void GEMM_EX(ARGS... args) { + platform::dynload::cblas_zgemm_batch(args...); + } +}; + #else template <> @@ -344,6 +590,93 @@ struct CBlas { cblas_dtrsm(args...); } }; + +template <> +struct CBlas { + template + static void VCOPY(ARGS... args) { + cblas_ccopy(args...); + } + + template + static void VADD(ARGS... args) { + vcAdd(args...); + } + + template + static void AXPY(int n, const paddle::platform::complex64 alpha, + const paddle::platform::complex64 *X, const int incX, + paddle::platform::complex64 *Y, const int incY) { + cblas_caxpy(n, &alpha, X, incX, Y, incY); + } + + template + static void GEMV(const CBLAS_LAYOUT layout, const CBLAS_TRANSPOSE TransA, + const int M, const int N, + const paddle::platform::complex64 alpha, + const paddle::platform::complex64 *A, const int lda, + const paddle::platform::complex64 *X, const int incX, + const paddle::platform::complex64 beta, + paddle::platform::complex64 *Y, const int incY) { + cblas_cgemv(layout, TransA, M, N, &alpha, A, lda, X, incX, &beta, Y, incY); + } + + template + static void GEMM(const CBLAS_LAYOUT layout, const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const paddle::platform::complex64 alpha, + const paddle::platform::complex64 *A, const int lda, + const paddle::platform::complex64 *B, const int ldb, + const paddle::platform::complex64 beta, + paddle::platform::complex64 *C, const int ldc) { + cblas_cgemm(layout, TransA, TransB, M, N, K, &alpha, A, lda, B, ldb, &beta, + C, ldc); + } +}; + +template <> +struct CBlas { + template + static void VCOPY(ARGS... args) { + cblas_zcopy(args...); + } + + template + static void VADD(ARGS... args) { + vzAdd(args...); + } + + template + static void AXPY(int n, const paddle::platform::complex128 alpha, + const paddle::platform::complex128 *X, const int incX, + paddle::platform::complex128 *Y, const int incY) { + cblas_zaxpy(n, &alpha, X, incX, Y, incY); + } + + template + static void GEMV(const CBLAS_LAYOUT layout, const CBLAS_TRANSPOSE TransA, + const int M, const int N, + const paddle::platform::complex128 alpha, + const paddle::platform::complex128 *A, const int lda, + const paddle::platform::complex128 *X, const int incX, + const paddle::platform::complex128 beta, + paddle::platform::complex128 *Y, const int incY) { + cblas_zgemv(layout, TransA, M, N, &alpha, A, lda, X, incX, &beta, Y, incY); + } + + template + static void GEMM(const CBLAS_LAYOUT layout, const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const paddle::platform::complex128 alpha, + const paddle::platform::complex128 *A, const int lda, + const paddle::platform::complex128 *B, const int ldb, + const paddle::platform::complex128 beta, + paddle::platform::complex128 *C, const int ldc) { + cblas_zgemm(layout, TransA, TransB, M, N, K, &alpha, A, lda, B, ldb, &beta, + C, ldc); + } +}; + #endif template <> @@ -517,10 +850,10 @@ void Blas::VADD(int n, const T *x, const T *y, CBlas::VADD(n, x, y, z); #else if (x == z) { - this->template AXPY(n, 1., y, z); + this->template AXPY(n, (T)(1.), y, z); } else { this->template VCOPY(n, y, z); - this->template AXPY(n, 1., x, z); + this->template AXPY(n, (T)(1.), x, z); } #endif } diff --git a/paddle/fluid/operators/math/concat_and_split.h b/paddle/fluid/operators/math/concat_and_split.h index 18d9a6310d..d6ad3aec22 100644 --- a/paddle/fluid/operators/math/concat_and_split.h +++ b/paddle/fluid/operators/math/concat_and_split.h @@ -65,14 +65,16 @@ class SplitFunctor { } // namespace operators } // namespace paddle -#define FOR_ALL_TYPES(macro) \ - macro(int); \ - macro(float); \ - macro(double); \ - macro(bool); \ - macro(int64_t); \ - macro(int16_t); \ - macro(uint8_t); \ - macro(int8_t); \ - macro(::paddle::platform::float16); \ - macro(::paddle::platform::bfloat16) +#define FOR_ALL_TYPES(macro) \ + macro(int); \ + macro(float); \ + macro(double); \ + macro(bool); \ + macro(int64_t); \ + macro(int16_t); \ + macro(uint8_t); \ + macro(int8_t); \ + macro(::paddle::platform::float16); \ + macro(::paddle::platform::bfloat16); \ + macro(::paddle::platform::complex64); \ + macro(::paddle::platform::complex128) diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index 1da8c89a6d..71ef5a962f 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -44,6 +44,8 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant; +template struct SetConstant; +template struct SetConstant; #ifdef PADDLE_WITH_XPU template struct SetConstant; @@ -54,19 +56,23 @@ template struct SetConstant; template struct SetConstant; #endif -#define DEFINE_CPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; +#define DEFINE_CPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; DEFINE_CPU_TRANS(1); DEFINE_CPU_TRANS(2); @@ -117,6 +123,8 @@ DEFINE_CPU_TRANS_NORMAL(bool); DEFINE_CPU_TRANS_NORMAL(int16_t); DEFINE_CPU_TRANS_NORMAL(uint8_t); DEFINE_CPU_TRANS_NORMAL(int8_t); +DEFINE_CPU_TRANS_NORMAL(platform::complex64); +DEFINE_CPU_TRANS_NORMAL(platform::complex128); struct TensorSetConstantCPU { TensorSetConstantCPU(framework::Tensor* tensor, float value) diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index 4d7c1a4928..cc8925fcf8 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -19,6 +19,8 @@ limitations under the License. */ #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function_impl.h" +#include "paddle/fluid/platform/complex128.h" +#include "paddle/fluid/platform/complex64.h" #include "paddle/fluid/platform/float16.h" namespace paddle { @@ -27,6 +29,8 @@ namespace math { using float16 = paddle::platform::float16; using bfloat16 = paddle::platform::bfloat16; +using complex64 = paddle::platform::complex64; +using complex128 = paddle::platform::complex128; template struct SetConstant; template struct SetConstant; @@ -34,15 +38,19 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant; +template struct SetConstant; +template struct SetConstant; -#define DEFINE_GPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; +#define DEFINE_GPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; DEFINE_GPU_TRANS(1); DEFINE_GPU_TRANS(2); @@ -132,6 +140,8 @@ DEFINE_GPU_TRANS_NORMAL(bool); DEFINE_GPU_TRANS_NORMAL(int16_t); DEFINE_GPU_TRANS_NORMAL(uint8_t); DEFINE_GPU_TRANS_NORMAL(int8_t); +DEFINE_GPU_TRANS_NORMAL(complex64); +DEFINE_GPU_TRANS_NORMAL(complex128); struct TensorSetConstantGPU { TensorSetConstantGPU(const platform::DeviceContext& context, diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc index 0254ad0a56..27023ecd29 100644 --- a/paddle/fluid/operators/matmul_v2_op.cc +++ b/paddle/fluid/operators/matmul_v2_op.cc @@ -168,9 +168,17 @@ REGISTER_OPERATOR(matmul_v2_grad, ops::MatMulV2OpGrad); REGISTER_OP_CPU_KERNEL( matmul_v2, ops::MatMulV2Kernel, - ops::MatMulV2Kernel); + ops::MatMulV2Kernel, + ops::MatMulV2Kernel, + ops::MatMulV2Kernel); REGISTER_OP_CPU_KERNEL( matmul_v2_grad, ops::MatMulV2GradKernel, - ops::MatMulV2GradKernel); + ops::MatMulV2GradKernel, + ops::MatMulV2GradKernel, + ops::MatMulV2GradKernel); diff --git a/paddle/fluid/operators/matmul_v2_op.cu b/paddle/fluid/operators/matmul_v2_op.cu index 91958513dd..e819398ec9 100644 --- a/paddle/fluid/operators/matmul_v2_op.cu +++ b/paddle/fluid/operators/matmul_v2_op.cu @@ -20,9 +20,13 @@ namespace plf = paddle::platform; REGISTER_OP_CUDA_KERNEL( matmul_v2, ops::MatMulV2Kernel, ops::MatMulV2Kernel, - ops::MatMulV2Kernel); + ops::MatMulV2Kernel, + ops::MatMulV2Kernel, + ops::MatMulV2Kernel); REGISTER_OP_CUDA_KERNEL( matmul_v2_grad, ops::MatMulV2GradKernel, ops::MatMulV2GradKernel, - ops::MatMulV2GradKernel); + ops::MatMulV2GradKernel, + ops::MatMulV2GradKernel, + ops::MatMulV2GradKernel); diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc index eff56046b9..8560f1f714 100644 --- a/paddle/fluid/operators/slice_op.cc +++ b/paddle/fluid/operators/slice_op.cc @@ -424,10 +424,18 @@ REGISTER_OP_CPU_KERNEL( slice, ops::SliceKernel, ops::SliceKernel, ops::SliceKernel, - ops::SliceKernel); + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel); REGISTER_OP_CPU_KERNEL( slice_grad, ops::SliceGradKernel, ops::SliceGradKernel, ops::SliceGradKernel, - ops::SliceGradKernel); + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel); diff --git a/paddle/fluid/operators/slice_op.cu b/paddle/fluid/operators/slice_op.cu index 7493b18936..5f80d3cc97 100644 --- a/paddle/fluid/operators/slice_op.cu +++ b/paddle/fluid/operators/slice_op.cu @@ -23,7 +23,9 @@ REGISTER_OP_CUDA_KERNEL( ops::SliceKernel, ops::SliceKernel, ops::SliceKernel, - ops::SliceKernel); + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel); REGISTER_OP_CUDA_KERNEL( slice_grad, @@ -31,4 +33,7 @@ REGISTER_OP_CUDA_KERNEL( ops::SliceGradKernel, ops::SliceGradKernel, ops::SliceGradKernel, - ops::SliceGradKernel); + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel); diff --git a/paddle/fluid/operators/strided_slice_op.cc b/paddle/fluid/operators/strided_slice_op.cc index 94a0576b77..e49476e4dc 100644 --- a/paddle/fluid/operators/strided_slice_op.cc +++ b/paddle/fluid/operators/strided_slice_op.cc @@ -327,11 +327,19 @@ REGISTER_OP_CPU_KERNEL( ops::StridedSliceKernel, ops::StridedSliceKernel, ops::StridedSliceKernel, - ops::StridedSliceKernel); + ops::StridedSliceKernel, + ops::StridedSliceKernel, + ops::StridedSliceKernel); REGISTER_OP_CPU_KERNEL( strided_slice_grad, ops::StridedSliceGradKernel, ops::StridedSliceGradKernel, ops::StridedSliceGradKernel, - ops::StridedSliceGradKernel); + ops::StridedSliceGradKernel, + ops::StridedSliceGradKernel, + ops::StridedSliceGradKernel); diff --git a/paddle/fluid/operators/strided_slice_op.cu b/paddle/fluid/operators/strided_slice_op.cu index f0c9d557b9..b85403b1c5 100644 --- a/paddle/fluid/operators/strided_slice_op.cu +++ b/paddle/fluid/operators/strided_slice_op.cu @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/strided_slice_op.h" +#include "paddle/fluid/platform/complex128.h" +#include "paddle/fluid/platform/complex64.h" namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( @@ -20,11 +22,19 @@ REGISTER_OP_CUDA_KERNEL( ops::StridedSliceKernel, ops::StridedSliceKernel, ops::StridedSliceKernel, - ops::StridedSliceKernel); + ops::StridedSliceKernel, + ops::StridedSliceKernel, + ops::StridedSliceKernel); REGISTER_OP_CUDA_KERNEL( strided_slice_grad, ops::StridedSliceGradKernel, ops::StridedSliceGradKernel, ops::StridedSliceGradKernel, - ops::StridedSliceGradKernel); + ops::StridedSliceGradKernel, + ops::StridedSliceGradKernel, + ops::StridedSliceGradKernel); diff --git a/paddle/fluid/platform/complex128.h b/paddle/fluid/platform/complex128.h new file mode 100644 index 0000000000..bc3f6cc031 --- /dev/null +++ b/paddle/fluid/platform/complex128.h @@ -0,0 +1,579 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#if !defined(_WIN32) +#define PADDLE_ALIGN(x) __attribute__((aligned(x))) +#else +#define PADDLE_ALIGN(x) __declspec(align(x)) +#endif + +#ifdef PADDLE_WITH_CUDA +#include +#include +#endif // PADDLE_WITH_CUDA + +#include + +#include "paddle/fluid/platform/hostdevice.h" +#include "unsupported/Eigen/CXX11/Tensor" + +namespace Eigen { +template +struct NumTraits; +} // namespace Eigen + +namespace paddle { +namespace platform { + +struct PADDLE_ALIGN(16) complex128 { + public: + double real; + double imag; + + complex128() = default; + complex128(const complex128& o) = default; + complex128& operator=(const complex128& o) = default; + complex128(complex128&& o) = default; + complex128& operator=(complex128&& o) = default; + ~complex128() = default; + + HOSTDEVICE complex128(double real, double imag) : real(real), imag(imag) {} +#if defined(PADDLE_WITH_CUDA) + + HOSTDEVICE inline explicit complex128(const thrust::complex& c) { + real = c.real(); + imag = c.imag(); + } + + HOSTDEVICE inline explicit operator thrust::complex() const { + return thrust::complex(real, imag); + } + + HOSTDEVICE inline explicit operator cuDoubleComplex() const { + return make_cuDoubleComplex(real, imag); + } +#endif + + HOSTDEVICE complex128(const float& val) { real = static_cast(val); } + HOSTDEVICE complex128(const double& val) { real = val; } + HOSTDEVICE complex128(const int& val) { real = static_cast(val); } + HOSTDEVICE complex128(const int64_t& val) { real = static_cast(val); } + + HOSTDEVICE inline explicit operator std::complex() { + return static_cast>(std::complex(real, imag)); + } + + template + HOSTDEVICE inline explicit complex128(const T& val) + : real(complex128(static_cast(val)).real) {} + + HOSTDEVICE complex128(const std::complex val) + : real(val.real()), imag(val.imag()) {} + + HOSTDEVICE inline complex128& operator=(bool b) { + real = b ? 1 : 0; + imag = 0; + return *this; + } + + HOSTDEVICE inline complex128& operator=(int8_t val) { + real = static_cast(val); + return *this; + } + + HOSTDEVICE inline complex128& operator=(uint8_t val) { + real = static_cast(val); + return *this; + } + + HOSTDEVICE inline complex128& operator=(int16_t val) { + real = static_cast(val); + return *this; + } + + HOSTDEVICE inline complex128& operator=(uint16_t val) { + real = static_cast(val); + return *this; + } + + HOSTDEVICE inline complex128& operator=(int32_t val) { + real = static_cast(val); + return *this; + } + + HOSTDEVICE inline complex128& operator=(uint32_t val) { + real = static_cast(val); + return *this; + } + + HOSTDEVICE inline complex128& operator=(int64_t val) { + real = static_cast(val); + return *this; + } + + HOSTDEVICE inline complex128& operator=(uint64_t val) { + real = static_cast(val); + return *this; + } + + HOSTDEVICE inline complex128& operator=(float val) { + real = val; + return *this; + } + + HOSTDEVICE inline complex128& operator=(double val) { + real = static_cast(val); + return *this; + } + + HOSTDEVICE inline operator float() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator bool() const { + return static_cast(this->real) || static_cast(this->imag); + } + + HOSTDEVICE inline explicit operator int8_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator uint8_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator int16_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator uint16_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator int32_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator uint32_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator int64_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator uint64_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator double() const { + return static_cast(this->real); + } +}; + +HOSTDEVICE inline complex128 operator+(const complex128& a, + const complex128& b) { +#if defined(__CUDA_ARCH__) + return complex128(thrust::complex(a.real, a.imag) + + thrust::complex(b.real, b.imag)); +#else + return complex128(a.real + b.real, a.imag + b.imag); +#endif +} + +HOSTDEVICE inline complex128 operator-(const complex128& a, + const complex128& b) { +#if defined(__CUDA_ARCH__) + return complex128(thrust::complex(a.real, a.imag) - + thrust::complex(b.real, b.imag)); +#else + return complex128(a.real - b.real, a.imag - b.imag); +#endif +} + +HOSTDEVICE inline complex128 operator*(const complex128& a, + const complex128& b) { +#if defined(__CUDA_ARCH__) + return complex128(thrust::complex(a.real, a.imag) * + thrust::complex(b.real, b.imag)); +#else + return complex128(a.real * b.real - a.imag * b.imag, + a.imag * b.real + b.imag * a.real); +#endif +} + +HOSTDEVICE inline complex128 operator/(const complex128& a, + const complex128& b) { +#if defined(__CUDA_ARCH__) + return complex128(thrust::complex(a.real, a.imag) / + thrust::complex(b.real, b.imag)); +#else + double denominator = b.real * b.real + b.imag * b.imag; + return complex128((a.real * b.real + a.imag * b.imag) / denominator, + (a.imag * b.real - a.real * b.imag) / denominator); +#endif +} + +HOSTDEVICE inline complex128 operator-(const complex128& a) { +#if defined(__CUDA_ARCH__) + return complex128(-thrust::complex(a.real, a.imag)); +#else + complex128 res; + res.real = -a.real; + res.imag = -a.imag; + return res; +#endif +} + +HOSTDEVICE inline complex128& operator+=(complex128& a, // NOLINT + const complex128& b) { +#if defined(__CUDA_ARCH__) + a = complex128(thrust::complex(a.real, a.imag) += + thrust::complex(b.real, b.imag)); + return a; +#else + a.real += b.real; + a.imag += b.imag; + return a; +#endif +} + +HOSTDEVICE inline complex128& operator-=(complex128& a, // NOLINT + const complex128& b) { +#if defined(__CUDA_ARCH__) + a = complex128(thrust::complex(a.real, a.imag) -= + thrust::complex(b.real, b.imag)); + return a; +#else + a.real -= b.real; + a.imag -= b.imag; + return a; +#endif +} + +HOSTDEVICE inline complex128& operator*=(complex128& a, // NOLINT + const complex128& b) { +#if defined(__CUDA_ARCH__) + a = complex128(thrust::complex(a.real, a.imag) *= + thrust::complex(b.real, b.imag)); + return a; +#else + a.real = a.real * b.real - a.imag * b.imag; + a.imag = a.imag * b.real + b.imag * a.real; + return a; +#endif +} + +HOSTDEVICE inline complex128& operator/=(complex128& a, // NOLINT + const complex128& b) { +#if defined(__CUDA_ARCH__) + a = complex128(thrust::complex(a.real, a.imag) /= + thrust::complex(b.real, b.imag)); + return a; +#else + double denominator = b.real * b.real + b.imag * b.imag; + a.real = (a.real * b.real + a.imag * b.imag) / denominator; + a.imag = (a.imag * b.real - a.real * b.imag) / denominator; + return a; +#endif +} + +HOSTDEVICE inline complex128 raw_uint16_to_complex128(uint16_t a) { + complex128 res; + res.real = a; + return res; +} + +HOSTDEVICE inline bool operator==(const complex128& a, const complex128& b) { + return a.real == b.real && a.imag == b.imag; +} + +HOSTDEVICE inline bool operator!=(const complex128& a, const complex128& b) { + return a.real != b.real || a.imag != b.imag; +} + +HOSTDEVICE inline bool operator<(const complex128& a, const complex128& b) { + return static_cast(a.real) < static_cast(b.real); +} + +HOSTDEVICE inline bool operator<=(const complex128& a, const complex128& b) { + return static_cast(a.real) <= static_cast(b.real); +} + +HOSTDEVICE inline bool operator>(const complex128& a, const complex128& b) { + return static_cast(a.real) > static_cast(b.real); +} + +HOSTDEVICE inline bool operator>=(const complex128& a, const complex128& b) { + return static_cast(a.real) >= static_cast(b.real); +} + +HOSTDEVICE inline bool(isnan)(const complex128& a) { +#if defined(__CUDA_ARCH__) + return __isnan(a.real) || __isnan(a.imag); +#else + return std::isnan(a.real) || std::isnan(a.imag); +#endif +} + +HOSTDEVICE inline bool(isinf)(const complex128& a) { +#if defined(__CUDA_ARCH__) + return __isinf(a.real) || __isinf(a.imag); +#else + return std::isinf(a.real) || std::isinf(a.imag); +#endif +} + +HOSTDEVICE inline bool(isfinite)(const complex128& a) { + return !((isnan)(a)) && !((isinf)(a)); +} + +HOSTDEVICE inline double(abs)(const complex128& a) { +#if defined(__CUDA_ARCH__) + return thrust::abs(thrust::complex(a.real, a.imag)); +#else + return std::abs(std::complex(a)); +#endif +} + +HOSTDEVICE inline complex128(pow)(const complex128& a, const complex128& b) { +#if defined(__CUDA_ARCH__) + return complex128(thrust::pow(thrust::complex(a.real, a.imag), + thrust::complex(b.real, b.imag))); +#else + return std::pow(std::complex(a), std::complex(b)); +#endif +} + +HOSTDEVICE inline complex128(sqrt)(const complex128& a) { +#if defined(__CUDA_ARCH__) + return complex128(thrust::sqrt(thrust::complex(a.real, a.imag))); +#else + return std::sqrt(std::complex(a)); +#endif +} + +HOSTDEVICE inline complex128(tanh)(const complex128& a) { +#if defined(__CUDA_ARCH__) + return complex128(thrust::tanh(thrust::complex(a.real, a.imag))); +#else + return std::tanh(std::complex(a)); +#endif +} + +HOSTDEVICE inline complex128(log)(const complex128& a) { +#if defined(__CUDA_ARCH__) + return complex128(thrust::log(thrust::complex(a.real, a.imag))); +#else + return complex128(std::log(std::complex(a))); +#endif +} + +inline std::ostream& operator<<(std::ostream& os, const complex128& a) { + os << "real:" << a.real << " imag:" << a.imag; + return os; +} + +} // namespace platform +} // namespace paddle + +namespace std { + +template <> +struct is_pod { + static const bool value = + is_trivial::value && + is_standard_layout::value; +}; + +template <> +struct is_floating_point + : std::integral_constant< + bool, std::is_same::type>::value> { +}; +template <> +struct is_signed { + static const bool value = false; +}; + +template <> +struct is_unsigned { + static const bool value = false; +}; + +inline bool isnan(const paddle::platform::complex128& a) { + return paddle::platform::isnan(a); +} + +inline bool isinf(const paddle::platform::complex128& a) { + return paddle::platform::isinf(a); +} + +template <> +struct numeric_limits { + static const bool is_specialized = false; + static const bool is_signed = false; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = false; + static const bool has_quiet_NaN = false; + static const bool has_signaling_NaN = false; + static const float_denorm_style has_denorm = denorm_absent; + static const bool has_denorm_loss = false; + static const std::float_round_style round_style = std::round_toward_zero; + static const bool is_iec559 = false; + static const bool is_bounded = false; + static const bool is_modulo = false; + static const int digits = 0; + static const int digits10 = 0; + static const int max_digits10 = 0; + static const int radix = 0; + static const int min_exponent = 0; + static const int min_exponent10 = 0; + static const int max_exponent = 0; + static const int max_exponent10 = 0; + static const bool traps = false; + static const bool tinyness_before = false; + + static paddle::platform::complex128(min)() { + return paddle::platform::complex128(0.0, 0.0); + } + static paddle::platform::complex128 lowest() { + return paddle::platform::complex128(0.0, 0.0); + } + static paddle::platform::complex128(max)() { + return paddle::platform::complex128(0.0, 0.0); + } + static paddle::platform::complex128 epsilon() { + return paddle::platform::complex128(0.0, 0.0); + } + static paddle::platform::complex128 round_error() { + return paddle::platform::complex128(0.0, 0.0); + } + static paddle::platform::complex128 infinity() { + return paddle::platform::complex128(0.0, 0.0); + } + static paddle::platform::complex128 quiet_NaN() { + return paddle::platform::complex128(0.0, 0.0); + } + static paddle::platform::complex128 signaling_NaN() { + return paddle::platform::complex128(0.0, 0.0); + } + static paddle::platform::complex128 denorm_min() { + return paddle::platform::complex128(0.0, 0.0); + } +}; + +} // namespace std +namespace Eigen { + +using complex128 = paddle::platform::complex128; + +template <> +struct NumTraits : GenericNumTraits> { + typedef double Real; + typedef typename NumTraits::Literal Literal; + enum { + IsComplex = 1, + RequireInitialization = NumTraits::RequireInitialization, + ReadCost = 2 * NumTraits::ReadCost, + AddCost = 2 * NumTraits::AddCost, + MulCost = 4 * NumTraits::MulCost + 2 * NumTraits::AddCost + }; + + EIGEN_DEVICE_FUNC + static inline Real epsilon() { return NumTraits::epsilon(); } + EIGEN_DEVICE_FUNC + static inline Real dummy_precision() { + return NumTraits::dummy_precision(); + } + EIGEN_DEVICE_FUNC + static inline int digits10() { return NumTraits::digits10(); } +}; +namespace numext { + +template <> +HOSTDEVICE inline bool(isnan)(const complex128& a) { + return (paddle::platform::isnan)(a); +} + +template <> +HOSTDEVICE inline bool(isinf)(const complex128& a) { + return (paddle::platform::isinf)(a); +} + +template <> +HOSTDEVICE inline bool(isfinite)(const complex128& a) { + return (paddle::platform::isfinite)(a); +} + +template <> +HOSTDEVICE inline complex128 exp(const complex128& a) { + double com = ::expf(a.real); + double res_real = com * ::cosf(a.imag); + double res_imag = com * ::sinf(a.imag); + return complex128(res_real, res_imag); +} + +template <> +HOSTDEVICE inline complex128 log(const complex128& a) { + return paddle::platform::log(a); +} + +template <> +HOSTDEVICE inline complex128 tanh(const complex128& a) { + return paddle::platform::tanh(a); +} + +template <> +HOSTDEVICE inline complex128 sqrt(const complex128& a) { + return paddle::platform::sqrt(a); +} + +template <> +HOSTDEVICE inline complex128 ceil(const complex128& a) { + return complex128(::ceilf(a.real), ::ceilf(a.imag)); +} + +template <> +HOSTDEVICE inline complex128 floor(const complex128& a) { + return complex128(::floorf(a.real), ::floor(a.imag)); +} + +template <> +HOSTDEVICE inline complex128 round(const complex128& a) { + return complex128(::roundf(a.real), ::roundf(a.imag)); +} + +template <> +HOSTDEVICE inline complex128 pow(const complex128& a, const complex128& b) { + return paddle::platform::pow(a, b); +} + +template <> +HOSTDEVICE inline double abs(const complex128& a) { + return paddle::platform::abs(a); +} + +} // namespace numext +} // namespace Eigen + +#define MKL_Complex16 paddle::platform::complex128 diff --git a/paddle/fluid/platform/complex64.h b/paddle/fluid/platform/complex64.h new file mode 100644 index 0000000000..d378f14e6f --- /dev/null +++ b/paddle/fluid/platform/complex64.h @@ -0,0 +1,582 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#if !defined(_WIN32) +#define PADDLE_ALIGN(x) __attribute__((aligned(x))) +#else +#define PADDLE_ALIGN(x) __declspec(align(x)) +#endif + +#ifdef PADDLE_WITH_CUDA +#include +#include +#endif // PADDLE_WITH_CUDA + +#include + +#include "paddle/fluid/platform/complex128.h" +#include "paddle/fluid/platform/hostdevice.h" +#include "unsupported/Eigen/CXX11/Tensor" + +namespace Eigen { +template +struct NumTraits; +} // namespace Eigen + +namespace paddle { +namespace platform { + +struct PADDLE_ALIGN(8) complex64 { + public: + float real; + float imag; + + complex64() = default; + complex64(const complex64& o) = default; + complex64& operator=(const complex64& o) = default; + complex64(complex64&& o) = default; + complex64& operator=(complex64&& o) = default; + ~complex64() = default; + + HOSTDEVICE complex64(float real, float imag) : real(real), imag(imag) {} +#if defined(PADDLE_WITH_CUDA) + + HOSTDEVICE inline explicit complex64(const thrust::complex& c) { + real = c.real(); + imag = c.imag(); + } + + HOSTDEVICE inline explicit operator thrust::complex() const { + return thrust::complex(real, imag); + } + + HOSTDEVICE inline explicit operator cuFloatComplex() const { + return make_cuFloatComplex(real, imag); + } +#endif + + HOSTDEVICE complex64(const float& val) { real = val; } + HOSTDEVICE complex64(const double& val) { real = static_cast(val); } + HOSTDEVICE complex64(const int& val) { real = static_cast(val); } + HOSTDEVICE complex64(const int64_t& val) { real = static_cast(val); } + HOSTDEVICE complex64(const complex128& val) { + real = static_cast(val.real); + imag = static_cast(val.imag); + } + + HOSTDEVICE inline explicit operator std::complex() { + return static_cast>(std::complex(real, imag)); + } + + template + HOSTDEVICE inline explicit complex64(const T& val) + : real(complex64(static_cast(val)).real) {} + + HOSTDEVICE complex64(const std::complex val) + : real(val.real()), imag(val.imag()) {} + + HOSTDEVICE inline complex64& operator=(bool b) { + real = b ? 1 : 0; + imag = 0; + return *this; + } + + HOSTDEVICE inline complex64& operator=(int8_t val) { + real = static_cast(val); + return *this; + } + + HOSTDEVICE inline complex64& operator=(uint8_t val) { + real = static_cast(val); + return *this; + } + + HOSTDEVICE inline complex64& operator=(int16_t val) { + real = static_cast(val); + return *this; + } + + HOSTDEVICE inline complex64& operator=(uint16_t val) { + real = static_cast(val); + return *this; + } + + HOSTDEVICE inline complex64& operator=(int32_t val) { + real = static_cast(val); + return *this; + } + + HOSTDEVICE inline complex64& operator=(uint32_t val) { + real = static_cast(val); + return *this; + } + + HOSTDEVICE inline complex64& operator=(int64_t val) { + real = static_cast(val); + return *this; + } + + HOSTDEVICE inline complex64& operator=(uint64_t val) { + real = static_cast(val); + return *this; + } + + HOSTDEVICE inline complex64& operator=(float val) { + real = val; + return *this; + } + + HOSTDEVICE inline complex64& operator=(double val) { + real = static_cast(val); + return *this; + } + + HOSTDEVICE inline operator float() const { return this->real; } + + HOSTDEVICE inline explicit operator bool() const { + return static_cast(this->real) || static_cast(this->imag); + } + + HOSTDEVICE inline explicit operator int8_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator uint8_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator int16_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator uint16_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator int32_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator uint32_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator int64_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator uint64_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator double() const { + return static_cast(this->real); + } + + HOSTDEVICE inline operator complex128() const { + return complex128(static_cast(this->real), + static_cast(this->imag)); + } +}; + +HOSTDEVICE inline complex64 operator+(const complex64& a, const complex64& b) { +#if defined(__CUDA_ARCH__) + return complex64(thrust::complex(a.real, a.imag) + + thrust::complex(b.real, b.imag)); +#else + return complex64(a.real + b.real, a.imag + b.imag); +#endif +} + +HOSTDEVICE inline complex64 operator-(const complex64& a, const complex64& b) { +#if defined(__CUDA_ARCH__) + return complex64(thrust::complex(a.real, a.imag) - + thrust::complex(b.real, b.imag)); +#else + return complex64(a.real - b.real, a.imag - b.imag); +#endif +} + +HOSTDEVICE inline complex64 operator*(const complex64& a, const complex64& b) { +#if defined(__CUDA_ARCH__) + return complex64(thrust::complex(a.real, a.imag) * + thrust::complex(b.real, b.imag)); +#else + return complex64(a.real * b.real - a.imag * b.imag, + a.imag * b.real + b.imag * a.real); +#endif +} + +HOSTDEVICE inline complex64 operator/(const complex64& a, const complex64& b) { +#if defined(__CUDA_ARCH__) + return complex64(thrust::complex(a.real, a.imag) / + thrust::complex(b.real, b.imag)); +#else + float denominator = b.real * b.real + b.imag * b.imag; + return complex64((a.real * b.real + a.imag * b.imag) / denominator, + (a.imag * b.real - a.real * b.imag) / denominator); +#endif +} + +HOSTDEVICE inline complex64 operator-(const complex64& a) { +#if defined(__CUDA_ARCH__) + return complex64(-thrust::complex(a.real, a.imag)); +#else + complex64 res; + res.real = -a.real; + res.imag = -a.imag; + return res; +#endif +} + +HOSTDEVICE inline complex64& operator+=(complex64& a, // NOLINT + const complex64& b) { +#if defined(__CUDA_ARCH__) + a = complex64(thrust::complex(a.real, a.imag) += + thrust::complex(b.real, b.imag)); + return a; +#else + a.real += b.real; + a.imag += b.imag; + return a; +#endif +} + +HOSTDEVICE inline complex64& operator-=(complex64& a, // NOLINT + const complex64& b) { +#if defined(__CUDA_ARCH__) + a = complex64(thrust::complex(a.real, a.imag) -= + thrust::complex(b.real, b.imag)); + return a; +#else + a.real -= b.real; + a.imag -= b.imag; + return a; +#endif +} + +HOSTDEVICE inline complex64& operator*=(complex64& a, // NOLINT + const complex64& b) { +#if defined(__CUDA_ARCH__) + a = complex64(thrust::complex(a.real, a.imag) *= + thrust::complex(b.real, b.imag)); + return a; +#else + a.real = a.real * b.real - a.imag * b.imag; + a.imag = a.imag * b.real + b.imag * a.real; + return a; +#endif +} + +HOSTDEVICE inline complex64& operator/=(complex64& a, // NOLINT + const complex64& b) { +#if defined(__CUDA_ARCH__) + a = complex64(thrust::complex(a.real, a.imag) /= + thrust::complex(b.real, b.imag)); + return a; +#else + float denominator = b.real * b.real + b.imag * b.imag; + a.real = (a.real * b.real + a.imag * b.imag) / denominator; + a.imag = (a.imag * b.real - a.real * b.imag) / denominator; + return a; +#endif +} + +HOSTDEVICE inline complex64 raw_uint16_to_complex64(uint16_t a) { + complex64 res; + res.real = a; + return res; +} + +HOSTDEVICE inline bool operator==(const complex64& a, const complex64& b) { + return a.real == b.real && a.imag == b.imag; +} + +HOSTDEVICE inline bool operator!=(const complex64& a, const complex64& b) { + return a.real != b.real || a.imag != b.imag; +} + +HOSTDEVICE inline bool operator<(const complex64& a, const complex64& b) { + return static_cast(a.real) < static_cast(b.real); +} + +HOSTDEVICE inline bool operator<=(const complex64& a, const complex64& b) { + return static_cast(a.real) <= static_cast(b.real); +} + +HOSTDEVICE inline bool operator>(const complex64& a, const complex64& b) { + return static_cast(a.real) > static_cast(b.real); +} + +HOSTDEVICE inline bool operator>=(const complex64& a, const complex64& b) { + return static_cast(a.real) >= static_cast(b.real); +} + +HOSTDEVICE inline bool(isnan)(const complex64& a) { +#if defined(__CUDA_ARCH__) + return __isnanf(a.real) || __isnanf(a.imag); +#else + return std::isnan(a.real) || std::isnan(a.imag); +#endif +} + +HOSTDEVICE inline bool(isinf)(const complex64& a) { +#if defined(__CUDA_ARCH__) + return __isinff(a.real) || __isinff(a.imag); +#else + return std::isinf(a.real) || std::isinf(a.imag); +#endif +} + +HOSTDEVICE inline bool(isfinite)(const complex64& a) { + return !((isnan)(a)) && !((isinf)(a)); +} + +HOSTDEVICE inline float(abs)(const complex64& a) { +#if defined(__CUDA_ARCH__) + return complex64(thrust::abs(thrust::complex(a.real, a.imag))); +#else + return std::abs(std::complex(a)); +#endif +} + +HOSTDEVICE inline complex64(pow)(const complex64& a, const complex64& b) { +#if defined(__CUDA_ARCH__) + return complex64(thrust::pow(thrust::complex(a.real, a.imag), + thrust::complex(b.real, b.imag))); +#else + return std::pow(std::complex(a), std::complex(b)); +#endif +} + +HOSTDEVICE inline complex64(sqrt)(const complex64& a) { +#if defined(__CUDA_ARCH__) + return complex64(thrust::sqrt(thrust::complex(a.real, a.imag))); +#else + return std::sqrt(std::complex(a)); +#endif +} + +HOSTDEVICE inline complex64(tanh)(const complex64& a) { +#if defined(__CUDA_ARCH__) + return complex64(thrust::tanh(thrust::complex(a.real, a.imag))); +#else + return std::tanh(std::complex(a)); +#endif +} + +HOSTDEVICE inline complex64(log)(const complex64& a) { +#if defined(__CUDA_ARCH__) + return complex64(thrust::log(thrust::complex(a.real, a.imag))); +#else + return std::log(std::complex(a)); +#endif +} + +inline std::ostream& operator<<(std::ostream& os, const complex64& a) { + os << "real:" << a.real << " imag:" << a.imag; + return os; +} + +} // namespace platform +} // namespace paddle + +namespace std { + +template <> +struct is_pod { + static const bool value = + is_trivial::value && + is_standard_layout::value; +}; + +template <> +struct is_floating_point + : std::integral_constant< + bool, std::is_same::type>::value> {}; +template <> +struct is_signed { + static const bool value = false; +}; + +template <> +struct is_unsigned { + static const bool value = false; +}; + +inline bool isnan(const paddle::platform::complex64& a) { + return paddle::platform::isnan(a); +} + +inline bool isinf(const paddle::platform::complex64& a) { + return paddle::platform::isinf(a); +} + +template <> +struct numeric_limits { + static const bool is_specialized = false; + static const bool is_signed = false; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = false; + static const bool has_quiet_NaN = false; + static const bool has_signaling_NaN = false; + static const float_denorm_style has_denorm = denorm_absent; + static const bool has_denorm_loss = false; + static const std::float_round_style round_style = std::round_toward_zero; + static const bool is_iec559 = false; + static const bool is_bounded = false; + static const bool is_modulo = false; + static const int digits = 0; + static const int digits10 = 0; + static const int max_digits10 = 0; + static const int radix = 0; + static const int min_exponent = 0; + static const int min_exponent10 = 0; + static const int max_exponent = 0; + static const int max_exponent10 = 0; + static const bool traps = false; + static const bool tinyness_before = false; + + static paddle::platform::complex64(min)() { + return paddle::platform::complex64(0.0, 0.0); + } + static paddle::platform::complex64 lowest() { + return paddle::platform::complex64(0.0, 0.0); + } + static paddle::platform::complex64(max)() { + return paddle::platform::complex64(0.0, 0.0); + } + static paddle::platform::complex64 epsilon() { + return paddle::platform::complex64(0.0, 0.0); + } + static paddle::platform::complex64 round_error() { + return paddle::platform::complex64(0.0, 0.0); + } + static paddle::platform::complex64 infinity() { + return paddle::platform::complex64(0.0, 0.0); + } + static paddle::platform::complex64 quiet_NaN() { + return paddle::platform::complex64(0.0, 0.0); + } + static paddle::platform::complex64 signaling_NaN() { + return paddle::platform::complex64(0.0, 0.0); + } + static paddle::platform::complex64 denorm_min() { + return paddle::platform::complex64(0.0, 0.0); + } +}; + +} // namespace std +namespace Eigen { + +using complex64 = paddle::platform::complex64; + +template <> +struct NumTraits : GenericNumTraits> { + typedef float Real; + typedef typename NumTraits::Literal Literal; + enum { + IsComplex = 1, + RequireInitialization = NumTraits::RequireInitialization, + ReadCost = 2 * NumTraits::ReadCost, + AddCost = 2 * NumTraits::AddCost, + MulCost = 4 * NumTraits::MulCost + 2 * NumTraits::AddCost + }; + + EIGEN_DEVICE_FUNC + static inline Real epsilon() { return NumTraits::epsilon(); } + EIGEN_DEVICE_FUNC + static inline Real dummy_precision() { + return NumTraits::dummy_precision(); + } + EIGEN_DEVICE_FUNC + static inline int digits10() { return NumTraits::digits10(); } +}; + +namespace numext { + +template <> +HOSTDEVICE inline bool(isnan)(const complex64& a) { + return (paddle::platform::isnan)(a); +} + +template <> +HOSTDEVICE inline bool(isinf)(const complex64& a) { + return (paddle::platform::isinf)(a); +} + +template <> +HOSTDEVICE inline bool(isfinite)(const complex64& a) { + return (paddle::platform::isfinite)(a); +} + +template <> +HOSTDEVICE inline complex64 exp(const complex64& a) { + float com = ::expf(a.real); + float res_real = com * ::cosf(a.imag); + float res_imag = com * ::sinf(a.imag); + return complex64(res_real, res_imag); +} + +template <> +HOSTDEVICE inline complex64 log(const complex64& a) { + return paddle::platform::log(a); +} + +template <> +HOSTDEVICE inline complex64 tanh(const complex64& a) { + return paddle::platform::tanh(a); +} + +template <> +HOSTDEVICE inline complex64 sqrt(const complex64& a) { + return paddle::platform::sqrt(a); +} + +template <> +HOSTDEVICE inline complex64 ceil(const complex64& a) { + return complex64(::ceilf(a.real), ::ceilf(a.imag)); +} + +template <> +HOSTDEVICE inline complex64 floor(const complex64& a) { + return complex64(::floorf(a.real), ::floor(a.imag)); +} + +template <> +HOSTDEVICE inline complex64 round(const complex64& a) { + return complex64(::roundf(a.real), ::roundf(a.imag)); +} + +template <> +HOSTDEVICE inline complex64 pow(const complex64& a, const complex64& b) { + return paddle::platform::pow(a, b); +} + +template <> +HOSTDEVICE inline float abs(const complex64& a) { + return paddle::platform::abs(a); +} + +} // namespace numext +} // namespace Eigen + +#define MKL_Complex8 paddle::platform::complex64 diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h index 202613244d..a70050bae1 100644 --- a/paddle/fluid/platform/cuda_device_function.h +++ b/paddle/fluid/platform/cuda_device_function.h @@ -18,6 +18,8 @@ limitations under the License. */ // NOTE(): support float16 to half in header file. #define PADDLE_CUDA_FP16 #include +#include "paddle/fluid/platform/complex128.h" +#include "paddle/fluid/platform/complex64.h" #include "paddle/fluid/platform/float16.h" namespace paddle { @@ -104,11 +106,54 @@ __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, return float16(__shfl_down_sync(mask, static_cast(val), static_cast(delta), width)); } + +template <> +__forceinline__ __device__ paddle::platform::complex64 CudaShuffleDownSync( + unsigned mask, paddle::platform::complex64 val, int delta, int width) { + float real = static_cast(__shfl_down_sync( + mask, static_cast(val.real), static_cast(delta), width)); + float imag = static_cast(__shfl_down_sync( + mask, static_cast(val.imag), static_cast(delta), width)); + return paddle::platform::complex64(real, imag); +} + +template <> +__forceinline__ __device__ paddle::platform::complex128 CudaShuffleDownSync( + unsigned mask, paddle::platform::complex128 val, int delta, int width) { + double real = static_cast( + __shfl_down_sync(mask, static_cast(val.real), + static_cast(delta), width)); + double imag = static_cast( + __shfl_down_sync(mask, static_cast(val.imag), + static_cast(delta), width)); + return paddle::platform::complex128(real, imag); +} + template <> __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask, float16 val, int width) { return float16(__shfl_xor_sync(mask, static_cast(val), width)); } + +template <> +__forceinline__ __device__ paddle::platform::complex64 CudaShuffleXorSync( + unsigned mask, paddle::platform::complex64 val, int width) { + float real = static_cast( + __shfl_xor_sync(mask, static_cast(val.real), width)); + float imag = static_cast( + __shfl_xor_sync(mask, static_cast(val.imag), width)); + return paddle::platform::complex64(real, imag); +} + +template <> +__forceinline__ __device__ paddle::platform::complex128 CudaShuffleXorSync( + unsigned mask, paddle::platform::complex128 val, int width) { + double real = static_cast( + __shfl_xor_sync(mask, static_cast(val.real), width)); + double imag = static_cast( + __shfl_xor_sync(mask, static_cast(val.imag), width)); + return paddle::platform::complex128(real, imag); +} #endif template diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index 045caab748..66032075f2 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -61,8 +61,12 @@ extern void *cublas_dso_handle; __macro(cublasDcopy_v2); \ __macro(cublasSgemv_v2); \ __macro(cublasDgemv_v2); \ + __macro(cublasCgemv_v2); \ + __macro(cublasZgemv_v2); \ __macro(cublasSgemm_v2); \ __macro(cublasDgemm_v2); \ + __macro(cublasCgemm_v2); \ + __macro(cublasZgemm_v2); \ __macro(cublasHgemm); \ __macro(cublasSgemmEx); \ __macro(cublasSgeam); \ diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index 9369cf131d..c3c8788c57 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -51,12 +51,20 @@ extern void* mklml_dso_handle; #define MKLML_ROUTINE_EACH(__macro) \ __macro(cblas_sgemm); \ __macro(cblas_dgemm); \ + __macro(cblas_cgemm); \ + __macro(cblas_zgemm); \ __macro(cblas_saxpy); \ __macro(cblas_daxpy); \ + __macro(cblas_caxpy); \ + __macro(cblas_zaxpy); \ __macro(cblas_scopy); \ __macro(cblas_dcopy); \ + __macro(cblas_ccopy); \ + __macro(cblas_zcopy); \ __macro(cblas_sgemv); \ __macro(cblas_dgemv); \ + __macro(cblas_cgemv); \ + __macro(cblas_zgemv); \ __macro(cblas_strsm); \ __macro(cblas_dtrsm); \ __macro(cblas_sgemm_alloc); \ @@ -69,6 +77,8 @@ extern void* mklml_dso_handle; __macro(cblas_dgemm_free); \ __macro(cblas_sgemm_batch); \ __macro(cblas_dgemm_batch); \ + __macro(cblas_cgemm_batch); \ + __macro(cblas_zgemm_batch); \ __macro(cblas_sdot); \ __macro(cblas_ddot); \ __macro(cblas_sasum); \ diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 97056eca41..06b3f10fef 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -185,6 +185,8 @@ void BindVarDsec(pybind11::module *m) { .value("FP32", pd::proto::VarType::FP32) .value("FP64", pd::proto::VarType::FP64) .value("BF16", pd::proto::VarType::BF16) + .value("COMPLEX64", pd::proto::VarType::COMPLEX64) + .value("COMPLEX128", pd::proto::VarType::COMPLEX128) .value("LOD_TENSOR", pd::proto::VarType::LOD_TENSOR) .value("SELECTED_ROWS", pd::proto::VarType::SELECTED_ROWS) .value("FEED_MINIBATCH", pd::proto::VarType::FEED_MINIBATCH) diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 012f624f67..49d68a2ad7 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -42,6 +42,8 @@ namespace detail { // print np.dtype(np.float16).num # 23 constexpr int NPY_FLOAT16_ = 23; constexpr int NPY_UINT16_ = 4; +constexpr int NPY_COMPLEX64 = 14; +constexpr int NPY_COMPLEX128 = 15; // Note: Since float16 is not a builtin type in C++, we register // paddle::platform::float16 as numpy.float16. @@ -78,6 +80,44 @@ struct npy_format_descriptor { static constexpr auto name = _("bfloat16"); }; +// we register paddle::platform::complex64 as numpy.complex64. +template <> +struct npy_format_descriptor { + static py::dtype dtype() { + handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_COMPLEX64); + return reinterpret_borrow(ptr); + } + + static std::string format() { + // Note: "F" represents complex64. + // Details at: + // https://stackoverflow.com/questions/13997087/what-are-the-available-datatypes-for-dtype-with-numpys-loadtxt-an-genfromtx + // for k, v in np.sctypeDict.iteritems(): + // print '{0:14s} : {1:40s}'.format(str(k), v) + return "F"; + } + static constexpr auto name = _("complext64"); +}; + +// we register paddle::platform::complex128 as numpy.complex128. +template <> +struct npy_format_descriptor { + static py::dtype dtype() { + handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_COMPLEX128); + return reinterpret_borrow(ptr); + } + + static std::string format() { + // Note: "D" represents complex128. + // Details at: + // https://stackoverflow.com/questions/13997087/what-are-the-available-datatypes-for-dtype-with-numpys-loadtxt-an-genfromtx + // for k, v in np.sctypeDict.iteritems(): + // print '{0:14s} : {1:40s}'.format(str(k), v) + return "D"; + } + static constexpr auto name = _("complext128"); +}; + } // namespace detail } // namespace pybind11 @@ -124,6 +164,8 @@ struct ValidDTypeToPyArrayChecker { DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::float16); DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::bfloat16); +DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::complex64); +DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::complex128); DECLARE_VALID_DTYPE_TO_PY_ARRAY(float); DECLARE_VALID_DTYPE_TO_PY_ARRAY(double); DECLARE_VALID_DTYPE_TO_PY_ARRAY(bool); @@ -142,6 +184,10 @@ inline std::string TensorDTypeToPyDTypeStr( } else if (std::is_same::value) { \ /* NumPy character code of uint16 due to no support for bfloat16 */ \ return "H"; \ + } else if (std::is_same::value) { \ + return "F"; \ + } else if (std::is_same::value) { \ + return "D"; \ } else { \ constexpr auto kIsValidDType = ValidDTypeToPyArrayChecker::kValue; \ PADDLE_ENFORCE_EQ( \ @@ -284,6 +330,12 @@ void SetTensorFromPyArray(framework::Tensor *self, const py::object &obj, } else if (py::isinstance>(array)) { SetTensorFromPyArrayT(self, array, place, zero_copy); + } else if (py::isinstance>(array)) { + SetTensorFromPyArrayT(self, array, place, + zero_copy); + } else if (py::isinstance>(array)) { + SetTensorFromPyArrayT(self, array, place, + zero_copy); } else if (py::isinstance>(array)) { // since there is still no support for bfloat16 in NumPy, // uint16 is used for casting bfloat16 @@ -504,6 +556,10 @@ inline framework::Tensor *_sliceTensor(const framework::Tensor &self, return _sliceAndConcat(self, obj, dim); case framework::proto::VarType::BF16: return _sliceAndConcat(self, obj, dim); + case framework::proto::VarType::COMPLEX64: + return _sliceAndConcat(self, obj, dim); + case framework::proto::VarType::COMPLEX128: + return _sliceAndConcat(self, obj, dim); case framework::proto::VarType::FP32: return _sliceAndConcat(self, obj, dim); case framework::proto::VarType::FP64: diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py index 5da83da33b..8a68ad9d54 100644 --- a/python/paddle/fluid/data_feeder.py +++ b/python/paddle/fluid/data_feeder.py @@ -47,6 +47,10 @@ def convert_dtype(dtype): return 'int64' elif dtype == core.VarDesc.VarType.UINT8: return 'uint8' + elif dtype == core.VarDesc.VarType.COMPLEX64: + return 'complex64' + elif dtype == core.VarDesc.VarType.COMPLEX128: + return 'complex128' elif isinstance(dtype, type): if dtype in [ np.bool, np.float16, np.float32, np.float64, np.int8, np.int16, diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 3a2d99085b..5e4f6394e1 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -643,6 +643,10 @@ def convert_np_dtype_to_dtype_(np_dtype): return core.VarDesc.VarType.UINT8 elif dtype == np.int8: return core.VarDesc.VarType.INT8 + elif dtype == np.complex64: + return core.VarDesc.VarType.COMPLEX64 + elif dtype == np.complex128: + return core.VarDesc.VarType.COMPLEX128 else: raise ValueError("Not supported numpy dtype %s" % dtype) diff --git a/python/paddle/fluid/tests/unittests/test_complex_elementwise_layers.py b/python/paddle/fluid/tests/unittests/test_complex_elementwise_layers.py index adf597704f..25b885214c 100644 --- a/python/paddle/fluid/tests/unittests/test_complex_elementwise_layers.py +++ b/python/paddle/fluid/tests/unittests/test_complex_elementwise_layers.py @@ -26,6 +26,13 @@ layers = { "div": cpx.elementwise_div, } +fluid_layers = { + "add": fluid.layers.elementwise_add, + "sub": fluid.layers.elementwise_sub, + "mul": fluid.layers.elementwise_mul, + "div": fluid.layers.elementwise_div, +} + class TestComplexElementwiseLayers(unittest.TestCase): def setUp(self): @@ -40,6 +47,22 @@ class TestComplexElementwiseLayers(unittest.TestCase): var_y = dg.to_variable(y) return layers[layer_type](var_x, var_y).numpy() + def fuild_calc(self, x, y, layer_type, place): + with dg.guard(place): + var_x = fluid.core.VarBase( + value=x, + place=fluid.framework._current_expected_place(), + persistable=False, + zero_copy=None, + name='') + var_y = fluid.core.VarBase( + value=y, + place=fluid.framework._current_expected_place(), + persistable=False, + zero_copy=None, + name='') + return fluid_layers[layer_type](var_x, var_y).numpy() + def compare(self, x, y): for place in self._places: self.assertTrue(np.allclose(self.calc(x, y, "add", place), x + y)) @@ -47,6 +70,17 @@ class TestComplexElementwiseLayers(unittest.TestCase): self.assertTrue(np.allclose(self.calc(x, y, "mul", place), x * y)) self.assertTrue(np.allclose(self.calc(x, y, "div", place), x / y)) + def compare_1(self, x, y): + for place in self._places: + self.assertTrue( + np.allclose(self.fuild_calc(x, y, "add", place), x + y)) + self.assertTrue( + np.allclose(self.fuild_calc(x, y, "sub", place), x - y)) + self.assertTrue( + np.allclose(self.fuild_calc(x, y, "mul", place), x * y)) + self.assertTrue( + np.allclose(self.fuild_calc(x, y, "div", place), x / y)) + def compare_op(self, x, y): for place in self._places: with dg.guard(place): @@ -57,6 +91,26 @@ class TestComplexElementwiseLayers(unittest.TestCase): self.assertTrue(var_x * var_y, x * y) self.assertTrue(var_x / var_y, x / y) + def compare_op_1(self, x, y): + for place in self._places: + with dg.guard(place): + var_x = fluid.core.VarBase( + value=x, + place=fluid.framework._current_expected_place(), + persistable=False, + zero_copy=None, + name='') + var_y = fluid.core.VarBase( + value=y, + place=fluid.framework._current_expected_place(), + persistable=False, + zero_copy=None, + name='') + self.assertTrue(np.allclose((var_x + var_y).numpy(), x + y)) + self.assertTrue(np.allclose((var_x - var_y).numpy(), x - y)) + self.assertTrue(np.allclose((var_x * var_y).numpy(), x * y)) + self.assertTrue(np.allclose((var_x / var_y).numpy(), x / y)) + def test_complex_xy(self): x = rand([2, 3, 4, 5]).astype(self._dtype) + 1j * rand( [2, 3, 4, 5]).astype(self._dtype) @@ -64,6 +118,8 @@ class TestComplexElementwiseLayers(unittest.TestCase): [2, 3, 4, 5]).astype(self._dtype) self.compare(x, y) self.compare_op(x, y) + self.compare_1(x, y) + self.compare_op_1(x, y) def test_complex_x_real_y(self): x = rand([2, 3, 4, 5]).astype(self._dtype) + 1j * rand( @@ -78,6 +134,14 @@ class TestComplexElementwiseLayers(unittest.TestCase): self.compare(x, y) self.compare_op(x, y) + def test_complex64_xy(self): + x = rand([2, 3, 4, 5]).astype("float32") + 1j * rand( + [2, 3, 4, 5]).astype("float32") + y = rand([2, 3, 4, 5]).astype("float32") + 1j * rand( + [2, 3, 4, 5]).astype("float32") + self.compare_1(x, y) + self.compare_op_1(x, y) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_complex_getitem.py b/python/paddle/fluid/tests/unittests/test_complex_getitem.py index d6b54bbdc4..2396244808 100644 --- a/python/paddle/fluid/tests/unittests/test_complex_getitem.py +++ b/python/paddle/fluid/tests/unittests/test_complex_getitem.py @@ -36,6 +36,18 @@ class TestComplexGetitemLayer(unittest.TestCase): np.testing.assert_allclose(x_var_slice.numpy(), x_np_slice) + for place in self._places: + with dg.guard(place): + x_var = fluid.core.VarBase( + value=x_np, + place=fluid.framework._current_expected_place(), + persistable=False, + zero_copy=None, + name='') + x_var_slice = x_var[0] + + np.testing.assert_allclose(x_var_slice.numpy(), x_np_slice) + def test_case2(self): x_np = np.random.randn(2, 3, 4) + 1j * np.random.randn(2, 3, 4) x_np_slice = x_np[0][1] @@ -47,6 +59,18 @@ class TestComplexGetitemLayer(unittest.TestCase): np.testing.assert_allclose(x_var_slice.numpy(), x_np_slice) + for place in self._places: + with dg.guard(place): + x_var = fluid.core.VarBase( + value=x_np, + place=fluid.framework._current_expected_place(), + persistable=False, + zero_copy=None, + name='') + x_var_slice = x_var[0][1] + + np.testing.assert_allclose(x_var_slice.numpy(), x_np_slice) + def test_case3(self): x_np = np.random.randn(2, 3, 4) + 1j * np.random.randn(2, 3, 4) x_np_slice = x_np[0][1][2] @@ -58,6 +82,18 @@ class TestComplexGetitemLayer(unittest.TestCase): np.testing.assert_allclose(x_var_slice.numpy(), x_np_slice) + for place in self._places: + with dg.guard(place): + x_var = fluid.core.VarBase( + value=x_np, + place=fluid.framework._current_expected_place(), + persistable=False, + zero_copy=None, + name='') + x_var_slice = x_var[0][1][2] + + np.testing.assert_allclose(x_var_slice.numpy(), x_np_slice) + def test_case4(self): x_np = np.random.randn(2, 3, 4) + 1j * np.random.randn(2, 3, 4) x_np_slice = x_np[0][1][0:3] @@ -69,6 +105,18 @@ class TestComplexGetitemLayer(unittest.TestCase): np.testing.assert_allclose(x_var_slice.numpy(), x_np_slice) + for place in self._places: + with dg.guard(place): + x_var = fluid.core.VarBase( + value=x_np, + place=fluid.framework._current_expected_place(), + persistable=False, + zero_copy=None, + name='') + x_var_slice = x_var[0][1][0:3] + + np.testing.assert_allclose(x_var_slice.numpy(), x_np_slice) + def test_case5(self): x_np = np.random.randn(2, 3, 4) + 1j * np.random.randn(2, 3, 4) x_np_slice = x_np[0][1][0:4:2] @@ -80,6 +128,18 @@ class TestComplexGetitemLayer(unittest.TestCase): np.testing.assert_allclose(x_var_slice.numpy(), x_np_slice) + for place in self._places: + with dg.guard(place): + x_var = fluid.core.VarBase( + value=x_np, + place=fluid.framework._current_expected_place(), + persistable=False, + zero_copy=None, + name='') + x_var_slice = x_var[0][1][0:4:2] + + np.testing.assert_allclose(x_var_slice.numpy(), x_np_slice) + def test_case6(self): x_np = np.random.randn(2, 3, 4) + 1j * np.random.randn(2, 3, 4) x_np_slice = x_np[0][1:3][0:4:2] @@ -90,6 +150,17 @@ class TestComplexGetitemLayer(unittest.TestCase): x_var_slice = x_var[0][1:3][0:4:2] np.testing.assert_allclose(x_var_slice.numpy(), x_np_slice) + for place in self._places: + with dg.guard(place): + x_var = fluid.core.VarBase( + value=x_np, + place=fluid.framework._current_expected_place(), + persistable=False, + zero_copy=None, + name='') + x_var_slice = x_var[0][1:3][0:4:2] + + np.testing.assert_allclose(x_var_slice.numpy(), x_np_slice) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_complex_matmul.py b/python/paddle/fluid/tests/unittests/test_complex_matmul.py index 9accbdfca5..bfe69f48ff 100644 --- a/python/paddle/fluid/tests/unittests/test_complex_matmul.py +++ b/python/paddle/fluid/tests/unittests/test_complex_matmul.py @@ -34,6 +34,25 @@ class TestComplexMatMulLayer(unittest.TestCase): np_result = np.matmul(x, y) self.assertTrue(np.allclose(result.numpy(), np_result)) + def compare_1(self, x, y): + for place in self._places: + with dg.guard(place): + x_var = fluid.core.VarBase( + value=x, + place=fluid.framework._current_expected_place(), + persistable=False, + zero_copy=None, + name='') + y_var = fluid.core.VarBase( + value=y, + place=fluid.framework._current_expected_place(), + persistable=False, + zero_copy=None, + name='') + result = paddle.matmul(x_var, y_var) + np_result = np.matmul(x, y) + self.assertTrue(np.allclose(result.numpy(), np_result)) + def compare_op(self, x, y): for place in self._places: with dg.guard(place): @@ -43,6 +62,25 @@ class TestComplexMatMulLayer(unittest.TestCase): np_result = np.matmul(x, y) self.assertTrue(np.allclose(result.numpy(), np_result)) + def compare_op_1(self, x, y): + for place in self._places: + with dg.guard(place): + x_var = fluid.core.VarBase( + value=x, + place=fluid.framework._current_expected_place(), + persistable=False, + zero_copy=None, + name='') + y_var = fluid.core.VarBase( + value=y, + place=fluid.framework._current_expected_place(), + persistable=False, + zero_copy=None, + name='') + result = x_var.matmul(y_var) + np_result = np.matmul(x, y) + self.assertTrue(np.allclose(result.numpy(), np_result)) + def test_complex_xy(self): x = np.random.random( (2, 3, 4, 5)).astype("float32") + 1J * np.random.random( @@ -52,6 +90,8 @@ class TestComplexMatMulLayer(unittest.TestCase): (2, 3, 5, 4)).astype("float32") self.compare(x, y) self.compare_op(x, y) + self.compare_1(x, y) + self.compare_op_1(x, y) def test_complex_x(self): x = np.random.random( @@ -68,6 +108,52 @@ class TestComplexMatMulLayer(unittest.TestCase): (2, 3, 5, 4)).astype("float32") self.compare(x, y) + def test_complex128_xy(self): + x = np.random.random( + (2, 3, 4, 5)).astype("float64") + 1J * np.random.random( + (2, 3, 4, 5)).astype("float64") + y = np.random.random( + (2, 3, 5, 4)).astype("float64") + 1J * np.random.random( + (2, 3, 5, 4)).astype("float64") + self.compare_1(x, y) + self.compare_op_1(x, y) + + def test_complex_xy_gemv(self): + x = np.random.random( + (2, 1, 100)).astype("float32") + 1J * np.random.random( + (2, 1, 100)).astype("float32") + y = np.random.random((100)).astype("float32") + 1J * np.random.random( + (100)).astype("float32") + self.compare_1(x, y) + self.compare_op_1(x, y) + + x = np.random.random( + (2, 1, 100)).astype("float64") + 1J * np.random.random( + (2, 1, 100)).astype("float64") + y = np.random.random((100)).astype("float64") + 1J * np.random.random( + (100)).astype("float64") + self.compare_1(x, y) + self.compare_op_1(x, y) + + def test_complex_xy_gemm(self): + x = np.random.random( + (1, 2, 50)).astype("float32") + 1J * np.random.random( + (1, 2, 50)).astype("float32") + y = np.random.random( + (1, 50, 2)).astype("float32") + 1J * np.random.random( + (1, 50, 2)).astype("float32") + self.compare_1(x, y) + self.compare_op_1(x, y) + + x = np.random.random( + (1, 2, 50)).astype("float64") + 1J * np.random.random( + (1, 2, 50)).astype("float64") + y = np.random.random( + (1, 50, 2)).astype("float64") + 1J * np.random.random( + (1, 50, 2)).astype("float64") + self.compare_1(x, y) + self.compare_op_1(x, y) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_complex_variable.py b/python/paddle/fluid/tests/unittests/test_complex_variable.py index ee69be1d0e..f29cb463da 100644 --- a/python/paddle/fluid/tests/unittests/test_complex_variable.py +++ b/python/paddle/fluid/tests/unittests/test_complex_variable.py @@ -16,6 +16,9 @@ import unittest import numpy as np import paddle import paddle.fluid.dygraph as dg +import paddle.fluid.core as core +from paddle.fluid.framework import convert_np_dtype_to_dtype_ +from paddle.fluid.data_feeder import convert_dtype class TestComplexVariable(unittest.TestCase): @@ -43,6 +46,20 @@ class TestComplexVariable(unittest.TestCase): self._dtype = "complex128" self.compare() + def test_convert_np_dtype_to_dtype(self): + self.assertEqual( + convert_np_dtype_to_dtype_(np.complex64), + core.VarDesc.VarType.COMPLEX64) + self.assertEqual( + convert_np_dtype_to_dtype_(np.complex64), + core.VarDesc.VarType.COMPLEX64) + + def test_convert_dtype(self): + self.assertEqual( + convert_dtype(core.VarDesc.VarType.COMPLEX64), "complex64") + self.assertEqual( + convert_dtype(core.VarDesc.VarType.COMPLEX128), "complex128") + if __name__ == '__main__': unittest.main() -- GitLab