diff --git a/paddle/pten/api/lib/tensor.cc b/paddle/pten/api/lib/tensor.cc index 40f35896323b98543364428c99b20d03571dbbd7..aae11294b0be0899480504fd734d44e584abbe5b 100644 --- a/paddle/pten/api/lib/tensor.cc +++ b/paddle/pten/api/lib/tensor.cc @@ -176,12 +176,12 @@ template PADDLE_API uint8_t *Tensor::mutable_data(); template PADDLE_API int8_t *Tensor::mutable_data(); template PADDLE_API int16_t *Tensor::mutable_data(); template PADDLE_API bool *Tensor::mutable_data(); -template PADDLE_API paddle::platform::complex - *Tensor::mutable_data>(); -template PADDLE_API paddle::platform::complex - *Tensor::mutable_data>(); -template PADDLE_API paddle::platform::float16 * -Tensor::mutable_data(); +template PADDLE_API pten::dtype::complex + *Tensor::mutable_data>(); +template PADDLE_API pten::dtype::complex + *Tensor::mutable_data>(); +template PADDLE_API pten::dtype::float16 * +Tensor::mutable_data(); template T *Tensor::mutable_data(const PlaceType &place) { @@ -214,12 +214,12 @@ template PADDLE_API int8_t *Tensor::mutable_data( template PADDLE_API int16_t *Tensor::mutable_data( const PlaceType &place); template PADDLE_API bool *Tensor::mutable_data(const PlaceType &place); -template PADDLE_API paddle::platform::complex * -Tensor::mutable_data>(const PlaceType &place); -template PADDLE_API paddle::platform::complex * -Tensor::mutable_data>(const PlaceType &place); -template PADDLE_API paddle::platform::float16 * -Tensor::mutable_data(const PlaceType &place); +template PADDLE_API pten::dtype::complex + *Tensor::mutable_data>(const PlaceType &place); +template PADDLE_API pten::dtype::complex + *Tensor::mutable_data>(const PlaceType &place); +template PADDLE_API pten::dtype::float16 * +Tensor::mutable_data(const PlaceType &place); template const T *Tensor::data() const { @@ -241,14 +241,14 @@ template PADDLE_API const uint8_t *Tensor::data() const; template PADDLE_API const int8_t *Tensor::data() const; template PADDLE_API const int16_t *Tensor::data() const; template PADDLE_API const bool *Tensor::data() const; -template PADDLE_API const paddle::platform::complex - *Tensor::data>() const; -template PADDLE_API const paddle::platform::complex - *Tensor::data>() const; -template PADDLE_API const paddle::platform::float16 * -Tensor::data() const; -template PADDLE_API const paddle::platform::bfloat16 * -Tensor::data() const; +template PADDLE_API const pten::dtype::complex + *Tensor::data>() const; +template PADDLE_API const pten::dtype::complex + *Tensor::data>() const; +template PADDLE_API const pten::dtype::float16 * +Tensor::data() const; +template PADDLE_API const pten::dtype::bfloat16 * +Tensor::data() const; template T *Tensor::data() { @@ -267,12 +267,11 @@ template PADDLE_API uint8_t *Tensor::data(); template PADDLE_API int8_t *Tensor::data(); template PADDLE_API int16_t *Tensor::data(); template PADDLE_API bool *Tensor::data(); -template PADDLE_API paddle::platform::complex - *Tensor::data>(); -template PADDLE_API paddle::platform::complex - *Tensor::data>(); -template PADDLE_API paddle::platform::float16 * -Tensor::data(); +template PADDLE_API pten::dtype::complex + *Tensor::data>(); +template PADDLE_API pten::dtype::complex + *Tensor::data>(); +template PADDLE_API pten::dtype::float16 *Tensor::data(); // TODO(chenweihang): replace slice impl by API Tensor Tensor::slice(int64_t begin_idx, int64_t end_idx) const { @@ -328,12 +327,12 @@ template PADDLE_API Tensor Tensor::copy_to(const PlaceType &target_place) const; template PADDLE_API Tensor Tensor::copy_to(const PlaceType &target_place) const; -template PADDLE_API Tensor Tensor::copy_to>( +template PADDLE_API Tensor Tensor::copy_to>( const PlaceType &target_place) const; -template PADDLE_API Tensor Tensor::copy_to>( +template PADDLE_API Tensor Tensor::copy_to>( const PlaceType &target_place) const; template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; +Tensor::copy_to(const PlaceType &target_place) const; Tensor Tensor::copy_to(Backend backend, bool blocking) const { return experimental::copy_to(*this, backend, blocking); diff --git a/paddle/pten/core/compat/convert_utils.h b/paddle/pten/core/compat/convert_utils.h index 0db71b577de515e8fe690f81d0054f6650bcd134..fba2243808a9764ef7f7636331643943705d24ee 100644 --- a/paddle/pten/core/compat/convert_utils.h +++ b/paddle/pten/core/compat/convert_utils.h @@ -20,11 +20,6 @@ limitations under the License. */ #include "paddle/pten/common/place.h" #include "paddle/pten/core/tensor_meta.h" -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/framework/data_type.h" - -// TODO(chenweihang): this file may need to be removed - namespace pten { std::string TransToPtenKernelName(const std::string& fluid_op_name); diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc index 36d56212e216af208362c6b78612cbaf5c058628..82150e10bb313065f53b2c24e2c5ed3c5a0a0100 100644 --- a/paddle/pten/core/dense_tensor.cc +++ b/paddle/pten/core/dense_tensor.cc @@ -202,12 +202,12 @@ DATA_MEMBER_FUNC_INSTANTIATION(int32_t); DATA_MEMBER_FUNC_INSTANTIATION(uint32_t); DATA_MEMBER_FUNC_INSTANTIATION(int64_t); DATA_MEMBER_FUNC_INSTANTIATION(uint64_t); -DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::bfloat16); -DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::float16); +DATA_MEMBER_FUNC_INSTANTIATION(::pten::dtype::bfloat16); +DATA_MEMBER_FUNC_INSTANTIATION(::pten::dtype::float16); DATA_MEMBER_FUNC_INSTANTIATION(float); DATA_MEMBER_FUNC_INSTANTIATION(double); -DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex64); -DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex128); +DATA_MEMBER_FUNC_INSTANTIATION(::pten::dtype::complex); +DATA_MEMBER_FUNC_INSTANTIATION(::pten::dtype::complex); #undef DATA_MEMBER_FUNC_INSTANTIATION diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h index 44232930e5f0d7b8e150a12d054b669c765dc0f8..280af941403a1e5633307af249b6b4b7f1a5f7de 100644 --- a/paddle/pten/core/dense_tensor.h +++ b/paddle/pten/core/dense_tensor.h @@ -20,9 +20,6 @@ limitations under the License. */ #include "paddle/pten/core/tensor_base.h" #include "paddle/pten/core/tensor_meta.h" -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/framework/data_type.h" - /* @jim19930609: Move to MKLDNN_Tensor in the future */ #ifdef PADDLE_WITH_MKLDNN diff --git a/paddle/pten/core/dense_tensor_impl.cc b/paddle/pten/core/dense_tensor_impl.cc index dfde62618d01ceae578035a0f596da601f096cf2..7237f03dccf96c337668169a2a8455f5d02ad552 100644 --- a/paddle/pten/core/dense_tensor_impl.cc +++ b/paddle/pten/core/dense_tensor_impl.cc @@ -40,14 +40,14 @@ size_t DenseTensor::memory_size() const { } void DenseTensor::check_memory_size() const { - PADDLE_ENFORCE_NOT_NULL(holder_, - paddle::platform::errors::PreconditionNotMet( - "Tensor holds no memory. " - "Call Tensor::mutable_data firstly.")); + PADDLE_ENFORCE_NOT_NULL( + holder_, + pten::errors::PreconditionNotMet("Tensor holds no memory. " + "Call Tensor::mutable_data firstly.")); PADDLE_ENFORCE_LE( numel() * SizeOf(dtype()), memory_size(), - paddle::platform::errors::PreconditionNotMet( + pten::errors::PreconditionNotMet( "Tensor's dimension is out of bound." "Tensor's dimension must be equal or less than the size of its " "memory." @@ -56,10 +56,10 @@ void DenseTensor::check_memory_size() const { memory_size())); } -const paddle::platform::Place& DenseTensor::place() const { +const Place& DenseTensor::place() const { PADDLE_ENFORCE_NOT_NULL( holder_, - paddle::platform::errors::PreconditionNotMet( + pten::errors::PreconditionNotMet( "Tensor not initialized yet when DenseTensor::place() is called.")); return holder_->place(); } @@ -82,7 +82,7 @@ void DenseTensor::ResetHolder(const std::shared_ptr& holder) { numel() * static_cast(SizeOf(dtype())) + static_cast(meta_.offset), static_cast(holder->size()), - paddle::platform::errors::InvalidArgument( + pten::errors::InvalidArgument( "The size of Holder is not enough to store the Tensor.")); } holder_ = holder; @@ -99,14 +99,14 @@ void DenseTensor::set_type(paddle::experimental::DataType type) { meta_.dtype = type; } -void* DenseTensor::mutable_data(const paddle::platform::Place& place, +void* DenseTensor::mutable_data(const Place& place, paddle::experimental::DataType type, size_t requested_size) { set_type(type); PADDLE_ENFORCE_GE( numel(), 0, - paddle::platform::errors::PreconditionNotMet( + pten::errors::PreconditionNotMet( "The Tensor's element number must be equal or greater than zero. " "The Tensor's shape is [", dims(), @@ -127,19 +127,18 @@ void* DenseTensor::mutable_data(const paddle::platform::Place& place, meta_.offset); } -void* DenseTensor::mutable_data(const paddle::platform::Place& place, - size_t requested_size) { +void* DenseTensor::mutable_data(const Place& place, size_t requested_size) { return mutable_data(place, type(), requested_size); } -void* DenseTensor::mutable_data(const paddle::platform::Place& place, +void* DenseTensor::mutable_data(const Place& place, paddle::experimental::DataType type, const pten::Stream& stream) { set_type(type); PADDLE_ENFORCE_GE( numel(), 0, - paddle::platform::errors::PreconditionNotMet( + pten::errors::PreconditionNotMet( "The Tensor's element number must be equal or greater than zero. " "The Tensor's shape is [", dims(), @@ -149,7 +148,7 @@ void* DenseTensor::mutable_data(const paddle::platform::Place& place, /* some versions of boost::variant don't have operator!= */ if (holder_ == nullptr || !(holder_->place() == place) || holder_->size() < size + meta_.offset || - !(paddle::platform::is_gpu_place(place) && + !(place.GetType() == pten::AllocationType::GPU && paddle::memory::InSameStream(holder_, stream))) { holder_.reset(); holder_ = paddle::memory::AllocShared(place, size, stream); @@ -166,7 +165,7 @@ void* DenseTensor::mutable_data(const paddle::platform::Place& place, */ template inline T* DenseTensor::mutable_data(const DDim& dims, - const paddle::platform::Place& place, + const Place& place, size_t requested_size) { static_assert(std::is_pod::value, "T must be POD"); meta_.dims = dims; @@ -174,8 +173,7 @@ inline T* DenseTensor::mutable_data(const DDim& dims, } template -inline T* DenseTensor::mutable_data(const paddle::platform::Place& place, - size_t requested_size) { +inline T* DenseTensor::mutable_data(const Place& place, size_t requested_size) { static_assert(std::is_pod::value, "T must be POD"); return reinterpret_cast( mutable_data(place, @@ -189,13 +187,11 @@ void DenseTensor::ShareBufferWith(const DenseTensor& tensor) { meta_.dtype = tensor.dtype(); } -#define LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(dtype) \ - template dtype* DenseTensor::mutable_data( \ - const DDim& dims, \ - const paddle::platform::Place& place, \ - size_t requested_size); \ - template dtype* DenseTensor::mutable_data( \ - const paddle::platform::Place& place, size_t requested_size); +#define LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(dtype) \ + template dtype* DenseTensor::mutable_data( \ + const DDim& dims, const Place& place, size_t requested_size); \ + template dtype* DenseTensor::mutable_data(const Place& place, \ + size_t requested_size); LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(bool) LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int8_t) @@ -205,10 +201,10 @@ LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int32_t) LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int64_t) LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(float) LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(double) -LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::bfloat16) -LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::float16) -LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex64) -LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex128) +LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::pten::dtype::bfloat16) +LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::pten::dtype::float16) +LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::pten::dtype::complex) +LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::pten::dtype::complex) #undef LEGACY_DATA_MEMBER_FUNC_INSTANTIATION @@ -234,7 +230,7 @@ std::pair DenseTensor::lod_element(size_t level, PADDLE_ENFORCE_LT( level, NumLevels(), - paddle::platform::errors::InvalidArgument( + pten::errors::InvalidArgument( "The input level of LoD is invalid, it should be less than LoD " "size. The input level is %zu, the LoD size is %zu.", level, @@ -242,7 +238,7 @@ std::pair DenseTensor::lod_element(size_t level, PADDLE_ENFORCE_LT(elem, NumElements(level), - paddle::platform::errors::InvalidArgument( + pten::errors::InvalidArgument( "The input element of LoD is invalid, it should be " "less than the number of elements in its level." "The input element is %zu, the number of elements in " @@ -259,7 +255,7 @@ size_t DenseTensor::NumElements(size_t level) const { PADDLE_ENFORCE_LT( level, NumLevels(), - paddle::platform::errors::InvalidArgument( + pten::errors::InvalidArgument( "The input level of LoD is invalid, it should be less than LoD " "size. The input level is %zu, the LoD size is %zu.", level, @@ -276,20 +272,20 @@ DenseTensor& DenseTensor::Resize(const DDim& dims) { DenseTensor DenseTensor::Slice(int64_t begin_idx, int64_t end_idx) const { check_memory_size(); - PADDLE_ENFORCE_GE(begin_idx, - 0, - paddle::platform::errors::OutOfRange( - "The start row index must be greater than 0." - "But received the start index is d%.", - begin_idx)); - PADDLE_ENFORCE_LE(end_idx, - meta_.dims[0], - paddle::platform::errors::OutOfRange( - "The end row index is out of bound.")); + PADDLE_ENFORCE_GE( + begin_idx, + 0, + pten::errors::OutOfRange("The start row index must be greater than 0." + "But received the start index is d%.", + begin_idx)); + PADDLE_ENFORCE_LE( + end_idx, + meta_.dims[0], + pten::errors::OutOfRange("The end row index is out of bound.")); PADDLE_ENFORCE_LT( begin_idx, end_idx, - paddle::platform::errors::InvalidArgument( + pten::errors::InvalidArgument( "The start row index must be less than the end row index." "But received the start index = %d, the end index = %d.", begin_idx, @@ -317,13 +313,13 @@ std::vector DenseTensor::Split(int64_t split_size, PADDLE_ENFORCE_GE(meta_.dims.size(), 0, - paddle::platform::errors::OutOfRange( + pten::errors::OutOfRange( "split expects at least a 1-dimensional tensor")); PADDLE_ENFORCE_GE( split_size, 0, - paddle::platform::errors::OutOfRange( + pten::errors::OutOfRange( "split expects split_size be non-negative, but got split_size is %d", split_size)); @@ -350,12 +346,12 @@ std::vector DenseTensor::Chunk(int64_t chunks, check_memory_size(); PADDLE_ENFORCE_GE(meta_.dims.size(), 0, - paddle::platform::errors::OutOfRange( + pten::errors::OutOfRange( "split expects at least a 1-dimensional tensor")); PADDLE_ENFORCE_GE( chunks, 0, - paddle::platform::errors::OutOfRange( + pten::errors::OutOfRange( "chunks expects to be greater than 0, but got chunks is %d", chunks)); int64_t numel_size = meta_.dims[axis]; @@ -376,7 +372,7 @@ DenseTensor& DenseTensor::ShareInplaceVersionCounterWith( const DenseTensor& src) { PADDLE_ENFORCE_NOT_NULL( inplace_version_counter_, - paddle::platform::errors::PreconditionNotMet( + pten::errors::PreconditionNotMet( "Tensor does not hold inplace_version_counter_.")); inplace_version_counter_ = src.inplace_version_counter_; diff --git a/paddle/pten/core/kernel_utils.h b/paddle/pten/core/kernel_utils.h index 8bc125c50bed6b04cf549ab592b24b68f3ce8712..01632b7e586723d6eea74b59fe56ee179fad438b 100644 --- a/paddle/pten/core/kernel_utils.h +++ b/paddle/pten/core/kernel_utils.h @@ -233,7 +233,7 @@ struct KernelImpl { PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(double); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t); - PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(paddle::platform::float16); + PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(pten::dtype::float16); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout); diff --git a/paddle/pten/core/utils/data_type.h b/paddle/pten/core/utils/data_type.h index ca0c678e0623d7b7a38b8d87170fc448798f7ea6..924040bf890f640d3e614808ddea3ef8bc62dfd9 100644 --- a/paddle/pten/core/utils/data_type.h +++ b/paddle/pten/core/utils/data_type.h @@ -26,23 +26,23 @@ namespace pten { #define _PtenForEachDataTypeHelper_(callback, cpp_type, data_type) \ callback(cpp_type, data_type); -#define _PtenForEachDataType_(callback) \ - _PtenForEachDataTypeHelper_(callback, float, DataType::FLOAT32); \ - _PtenForEachDataTypeHelper_( \ - callback, ::paddle::platform::float16, DataType::FLOAT16); \ - _PtenForEachDataTypeHelper_( \ - callback, ::paddle::platform::bfloat16, DataType::BFLOAT16); \ - _PtenForEachDataTypeHelper_(callback, double, DataType::FLOAT64); \ - _PtenForEachDataTypeHelper_(callback, int, DataType::INT32); \ - _PtenForEachDataTypeHelper_(callback, int64_t, DataType::INT64); \ - _PtenForEachDataTypeHelper_(callback, bool, DataType::BOOL); \ - _PtenForEachDataTypeHelper_(callback, uint8_t, DataType::UINT8); \ - _PtenForEachDataTypeHelper_(callback, int16_t, DataType::INT16); \ - _PtenForEachDataTypeHelper_(callback, int8_t, DataType::INT8); \ - _PtenForEachDataTypeHelper_( \ - callback, ::paddle::platform::complex, DataType::COMPLEX64); \ - _PtenForEachDataTypeHelper_( \ - callback, ::paddle::platform::complex, DataType::COMPLEX128); +#define _PtenForEachDataType_(callback) \ + _PtenForEachDataTypeHelper_(callback, float, DataType::FLOAT32); \ + _PtenForEachDataTypeHelper_( \ + callback, ::pten::dtype::float16, DataType::FLOAT16); \ + _PtenForEachDataTypeHelper_( \ + callback, ::pten::dtype::bfloat16, DataType::BFLOAT16); \ + _PtenForEachDataTypeHelper_(callback, double, DataType::FLOAT64); \ + _PtenForEachDataTypeHelper_(callback, int, DataType::INT32); \ + _PtenForEachDataTypeHelper_(callback, int64_t, DataType::INT64); \ + _PtenForEachDataTypeHelper_(callback, bool, DataType::BOOL); \ + _PtenForEachDataTypeHelper_(callback, uint8_t, DataType::UINT8); \ + _PtenForEachDataTypeHelper_(callback, int16_t, DataType::INT16); \ + _PtenForEachDataTypeHelper_(callback, int8_t, DataType::INT8); \ + _PtenForEachDataTypeHelper_( \ + callback, ::pten::dtype::complex, DataType::COMPLEX64); \ + _PtenForEachDataTypeHelper_( \ + callback, ::pten::dtype::complex, DataType::COMPLEX128); template inline void VisitDataType(pten::DataType type, Visitor visitor) { diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt index a9b81ad4eb2b3914b005caf348d65bf87e788dca..20067eb5b047018b498175d1f2d6906e524cbfd0 100644 --- a/paddle/pten/kernels/CMakeLists.txt +++ b/paddle/pten/kernels/CMakeLists.txt @@ -15,14 +15,10 @@ set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} pten_api_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) -set(MATH_KERNEL_DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel pten_transpose_cpu) -if(WITH_GPU OR WITH_ROCM) - set(MATH_KERNEL_DEPS ${MATH_KERNEL_DEPS} pten_transpose_gpu) -endif() - # auto build kernel targets by cmake -register_kernels(EXCLUDES math_kernel DEPS ${COMMON_KERNEL_DEPS}) -kernel_library(math_kernel DEPS ${MATH_KERNEL_DEPS}) +register_kernels(DEPS ${COMMON_KERNEL_DEPS}) + +# pten sparse kernels add_subdirectory(sparse) copy_if_different(${kernel_declare_file} ${kernel_declare_file_final}) diff --git a/paddle/pten/kernels/complex_kernel.h b/paddle/pten/kernels/complex_kernel.h index ab1cb59872a0430b122faa0bcd9816d801314bc4..867af865fe0fd7658b80c819e3ab3f28cab674db 100644 --- a/paddle/pten/kernels/complex_kernel.h +++ b/paddle/pten/kernels/complex_kernel.h @@ -25,12 +25,12 @@ template void ConjKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); // If T is complex -template >::value || - std::is_same>::value, - bool> = true> +template < + typename T, + typename Context, + std::enable_if_t>::value || + std::is_same>::value, + bool> = true> DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) { auto dense_out = pten::Empty(dev_ctx); MetaTensor meta_out(&dense_out); @@ -40,12 +40,12 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) { } // If T is not complex -template >::value && - !std::is_same>::value, - bool> = true> +template < + typename T, + typename Context, + std::enable_if_t>::value && + !std::is_same>::value, + bool> = true> DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) { return x; } diff --git a/paddle/pten/kernels/cpu/cast_kernel.cc b/paddle/pten/kernels/cpu/cast_kernel.cc index 24371ca7690de6ff45020499a9ca667e42934bae..7303028cd7a16b92d13d74fc5e2f3f1312d7a0f4 100644 --- a/paddle/pten/kernels/cpu/cast_kernel.cc +++ b/paddle/pten/kernels/cpu/cast_kernel.cc @@ -69,9 +69,9 @@ PT_REGISTER_KERNEL(cast, int16_t, bool, uint8_t, - paddle::platform::float16, - paddle::platform::bfloat16, - paddle::platform::complex, - paddle::platform::complex) { + pten::dtype::float16, + pten::dtype::bfloat16, + pten::dtype::complex, + pten::dtype::complex) { kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); } diff --git a/paddle/pten/kernels/cpu/complex_kernel.cc b/paddle/pten/kernels/cpu/complex_kernel.cc index 6cdba15620fe36e23a2e231d32d695712365fa36..e1bbe1ff00ed0a82433d13d762a72c3f860687c0 100644 --- a/paddle/pten/kernels/cpu/complex_kernel.cc +++ b/paddle/pten/kernels/cpu/complex_kernel.cc @@ -25,8 +25,8 @@ PT_REGISTER_KERNEL(conj, CPU, ALL_LAYOUT, pten::ConjKernel, - paddle::platform::complex, - paddle::platform::complex, + pten::dtype::complex, + pten::dtype::complex, float, double, int, diff --git a/paddle/pten/kernels/cpu/concat_kernel.cc b/paddle/pten/kernels/cpu/concat_kernel.cc index c4aed7679bd72c42d1d0b46d3ebf195d1c35298b..a9ecd15e68d875741cfedca5d060ac5cf1a2a26e 100644 --- a/paddle/pten/kernels/cpu/concat_kernel.cc +++ b/paddle/pten/kernels/cpu/concat_kernel.cc @@ -120,6 +120,6 @@ PT_REGISTER_KERNEL(concat, int64_t, int, uint8_t, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/cpu/dot_grad_kernel.cc b/paddle/pten/kernels/cpu/dot_grad_kernel.cc index 91202cf836df5dc33269d09d40959d86441df434..2705c0667941c318dfa53dc36cf19eaf585e89cf 100644 --- a/paddle/pten/kernels/cpu/dot_grad_kernel.cc +++ b/paddle/pten/kernels/cpu/dot_grad_kernel.cc @@ -28,5 +28,5 @@ PT_REGISTER_KERNEL(dot_grad, double, int, int64_t, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/cpu/dot_kernel.cc b/paddle/pten/kernels/cpu/dot_kernel.cc index 5cef8d0bdd56d08731d617f0bd9c732fe1688af5..5166d9c061f95fead0c86f6ad029968b1e2e1f77 100644 --- a/paddle/pten/kernels/cpu/dot_kernel.cc +++ b/paddle/pten/kernels/cpu/dot_kernel.cc @@ -46,8 +46,8 @@ void DotKernel(const Context& dev_ctx, } // namespace pten -using complex64 = ::paddle::platform::complex; -using complex128 = ::paddle::platform::complex; +using complex64 = ::pten::dtype::complex; +using complex128 = ::pten::dtype::complex; PT_REGISTER_KERNEL(dot, CPU, diff --git a/paddle/pten/kernels/cpu/elementwise_grad_kernel.cc b/paddle/pten/kernels/cpu/elementwise_grad_kernel.cc index d3d3aa79edb390d682f90c328874bba8f3c77921..002b575341a1601d093680d6854ed2cfc262e788 100644 --- a/paddle/pten/kernels/cpu/elementwise_grad_kernel.cc +++ b/paddle/pten/kernels/cpu/elementwise_grad_kernel.cc @@ -134,8 +134,8 @@ PT_REGISTER_KERNEL(add_grad, double, int, int64_t, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(add_double_grad, CPU, @@ -145,8 +145,8 @@ PT_REGISTER_KERNEL(add_double_grad, double, int, int64_t, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(add_triple_grad, CPU, @@ -156,8 +156,8 @@ PT_REGISTER_KERNEL(add_triple_grad, double, int, int64_t, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(subtract_grad, CPU, @@ -167,8 +167,8 @@ PT_REGISTER_KERNEL(subtract_grad, double, int, int64_t, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(subtract_double_grad, CPU, @@ -178,5 +178,5 @@ PT_REGISTER_KERNEL(subtract_double_grad, double, int, int64_t, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/cpu/full_kernel.cc b/paddle/pten/kernels/cpu/full_kernel.cc index 919471d86ac5340ec8cc1c25746b99508db35ae4..62e1bbf1d9d9c3cb219f2c96338a53ab20b14082 100644 --- a/paddle/pten/kernels/cpu/full_kernel.cc +++ b/paddle/pten/kernels/cpu/full_kernel.cc @@ -29,10 +29,10 @@ PT_REGISTER_KERNEL(full, int, int64_t, bool, - paddle::platform::float16, - paddle::platform::bfloat16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::bfloat16, + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(full_like, CPU, @@ -43,4 +43,4 @@ PT_REGISTER_KERNEL(full_like, int, int64_t, bool, - paddle::platform::float16) {} + pten::dtype::float16) {} diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc index d4987e7a3606987ab64449e1346c788431895788..70e90587123fe882279fd5cec37717132abd6f09 100644 --- a/paddle/pten/kernels/cpu/math_kernel.cc +++ b/paddle/pten/kernels/cpu/math_kernel.cc @@ -113,11 +113,11 @@ DEFINE_CPU_ELEMENTWISE_OP(Multiply) } // namespace pten -using complex64 = ::paddle::platform::complex; -using complex128 = ::paddle::platform::complex; +using complex64 = ::pten::dtype::complex; +using complex128 = ::pten::dtype::complex; // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16 -// using bfloat16 = ::paddle::platform::bfloat16; +// using bfloat16 = ::pten::dtype::bfloat16; PT_REGISTER_KERNEL(add_raw, CPU, ALL_LAYOUT, @@ -166,7 +166,7 @@ PT_REGISTER_KERNEL(sum_raw, bool, float, double, - paddle::platform::float16, + pten::dtype::float16, int, int64_t, complex64, diff --git a/paddle/pten/kernels/cpu/matmul_grad_kernel.cc b/paddle/pten/kernels/cpu/matmul_grad_kernel.cc index 955f2b017b0dccf6bef6d551aaa9f3044ee221cf..fa0fd0c8d4c0a95e15bff89b093b9b182a084cb5 100644 --- a/paddle/pten/kernels/cpu/matmul_grad_kernel.cc +++ b/paddle/pten/kernels/cpu/matmul_grad_kernel.cc @@ -25,8 +25,8 @@ PT_REGISTER_KERNEL(matmul_grad, pten::MatmulGradKernel, float, double, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(matmul_double_grad, CPU, @@ -34,8 +34,8 @@ PT_REGISTER_KERNEL(matmul_double_grad, pten::MatmulDoubleGradKernel, float, double, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(matmul_triple_grad, CPU, @@ -43,5 +43,5 @@ PT_REGISTER_KERNEL(matmul_triple_grad, pten::MatmulTripleGradKernel, float, double, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/cpu/matmul_kernel.cc b/paddle/pten/kernels/cpu/matmul_kernel.cc index 51def07d4031f91a7ad4843f1b62af34c8f68d27..46e2c49ab115b50c8b2eb710a51d53b2e3a1ee55 100644 --- a/paddle/pten/kernels/cpu/matmul_kernel.cc +++ b/paddle/pten/kernels/cpu/matmul_kernel.cc @@ -26,5 +26,5 @@ PT_REGISTER_KERNEL(matmul, pten::MatmulKernel, float, double, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/cpu/reduce.h b/paddle/pten/kernels/cpu/reduce.h index bdf9e65f541886900c58163bda4604e7f338c0c0..0882e13a8522b19e6161b6e19b6566c6ccf8cb07 100644 --- a/paddle/pten/kernels/cpu/reduce.h +++ b/paddle/pten/kernels/cpu/reduce.h @@ -23,7 +23,7 @@ #include "paddle/pten/api/lib/utils/storage.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/kernels/funcs/eigen/common.h" -#include "paddle/pten/kernels/funcs/transpose.h" +#include "paddle/pten/kernels/funcs/math_function.h" // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/operators/eigen/eigen_function.h" namespace pten { @@ -80,7 +80,7 @@ void ReduceFunctor(const DeviceContext& context, inline void GetShuffledDim(const DDim& src_dims, DDim* dst_dims, const std::vector& reduced_dims, - std::vector* perm_axis) { + std::vector* perm_axis) { // check if it's a reduced dim std::vector src_dims_check(src_dims.size(), false); size_t src_size = src_dims.size(); @@ -115,13 +115,13 @@ void GetShuffledInput(const DeviceContext& dev_ctx, pten::DenseTensor* shuffled_input, const std::vector& dims) { DDim shuffled_dims(input.dims()); - std::vector perm_axis(input.dims().size()); + std::vector perm_axis(input.dims().size()); GetShuffledDim(input.dims(), &shuffled_dims, dims, &perm_axis); shuffled_input->ResizeAndAllocate(shuffled_dims); dev_ctx.template Alloc(shuffled_input); - pten::math::TransposeNormal trans; + pten::funcs::TransposeNormal trans; trans(dev_ctx, input, shuffled_input, perm_axis); } diff --git a/paddle/pten/kernels/empty_kernel.cc b/paddle/pten/kernels/empty_kernel.cc index e1a1788815ebfef75ac29e332da3e76f3d2a5d52..03fe240a88b13a3bb38fab72ea6c317aaa7a5ccc 100644 --- a/paddle/pten/kernels/empty_kernel.cc +++ b/paddle/pten/kernels/empty_kernel.cc @@ -45,10 +45,10 @@ PT_REGISTER_KERNEL(empty, int, int64_t, bool, - paddle::platform::float16, - paddle::platform::bfloat16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::bfloat16, + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(empty_like, CPU, @@ -61,10 +61,10 @@ PT_REGISTER_KERNEL(empty_like, int, int64_t, bool, - paddle::platform::float16, - paddle::platform::bfloat16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::bfloat16, + pten::dtype::complex, + pten::dtype::complex) {} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PT_REGISTER_KERNEL(empty, @@ -78,9 +78,9 @@ PT_REGISTER_KERNEL(empty, int, int64_t, bool, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(empty_like, GPU, @@ -93,8 +93,8 @@ PT_REGISTER_KERNEL(empty_like, int, int64_t, bool, - paddle::platform::float16, - paddle::platform::bfloat16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::bfloat16, + pten::dtype::complex, + pten::dtype::complex) {} #endif diff --git a/paddle/pten/kernels/flatten_grad_kernel.cc b/paddle/pten/kernels/flatten_grad_kernel.cc index cbbf62f1993e2d2ce3999e189685452d0a856e11..7850f1170b832d54b6cfe9e8691888d88abca641 100644 --- a/paddle/pten/kernels/flatten_grad_kernel.cc +++ b/paddle/pten/kernels/flatten_grad_kernel.cc @@ -49,7 +49,7 @@ PT_REGISTER_KERNEL(flatten_grad, ALL_LAYOUT, pten::FlattenGradKernel, float, - paddle::platform::float16, + pten::dtype::float16, double, uint8_t, int8_t, @@ -64,7 +64,7 @@ PT_REGISTER_KERNEL(flatten_grad, ALL_LAYOUT, pten::FlattenGradKernel, float, - paddle::platform::float16, + pten::dtype::float16, int8_t, int, int64_t) {} diff --git a/paddle/pten/kernels/flatten_kernel.cc b/paddle/pten/kernels/flatten_kernel.cc index b0d05803ac351c4169b952495bd4c246ccf8f062..0ae6cd1b9c35efa57b20e8d4f6a07408544e8166 100644 --- a/paddle/pten/kernels/flatten_kernel.cc +++ b/paddle/pten/kernels/flatten_kernel.cc @@ -76,7 +76,7 @@ PT_REGISTER_KERNEL(flatten, ALL_LAYOUT, pten::FlattenKernel, float, - paddle::platform::float16, + pten::dtype::float16, double, uint8_t, int8_t, @@ -88,7 +88,7 @@ PT_REGISTER_KERNEL(flatten_with_xshape, ALL_LAYOUT, pten::FlattenWithXShape, float, - paddle::platform::float16, + pten::dtype::float16, double, uint8_t, int8_t, @@ -102,7 +102,7 @@ PT_REGISTER_KERNEL(flatten, ALL_LAYOUT, pten::FlattenKernel, float, - paddle::platform::float16, + pten::dtype::float16, int8_t, int, int64_t) {} @@ -112,7 +112,7 @@ PT_REGISTER_KERNEL(flatten_with_xshape, ALL_LAYOUT, pten::FlattenWithXShape, float, - paddle::platform::float16, + pten::dtype::float16, int8_t, int, int64_t) {} diff --git a/paddle/pten/kernels/funcs/CMakeLists.txt b/paddle/pten/kernels/funcs/CMakeLists.txt index e4dd437629a9b2dd0f476b2a6839a80e83a0a5d9..844464a52dcbfb7908cc5bc7615ce64d88643a6a 100644 --- a/paddle/pten/kernels/funcs/CMakeLists.txt +++ b/paddle/pten/kernels/funcs/CMakeLists.txt @@ -1,12 +1,5 @@ add_subdirectory(eigen) -cc_library(pten_transpose_cpu SRCS transpose.cc DEPS dense_tensor pten_context) -if(WITH_GPU) - nv_library(pten_transpose_gpu SRCS transpose.cu DEPS dense_tensor malloc pten_context) -elseif(WITH_ROCM) - hip_library(pten_transpose_gpu SRCS transpose.cu DEPS dense_tensor malloc pten_context) -endif() - function(math_library TARGET) # math_library is a function to create math library. # The interface is the same as cc_library. @@ -47,10 +40,3 @@ function(math_library TARGET) endfunction() math_library(math_function DEPS blas dense_tensor tensor) -cc_test(math_function_test SRCS math_function_test.cc DEPS math_function) -if(WITH_GPU) - nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function) -endif() -if(WITH_ROCM) - hip_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor) -endif() diff --git a/paddle/pten/kernels/funcs/common_shape.h b/paddle/pten/kernels/funcs/common_shape.h index e751f85b50f24bdddb475653e5e706975333242c..c947771900304efcbfbf24be20d51d73566eea98 100644 --- a/paddle/pten/kernels/funcs/common_shape.h +++ b/paddle/pten/kernels/funcs/common_shape.h @@ -15,6 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/kernels/funcs/eigen/common.h" +#include "paddle/pten/kernels/funcs/eigen/eigen_function.h" namespace pten { namespace funcs { diff --git a/paddle/pten/kernels/funcs/math_function.cc b/paddle/pten/kernels/funcs/math_function.cc index dec89e79565dea863b1f2837334db372ed415522..facb26a552019df6e485c2cdbfb5ddda77dc6be5 100644 --- a/paddle/pten/kernels/funcs/math_function.cc +++ b/paddle/pten/kernels/funcs/math_function.cc @@ -36,12 +36,12 @@ limitations under the License. */ namespace pten { namespace funcs { -using float16 = paddle::platform::float16; +using float16 = pten::dtype::float16; template struct SetConstant; + pten::dtype::float16>; template struct SetConstant; + pten::dtype::bfloat16>; template struct SetConstant; template struct SetConstant; template struct SetConstant; @@ -50,12 +50,12 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant>; + pten::dtype::complex>; template struct SetConstant>; + pten::dtype::complex>; -template struct SetConstant; -template struct SetConstant; +template struct SetConstant; +template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant; @@ -63,15 +63,14 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant; -template struct SetConstant>; -template struct SetConstant>; +template struct SetConstant>; +template struct SetConstant>; #ifdef PADDLE_WITH_XPU template struct SetConstant; + pten::dtype::float16>; template struct SetConstant; + pten::dtype::bfloat16>; template struct SetConstant; template struct SetConstant; template struct SetConstant; @@ -80,17 +79,17 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant>; + pten::dtype::complex>; template struct SetConstant>; + pten::dtype::complex>; #endif #define DEFINE_CPU_TRANS(RANK) \ template struct Transpose; \ template struct Transpose; \ template struct Transpose; \ template struct Transpose; \ @@ -107,10 +106,26 @@ template struct SetConstant; \ template struct Transpose; \ template struct Transpose, \ + pten::dtype::complex, \ RANK>; \ template struct Transpose, \ + pten::dtype::complex, \ + RANK>; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose, \ + RANK>; \ + template struct Transpose, \ RANK>; DEFINE_CPU_TRANS(1); @@ -120,41 +135,41 @@ DEFINE_CPU_TRANS(4); DEFINE_CPU_TRANS(5); DEFINE_CPU_TRANS(6); -template -struct TransposeNormal { - void operator()(const paddle::platform::CPUDeviceContext& context, - const paddle::framework::Tensor& in, - paddle::framework::Tensor* out, - const std::vector& axis) { - const int rank = axis.size(); - auto in_stride = paddle::framework::stride(in.dims()); - auto out_stride = paddle::framework::stride(out->dims()); - const T* in_ptr = in.data(); - T* out_ptr = out->data(); - - auto transpose_helper = [&](int64_t beg, int64_t end) { - for (int64_t out_idx = beg; out_idx < end; ++out_idx) { - int64_t in_idx = 0; - int64_t tmp_idx = out_idx; - // calculate the input index - for (int i = 0; i < rank; ++i) { - const int64_t coordinate = tmp_idx / out_stride[i]; - tmp_idx -= coordinate * out_stride[i]; - in_idx += coordinate * in_stride[axis[i]]; - } - out_ptr[out_idx] = in_ptr[in_idx]; +template +void TransposeNormal::operator()( + const DeviceContext& context, + const paddle::framework::Tensor& in, + paddle::framework::Tensor* out, + const std::vector& axis) { + const int rank = axis.size(); + auto in_stride = paddle::framework::stride(in.dims()); + auto out_stride = paddle::framework::stride(out->dims()); + const T* in_ptr = in.data(); + T* out_ptr = out->data(); + + auto transpose_helper = [&](int64_t beg, int64_t end) { + for (int64_t out_idx = beg; out_idx < end; ++out_idx) { + int64_t in_idx = 0; + int64_t tmp_idx = out_idx; + // calculate the input index + for (int i = 0; i < rank; ++i) { + const int64_t coordinate = tmp_idx / out_stride[i]; + tmp_idx -= coordinate * out_stride[i]; + in_idx += coordinate * in_stride[axis[i]]; } - }; - transpose_helper(0, out->numel()); - } -}; + out_ptr[out_idx] = in_ptr[in_idx]; + } + }; + transpose_helper(0, out->numel()); +} // define transpose normal -#define DEFINE_CPU_TRANS_NORMAL(TYPE) \ - template struct TransposeNormal +#define DEFINE_CPU_TRANS_NORMAL(TYPE) \ + template struct TransposeNormal; \ + template struct TransposeNormal -DEFINE_CPU_TRANS_NORMAL(paddle::platform::float16); -DEFINE_CPU_TRANS_NORMAL(paddle::platform::bfloat16); +DEFINE_CPU_TRANS_NORMAL(pten::dtype::float16); +DEFINE_CPU_TRANS_NORMAL(pten::dtype::bfloat16); DEFINE_CPU_TRANS_NORMAL(float); DEFINE_CPU_TRANS_NORMAL(double); DEFINE_CPU_TRANS_NORMAL(int); @@ -163,8 +178,8 @@ DEFINE_CPU_TRANS_NORMAL(bool); DEFINE_CPU_TRANS_NORMAL(int16_t); DEFINE_CPU_TRANS_NORMAL(uint8_t); DEFINE_CPU_TRANS_NORMAL(int8_t); -DEFINE_CPU_TRANS_NORMAL(paddle::platform::complex); -DEFINE_CPU_TRANS_NORMAL(paddle::platform::complex); +DEFINE_CPU_TRANS_NORMAL(pten::dtype::complex); +DEFINE_CPU_TRANS_NORMAL(pten::dtype::complex); struct TensorSetConstantCPU { TensorSetConstantCPU(paddle::framework::Tensor* tensor, float value) @@ -343,7 +358,7 @@ struct ElementwiseAddTo { }; template struct ElementwiseAddTo; + pten::dtype::float16>; } // namespace funcs } // namespace pten diff --git a/paddle/pten/kernels/funcs/math_function.cu b/paddle/pten/kernels/funcs/math_function.cu index 8ed72dbd1c1278d320ccebfd7463e83f7c101065..d019a382d77173185e9ce0a7e76d7c6ae5fcf773 100644 --- a/paddle/pten/kernels/funcs/math_function.cu +++ b/paddle/pten/kernels/funcs/math_function.cu @@ -27,13 +27,13 @@ limitations under the License. */ namespace pten { namespace funcs { -using float16 = paddle::platform::float16; -using bfloat16 = paddle::platform::bfloat16; +using float16 = pten::dtype::float16; +using bfloat16 = pten::dtype::bfloat16; template struct SetConstant; + pten::dtype::float16>; template struct SetConstant; + pten::dtype::bfloat16>; template struct SetConstant; template struct SetConstant; template struct SetConstant; @@ -42,12 +42,12 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant>; + pten::dtype::complex>; template struct SetConstant>; + pten::dtype::complex>; -template struct SetConstant; -template struct SetConstant; +template struct SetConstant; +template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant; @@ -55,14 +55,13 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant; -template struct SetConstant>; -template struct SetConstant>; +template struct SetConstant>; +template struct SetConstant>; template struct SetConstant; + pten::dtype::float16>; template struct SetConstant; + pten::dtype::bfloat16>; template struct SetConstant; template struct SetConstant; template struct SetConstant; @@ -71,9 +70,9 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant>; + pten::dtype::complex>; template struct SetConstant>; + pten::dtype::complex>; #define DEFINE_GPU_TRANS(RANK) \ template struct Transpose; \ @@ -97,10 +96,24 @@ template struct SetConstant; \ template struct Transpose, \ + pten::dtype::complex, \ RANK>; \ template struct Transpose, \ + pten::dtype::complex, \ + RANK>; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose, \ + RANK>; \ + template struct Transpose, \ RANK>; DEFINE_GPU_TRANS(1); @@ -133,60 +146,53 @@ __global__ void TransposeNormalKernel(const T* in_ptr, } } -template -struct TransposeNormal { - void operator()(const paddle::platform::CUDADeviceContext& context, - const paddle::framework::Tensor& in, - paddle::framework::Tensor* out, - const std::vector& axis) { - const int rank = axis.size(); - auto in_stride = paddle::framework::stride(in.dims()); - auto out_stride = paddle::framework::stride(out->dims()); - auto* in_ptr = in.data(); - auto* out_ptr = out->data(); - - // copy in_stride, out_stride, axis to gpu device - const paddle::platform::CUDAPlace& cuda_place = context.GetPlace(); - paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace(); - size_t size = 3 * rank * sizeof(int64_t); - auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size); - auto cuda_buf_holder = paddle::memory::Alloc(cuda_place, size); - REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr()); - REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr()); - for (int i = 0; i < rank; ++i) { - cpu_buf[i] = in_stride[i]; - cpu_buf[rank + i] = out_stride[i]; - cpu_buf[2 * rank + i] = axis[i]; - } - paddle::memory::Copy( - cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream()); - REINTERPRET(const int64_t, in_stride_ptr, cuda_buf); - REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank); - REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank); +template +void TransposeNormal::operator()( + const DeviceContext& context, + const paddle::framework::Tensor& in, + paddle::framework::Tensor* out, + const std::vector& axis) { + const int rank = axis.size(); + auto in_stride = paddle::framework::stride(in.dims()); + auto out_stride = paddle::framework::stride(out->dims()); + auto* in_ptr = in.data(); + auto* out_ptr = out->data(); - const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock(); - const int MAX_GRID_DIM = - context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM; - int64_t elements = in.numel(); - int block_size = (elements >= MAX_BLOCK_DIM) - ? MAX_BLOCK_DIM - : (1 << static_cast(std::log2(elements))); - int grid_size = elements / block_size; - grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size; - TransposeNormalKernel<<>>( - in_ptr, - out_ptr, - elements, - in_stride_ptr, - out_stride_ptr, - axis_ptr, - rank); + // copy in_stride, out_stride, axis to gpu device + const paddle::platform::CUDAPlace& cuda_place = context.GetPlace(); + paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace(); + size_t size = 3 * rank * sizeof(int64_t); + auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size); + auto cuda_buf_holder = paddle::memory::Alloc(cuda_place, size); + REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr()); + REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr()); + for (int i = 0; i < rank; ++i) { + cpu_buf[i] = in_stride[i]; + cpu_buf[rank + i] = out_stride[i]; + cpu_buf[2 * rank + i] = axis[i]; } -}; + paddle::memory::Copy( + cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream()); + REINTERPRET(const int64_t, in_stride_ptr, cuda_buf); + REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank); + REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank); + + const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock(); + const int MAX_GRID_DIM = context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM; + int64_t elements = in.numel(); + int block_size = (elements >= MAX_BLOCK_DIM) + ? MAX_BLOCK_DIM + : (1 << static_cast(std::log2(elements))); + int grid_size = elements / block_size; + grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size; + TransposeNormalKernel<<>>( + in_ptr, out_ptr, elements, in_stride_ptr, out_stride_ptr, axis_ptr, rank); +} // define transpose normal -#define DEFINE_GPU_TRANS_NORMAL(TYPE) \ - template struct TransposeNormal +#define DEFINE_GPU_TRANS_NORMAL(TYPE) \ + template struct TransposeNormal; \ + template struct TransposeNormal DEFINE_GPU_TRANS_NORMAL(float16); DEFINE_GPU_TRANS_NORMAL(bfloat16); @@ -198,8 +204,8 @@ DEFINE_GPU_TRANS_NORMAL(bool); DEFINE_GPU_TRANS_NORMAL(int16_t); DEFINE_GPU_TRANS_NORMAL(uint8_t); DEFINE_GPU_TRANS_NORMAL(int8_t); -DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex); -DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex); +DEFINE_GPU_TRANS_NORMAL(pten::dtype::complex); +DEFINE_GPU_TRANS_NORMAL(pten::dtype::complex); struct TensorSetConstantGPU { TensorSetConstantGPU(const paddle::platform::DeviceContext& context, @@ -374,7 +380,7 @@ struct ElementwiseAddTo { }; template struct ElementwiseAddTo; + pten::dtype::float16>; } // namespace funcs } // namespace pten diff --git a/paddle/pten/kernels/funcs/transpose.cc b/paddle/pten/kernels/funcs/transpose.cc deleted file mode 100644 index 7d4dc3c7ce8f00fece82e5a27af5347b5d5cfabf..0000000000000000000000000000000000000000 --- a/paddle/pten/kernels/funcs/transpose.cc +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/pten/kernels/funcs/transpose.h" -#include "paddle/pten/backends/cpu/cpu_context.h" -#include "paddle/pten/core/ddim.h" -#include "paddle/pten/core/dense_tensor.h" - -// See Note [ Why still include the fluid headers? ] -#include "paddle/pten/common/bfloat16.h" -#include "paddle/pten/common/complex.h" -#include "paddle/pten/common/float16.h" - -namespace pten { -namespace math { - -template -struct TransposeNormal { - // for dims >= 7 situation - void operator()(const CPUContext& dev_ctx, - const pten::DenseTensor& in, - pten::DenseTensor* out, - const std::vector& axis) { - const int rank = axis.size(); - auto in_stride = pten::framework::stride(in.dims()); - auto out_stride = pten::framework::stride(out->dims()); - const T* in_ptr = in.data(); - T* out_ptr = dev_ctx.template Alloc(out); - - auto transpose_helper = [&](int64_t beg, int64_t end) { - for (int64_t out_idx = beg; out_idx < end; ++out_idx) { - int64_t in_idx = 0; - int64_t tmp_idx = out_idx; - // calculate the input index - for (int i = 0; i < rank; ++i) { - const int64_t coordinate = tmp_idx / out_stride[i]; - tmp_idx -= coordinate * out_stride[i]; - in_idx += coordinate * in_stride[axis[i]]; - } - out_ptr[out_idx] = in_ptr[in_idx]; - } - }; - transpose_helper(0, out->numel()); - } -}; - -// define transpose normal -#define DEFINE_CPU_TRANS_NORMAL(TYPE) \ - template struct TransposeNormal - -DEFINE_CPU_TRANS_NORMAL(bool); -DEFINE_CPU_TRANS_NORMAL(int8_t); -DEFINE_CPU_TRANS_NORMAL(uint8_t); -DEFINE_CPU_TRANS_NORMAL(int16_t); -DEFINE_CPU_TRANS_NORMAL(int32_t); -DEFINE_CPU_TRANS_NORMAL(int64_t); -DEFINE_CPU_TRANS_NORMAL(float); -DEFINE_CPU_TRANS_NORMAL(double); -DEFINE_CPU_TRANS_NORMAL(paddle::platform::float16); -DEFINE_CPU_TRANS_NORMAL(paddle::platform::bfloat16); -DEFINE_CPU_TRANS_NORMAL(paddle::platform::complex); -DEFINE_CPU_TRANS_NORMAL(paddle::platform::complex); - -} // namespace math -} // namespace pten diff --git a/paddle/pten/kernels/funcs/transpose.cu b/paddle/pten/kernels/funcs/transpose.cu deleted file mode 100644 index 09baa2c6e023a5f67036306e91df00c47e4b1b1d..0000000000000000000000000000000000000000 --- a/paddle/pten/kernels/funcs/transpose.cu +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/memory/memcpy.h" -#include "paddle/pten/backends/gpu/gpu_context.h" -#include "paddle/pten/core/ddim.h" -#include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/kernels/funcs/transpose.h" - -// See Note [ Why still include the fluid headers? ] -#include "paddle/pten/common/bfloat16.h" -#include "paddle/pten/common/complex.h" -#include "paddle/pten/common/float16.h" - -namespace pten { - -namespace math { - -#define REINTERPRET(T, DST_PTR, SRC_PTR) \ - T* DST_PTR = reinterpret_cast(SRC_PTR) - -template -__global__ void TransposeNormalKernel(const T* in_ptr, - T* out_ptr, - int64_t element, - const int64_t* in_stride_ptr, - const int64_t* out_stride_ptr, - const int64_t* axis_ptr, - int rank) { - CUDA_KERNEL_LOOP(out_idx, element) { - int64_t in_idx = 0; - int64_t tmp_idx = out_idx; - for (int i = 0; i < rank; ++i) { - const int64_t coordinate = tmp_idx / out_stride_ptr[i]; - tmp_idx -= coordinate * out_stride_ptr[i]; - in_idx += coordinate * in_stride_ptr[axis_ptr[i]]; - } - out_ptr[out_idx] = in_ptr[in_idx]; - } -} - -template -struct TransposeNormal { - // for dims >= 7 situation - void operator()(const GPUContext& dev_ctx, - const pten::DenseTensor& in, - pten::DenseTensor* out, - const std::vector& axis) { - const int rank = axis.size(); - auto in_stride = pten::framework::stride(in.dims()); - auto out_stride = pten::framework::stride(out->dims()); - auto* in_ptr = in.data(); - T* out_ptr = dev_ctx.template Alloc(out); - - // copy in_stride, out_stride, axis to gpu device - const paddle::platform::CUDAPlace& cuda_place = dev_ctx.GetPlace(); - paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace(); - size_t size = 3 * rank * sizeof(int64_t); - auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size); - auto cuda_buf_holder = paddle::memory::Alloc(cuda_place, size); - REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr()); - REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr()); - for (int i = 0; i < rank; ++i) { - cpu_buf[i] = in_stride[i]; - cpu_buf[rank + i] = out_stride[i]; - cpu_buf[2 * rank + i] = axis[i]; - } - paddle::memory::Copy( - cuda_place, cuda_buf, cpu_place, cpu_buf, size, dev_ctx.stream()); - REINTERPRET(const int64_t, in_stride_ptr, cuda_buf); - REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank); - REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank); - - const int MAX_BLOCK_DIM = dev_ctx.GetMaxThreadsPerBlock(); - const int MAX_GRID_DIM = - dev_ctx.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM; - int64_t elements = in.numel(); - int block_size = (elements >= MAX_BLOCK_DIM) - ? MAX_BLOCK_DIM - : (1 << static_cast(std::log2(elements))); - int grid_size = elements / block_size; - grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size; - TransposeNormalKernel<<>>( - in_ptr, - out_ptr, - elements, - in_stride_ptr, - out_stride_ptr, - axis_ptr, - rank); - } -}; - -// define transpose normal -#define DEFINE_GPU_TRANS_NORMAL(TYPE) \ - template struct TransposeNormal - -DEFINE_GPU_TRANS_NORMAL(bool); -DEFINE_GPU_TRANS_NORMAL(int8_t); -DEFINE_GPU_TRANS_NORMAL(uint8_t); -DEFINE_GPU_TRANS_NORMAL(int16_t); -DEFINE_GPU_TRANS_NORMAL(int32_t); -DEFINE_GPU_TRANS_NORMAL(int64_t); -DEFINE_GPU_TRANS_NORMAL(float); -DEFINE_GPU_TRANS_NORMAL(double); -DEFINE_GPU_TRANS_NORMAL(paddle::platform::float16); -DEFINE_GPU_TRANS_NORMAL(paddle::platform::bfloat16); -DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex); -DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex); - -} // namespace math -} // namespace pten diff --git a/paddle/pten/kernels/funcs/transpose.h b/paddle/pten/kernels/funcs/transpose.h deleted file mode 100644 index 0cb2b4289fe6ef9f383eb20d241fb71430fb3634..0000000000000000000000000000000000000000 --- a/paddle/pten/kernels/funcs/transpose.h +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/pten/core/ddim.h" -#include "paddle/pten/core/dense_tensor.h" - -#include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/pten/kernels/funcs/eigen/common.h" - -namespace pten { - -namespace math { - -template -struct TransposeNormal { - // for dims >= 7 situation - void operator()(const DeviceContext& dev_ctx, - const pten::DenseTensor& in, - pten::DenseTensor* out, - const std::vector& axis); -}; - -template -struct Transpose { - void operator()(const DeviceContext& dev_ctx, - const DenseTensor& in, - DenseTensor* out, - const std::vector& axis) { - Eigen::array permute; - for (int i = 0; i < Rank; i++) { - permute[i] = axis[i]; - } - auto eigen_in = pten::EigenTensor::From(in); - auto eigen_out = pten::EigenTensor::From(*out); - auto* dev = dev_ctx.eigen_device(); - // use 32bit index to speed up computation - bool use_32bit_index = eigen_out.size() < Eigen::NumTraits::highest(); - bool is_gpu_place = paddle::platform::is_gpu_place(dev_ctx.GetPlace()); - if (use_32bit_index && is_gpu_place) { - To32BitIndex(eigen_out).device(*dev) = - To32BitIndex(eigen_in).shuffle(permute); - } else { - eigen_out.device(*dev) = eigen_in.shuffle(permute); - } - } -}; - -} // namespace math -} // namespace pten diff --git a/paddle/pten/kernels/gpu/cast_kernel.cu b/paddle/pten/kernels/gpu/cast_kernel.cu index 8fba5bc6fba659d194cc29eba0de773600b21c58..006b4f3687c44bcefe5f710a113abb9cd5a3ffc9 100644 --- a/paddle/pten/kernels/gpu/cast_kernel.cu +++ b/paddle/pten/kernels/gpu/cast_kernel.cu @@ -72,16 +72,16 @@ void CastKernel(const Context& dev_ctx, int16_t, \ bool, \ uint8_t, \ - paddle::platform::float16, \ - paddle::platform::complex, \ - paddle::platform::complex, \ + pten::dtype::float16, \ + pten::dtype::complex, \ + pten::dtype::complex, \ ##__VA_ARGS__) { \ kernel->OutputAt(0).SetDataType( \ paddle::experimental::DataType::UNDEFINED); \ } #if !defined(PADDLE_WITH_HIP) -PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16) +PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, pten::dtype::bfloat16) #else PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast) #endif diff --git a/paddle/pten/kernels/gpu/complex_kernel.cu b/paddle/pten/kernels/gpu/complex_kernel.cu index cd9c95de2ab017042714f347064c6d9ad564820e..f52159b987b4f03c16868d6aa8f02e624cb5b988 100644 --- a/paddle/pten/kernels/gpu/complex_kernel.cu +++ b/paddle/pten/kernels/gpu/complex_kernel.cu @@ -25,9 +25,9 @@ PT_REGISTER_KERNEL(conj, GPU, ALL_LAYOUT, pten::ConjKernel, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex, + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex, float, double, int, diff --git a/paddle/pten/kernels/gpu/concat_kernel.cu b/paddle/pten/kernels/gpu/concat_kernel.cu index 093af0d54f6eb36633e52f9aef90068275dea3dd..784812861fd184d525a58efd264c1c9a5afbf64f 100644 --- a/paddle/pten/kernels/gpu/concat_kernel.cu +++ b/paddle/pten/kernels/gpu/concat_kernel.cu @@ -120,7 +120,7 @@ PT_REGISTER_KERNEL(concat, int64_t, int, uint8_t, - paddle::platform::float16, - paddle::platform::bfloat16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::bfloat16, + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/gpu/dot_grad_kernel.cu b/paddle/pten/kernels/gpu/dot_grad_kernel.cu index 90c37ea1b006120b92d018f6e3e3aa13d72a79f1..a371daf79157c46527f0a0b708519b92d0249b2c 100644 --- a/paddle/pten/kernels/gpu/dot_grad_kernel.cu +++ b/paddle/pten/kernels/gpu/dot_grad_kernel.cu @@ -28,5 +28,5 @@ PT_REGISTER_KERNEL(dot_grad, double, int, int64_t, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/gpu/dot_kernel.cu b/paddle/pten/kernels/gpu/dot_kernel.cu index 24bd034fb15a0df3e19c60bfebb56f90c72da75a..7881dfa9b1bda1e2b3cc290cc638ce7b12ed0541 100644 --- a/paddle/pten/kernels/gpu/dot_kernel.cu +++ b/paddle/pten/kernels/gpu/dot_kernel.cu @@ -49,8 +49,8 @@ void DotKernel(const Context& dev_ctx, } // namespace pten -using complex64 = ::paddle::platform::complex; -using complex128 = ::paddle::platform::complex; +using complex64 = ::pten::dtype::complex; +using complex128 = ::pten::dtype::complex; PT_REGISTER_KERNEL(dot, GPU, diff --git a/paddle/pten/kernels/gpu/elementwise_grad_kernel.cu b/paddle/pten/kernels/gpu/elementwise_grad_kernel.cu index f1b3f53b809dfcbf44359d50a2ee97be599c988b..3be02106afef3435c705f1056d4631e358d0cb40 100644 --- a/paddle/pten/kernels/gpu/elementwise_grad_kernel.cu +++ b/paddle/pten/kernels/gpu/elementwise_grad_kernel.cu @@ -128,9 +128,9 @@ PT_REGISTER_KERNEL(add_grad, double, int, int64_t, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(add_double_grad, GPU, @@ -140,9 +140,9 @@ PT_REGISTER_KERNEL(add_double_grad, double, int, int64_t, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(add_triple_grad, GPU, @@ -152,9 +152,9 @@ PT_REGISTER_KERNEL(add_triple_grad, double, int, int64_t, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(subtract_grad, GPU, @@ -164,9 +164,9 @@ PT_REGISTER_KERNEL(subtract_grad, double, int, int64_t, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(subtract_double_grad, GPU, @@ -176,6 +176,6 @@ PT_REGISTER_KERNEL(subtract_double_grad, double, int, int64_t, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/gpu/expand_grad_kernel.cu b/paddle/pten/kernels/gpu/expand_grad_kernel.cu index 49f8718c483ce0e0a7bbd126edb01560c4d9fbfd..b4e89de892449779488bf8ec9a71a5406eced5af 100644 --- a/paddle/pten/kernels/gpu/expand_grad_kernel.cu +++ b/paddle/pten/kernels/gpu/expand_grad_kernel.cu @@ -24,6 +24,6 @@ PT_REGISTER_KERNEL(expand_grad, pten::ExpandGradKernel, float, double, - paddle::platform::float16, + pten::dtype::float16, int, int64_t) {} diff --git a/paddle/pten/kernels/gpu/expand_kernel.cu b/paddle/pten/kernels/gpu/expand_kernel.cu index e0d8536d6ab34ce0b0d9e5790477cfe7305f9e43..455eb6ef14cb5192675a490e4bb51f462d6f7f1b 100644 --- a/paddle/pten/kernels/gpu/expand_kernel.cu +++ b/paddle/pten/kernels/gpu/expand_kernel.cu @@ -25,7 +25,7 @@ PT_REGISTER_KERNEL(expand, pten::ExpandKernel, float, double, - paddle::platform::float16, + pten::dtype::float16, int, int64_t, bool) {} diff --git a/paddle/pten/kernels/gpu/full_kernel.cu b/paddle/pten/kernels/gpu/full_kernel.cu index 6ea1f1282ddc1052afbeb32880443049b4dc44ed..7f600fb3134721dac12829390e95f08a3c4533ea 100644 --- a/paddle/pten/kernels/gpu/full_kernel.cu +++ b/paddle/pten/kernels/gpu/full_kernel.cu @@ -106,9 +106,9 @@ PT_REGISTER_KERNEL(full, int, int64_t, bool, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(full_like, GPU, @@ -119,4 +119,4 @@ PT_REGISTER_KERNEL(full_like, int, int64_t, bool, - paddle::platform::float16) {} + pten::dtype::float16) {} diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu index 1a549087e4221b0f74362fe9fe653d5fbe91c738..387defc9f418699238acea74b2a926758d266d46 100644 --- a/paddle/pten/kernels/gpu/math_kernel.cu +++ b/paddle/pten/kernels/gpu/math_kernel.cu @@ -91,9 +91,9 @@ DEFINE_CUDA_ELEMENTWISE_OP(Divide) } // namespace pten -using float16 = paddle::platform::float16; -using complex64 = ::paddle::platform::complex; -using complex128 = ::paddle::platform::complex; +using float16 = pten::dtype::float16; +using complex64 = ::pten::dtype::complex; +using complex128 = ::pten::dtype::complex; PT_REGISTER_KERNEL(add_raw, GPU, diff --git a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu index 7df99260aa1614a29325ed1d0834400566e28139..306fe5540e37220920da25a33055202b4eeb0ebf 100644 --- a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu +++ b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu @@ -25,10 +25,10 @@ PT_REGISTER_KERNEL(matmul_grad, pten::MatmulGradKernel, float, double, - paddle::platform::float16, - paddle::platform::bfloat16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::bfloat16, + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(matmul_double_grad, GPU, @@ -36,9 +36,9 @@ PT_REGISTER_KERNEL(matmul_double_grad, pten::MatmulDoubleGradKernel, float, double, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(matmul_triple_grad, GPU, @@ -46,6 +46,6 @@ PT_REGISTER_KERNEL(matmul_triple_grad, pten::MatmulTripleGradKernel, float, double, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/gpu/matmul_kernel.cu b/paddle/pten/kernels/gpu/matmul_kernel.cu index b365581e949c103be511e4849a45b4fd9a024f77..ebb17963ab0df088c29d5960ff3bf9a4365c982e 100644 --- a/paddle/pten/kernels/gpu/matmul_kernel.cu +++ b/paddle/pten/kernels/gpu/matmul_kernel.cu @@ -26,7 +26,7 @@ PT_REGISTER_KERNEL(matmul, pten::MatmulKernel, float, double, - paddle::platform::float16, - paddle::platform::bfloat16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::bfloat16, + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/gpu/norm_grad_kernel.cu b/paddle/pten/kernels/gpu/norm_grad_kernel.cu index 35701d349ad3c39ab35386b8cd7f8f15746c7e72..4c2cc5347d8e64a61e68fd80874954ed085d5876 100644 --- a/paddle/pten/kernels/gpu/norm_grad_kernel.cu +++ b/paddle/pten/kernels/gpu/norm_grad_kernel.cu @@ -117,4 +117,4 @@ PT_REGISTER_KERNEL(norm_grad, pten::NormGradKernel, float, double, - paddle::platform::float16) {} + pten::dtype::float16) {} diff --git a/paddle/pten/kernels/gpu/norm_kernel.cu b/paddle/pten/kernels/gpu/norm_kernel.cu index 6e2ee65231973079a42131f55e7eec8ba753755f..66383f53b5853abdfdd8443cfe3fcdd234af5802 100644 --- a/paddle/pten/kernels/gpu/norm_kernel.cu +++ b/paddle/pten/kernels/gpu/norm_kernel.cu @@ -130,4 +130,4 @@ PT_REGISTER_KERNEL(norm, pten::NormKernel, float, double, - paddle::platform::float16) {} + pten::dtype::float16) {} diff --git a/paddle/pten/kernels/gpu/reduce.h b/paddle/pten/kernels/gpu/reduce.h index c83662c03c7b0e314c1842a8b243d5681211b760..bca8d455623c0c7dceb3559f19e8a7f607321611 100644 --- a/paddle/pten/kernels/gpu/reduce.h +++ b/paddle/pten/kernels/gpu/reduce.h @@ -1004,15 +1004,14 @@ template class ReduceOp, typename TransformOp> -static - typename std::enable_if::value, - void>::type - CubTensorReduceImpl(const Tx* x_data, - Ty* y_data, - const TransformOp& transform, - int reduce_num, - const paddle::platform::Place& place, - gpuStream_t stream) { +static typename std::enable_if::value, + void>::type +CubTensorReduceImpl(const Tx* x_data, + Ty* y_data, + const TransformOp& transform, + int reduce_num, + const paddle::platform::Place& place, + gpuStream_t stream) { auto reducer = ReduceOp(); cub::TransformInputIterator trans_x(x_data, transform); @@ -1048,15 +1047,14 @@ template class ReduceOp, typename TransformOp> -static - typename std::enable_if::value, - void>::type - CubTensorReduceImpl(const Tx* x_data, - Ty* y_data, - const TransformOp& transform, - int reduce_num, - const paddle::platform::Place& place, - gpuStream_t stream) { +static typename std::enable_if::value, + void>::type +CubTensorReduceImpl(const Tx* x_data, + Ty* y_data, + const TransformOp& transform, + int reduce_num, + const paddle::platform::Place& place, + gpuStream_t stream) { PADDLE_THROW(pten::errors::InvalidArgument( "Tx should not be float16 when using cub::DeviceReduce::Reduce().")); } @@ -1099,7 +1097,7 @@ void TensorReduceImpl(const pten::GPUContext& dev_ctx, } config.SetOutputData(y_data, x.place(), &tmp); - constexpr bool kIsTxFP16 = std::is_same::value; + constexpr bool kIsTxFP16 = std::is_same::value; bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16; if (use_cub_reduce) { CubTensorReduceImpl( diff --git a/paddle/pten/kernels/gpu/sign_kernel.cu b/paddle/pten/kernels/gpu/sign_kernel.cu index 2a96ff653035a0fd8f48b2bfbaa0907a04a2054a..d479d6a2b2d5187eda0a2a13b15ead0cf6201e48 100644 --- a/paddle/pten/kernels/gpu/sign_kernel.cu +++ b/paddle/pten/kernels/gpu/sign_kernel.cu @@ -21,7 +21,7 @@ limitations under the License. */ // See Note [ Why still include the fluid headers? ] #include "paddle/pten/common/float16.h" -using float16 = paddle::platform::float16; +using float16 = pten::dtype::float16; PT_REGISTER_KERNEL( sign, GPU, ALL_LAYOUT, pten::SignKernel, float, double, float16) {} diff --git a/paddle/pten/kernels/impl/full_kernel_impl.h b/paddle/pten/kernels/impl/full_kernel_impl.h index 4fbe9f34e5b4d9e683db4f623fe6195f21469f8d..4c018e34e2ebc2d3d4f61e339c7ca3a010d10c2b 100644 --- a/paddle/pten/kernels/impl/full_kernel_impl.h +++ b/paddle/pten/kernels/impl/full_kernel_impl.h @@ -47,10 +47,9 @@ void FullLikeKernel(const Context& dev_ctx, auto value = val.to(); using CommonType = typename std::common_type< float, - typename std::conditional< - std::is_same::value, - float, - T>::type>::type; + typename std::conditional::value, + float, + T>::type>::type; auto common_type_value = static_cast(value); diff --git a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h index b346acb6e25c6ab2936980034794cf750a248468..f84187484b194cab40d3417ee318b19636073135 100644 --- a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h +++ b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h @@ -90,7 +90,7 @@ static DenseTensor FoldHeadAndLastDims(const Context& dev_ctx, DenseTensor output = EmptyLike(dev_ctx, input); output.Resize({in_dims[1], in_dims[0], in_dims[2]}); std::vector axis = {1, 0, 2}; - math::Transpose trans; + funcs::Transpose trans; trans(dev_ctx, input, &output, axis); output.Resize({in_dims[1], in_dims[0] * in_dims[2]}); return output; diff --git a/paddle/pten/kernels/math_kernel.cc b/paddle/pten/kernels/math_kernel.cc index 29a2b48fa7c4f12558c47dc1d6d87c758f0c492e..2356fb34bf1b7148d69ce154e4bc38b37aaf3eef 100644 --- a/paddle/pten/kernels/math_kernel.cc +++ b/paddle/pten/kernels/math_kernel.cc @@ -78,8 +78,8 @@ void MultiplyKernel(const Context& dev_ctx, } // namespace pten -using complex64 = ::paddle::platform::complex; -using complex128 = ::paddle::platform::complex; +using complex64 = ::pten::dtype::complex; +using complex128 = ::pten::dtype::complex; PT_REGISTER_KERNEL( mean, CPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool) {} @@ -91,7 +91,7 @@ PT_REGISTER_KERNEL(sum, bool, float, double, - paddle::platform::float16, + pten::dtype::float16, int, int64_t, complex64, @@ -149,7 +149,7 @@ PT_REGISTER_KERNEL(mean, float, double, bool, - paddle::platform::float16) {} + pten::dtype::float16) {} PT_REGISTER_KERNEL(sum, GPU, ALL_LAYOUT, @@ -157,7 +157,7 @@ PT_REGISTER_KERNEL(sum, bool, float, double, - paddle::platform::float16, + pten::dtype::float16, int, int64_t, complex64, @@ -172,7 +172,7 @@ PT_REGISTER_KERNEL(add, double, int, int64_t, - paddle::platform::float16, + pten::dtype::float16, complex64, complex128) {} PT_REGISTER_KERNEL(subtract, @@ -183,7 +183,7 @@ PT_REGISTER_KERNEL(subtract, double, int, int64_t, - paddle::platform::float16, + pten::dtype::float16, complex64, complex128) {} PT_REGISTER_KERNEL(divide, @@ -194,7 +194,7 @@ PT_REGISTER_KERNEL(divide, double, int, int64_t, - paddle::platform::float16, + pten::dtype::float16, complex64, complex128) {} PT_REGISTER_KERNEL(multiply, @@ -206,7 +206,7 @@ PT_REGISTER_KERNEL(multiply, int, int64_t, bool, - paddle::platform::float16, + pten::dtype::float16, complex64, complex128) {} #endif diff --git a/paddle/pten/kernels/transfer_layout_kernel.cc b/paddle/pten/kernels/transfer_layout_kernel.cc index c21ab7c304d9e7b1253d2ed3157119662089615a..281d40221dc9597e4a61773c86e86930f1a08649 100644 --- a/paddle/pten/kernels/transfer_layout_kernel.cc +++ b/paddle/pten/kernels/transfer_layout_kernel.cc @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/pten/api/ext/dispatch.h" #include "paddle/pten/backends/all_context.h" #include "paddle/pten/core/kernel_registry.h" -#include "paddle/pten/kernels/funcs/transpose.h" +#include "paddle/pten/kernels/funcs/math_function.h" namespace pten { @@ -42,7 +42,7 @@ void CastDataLayout(const Context& dev_ctx, const DenseTensor& x, const std::vector& axis, DenseTensor* out) { - math::Transpose trans4; + funcs::Transpose trans4; trans4(dev_ctx, x, out, axis); } diff --git a/paddle/pten/tests/api/scale_api.h b/paddle/pten/tests/api/scale_api.h index f4f9ec3c84758aa48d0253d1a6661d4766783da6..5403f5932ea18abd61cb8e53ed58d921b6cc7838 100644 --- a/paddle/pten/tests/api/scale_api.h +++ b/paddle/pten/tests/api/scale_api.h @@ -162,7 +162,7 @@ static void ScaleGPU(DataType kernel_dtype, break; } case pten::DataType::FLOAT16: { - pten::ScaleKernel( + pten::ScaleKernel( dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); break; } diff --git a/paddle/pten/tests/kernels/CMakeLists.txt b/paddle/pten/tests/kernels/CMakeLists.txt index 15a1cab5f0dd473498ebb23e564ce88400af9713..9682e063471dfac551aa2c844506878f1c97ef46 100644 --- a/paddle/pten/tests/kernels/CMakeLists.txt +++ b/paddle/pten/tests/kernels/CMakeLists.txt @@ -13,3 +13,11 @@ cc_test(test_conj_dev_api SRCS test_conj_dev_api.cc DEPS pten pten_api_utils) cc_test(test_concat_dev_api SRCS test_concat_dev_api.cc DEPS pten pten_api_utils) cc_test(test_split_dev_api SRCS test_split_dev_api.cc DEPS pten pten_api_utils) cc_test(test_sparse_utils_dev_api SRCS test_sparse_utils_dev_api.cc DEPS pten pten_api_utils) + +cc_test(test_math_function SRCS test_math_function.cc DEPS math_function) +if(WITH_GPU) + nv_test(test_math_function_gpu SRCS test_math_function.cu DEPS math_function) +endif() +if(WITH_ROCM) + hip_test(test_math_function_gpu SRCS test_math_function.cu DEPS math_function) +endif() diff --git a/paddle/pten/kernels/funcs/math_function_test.cc b/paddle/pten/tests/kernels/test_math_function.cc similarity index 99% rename from paddle/pten/kernels/funcs/math_function_test.cc rename to paddle/pten/tests/kernels/test_math_function.cc index 6ef8c6b689d2ceb47d1d2b61e4f49a77463b63ee..0d53ff6c637ba41799c68b8d0a6715b228cf2d62 100644 --- a/paddle/pten/kernels/funcs/math_function_test.cc +++ b/paddle/pten/tests/kernels/test_math_function.cc @@ -11,9 +11,13 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/pten/kernels/funcs/math_function.h" + #include "gtest/gtest.h" #include "paddle/fluid/operators/math/blas.h" +#include "paddle/pten/kernels/funcs/math_function.h" + +namespace pten { +namespace tests { template inline paddle::operators::math::BlasT @@ -348,3 +352,6 @@ TEST(math_function, gemm_warp) { GemmWarpTest(8, 5, 6, 1.0, 0.0); GemmWarpTest(8, 5, 6, 2.0, 1.0); } + +} // namespace tests +} // namespace pten diff --git a/paddle/pten/kernels/funcs/math_function_test.cu b/paddle/pten/tests/kernels/test_math_function.cu similarity index 85% rename from paddle/pten/kernels/funcs/math_function_test.cu rename to paddle/pten/tests/kernels/test_math_function.cu index 87f11c47a4433b66576d7c1fd14dcffaf3bbdc77..69ea874408ec03503d770ffc5dde0f9a771be365 100644 --- a/paddle/pten/kernels/funcs/math_function_test.cu +++ b/paddle/pten/tests/kernels/test_math_function.cu @@ -11,12 +11,16 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #include "gtest/gtest.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/pten/kernels/funcs/math_function.h" -void fill_fp16_data(paddle::platform::float16* in_ptr, +namespace pten { +namespace tests { + +void fill_fp16_data(pten::dtype::float16* in_ptr, size_t size, const std::vector& data) { PADDLE_ENFORCE_EQ( @@ -28,7 +32,7 @@ void fill_fp16_data(paddle::platform::float16* in_ptr, size, data.size())); for (size_t i = 0; i < data.size(); ++i) { - in_ptr[i] = paddle::platform::float16(data[i]); + in_ptr[i] = pten::dtype::float16(data[i]); } } @@ -95,27 +99,26 @@ TEST(math_function, notrans_mul_trans_fp16) { return; } - paddle::platform::float16* input1_ptr = - input1.mutable_data({2, 3}, cpu_place); + pten::dtype::float16* input1_ptr = + input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu); - out_gpu.mutable_data({2, 2}, gpu_place); + out_gpu.mutable_data({2, 2}, gpu_place); - GetBlas(context).MatMul( - input1_gpu, - false, - input2_gpu, - true, - paddle::platform::float16(1), - &out_gpu, - paddle::platform::float16(0)); + GetBlas(context).MatMul(input1_gpu, + false, + input2_gpu, + true, + pten::dtype::float16(1), + &out_gpu, + pten::dtype::float16(0)); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out); - paddle::platform::float16* out_ptr = out.data(); + pten::dtype::float16* out_ptr = out.data(); context.Wait(); EXPECT_EQ(static_cast(out_ptr[0]), 5); EXPECT_EQ(static_cast(out_ptr[1]), 14); @@ -185,27 +188,26 @@ TEST(math_function, trans_mul_notrans_fp16) { return; } - paddle::platform::float16* input1_ptr = - input1.mutable_data({2, 3}, cpu_place); + pten::dtype::float16* input1_ptr = + input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu); - out_gpu.mutable_data({3, 3}, gpu_place); + out_gpu.mutable_data({3, 3}, gpu_place); - GetBlas(context).MatMul( - input1_gpu, - true, - input2_gpu, - false, - paddle::platform::float16(1), - &out_gpu, - paddle::platform::float16(0)); + GetBlas(context).MatMul(input1_gpu, + true, + input2_gpu, + false, + pten::dtype::float16(1), + &out_gpu, + pten::dtype::float16(0)); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out); - paddle::platform::float16* out_ptr = out.data(); + pten::dtype::float16* out_ptr = out.data(); context.Wait(); EXPECT_EQ(static_cast(out_ptr[0]), 9); EXPECT_EQ(static_cast(out_ptr[1]), 12); @@ -300,37 +302,37 @@ TEST(math_function, gemm_notrans_cublas_fp16) { int m = 2; int n = 3; int k = 3; - paddle::platform::float16* input1_ptr = - input1.mutable_data({2, 3}, cpu_place); + pten::dtype::float16* input1_ptr = + input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); - paddle::platform::float16* input2_ptr = - input2.mutable_data({3, 4}, cpu_place); + pten::dtype::float16* input2_ptr = + input2.mutable_data({3, 4}, cpu_place); fill_fp16_data( input2_ptr, input2.numel(), {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); - paddle::platform::float16* input3_ptr = - input3.mutable_data({2, 4}, cpu_place); + pten::dtype::float16* input3_ptr = + input3.mutable_data({2, 4}, cpu_place); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu); - paddle::platform::float16* a = input1_gpu.data(); - paddle::platform::float16* b = input2_gpu.data(); - paddle::platform::float16* c = - input3_gpu.mutable_data(gpu_place); + pten::dtype::float16* a = input1_gpu.data(); + pten::dtype::float16* b = input2_gpu.data(); + pten::dtype::float16* c = + input3_gpu.mutable_data(gpu_place); - GetBlas(context).GEMM( + GetBlas(context).GEMM( false, false, m, n, k, - static_cast(1), + static_cast(1), a, 3, b + 1, 4, - static_cast(1), + static_cast(1), c + 1, 4); @@ -429,37 +431,37 @@ TEST(math_function, gemm_trans_cublas_fp16) { int m = 2; int n = 3; int k = 3; - paddle::platform::float16* input1_ptr = - input1.mutable_data({2, 3}, cpu_place); + pten::dtype::float16* input1_ptr = + input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); - paddle::platform::float16* input2_ptr = - input2.mutable_data({4, 3}, cpu_place); + pten::dtype::float16* input2_ptr = + input2.mutable_data({4, 3}, cpu_place); fill_fp16_data( input2_ptr, input2.numel(), {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}); - paddle::platform::float16* input3_ptr = - input3.mutable_data({2, 4}, cpu_place); + pten::dtype::float16* input3_ptr = + input3.mutable_data({2, 4}, cpu_place); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu); - paddle::platform::float16* a = input1_gpu.data(); - paddle::platform::float16* b = input2_gpu.data(); - paddle::platform::float16* c = - input3_gpu.mutable_data(gpu_place); + pten::dtype::float16* a = input1_gpu.data(); + pten::dtype::float16* b = input2_gpu.data(); + pten::dtype::float16* c = + input3_gpu.mutable_data(gpu_place); - GetBlas(context).GEMM( + GetBlas(context).GEMM( false, true, m, n, k, - static_cast(1), + static_cast(1), a, 3, b + 3, 3, - static_cast(1), + static_cast(1), c + 1, 4); @@ -547,3 +549,6 @@ TEST(math_function, gemv) { GemvTest(3, 13, true); GemvTest(3, 13, true); } + +} // namespace tests +} // namespace pten