diff --git a/paddle/fluid/platform/details/device_ptr_cast.h b/paddle/fluid/platform/details/cuda_transform_iterator_cast.h similarity index 50% rename from paddle/fluid/platform/details/device_ptr_cast.h rename to paddle/fluid/platform/details/cuda_transform_iterator_cast.h index 1c502a19c056c7fe434e68d568a0f59bf6315b95..06afc44c257bbeb0729323e1a42e1eead23ff075 100644 --- a/paddle/fluid/platform/details/device_ptr_cast.h +++ b/paddle/fluid/platform/details/cuda_transform_iterator_cast.h @@ -18,16 +18,22 @@ limitations under the License. */ #error device_ptr_cast must be include by .cu file #endif -#include +#include // For std::remove_pointer and std::is_pointer. + +#include "thrust/device_ptr.h" namespace paddle { namespace platform { namespace details { + +// PointerToThrustDevicePtr has two speicalizations, one casts a (CUDA +// device) pointer into thrust::device_ptr, the other keeps rest types +// un-casted. template -struct DevicePtrCast; +struct PointerToThrustDevicePtr; template -struct DevicePtrCast { +struct PointerToThrustDevicePtr { using ELEM = typename std::remove_pointer::type; using RTYPE = thrust::device_ptr; @@ -37,17 +43,26 @@ struct DevicePtrCast { }; template -struct DevicePtrCast { +struct PointerToThrustDevicePtr { using RTYPE = T; inline RTYPE operator()(RTYPE it) const { return it; } }; -// Cast T to thrust::device_ptr if T is a pointer. -// Otherwise, e.g., T is a iterator, return T itself. +// CastToCUDATransformIterator casts a pointer to thrust::device_ptr +// so it could be used as the iterator of thrust::transform. It +// doesn't cast other types. +// +// We need CastToCUDATransformIterator because it is often that we +// want to use device memory pointers as transform iterators, e.g., to +// transform a block of float32 to float16. In this case, we want +// CastToCUDATransformIterator to cast float16/32 pointers to +// thrust::device_ptr, otherwise they cannot work as the iterator +// required by thrust::transform. At the same time, we don't want to +// cast thrust::device_ptr to thrust::device_ptr repeatedly. template -auto DevPtrCast(T t) -> - typename DevicePtrCast::value>::RTYPE { - DevicePtrCast::value> cast; +auto CastToCUDATransformIterator(T t) -> + typename PointerToThrustDevicePtr::value>::RTYPE { + PointerToThrustDevicePtr::value> cast; return cast(t); } diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h index 917c48b47f8d70cd821d45dfbc6bafa494710ffa..7877d3e41c1c993662f5d91b263cbcb71db74c36 100644 --- a/paddle/fluid/platform/transform.h +++ b/paddle/fluid/platform/transform.h @@ -14,29 +14,44 @@ limitations under the License. */ #pragma once +#include +#include + #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/place.h" -#include -#include #ifdef __NVCC__ #include #include -#include "paddle/fluid/platform/details/device_ptr_cast.h" +#include "paddle/fluid/platform/details/cuda_transform_iterator_cast.h" #endif namespace paddle { namespace platform { -// Transform on host or device. It provides the same API in std library. +// Transform applys a unary or a binary functor on each element in a +// range defined by a pair of iterators. +// +// - The specialization for CPU calls std::transform. +// - The specialization for CUDA calls thrust::tranform. +// +// NOTE: We need to define InputIter and OutputIter defined as +// different types, because the InputIter points op's inputs and +// OutputIter pints to op's outputs. +// +// NOTE: We don't assume that InputIter to be const InputType* and +// OutputIter to be OutputType*, because we might use a iterator +// class, paddle::fluid::operators::RowwiseTRansformIterator. template struct Transform { + // The unary version. template void operator()(const DeviceContext& context, InputIter first, InputIter last, OutputIter result, UnaryOperation op); + // The binary version. template void operator()(const DeviceContext& context, InputIter1 first1, @@ -70,8 +85,9 @@ struct Transform { auto place = context.GetPlace(); PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place."); thrust::transform(thrust::cuda::par.on(context.stream()), - details::DevPtrCast(first), details::DevPtrCast(last), - details::DevPtrCast(result), op); + details::CastToCUDATransformIterator(first), + details::CastToCUDATransformIterator(last), + details::CastToCUDATransformIterator(result), op); } template { auto place = context.GetPlace(); PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place."); thrust::transform(thrust::cuda::par.on(context.stream()), - details::DevPtrCast(first1), details::DevPtrCast(last1), - details::DevPtrCast(first2), details::DevPtrCast(result), - op); + details::CastToCUDATransformIterator(first1), + details::CastToCUDATransformIterator(last1), + details::CastToCUDATransformIterator(first2), + details::CastToCUDATransformIterator(result), op); } }; #endif