未验证 提交 64f1485a 编写于 作者: Z Zhang Ting 提交者: GitHub

replace implementation with cuda kernel (#39795)

上级 bbe5228c
...@@ -36,6 +36,7 @@ limitations under the License. */ ...@@ -36,6 +36,7 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/phi/kernels/funcs/functors.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -270,32 +271,38 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, ...@@ -270,32 +271,38 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
const Tensor& mask, int64_t size, const Tensor& mask, int64_t size,
Tensor* grad_x, bool is_test = false) { Tensor* grad_x, bool is_test = false) {
using MT = typename details::MPTypeTrait<T>::Type; using MT = typename details::MPTypeTrait<T>::Type;
auto dX = EigenVector<T>::Flatten(*grad_x); auto stream = dev_ctx.stream();
auto dY = EigenVector<T>::Flatten(grad_y); MT factor;
auto& place = *dev_ctx.eigen_device();
if (is_test) { if (is_test) {
if (dropout_implementation == "upscale_in_train") { if (dropout_implementation == "upscale_in_train") {
dX.device(place) = static_cast<T>(1) * dY; factor = static_cast<MT>(1.0f);
} else { } else {
dX.device(place) = dY * static_cast<T>(1.0f - dropout_prob); factor = static_cast<MT>(1.0f - dropout_prob);
} }
std::vector<const framework::Tensor*> ins = {&grad_y};
std::vector<framework::Tensor*> outs = {grad_x};
auto functor = phi::funcs::ScaleFunctor<T>(factor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
} else { } else {
auto M = EigenVector<uint8_t>::Flatten(mask); std::vector<const framework::Tensor*> ins = {&grad_y, &mask};
std::vector<framework::Tensor*> outs = {grad_x};
if (dropout_implementation == "upscale_in_train") { if (dropout_implementation == "upscale_in_train") {
if (dropout_prob == 1.0f) { if (dropout_prob == 1.0f) {
dX.device(place) = static_cast<T>(0) * dY; #ifdef PADDLE_WITH_HIP
hipMemset(grad_x->data<T>(), 0, size * sizeof(T));
#else
cudaMemset(grad_x->data<T>(), 0, size * sizeof(T));
#endif
} else { } else {
auto factor = static_cast<MT>(1.0f / (1.0f - dropout_prob)); factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
auto stream = dev_ctx.stream();
std::vector<const framework::Tensor*> ins = {&grad_y, &mask};
std::vector<framework::Tensor*> outs = {grad_x};
auto functor = CudaDropoutGradFunctor<T, uint8_t>(factor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>( paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
dev_ctx, ins, &outs, functor); dev_ctx, ins, &outs, CudaDropoutGradFunctor<T, uint8_t>(factor));
} }
} else { } else {
dX.device(place) = dY * M.cast<T>(); factor = static_cast<MT>(1.0f);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
dev_ctx, ins, &outs, CudaDropoutGradFunctor<T, uint8_t>(factor));
} }
} }
} }
......
...@@ -38,12 +38,15 @@ struct AddGradFunctor { ...@@ -38,12 +38,15 @@ struct AddGradFunctor {
template <typename T> template <typename T>
struct ScaleFunctor { struct ScaleFunctor {
explicit ScaleFunctor(const T coeff) : coeff_(coeff) {} using MT = typename paddle::operators::details::MPTypeTrait<T>::Type;
explicit ScaleFunctor(const MT coeff) : coeff_(coeff) {}
inline HOSTDEVICE T operator()(T ele) { return ele * coeff_; } inline HOSTDEVICE T operator()(T ele) {
return static_cast<T>(static_cast<MT>(ele) * coeff_);
}
private: private:
T coeff_; MT coeff_;
}; };
template <typename T> template <typename T>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册