未验证 提交 9e06a641 编写于 作者: R ronnywang 提交者: GitHub

[ROCM] fix softmax_with_cross_entropy_op (#31982)

上级 4490e8af
...@@ -66,18 +66,23 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> { ...@@ -66,18 +66,23 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
int batch_size = prob->dims()[0]; int batch_size = prob->dims()[0];
int class_num = prob->dims()[1]; int class_num = prob->dims()[1];
#ifdef __HIPCC__
constexpr int kMaxBlockDim = 256;
#else
constexpr int kMaxBlockDim = 512;
#endif
if (softLabel) { if (softLabel) {
const T* label_data = labels->data<T>(); const T* label_data = labels->data<T>();
int block = class_num > 512 int block = class_num > kMaxBlockDim
? 512 ? kMaxBlockDim
: pow(2, static_cast<int>(std::log2(class_num))); : pow(2, static_cast<int>(std::log2(class_num)));
SoftCrossEntropyKernel<T><<<batch_size, block, 0, ctx.stream()>>>( SoftCrossEntropyKernel<T><<<batch_size, block, 0, ctx.stream()>>>(
loss_data, prob_data, label_data, class_num); loss_data, prob_data, label_data, class_num);
} else { } else {
const int64_t* label_data = labels->data<int64_t>(); const int64_t* label_data = labels->data<int64_t>();
int block = 512; int block = kMaxBlockDim;
int grid = (batch_size + block - 1) / block; int grid = (batch_size + block - 1) / block;
CrossEntropyKernel<T><<<grid, block, 0, ctx.stream()>>>( CrossEntropyKernel<T><<<grid, block, 0, ctx.stream()>>>(
loss_data, prob_data, label_data, batch_size, class_num, loss_data, prob_data, label_data, batch_size, class_num,
......
...@@ -54,10 +54,11 @@ void SoftmaxCUDNNFunctor<T>::operator()( ...@@ -54,10 +54,11 @@ void SoftmaxCUDNNFunctor<T>::operator()(
xDesc.descriptor<T>(layout, cudnn_tensor_dims); xDesc.descriptor<T>(layout, cudnn_tensor_dims);
miopenTensorDescriptor_t cudnn_y_desc = miopenTensorDescriptor_t cudnn_y_desc =
xDesc.descriptor<T>(layout, cudnn_tensor_dims); xDesc.descriptor<T>(layout, cudnn_tensor_dims);
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
context.cudnn_handle(), CudnnDataType<T>::kOne(), cudnn_x_desc, context.cudnn_handle(), CudnnDataType<T>::kOne(), cudnn_x_desc,
X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc, X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc,
Y->mutable_data<T>(context.GetPlace()))); Y->mutable_data<T>(context.GetPlace()), MIOPEN_SOFTMAX_ACCURATE,
MIOPEN_SOFTMAX_MODE_INSTANCE));
#else #else
cudnnTensorDescriptor_t cudnn_x_desc = cudnnTensorDescriptor_t cudnn_x_desc =
xDesc.descriptor<T>(layout, cudnn_tensor_dims); xDesc.descriptor<T>(layout, cudnn_tensor_dims);
...@@ -96,11 +97,12 @@ void SoftmaxGradCUDNNFunctor<T>::operator()( ...@@ -96,11 +97,12 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
dxDesc.descriptor<T>(layout, cudnn_tensor_dims); dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
miopenTensorDescriptor_t cudnn_ygrad_desc = miopenTensorDescriptor_t cudnn_ygrad_desc =
dyDesc.descriptor<T>(layout, cudnn_tensor_dims); dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
context.cudnn_handle(), CudnnDataType<T>::kOne(), cudnn_y_desc, context.cudnn_handle(), CudnnDataType<T>::kOne(), cudnn_y_desc,
Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(), Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(),
CudnnDataType<T>::kZero(), cudnn_xgrad_desc, CudnnDataType<T>::kZero(), cudnn_xgrad_desc,
XGrad->mutable_data<T>(context.GetPlace()))); XGrad->mutable_data<T>(context.GetPlace()), MIOPEN_SOFTMAX_ACCURATE,
MIOPEN_SOFTMAX_MODE_INSTANCE));
#else #else
cudnnTensorDescriptor_t cudnn_y_desc = cudnnTensorDescriptor_t cudnn_y_desc =
yDesc.descriptor<T>(layout, cudnn_tensor_dims); yDesc.descriptor<T>(layout, cudnn_tensor_dims);
......
...@@ -672,7 +672,11 @@ template <typename T> ...@@ -672,7 +672,11 @@ template <typename T>
static void SoftmaxWithCrossEntropyFusedKernel( static void SoftmaxWithCrossEntropyFusedKernel(
const T* logits_data, const T* labels_data, T* softmax_data, T* loss_data, const T* logits_data, const T* labels_data, T* softmax_data, T* loss_data,
int64_t n, int64_t d, int axis_dim, gpuStream_t stream) { int64_t n, int64_t d, int axis_dim, gpuStream_t stream) {
#ifdef __HIPCC__
constexpr int kMaxBlockDim = 256;
#else
constexpr int kMaxBlockDim = 512; constexpr int kMaxBlockDim = 512;
#endif
int64_t block_dim = axis_dim >= kMaxBlockDim int64_t block_dim = axis_dim >= kMaxBlockDim
? kMaxBlockDim ? kMaxBlockDim
: (1 << static_cast<int>(std::log2(axis_dim))); : (1 << static_cast<int>(std::log2(axis_dim)));
......
...@@ -116,7 +116,9 @@ extern void EnforceCUDNNLoaded(const char* fn_name); ...@@ -116,7 +116,9 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(miopenPoolingForward); \ __macro(miopenPoolingForward); \
__macro(miopenPoolingBackward); \ __macro(miopenPoolingBackward); \
__macro(miopenSoftmaxBackward); \ __macro(miopenSoftmaxBackward); \
__macro(miopenSoftmaxBackward_V2); \
__macro(miopenSoftmaxForward); \ __macro(miopenSoftmaxForward); \
__macro(miopenSoftmaxForward_V2); \
__macro(miopenCreateDropoutDescriptor); \ __macro(miopenCreateDropoutDescriptor); \
__macro(miopenDestroyDropoutDescriptor); \ __macro(miopenDestroyDropoutDescriptor); \
__macro(miopenRestoreDropoutDescriptor); \ __macro(miopenRestoreDropoutDescriptor); \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册