未验证 提交 e312a1ff 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] update fluid operators for rocm (part9), test=develop (#31338)

上级 6626c6a6
...@@ -13,7 +13,13 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm> #include <algorithm>
#ifdef __NVCC__
#include "cub/cub.cuh" #include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#endif
#include "paddle/fluid/operators/p_norm_op.h" #include "paddle/fluid/operators/p_norm_op.h"
namespace paddle { namespace paddle {
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/prroi_pool_op.h" #include "paddle/fluid/operators/prroi_pool_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -29,22 +28,6 @@ static inline int NumBlocks(const int N) { ...@@ -29,22 +28,6 @@ static inline int NumBlocks(const int N) {
kNumMaximumNumBlocks); kNumMaximumNumBlocks);
} }
template <typename T>
DEVICE void PrRoIPoolingDistributeDiffCUDA(T* diff, const T top_diff,
const int h, const int w,
const int height, const int width,
const T coeff) {
bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
if (!overflow) {
paddle::platform::CudaAtomicAdd(diff + h * width + w, top_diff * coeff);
}
}
template <typename T>
DEVICE void GPUAccumulateRois(T* offset, T data) {
paddle::platform::CudaAtomicAdd(offset, data);
}
template <typename T> template <typename T>
__global__ void GPUPRROIPoolForward( __global__ void GPUPRROIPoolForward(
const int nthreads, const T* input_data, const T* input_rois, const int nthreads, const T* input_data, const T* input_rois,
...@@ -170,25 +153,23 @@ __global__ void GPUPRROIPoolBackward( ...@@ -170,25 +153,23 @@ __global__ void GPUPRROIPoolBackward(
for (int w_iter = s_w; w_iter < e_w; ++w_iter) { for (int w_iter = s_w; w_iter < e_w; ++w_iter) {
for (int h_iter = s_h; h_iter < e_h; ++h_iter) { for (int h_iter = s_h; h_iter < e_h; ++h_iter) {
PrRoIPoolingMatDistributeDiff( PrRoIPoolingMatDistributeDiff<T>(
offset_input_grad_data, sum_out, h_iter, w_iter, h_iter + 1, offset_input_grad_data, sum_out, h_iter, w_iter, h_iter + 1,
w_iter + 1, max(win_start_h, static_cast<T>(h_iter)), w_iter + 1, max(win_start_h, static_cast<T>(h_iter)),
max(win_start_w, static_cast<T>(w_iter)), max(win_start_w, static_cast<T>(w_iter)),
min(win_end_h, static_cast<T>(h_iter) + static_cast<T>(1.0)), min(win_end_h, static_cast<T>(h_iter) + static_cast<T>(1.0)),
min(win_end_w, static_cast<T>(w_iter) + static_cast<T>(1.0)), min(win_end_w, static_cast<T>(w_iter) + static_cast<T>(1.0)),
height, width, PrRoIPoolingDistributeDiffCUDA<T>); height, width);
} }
} }
const T* offset_out_data = out_data + i; const T* offset_out_data = out_data + i;
const T* offset_in_data = in_data + input_offset; const T* offset_in_data = in_data + input_offset;
PrRoIPoolingCoorBackward( PrRoIPoolingCoorBackward<T>(
s_w, e_w, s_h, e_h, width, height, win_start_w, win_start_h, win_end_w, s_w, e_w, s_h, e_h, width, height, win_start_w, win_start_h, win_end_w,
win_end_h, pw, ph, pooled_width, pooled_height, win_size, spatial_scale, win_end_h, pw, ph, pooled_width, pooled_height, win_size, spatial_scale,
offset_in_data, offset_out_data, offset_input_roi_grad_data, offset_in_data, offset_out_data, offset_input_roi_grad_data,
offset_output_grad_data, GPUAccumulateRois<T>, offset_output_grad_data);
[](const T x, const T y) { return max(x, y); },
[](const T x, const T y) { return min(x, y); });
} }
} }
......
...@@ -16,6 +16,9 @@ limitations under the License. */ ...@@ -16,6 +16,9 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#if defined(__NVCC__) || defined(__HIPCC__)
#include "paddle/fluid/platform/cuda_primitives.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -73,6 +76,17 @@ inline HOSTDEVICE T PrRoIPoolingMatCalculation(const T* this_data, ...@@ -73,6 +76,17 @@ inline HOSTDEVICE T PrRoIPoolingMatCalculation(const T* this_data,
return sum_out; return sum_out;
} }
#if defined(__NVCC__) || defined(__HIPCC__)
template <typename T>
DEVICE void PrRoIPoolingDistributeDiff(T* diff, const T top_diff, const int h,
const int w, const int height,
const int width, const T coeff) {
bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
if (!overflow) {
paddle::platform::CudaAtomicAdd(diff + h * width + w, top_diff * coeff);
}
}
#else
template <typename T> template <typename T>
inline HOSTDEVICE void PrRoIPoolingDistributeDiff(T* diff, const T top_diff, inline HOSTDEVICE void PrRoIPoolingDistributeDiff(T* diff, const T top_diff,
const int h, const int w, const int h, const int w,
...@@ -84,12 +98,15 @@ inline HOSTDEVICE void PrRoIPoolingDistributeDiff(T* diff, const T top_diff, ...@@ -84,12 +98,15 @@ inline HOSTDEVICE void PrRoIPoolingDistributeDiff(T* diff, const T top_diff,
*(diff + h * width + w) += top_diff * coeff; *(diff + h * width + w) += top_diff * coeff;
} }
} }
#endif
template <typename T, typename Functor> template <typename T>
HOSTDEVICE void PrRoIPoolingMatDistributeDiff( HOSTDEVICE void PrRoIPoolingMatDistributeDiff(T* diff, const T top_diff,
T* diff, const T top_diff, const int s_h, const int s_w, const int e_h, const int s_h, const int s_w,
const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0, const int e_h, const int e_w,
const int w0, Functor functor) { const T y0, const T x0,
const T y1, const T x1,
const int h0, const int w0) {
T alpha, beta, lim_alpha, lim_beta, tmp; T alpha, beta, lim_alpha, lim_beta, tmp;
alpha = x0 - static_cast<T>(s_w); alpha = x0 - static_cast<T>(s_w);
...@@ -99,14 +116,14 @@ HOSTDEVICE void PrRoIPoolingMatDistributeDiff( ...@@ -99,14 +116,14 @@ HOSTDEVICE void PrRoIPoolingMatDistributeDiff(
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) * 0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
functor(diff, top_diff, s_h, s_w, h0, w0, tmp); PrRoIPoolingDistributeDiff<T>(diff, top_diff, s_h, s_w, h0, w0, tmp);
alpha = static_cast<T>(e_w) - x1; alpha = static_cast<T>(e_w) - x1;
lim_alpha = static_cast<T>(e_w) - x0; lim_alpha = static_cast<T>(e_w) - x0;
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) * 0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
functor(diff, top_diff, s_h, e_w, h0, w0, tmp); PrRoIPoolingDistributeDiff<T>(diff, top_diff, s_h, e_w, h0, w0, tmp);
alpha = x0 - static_cast<T>(s_w); alpha = x0 - static_cast<T>(s_w);
beta = static_cast<T>(e_h) - y1; beta = static_cast<T>(e_h) - y1;
...@@ -115,20 +132,47 @@ HOSTDEVICE void PrRoIPoolingMatDistributeDiff( ...@@ -115,20 +132,47 @@ HOSTDEVICE void PrRoIPoolingMatDistributeDiff(
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) * 0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
functor(diff, top_diff, e_h, s_w, h0, w0, tmp); PrRoIPoolingDistributeDiff<T>(diff, top_diff, e_h, s_w, h0, w0, tmp);
alpha = static_cast<T>(e_w) - x1; alpha = static_cast<T>(e_w) - x1;
lim_alpha = static_cast<T>(e_w) - x0; lim_alpha = static_cast<T>(e_w) - x0;
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) * 0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
functor(diff, top_diff, e_h, e_w, h0, w0, tmp); PrRoIPoolingDistributeDiff<T>(diff, top_diff, e_h, e_w, h0, w0, tmp);
} }
#if defined(__NVCC__) || defined(__HIPCC__)
template <typename T>
DEVICE void AccumulateRois(T* offset, T data) {
paddle::platform::CudaAtomicAdd(offset, data);
}
#else
template <typename T> template <typename T>
inline HOSTDEVICE void CPUAccumulateRois(T* offset, T data) { inline HOSTDEVICE void AccumulateRois(T* offset, T data) {
*offset += data; *offset += data;
} }
#endif
#if defined(__NVCC__) || defined(__HIPCC__)
template <typename T>
DEVICE T MaxFunctor(const T x, const T y) {
return max(x, y);
}
template <typename T>
DEVICE T MinFunctor(const T x, const T y) {
return min(x, y);
}
#else
template <typename T>
inline HOSTDEVICE T MaxFunctor(const T x, const T y) {
return std::max(x, y);
}
template <typename T>
inline HOSTDEVICE T MinFunctor(const T x, const T y) {
return std::max(x, y);
}
#endif
template <typename T> template <typename T>
inline HOSTDEVICE static T PrRoIPoolingGetCoeff(T dh, T dw) { inline HOSTDEVICE static T PrRoIPoolingGetCoeff(T dh, T dw) {
...@@ -172,15 +216,13 @@ inline HOSTDEVICE T PrRoIPoolingSingleCoorIntegral(T s, T t, T c1, T c2) { ...@@ -172,15 +216,13 @@ inline HOSTDEVICE T PrRoIPoolingSingleCoorIntegral(T s, T t, T c1, T c2) {
(t - 0.5f * t * t - s + 0.5f * s * s) * c1; (t - 0.5f * t * t - s + 0.5f * s * s) * c1;
} }
template <typename T, typename Functor, typename MaxFunctor, template <typename T>
typename MinFunctor>
inline HOSTDEVICE void PrRoIPoolingCoorBackward( inline HOSTDEVICE void PrRoIPoolingCoorBackward(
int s_w, int e_w, int s_h, int e_h, int width, int height, T win_start_w, int s_w, int e_w, int s_h, int e_h, int width, int height, T win_start_w,
T win_start_h, T win_end_w, T win_end_h, int pw, int ph, T win_start_h, T win_end_w, T win_end_h, int pw, int ph,
const int pooled_width, const int pooled_height, T win_size, const int pooled_width, const int pooled_height, T win_size,
const float spatial_scale, const T* this_bottom_data, const float spatial_scale, const T* this_bottom_data,
const T* this_top_data, T* this_data_grad, const T* this_out_grad, const T* this_top_data, T* this_data_grad, const T* this_out_grad) {
Functor functor, MaxFunctor maxFunctor, MinFunctor minFunctor) {
T g_x1_y = 0.f; T g_x1_y = 0.f;
T g_x2_y = 0.f; T g_x2_y = 0.f;
T g_x_y1 = 0.f; T g_x_y1 = 0.f;
...@@ -188,16 +230,16 @@ inline HOSTDEVICE void PrRoIPoolingCoorBackward( ...@@ -188,16 +230,16 @@ inline HOSTDEVICE void PrRoIPoolingCoorBackward(
for (int h_iter = s_h; h_iter < e_h; ++h_iter) { for (int h_iter = s_h; h_iter < e_h; ++h_iter) {
g_x1_y += PrRoIPoolingSingleCoorIntegral( g_x1_y += PrRoIPoolingSingleCoorIntegral(
maxFunctor(win_start_h, static_cast<T>(h_iter)) - h_iter, MaxFunctor<T>(win_start_h, static_cast<T>(h_iter)) - h_iter,
minFunctor(win_end_h, static_cast<T>(h_iter + 1)) - h_iter, MinFunctor<T>(win_end_h, static_cast<T>(h_iter + 1)) - h_iter,
PrRoIPoolingInterpolation(this_bottom_data, h_iter, win_start_w, height, PrRoIPoolingInterpolation(this_bottom_data, h_iter, win_start_w, height,
width), width),
PrRoIPoolingInterpolation(this_bottom_data, h_iter + 1, win_start_w, PrRoIPoolingInterpolation(this_bottom_data, h_iter + 1, win_start_w,
height, width)); height, width));
g_x2_y += PrRoIPoolingSingleCoorIntegral( g_x2_y += PrRoIPoolingSingleCoorIntegral(
maxFunctor(win_start_h, static_cast<T>(h_iter)) - h_iter, MaxFunctor<T>(win_start_h, static_cast<T>(h_iter)) - h_iter,
minFunctor(win_end_h, static_cast<T>(h_iter + 1)) - h_iter, MinFunctor<T>(win_end_h, static_cast<T>(h_iter + 1)) - h_iter,
PrRoIPoolingInterpolation(this_bottom_data, h_iter, win_end_w, height, PrRoIPoolingInterpolation(this_bottom_data, h_iter, win_end_w, height,
width), width),
PrRoIPoolingInterpolation(this_bottom_data, h_iter + 1, win_end_w, PrRoIPoolingInterpolation(this_bottom_data, h_iter + 1, win_end_w,
...@@ -206,16 +248,16 @@ inline HOSTDEVICE void PrRoIPoolingCoorBackward( ...@@ -206,16 +248,16 @@ inline HOSTDEVICE void PrRoIPoolingCoorBackward(
for (int w_iter = s_w; w_iter < e_w; ++w_iter) { for (int w_iter = s_w; w_iter < e_w; ++w_iter) {
g_x_y1 += PrRoIPoolingSingleCoorIntegral( g_x_y1 += PrRoIPoolingSingleCoorIntegral(
maxFunctor(win_start_w, static_cast<T>(w_iter)) - w_iter, MaxFunctor<T>(win_start_w, static_cast<T>(w_iter)) - w_iter,
minFunctor(win_end_w, static_cast<T>(w_iter + 1)) - w_iter, MinFunctor<T>(win_end_w, static_cast<T>(w_iter + 1)) - w_iter,
PrRoIPoolingInterpolation(this_bottom_data, win_start_h, w_iter, height, PrRoIPoolingInterpolation(this_bottom_data, win_start_h, w_iter, height,
width), width),
PrRoIPoolingInterpolation(this_bottom_data, win_start_h, w_iter + 1, PrRoIPoolingInterpolation(this_bottom_data, win_start_h, w_iter + 1,
height, width)); height, width));
g_x_y2 += PrRoIPoolingSingleCoorIntegral( g_x_y2 += PrRoIPoolingSingleCoorIntegral(
maxFunctor(win_start_w, static_cast<T>(w_iter)) - w_iter, MaxFunctor<T>(win_start_w, static_cast<T>(w_iter)) - w_iter,
minFunctor(win_end_w, static_cast<T>(w_iter + 1)) - w_iter, MinFunctor<T>(win_end_w, static_cast<T>(w_iter + 1)) - w_iter,
PrRoIPoolingInterpolation(this_bottom_data, win_end_h, w_iter, height, PrRoIPoolingInterpolation(this_bottom_data, win_end_h, w_iter, height,
width), width),
PrRoIPoolingInterpolation(this_bottom_data, win_end_h, w_iter + 1, PrRoIPoolingInterpolation(this_bottom_data, win_end_h, w_iter + 1,
...@@ -232,22 +274,24 @@ inline HOSTDEVICE void PrRoIPoolingCoorBackward( ...@@ -232,22 +274,24 @@ inline HOSTDEVICE void PrRoIPoolingCoorBackward(
partial_y1 = partial_y1 / win_size * spatial_scale; partial_y1 = partial_y1 / win_size * spatial_scale;
partial_y2 = partial_y2 / win_size * spatial_scale; partial_y2 = partial_y2 / win_size * spatial_scale;
functor(this_data_grad + 0, AccumulateRois<T>(
(partial_x1 * (1.0 - static_cast<T>(pw) / pooled_width) + this_data_grad + 0,
partial_x2 * (1.0 - static_cast<T>(pw + 1) / pooled_width)) * (partial_x1 * (1.0 - static_cast<T>(pw) / pooled_width) +
(*this_out_grad)); partial_x2 * (1.0 - static_cast<T>(pw + 1) / pooled_width)) *
functor(this_data_grad + 1, (*this_out_grad));
(partial_y1 * (1.0 - static_cast<T>(ph) / pooled_height) + AccumulateRois<T>(
partial_y2 * (1.0 - static_cast<T>(ph + 1) / pooled_height)) * this_data_grad + 1,
(*this_out_grad)); (partial_y1 * (1.0 - static_cast<T>(ph) / pooled_height) +
functor(this_data_grad + 2, partial_y2 * (1.0 - static_cast<T>(ph + 1) / pooled_height)) *
(partial_x2 * static_cast<T>(pw + 1) / pooled_width + (*this_out_grad));
partial_x1 * static_cast<T>(pw) / pooled_width) * AccumulateRois<T>(this_data_grad + 2,
(*this_out_grad)); (partial_x2 * static_cast<T>(pw + 1) / pooled_width +
functor(this_data_grad + 3, partial_x1 * static_cast<T>(pw) / pooled_width) *
(partial_y2 * static_cast<T>(ph + 1) / pooled_height + (*this_out_grad));
partial_y1 * static_cast<T>(ph) / pooled_height) * AccumulateRois<T>(this_data_grad + 3,
(*this_out_grad)); (partial_y2 * static_cast<T>(ph + 1) / pooled_height +
partial_y1 * static_cast<T>(ph) / pooled_height) *
(*this_out_grad));
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -516,7 +560,7 @@ class CPUPRROIPoolGradOpKernel : public framework::OpKernel<T> { ...@@ -516,7 +560,7 @@ class CPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
for (int w_iter = s_w; w_iter < e_w; ++w_iter) { for (int w_iter = s_w; w_iter < e_w; ++w_iter) {
for (int h_iter = s_h; h_iter < e_h; ++h_iter) { for (int h_iter = s_h; h_iter < e_h; ++h_iter) {
PrRoIPoolingMatDistributeDiff( PrRoIPoolingMatDistributeDiff<T>(
offset_input_grad_data, sum_out, h_iter, w_iter, h_iter + 1, offset_input_grad_data, sum_out, h_iter, w_iter, h_iter + 1,
w_iter + 1, std::max(win_start_h, static_cast<T>(h_iter)), w_iter + 1, std::max(win_start_h, static_cast<T>(h_iter)),
std::max(win_start_w, static_cast<T>(w_iter)), std::max(win_start_w, static_cast<T>(w_iter)),
...@@ -524,19 +568,16 @@ class CPUPRROIPoolGradOpKernel : public framework::OpKernel<T> { ...@@ -524,19 +568,16 @@ class CPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
static_cast<T>(h_iter) + static_cast<T>(1.0)), static_cast<T>(h_iter) + static_cast<T>(1.0)),
std::min(win_end_w, std::min(win_end_w,
static_cast<T>(w_iter) + static_cast<T>(1.0)), static_cast<T>(w_iter) + static_cast<T>(1.0)),
height, width, PrRoIPoolingDistributeDiff<T>); height, width);
} }
} }
const T* offset_in_data = in_data + input_offset; const T* offset_in_data = in_data + input_offset;
PrRoIPoolingCoorBackward( PrRoIPoolingCoorBackward<T>(
s_w, e_w, s_h, e_h, width, height, win_start_w, win_start_h, s_w, e_w, s_h, e_h, width, height, win_start_w, win_start_h,
win_end_w, win_end_h, pw, ph, pooled_width, pooled_height, win_size, win_end_w, win_end_h, pw, ph, pooled_width, pooled_height, win_size,
spatial_scale, offset_in_data, offset_out_data, spatial_scale, offset_in_data, offset_out_data,
offset_input_roi_grad_data, offset_output_grad_data, offset_input_roi_grad_data, offset_output_grad_data);
CPUAccumulateRois<T>,
[](const T x, const T y) { return std::max(x, y); },
[](const T x, const T y) { return std::min(x, y); });
} }
} }
} }
......
...@@ -47,7 +47,8 @@ static void PullBoxSparseFunctor(const framework::ExecutionContext &ctx) { ...@@ -47,7 +47,8 @@ static void PullBoxSparseFunctor(const framework::ExecutionContext &ctx) {
box_ptr->PullSparse(ctx.GetPlace(), all_keys, all_values, slot_lengths, box_ptr->PullSparse(ctx.GetPlace(), all_keys, all_values, slot_lengths,
hidden_size, 0); hidden_size, 0);
#endif #endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
auto hidden_size = ctx.Attr<int>("size"); auto hidden_size = ctx.Attr<int>("size");
auto gpu_ps_ptr = paddle::framework::PSGPUWrapper::GetInstance(); auto gpu_ps_ptr = paddle::framework::PSGPUWrapper::GetInstance();
gpu_ps_ptr->PullSparse(ctx.GetPlace(), 0, all_keys, all_values, slot_lengths, gpu_ps_ptr->PullSparse(ctx.GetPlace(), 0, all_keys, all_values, slot_lengths,
...@@ -90,7 +91,8 @@ static void PushBoxSparseFunctor(const framework::ExecutionContext &ctx) { ...@@ -90,7 +91,8 @@ static void PushBoxSparseFunctor(const framework::ExecutionContext &ctx) {
box_ptr->PushSparseGrad(ctx.GetPlace(), all_keys, all_grad_values, box_ptr->PushSparseGrad(ctx.GetPlace(), all_keys, all_grad_values,
slot_lengths, hidden_size, 0, batch_size); slot_lengths, hidden_size, 0, batch_size);
#endif #endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
auto hidden_size = ctx.Attr<int>("size"); auto hidden_size = ctx.Attr<int>("size");
auto gpu_ps_ptr = paddle::framework::PSGPUWrapper::GetInstance(); auto gpu_ps_ptr = paddle::framework::PSGPUWrapper::GetInstance();
gpu_ps_ptr->PushSparseGrad(ctx.GetPlace(), 0, all_keys, all_grad_values, gpu_ps_ptr->PushSparseGrad(ctx.GetPlace(), 0, all_keys, all_grad_values,
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <thrust/random.h> #include <thrust/random.h>
#endif #endif
...@@ -36,7 +36,7 @@ struct Random<platform::CPUDeviceContext> { ...@@ -36,7 +36,7 @@ struct Random<platform::CPUDeviceContext> {
using UniformIntDist = std::uniform_int_distribution<T>; using UniformIntDist = std::uniform_int_distribution<T>;
}; };
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <> template <>
struct Random<platform::CUDADeviceContext> { struct Random<platform::CUDADeviceContext> {
using Engine = thrust::minstd_rand; using Engine = thrust::minstd_rand;
......
...@@ -50,7 +50,7 @@ __global__ void expand_input_by_rank_kernel( ...@@ -50,7 +50,7 @@ __global__ void expand_input_by_rank_kernel(
} }
template <typename T> template <typename T>
void expand_rank_attention_input(cudaStream_t stream, const T* input, void expand_rank_attention_input(gpuStream_t stream, const T* input,
int input_row, int input_col, T* output, int input_row, int input_col, T* output,
int output_row, int output_col, int output_row, int output_col,
const int* rank_offset, int rank_offset_row, const int* rank_offset, int rank_offset_row,
...@@ -93,7 +93,7 @@ __global__ void expand_rank_attention_param_kernel( ...@@ -93,7 +93,7 @@ __global__ void expand_rank_attention_param_kernel(
} }
template <typename T> template <typename T>
void expand_rank_attention_param(cudaStream_t stream, const T* input, void expand_rank_attention_param(gpuStream_t stream, const T* input,
int input_row, int input_col, int input_row, int input_col,
const int* rank_offset, int rank_offset_row, const int* rank_offset, int rank_offset_row,
int rank_offset_col, const T* param, int rank_offset_col, const T* param,
...@@ -133,7 +133,7 @@ __global__ void merge_param_gradient_kernel( ...@@ -133,7 +133,7 @@ __global__ void merge_param_gradient_kernel(
} }
template <typename T> template <typename T>
void merge_rank_attention_param_grad(cudaStream_t stream, T* expanded_grad, void merge_rank_attention_param_grad(gpuStream_t stream, T* expanded_grad,
int expanded_grad_row, int expanded_grad_row,
int expanded_grad_col, T* param_grad, int expanded_grad_col, T* param_grad,
int param_grad_row, int param_grad_col, int param_grad_row, int param_grad_col,
......
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cublas.h>
#include <algorithm> #include <algorithm>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
......
...@@ -654,7 +654,7 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR( ...@@ -654,7 +654,7 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(
ops::ReshapeDoubleGradKernel, paddle::platform::complex128, ops::ReshapeDoubleGradKernel, paddle::platform::complex128,
ops::ReshapeDoubleGradKernel); ops::ReshapeDoubleGradKernel);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double, REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel, ops::ReshapeKernel, int, ops::ReshapeKernel,
uint8_t, ops::ReshapeKernel, int64_t, uint8_t, ops::ReshapeKernel, int64_t,
......
...@@ -16,7 +16,12 @@ limitations under the License. */ ...@@ -16,7 +16,12 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/utils.h" #include "paddle/fluid/operators/utils.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/miopen_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -28,7 +33,11 @@ class RNNDescriptors { ...@@ -28,7 +33,11 @@ class RNNDescriptors {
public: public:
RNNDescriptors(int seq_length, int batch_size, int input_size, RNNDescriptors(int seq_length, int batch_size, int input_size,
int hidden_size, int num_layers, float dropout_prob, int seed, int hidden_size, int num_layers, float dropout_prob, int seed,
#ifdef PADDLE_WITH_HIP
int weight_numel, miopenRNNMode_t mode, bool is_bidirec,
#else
int weight_numel, cudnnRNNMode_t mode, bool is_bidirec, int weight_numel, cudnnRNNMode_t mode, bool is_bidirec,
#endif
bool is_test) bool is_test)
: seq_length_(seq_length), : seq_length_(seq_length),
batch_size_(batch_size), batch_size_(batch_size),
...@@ -40,15 +49,23 @@ class RNNDescriptors { ...@@ -40,15 +49,23 @@ class RNNDescriptors {
weight_numel_(weight_numel), weight_numel_(weight_numel),
mode_(mode), mode_(mode),
is_bidirec_(is_bidirec), is_bidirec_(is_bidirec),
is_test_(is_test) {} is_test_(is_test) {
}
template <typename T> template <typename T>
#ifdef PADDLE_WITH_HIP
void Create(const miopenHandle_t &handle, const platform::Place &place,
#else
void Create(const cudnnHandle_t &handle, const platform::Place &place, void Create(const cudnnHandle_t &handle, const platform::Place &place,
#endif
const std::vector<int> &sequence_length, size_t *workspace_size, const std::vector<int> &sequence_length, size_t *workspace_size,
size_t *reserve_size, framework::Tensor *dropout_state) { size_t *reserve_size, framework::Tensor *dropout_state) {
int numDirections = is_bidirec_ ? 2 : 1; int numDirections = is_bidirec_ ? 2 : 1;
#ifdef PADDLE_WITH_HIP
miopenDataType_t cudnn_type = platform::CudnnDataType<T>::type;
#else
cudnnDataType_t cudnn_type = platform::CudnnDataType<T>::type; cudnnDataType_t cudnn_type = platform::CudnnDataType<T>::type;
#endif
// ------------------- cudnn x, y descriptors --------------------- // ------------------- cudnn x, y descriptors ---------------------
std::vector<int> dims_x = {batch_size_, input_size_, 1}; std::vector<int> dims_x = {batch_size_, input_size_, 1};
std::vector<int> strides_x = {input_size_, 1, 1}; std::vector<int> strides_x = {input_size_, 1, 1};
...@@ -59,7 +76,7 @@ class RNNDescriptors { ...@@ -59,7 +76,7 @@ class RNNDescriptors {
y_descs_.emplace_back(y_desc_.descriptor<T>(dims_y, strides_y)); y_descs_.emplace_back(y_desc_.descriptor<T>(dims_y, strides_y));
} }
#if CUDNN_VERSION >= 7201 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201
if (!sequence_length.empty()) { if (!sequence_length.empty()) {
x_seq_desc_.descriptor<T>(seq_length_, batch_size_, input_size_, true, x_seq_desc_.descriptor<T>(seq_length_, batch_size_, input_size_, true,
sequence_length); sequence_length);
...@@ -82,17 +99,29 @@ class RNNDescriptors { ...@@ -82,17 +99,29 @@ class RNNDescriptors {
size_t state_size; size_t state_size;
bool is_initialized = dropout_state->IsInitialized(); bool is_initialized = dropout_state->IsInitialized();
if (!is_test_ && !is_initialized) { if (!is_test_ && !is_initialized) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::miopenDropoutGetStatesSize(handle, &state_size));
dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
place);
#else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size)); platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)}, dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
place); place);
#endif
} }
dropout_desc_.descriptor(handle, place, is_initialized, dropout_prob_, dropout_desc_.descriptor(handle, place, is_initialized, dropout_prob_,
is_test_ ? nullptr : dropout_state, seed_, is_test_ ? nullptr : dropout_state, seed_,
state_size); state_size);
// ------------------- cudnn rnn descriptors --------------------- // ------------------- cudnn rnn descriptors ---------------------
#if CUDNN_VERSION >= 6000 #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor(
rnn_desc_.desc(), hidden_size_, num_layers_, miopenRNNlinear,
is_bidirec_ ? miopenRNNbidirection : miopenRNNunidirection, mode_,
miopenRNNNoBias, miopenRNNdefault, cudnn_type));
#elif CUDNN_VERSION >= 6000
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
handle, rnn_desc_.desc(), hidden_size_, num_layers_, handle, rnn_desc_.desc(), hidden_size_, num_layers_,
dropout_desc_.desc(), CUDNN_LINEAR_INPUT, dropout_desc_.desc(), CUDNN_LINEAR_INPUT,
...@@ -106,7 +135,7 @@ class RNNDescriptors { ...@@ -106,7 +135,7 @@ class RNNDescriptors {
cudnn_type)); cudnn_type));
#endif #endif
#if CUDNN_VERSION >= 7201 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201
if (!sequence_length.empty()) { if (!sequence_length.empty()) {
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode(
rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED)); rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED));
...@@ -115,8 +144,13 @@ class RNNDescriptors { ...@@ -115,8 +144,13 @@ class RNNDescriptors {
// ------------------- cudnn weights_size --------------------- // ------------------- cudnn weights_size ---------------------
size_t weights_size_; size_t weights_size_;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenGetRNNParamsSize(
handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type)); handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
#endif
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
weights_size_, sizeof(T) * weight_numel_, weights_size_, sizeof(T) * weight_numel_,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
...@@ -126,7 +160,16 @@ class RNNDescriptors { ...@@ -126,7 +160,16 @@ class RNNDescriptors {
int dim_tmp = weights_size_ / sizeof(T); int dim_tmp = weights_size_ / sizeof(T);
std::vector<int> dim_w = {dim_tmp, 1, 1}; std::vector<int> dim_w = {dim_tmp, 1, 1};
weight_desc_.descriptor<T>(layout, dim_w); weight_desc_.descriptor<T>(layout, dim_w);
// ------------------- cudnn workspace, reserve size --------------------- // ------------------- cudnn workspace, reserve size ---------------------
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenGetRNNWorkspaceSize(
handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
workspace_size));
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::miopenGetRNNTrainingReserveSize(
handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
reserve_size));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
workspace_size)); workspace_size));
...@@ -134,7 +177,19 @@ class RNNDescriptors { ...@@ -134,7 +177,19 @@ class RNNDescriptors {
platform::dynload::cudnnGetRNNTrainingReserveSize( platform::dynload::cudnnGetRNNTrainingReserveSize(
handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
reserve_size)); reserve_size));
#endif
} }
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t *x_descs() { return x_descs_.data(); }
miopenTensorDescriptor_t *y_descs() { return y_descs_.data(); }
miopenTensorDescriptor_t init_h_desc() { return init_h_desc_.desc(); }
miopenTensorDescriptor_t init_c_desc() { return init_c_desc_.desc(); }
miopenTensorDescriptor_t last_h_desc() { return last_h_desc_.desc(); }
miopenTensorDescriptor_t last_c_desc() { return last_c_desc_.desc(); }
miopenRNNDescriptor_t rnn_desc() { return rnn_desc_.desc(); }
miopenDropoutDescriptor_t dropout_desc() { return dropout_desc_.desc(); }
miopenTensorDescriptor_t weight_desc() { return weight_desc_.desc(); }
#else
cudnnTensorDescriptor_t *x_descs() { return x_descs_.data(); } cudnnTensorDescriptor_t *x_descs() { return x_descs_.data(); }
cudnnTensorDescriptor_t *y_descs() { return y_descs_.data(); } cudnnTensorDescriptor_t *y_descs() { return y_descs_.data(); }
#if CUDNN_VERSION >= 7201 #if CUDNN_VERSION >= 7201
...@@ -148,6 +203,7 @@ class RNNDescriptors { ...@@ -148,6 +203,7 @@ class RNNDescriptors {
cudnnRNNDescriptor_t rnn_desc() { return rnn_desc_.desc(); } cudnnRNNDescriptor_t rnn_desc() { return rnn_desc_.desc(); }
cudnnDropoutDescriptor_t dropout_desc() { return dropout_desc_.desc(); } cudnnDropoutDescriptor_t dropout_desc() { return dropout_desc_.desc(); }
cudnnFilterDescriptor_t weight_desc() { return weight_desc_.desc(); } cudnnFilterDescriptor_t weight_desc() { return weight_desc_.desc(); }
#endif
private: private:
int seq_length_; int seq_length_;
...@@ -158,15 +214,24 @@ class RNNDescriptors { ...@@ -158,15 +214,24 @@ class RNNDescriptors {
float dropout_prob_; float dropout_prob_;
int seed_; int seed_;
int weight_numel_; int weight_numel_;
#ifdef PADDLE_WITH_HIP
miopenRNNMode_t mode_;
#else
cudnnRNNMode_t mode_; cudnnRNNMode_t mode_;
#endif
bool is_bidirec_; bool is_bidirec_;
bool is_test_; bool is_test_;
#ifdef PADDLE_WITH_HIP
std::vector<miopenTensorDescriptor_t> x_descs_;
std::vector<miopenTensorDescriptor_t> y_descs_;
#else
std::vector<cudnnTensorDescriptor_t> x_descs_; std::vector<cudnnTensorDescriptor_t> x_descs_;
std::vector<cudnnTensorDescriptor_t> y_descs_; std::vector<cudnnTensorDescriptor_t> y_descs_;
#endif
platform::ScopedTensorDescriptor x_desc_; platform::ScopedTensorDescriptor x_desc_;
platform::ScopedTensorDescriptor y_desc_; platform::ScopedTensorDescriptor y_desc_;
#if CUDNN_VERSION >= 7201 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201
platform::ScopedRNNTensorDescriptor x_seq_desc_; platform::ScopedRNNTensorDescriptor x_seq_desc_;
platform::ScopedRNNTensorDescriptor y_seq_desc_; platform::ScopedRNNTensorDescriptor y_seq_desc_;
#endif #endif
...@@ -193,7 +258,7 @@ bool is_continuous(const Type &weight_list) { ...@@ -193,7 +258,7 @@ bool is_continuous(const Type &weight_list) {
} }
template <typename T> template <typename T>
void weight_to_tensor(const platform::Place &place, cudaStream_t stream, void weight_to_tensor(const platform::Place &place, gpuStream_t stream,
const std::vector<const Tensor *> &weight_list, const std::vector<const Tensor *> &weight_list,
Tensor *weight) { Tensor *weight) {
auto weight_data = weight->data<T>(); auto weight_data = weight->data<T>();
...@@ -211,7 +276,7 @@ void weight_to_tensor(const platform::Place &place, cudaStream_t stream, ...@@ -211,7 +276,7 @@ void weight_to_tensor(const platform::Place &place, cudaStream_t stream,
} }
template <typename T> template <typename T>
void weight_to_tensor_list(const platform::Place &place, cudaStream_t stream, void weight_to_tensor_list(const platform::Place &place, gpuStream_t stream,
std::vector<Tensor *> *weight_grad, std::vector<Tensor *> *weight_grad,
const std::vector<const Tensor *> &weight_input, const std::vector<const Tensor *> &weight_input,
const Tensor *weight) { const Tensor *weight) {
...@@ -247,6 +312,17 @@ class RNNCudnnKernel : public framework::OpKernel<T> { ...@@ -247,6 +312,17 @@ class RNNCudnnKernel : public framework::OpKernel<T> {
int hidden_size = ctx.Attr<int>("hidden_size"); int hidden_size = ctx.Attr<int>("hidden_size");
int num_layers = ctx.Attr<int>("num_layers"); int num_layers = ctx.Attr<int>("num_layers");
auto mode = ctx.Attr<std::string>("mode"); auto mode = ctx.Attr<std::string>("mode");
#ifdef PADDLE_WITH_HIP
miopenRNNMode_t rnn_mode = miopenLSTM;
if (mode == "LSTM")
rnn_mode = miopenLSTM;
else if (mode == "GRU")
rnn_mode = miopenGRU;
else if (mode == "RNN_RELU")
rnn_mode = miopenRNNRELU;
else if (mode == "RNN_TANH")
rnn_mode = miopenRNNTANH;
#else
cudnnRNNMode_t rnn_mode = CUDNN_LSTM; cudnnRNNMode_t rnn_mode = CUDNN_LSTM;
if (mode == "LSTM") if (mode == "LSTM")
rnn_mode = CUDNN_LSTM; rnn_mode = CUDNN_LSTM;
...@@ -256,6 +332,7 @@ class RNNCudnnKernel : public framework::OpKernel<T> { ...@@ -256,6 +332,7 @@ class RNNCudnnKernel : public framework::OpKernel<T> {
rnn_mode = CUDNN_RNN_RELU; rnn_mode = CUDNN_RNN_RELU;
else if (mode == "RNN_TANH") else if (mode == "RNN_TANH")
rnn_mode = CUDNN_RNN_TANH; rnn_mode = CUDNN_RNN_TANH;
#endif
else else
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"rnn_mode should be LSTM, GRU, RNN_RELU or RNN_TANH, but received: " "rnn_mode should be LSTM, GRU, RNN_RELU or RNN_TANH, but received: "
...@@ -285,7 +362,11 @@ class RNNCudnnKernel : public framework::OpKernel<T> { ...@@ -285,7 +362,11 @@ class RNNCudnnKernel : public framework::OpKernel<T> {
T *out_data = out->mutable_data<T>(ctx.GetPlace()); T *out_data = out->mutable_data<T>(ctx.GetPlace());
T *last_h_data = state[0]->mutable_data<T>(ctx.GetPlace()); T *last_h_data = state[0]->mutable_data<T>(ctx.GetPlace());
T *last_c_data = nullptr; T *last_c_data = nullptr;
#ifdef PADDLE_WITH_HIP
if (rnn_mode == miopenLSTM) {
#else
if (rnn_mode == CUDNN_LSTM) { if (rnn_mode == CUDNN_LSTM) {
#endif
init_c_data = pre_state[1]->data<T>(); init_c_data = pre_state[1]->data<T>();
last_c_data = state[1]->mutable_data<T>(ctx.GetPlace()); last_c_data = state[1]->mutable_data<T>(ctx.GetPlace());
} }
...@@ -362,8 +443,17 @@ class RNNCudnnKernel : public framework::OpKernel<T> { ...@@ -362,8 +443,17 @@ class RNNCudnnKernel : public framework::OpKernel<T> {
&workspace_data_, workspace_size); &workspace_data_, workspace_size);
} else { } else {
if (!has_seq_length) { if (!has_seq_length) {
// for train // for train
// This interface is used when the input/output is unpadded. // This interface is used when the input/output is unpadded.
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNForwardTraining(
handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data,
rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
rnn.weight_desc(), w_data, rnn.y_descs(), out_data,
rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data,
workspace_data_.data<uint8_t>(), workspace_size, reserve_data,
reserve_size));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data, handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data,
rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data, rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
...@@ -371,8 +461,9 @@ class RNNCudnnKernel : public framework::OpKernel<T> { ...@@ -371,8 +461,9 @@ class RNNCudnnKernel : public framework::OpKernel<T> {
rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data, rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data,
workspace_data_.data<uint8_t>(), workspace_size, reserve_data, workspace_data_.data<uint8_t>(), workspace_size, reserve_data,
reserve_size)); reserve_size));
#endif
} else { } else {
#if CUDNN_VERSION >= 7201 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201
// for train // for train
// This interface is used when the input/output is padded. // This interface is used when the input/output is padded.
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
...@@ -394,23 +485,36 @@ class RNNCudnnKernel : public framework::OpKernel<T> { ...@@ -394,23 +485,36 @@ class RNNCudnnKernel : public framework::OpKernel<T> {
} }
} }
#ifdef PADDLE_WITH_HIP
void RNNInferece(const bool &has_seq_length, const miopenHandle_t &handle,
#else
void RNNInferece(const bool &has_seq_length, const cudnnHandle_t &handle, void RNNInferece(const bool &has_seq_length, const cudnnHandle_t &handle,
#endif
const int &seq_length, RNNDescriptors *rnn, const T *x_data, const int &seq_length, RNNDescriptors *rnn, const T *x_data,
const T *init_h_data, const T *init_c_data, const T *w_data, const T *init_h_data, const T *init_c_data, const T *w_data,
T *out_data, T *last_h_data, T *last_c_data, T *out_data, T *last_h_data, T *last_c_data,
framework::Tensor *workspace_data, framework::Tensor *workspace_data,
const size_t &workspace_size) const { const size_t &workspace_size) const {
if (!has_seq_length) { if (!has_seq_length) {
// for inference // for inference
// This interface is used when the input/output is unpadded. // This interface is used when the input/output is unpadded.
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNForwardInference(
handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data,
rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data,
rnn->weight_desc(), w_data, rnn->y_descs(), out_data,
rnn->last_h_desc(), last_h_data, rnn->last_c_desc(), last_c_data,
workspace_data->data<uint8_t>(), workspace_size));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data, handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data,
rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data, rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data,
rnn->weight_desc(), w_data, rnn->y_descs(), out_data, rnn->weight_desc(), w_data, rnn->y_descs(), out_data,
rnn->last_h_desc(), last_h_data, rnn->last_c_desc(), last_c_data, rnn->last_h_desc(), last_h_data, rnn->last_c_desc(), last_c_data,
workspace_data->data<uint8_t>(), workspace_size)); workspace_data->data<uint8_t>(), workspace_size));
#endif
} else { } else {
#if CUDNN_VERSION >= 7201 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201
// for inference // for inference
// This interface is used when the input/output is padded. // This interface is used when the input/output is padded.
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInferenceEx( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInferenceEx(
...@@ -457,6 +561,17 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> { ...@@ -457,6 +561,17 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
int hidden_size = ctx.Attr<int>("hidden_size"); int hidden_size = ctx.Attr<int>("hidden_size");
int num_layers = ctx.Attr<int>("num_layers"); int num_layers = ctx.Attr<int>("num_layers");
auto mode = ctx.Attr<std::string>("mode"); auto mode = ctx.Attr<std::string>("mode");
#ifdef PADDLE_WITH_HIP
miopenRNNMode_t rnn_mode = miopenLSTM;
if (mode == "LSTM")
rnn_mode = miopenLSTM;
else if (mode == "GRU")
rnn_mode = miopenGRU;
else if (mode == "RNN_RELU")
rnn_mode = miopenRNNRELU;
else if (mode == "RNN_TANH")
rnn_mode = miopenRNNTANH;
#else
cudnnRNNMode_t rnn_mode = CUDNN_LSTM; cudnnRNNMode_t rnn_mode = CUDNN_LSTM;
if (mode == "LSTM") if (mode == "LSTM")
rnn_mode = CUDNN_LSTM; rnn_mode = CUDNN_LSTM;
...@@ -466,6 +581,7 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> { ...@@ -466,6 +581,7 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
rnn_mode = CUDNN_RNN_RELU; rnn_mode = CUDNN_RNN_RELU;
else if (mode == "RNN_TANH") else if (mode == "RNN_TANH")
rnn_mode = CUDNN_RNN_TANH; rnn_mode = CUDNN_RNN_TANH;
#endif
else else
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"rnn_mode should be LSTM, GRU, RNN_RELU or RNN_TANH, but received: " "rnn_mode should be LSTM, GRU, RNN_RELU or RNN_TANH, but received: "
...@@ -532,7 +648,11 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> { ...@@ -532,7 +648,11 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
? pre_state_grad[0]->mutable_data<T>(ctx.GetPlace()) ? pre_state_grad[0]->mutable_data<T>(ctx.GetPlace())
: nullptr; : nullptr;
T *init_c_grad_data = nullptr; T *init_c_grad_data = nullptr;
#ifdef PADDLE_WITH_HIP
if (rnn_mode == miopenLSTM) {
#else
if (rnn_mode == CUDNN_LSTM) { if (rnn_mode == CUDNN_LSTM) {
#endif
init_c_data = pre_state[1]->data<T>(); init_c_data = pre_state[1]->data<T>();
// last_c_data = state[1]->data<T>(); // last_c_data = state[1]->data<T>();
last_c_grad_data = state_grad[1]->data<T>(); last_c_grad_data = state_grad[1]->data<T>();
...@@ -579,6 +699,17 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> { ...@@ -579,6 +699,17 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
if (!has_seq_length) { if (!has_seq_length) {
if (in_grad) { if (in_grad) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNBackwardData(
handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data,
rnn.y_descs(), out_grad_data, rnn.last_h_desc(), last_h_grad_data,
rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
rnn.x_descs(), in_grad_data, rnn.init_h_desc(), init_h_grad_data,
rnn.init_c_desc(), init_c_grad_data,
workspace_data_.data<uint8_t>(), workspace_size,
const_cast<uint8_t *>(reserve_data), reserve_size));
#else
// This interface is used when the input/output is unpadded. // This interface is used when the input/output is unpadded.
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData(
handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data, handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data,
...@@ -589,17 +720,27 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> { ...@@ -589,17 +720,27 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
rnn.init_c_desc(), init_c_grad_data, rnn.init_c_desc(), init_c_grad_data,
workspace_data_.data<uint8_t>(), workspace_size, workspace_data_.data<uint8_t>(), workspace_size,
const_cast<uint8_t *>(reserve_data), reserve_size)); const_cast<uint8_t *>(reserve_data), reserve_size));
#endif
} }
if (!weight_grad_list.empty()) { if (!weight_grad_list.empty()) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNBackwardWeights(
handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data<T>(),
rnn.init_h_desc(), init_h_data, rnn.y_descs(), out->data<T>(),
rnn.weight_desc(), weight_grad_data,
workspace_data_.data<uint8_t>(), workspace_size,
const_cast<uint8_t *>(reserve_data), reserve_size));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data<T>(), handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data<T>(),
rnn.init_h_desc(), init_h_data, rnn.y_descs(), out->data<T>(), rnn.init_h_desc(), init_h_data, rnn.y_descs(), out->data<T>(),
workspace_data_.data<uint8_t>(), workspace_size, rnn.weight_desc(), workspace_data_.data<uint8_t>(), workspace_size, rnn.weight_desc(),
weight_grad_data, const_cast<uint8_t *>(reserve_data), weight_grad_data, const_cast<uint8_t *>(reserve_data),
reserve_size)); reserve_size));
#endif
} }
} else { } else {
#if CUDNN_VERSION >= 7201 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201
// for train // for train
// This interface is used when the input/output is padded. // This interface is used when the input/output is padded.
if (in_grad) { if (in_grad) {
...@@ -638,7 +779,13 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> { ...@@ -638,7 +779,13 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
REGISTER_OP_CUDA_KERNEL(rnn, ops::RNNCudnnKernel<float>);
REGISTER_OP_CUDA_KERNEL(rnn_grad, ops::RNNGradCudnnKernel<float>);
#else
REGISTER_OP_CUDA_KERNEL(rnn, ops::RNNCudnnKernel<float>, REGISTER_OP_CUDA_KERNEL(rnn, ops::RNNCudnnKernel<float>,
ops::RNNCudnnKernel<double>); ops::RNNCudnnKernel<double>);
REGISTER_OP_CUDA_KERNEL(rnn_grad, ops::RNNGradCudnnKernel<float>, REGISTER_OP_CUDA_KERNEL(rnn_grad, ops::RNNGradCudnnKernel<float>,
ops::RNNGradCudnnKernel<double>); ops::RNNGradCudnnKernel<double>);
#endif
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <cuda.h>
#include "paddle/fluid/operators/seed_op.h" #include "paddle/fluid/operators/seed_op.h"
namespace paddle { namespace paddle {
......
...@@ -63,7 +63,7 @@ void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) { ...@@ -63,7 +63,7 @@ void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) {
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
set_zero(dev_ctx, output, static_cast<T>(0)); set_zero(dev_ctx, output, static_cast<T>(0));
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (!cpu_place) { if (!cpu_place) {
Tensor length; Tensor length;
length.mutable_data<IndexT>(framework::make_ddim({1}), length.mutable_data<IndexT>(framework::make_ddim({1}),
...@@ -71,9 +71,15 @@ void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) { ...@@ -71,9 +71,15 @@ void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) {
IndexT* length_data = length.data<IndexT>(); IndexT* length_data = length.data<IndexT>();
const IndexT* segment_ids = segment->data<IndexT>(); const IndexT* segment_ids = segment->data<IndexT>();
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT),
hipMemcpyDeviceToHost));
#else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT), cudaMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT),
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
#endif
IndexT length_host = length_data[0]; IndexT length_host = length_data[0];
length_host++; length_host++;
......
...@@ -37,7 +37,7 @@ inline int GetBranchNumber(const framework::LoDTensor &mask) { ...@@ -37,7 +37,7 @@ inline int GetBranchNumber(const framework::LoDTensor &mask) {
} }
// when platform::is_gpu_place(mask.place()) is ture // when platform::is_gpu_place(mask.place()) is ture
std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()}; std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
framework::TensorCopySync(mask, platform::CPUPlace(), cpu_mask.get()); framework::TensorCopySync(mask, platform::CPUPlace(), cpu_mask.get());
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
......
...@@ -33,7 +33,7 @@ namespace paddle { ...@@ -33,7 +33,7 @@ namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <typename T> template <typename T>
using Vector = framework::Vector<T>; using Vector = framework::Vector<T>;
#else #else
......
...@@ -11,7 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,7 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef __NVCC__
#include "cub/cub.cuh" #include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#endif
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/math.h" #include "paddle/fluid/operators/math.h"
#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
......
...@@ -16,7 +16,11 @@ limitations under the License. */ ...@@ -16,7 +16,11 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_cuda_utils.h" #include "paddle/fluid/operators/math/math_cuda_utils.h"
#include "paddle/fluid/operators/softmax_op.h" #include "paddle/fluid/operators/softmax_op.h"
#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/cuda_device_function.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/miopen_helper.h"
#else
#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_helper.h"
#endif
#include "paddle/fluid/platform/gpu_launch_config.h" #include "paddle/fluid/platform/gpu_launch_config.h"
namespace paddle { namespace paddle {
...@@ -388,18 +392,30 @@ class SoftmaxCUDNNKernel : public framework::OpKernel<T> { ...@@ -388,18 +392,30 @@ class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
ScopedTensorDescriptor desc; ScopedTensorDescriptor desc;
std::vector<int> tensor_dims = {N, dim, D, 1}; std::vector<int> tensor_dims = {N, dim, D, 1};
DataLayout layout = DataLayout::kNCHW; DataLayout layout = DataLayout::kNCHW;
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
#else
cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims); cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
#endif
auto& dev_ctx = auto& dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>(); ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
#ifdef PADDLE_WITH_HIP
auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
: MIOPEN_SOFTMAX_MODE_CHANNEL;
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward(
handle, platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
platform::CudnnDataType<T>::kZero(), desc_, out_data));
#else
auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
: CUDNN_SOFTMAX_MODE_CHANNEL; : CUDNN_SOFTMAX_MODE_CHANNEL;
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
handle, CUDNN_SOFTMAX_ACCURATE, mode, handle, CUDNN_SOFTMAX_ACCURATE, mode,
platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(), platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
platform::CudnnDataType<T>::kZero(), desc_, out_data)); platform::CudnnDataType<T>::kZero(), desc_, out_data));
#endif
} }
} }
}; };
...@@ -496,19 +512,32 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> { ...@@ -496,19 +512,32 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
ScopedTensorDescriptor desc; ScopedTensorDescriptor desc;
std::vector<int> tensor_dims = {N, dim, D, 1}; std::vector<int> tensor_dims = {N, dim, D, 1};
DataLayout layout = DataLayout::kNCHW; DataLayout layout = DataLayout::kNCHW;
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
#else
cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims); cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
#endif
auto& dev_ctx = auto& dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>(); ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
#ifdef PADDLE_WITH_HIP
auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
: MIOPEN_SOFTMAX_MODE_CHANNEL;
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward(
handle, platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(),
desc_, dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
dx_data));
#else
auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
: CUDNN_SOFTMAX_MODE_CHANNEL; : CUDNN_SOFTMAX_MODE_CHANNEL;
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
handle, CUDNN_SOFTMAX_ACCURATE, mode, handle, CUDNN_SOFTMAX_ACCURATE, mode,
platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(), desc_, platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(), desc_,
dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_, dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
dx_data)); dx_data));
#endif
} }
} }
}; };
...@@ -518,6 +547,15 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> { ...@@ -518,6 +547,15 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
ops::SoftmaxCUDNNKernel<float>,
ops::SoftmaxCUDNNKernel<plat::float16>);
REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
ops::SoftmaxGradCUDNNKernel<float>,
ops::SoftmaxGradCUDNNKernel<plat::float16>);
#else
REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
ops::SoftmaxCUDNNKernel<float>, ops::SoftmaxCUDNNKernel<float>,
ops::SoftmaxCUDNNKernel<double>, ops::SoftmaxCUDNNKernel<double>,
...@@ -526,3 +564,4 @@ REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace, ...@@ -526,3 +564,4 @@ REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
ops::SoftmaxGradCUDNNKernel<float>, ops::SoftmaxGradCUDNNKernel<float>,
ops::SoftmaxGradCUDNNKernel<double>, ops::SoftmaxGradCUDNNKernel<double>,
ops::SoftmaxGradCUDNNKernel<plat::float16>); ops::SoftmaxGradCUDNNKernel<plat::float16>);
#endif
...@@ -22,6 +22,10 @@ limitations under the License. */ ...@@ -22,6 +22,10 @@ limitations under the License. */
#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_helper.h"
#endif #endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/miopen_helper.h"
#endif
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#endif #endif
...@@ -66,7 +70,7 @@ class SoftmaxOp : public framework::OperatorWithKernel { ...@@ -66,7 +70,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
framework::DataLayout layout_ = framework::StringToDataLayout(data_format); framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::CanCUDNNBeUsed(ctx)) { if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN; library_ = framework::LibraryType::kCUDNN;
} }
...@@ -190,7 +194,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { ...@@ -190,7 +194,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
framework::DataLayout layout_ = framework::StringToDataLayout(data_format); framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
auto input_data_type = OperatorWithKernel::IndicateVarDataType( auto input_data_type = OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Out")); ctx, framework::GradVarName("Out"));
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::CanCUDNNBeUsed(ctx)) { if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN; library_ = framework::LibraryType::kCUDNN;
} }
......
...@@ -82,7 +82,7 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> { ...@@ -82,7 +82,7 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(), platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(),
src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel); src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel);
} else { } else {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
memory::Copy(platform::CUDAPlace(), dst + j * row_numel, memory::Copy(platform::CUDAPlace(), dst + j * row_numel,
platform::CUDAPlace(), platform::CUDAPlace(),
......
...@@ -98,7 +98,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, ...@@ -98,7 +98,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
memory::Copy(cpu_place, dst + i * dst_after, cpu_place, memory::Copy(cpu_place, dst + i * dst_after, cpu_place,
src + i * src_after, sizeof(T) * size); src + i * src_after, sizeof(T) * size);
} else { } else {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto& gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place); auto& gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place);
auto& cuda_ctx = auto& cuda_ctx =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx); reinterpret_cast<const platform::CUDADeviceContext&>(ctx);
......
...@@ -72,7 +72,7 @@ TEST(StridedMemcpy, CPUConcat) { ...@@ -72,7 +72,7 @@ TEST(StridedMemcpy, CPUConcat) {
} }
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(StridedMemcpy, GPUCrop) { TEST(StridedMemcpy, GPUCrop) {
// clang-format off // clang-format off
int src[] = { int src[] = {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册