[ROCM] update fluid operators for rocm (part7), test=develop (#31307)

3b9db171 · Qi Li · GitHub · db50fb67 · 3b9db171 · 3b9db171
19 changed file
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -73,9 +73,11 @@ register_operators(EXCLUDES py_func_op warpctc_op dgc_op lstm_op run_program_op

 op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})

-if (WITH_GPU)
+if (WITH_GPU OR WITH_ROCM)
+    if(WITH_ROCM)
+        op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
    # warpctc_op needs cudnn 7 above
-    if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+    elseif(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
        op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
    else()
        op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
@@ -108,7 +110,7 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_fun
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost ps_gpu_wrapper)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions)
-if (WITH_GPU)
+if (WITH_GPU OR WITH_ROCM)
  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor)
 endif()
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment)
@@ -139,9 +141,12 @@ cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_t
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
-nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor generator)
 if (WITH_GPU)
+    nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor generator)
    nv_test(test_leaky_relu_grad_grad_functor SRCS test_leaky_relu_grad_grad_functor.cc test_leaky_relu_grad_grad_functor.cu DEPS tensor device_context eigen3)
+elseif(WITH_ROCM)
+    hip_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor generator)
+    hip_test(test_leaky_relu_grad_grad_functor SRCS test_leaky_relu_grad_grad_functor.cc test_leaky_relu_grad_grad_functor.cu DEPS tensor device_context eigen3)
 else()
    cc_test(test_leaky_relu_grad_grad_functor SRCS test_leaky_relu_grad_grad_functor.cc DEPS tensor device_context eigen3)
 endif()

--- a/paddle/fluid/operators/bmm_op.cu
+++ b/paddle/fluid/operators/bmm_op.cu
@@ -11,7 +11,7 @@

 #include "paddle/fluid/operators/bmm_op.h"

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
    bmm, ops::BmmKernel<paddle::platform::CUDADeviceContext, float>,

--- a/paddle/fluid/operators/cholesky_op.cu
+++ b/paddle/fluid/operators/cholesky_op.cu
@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
 #include <thrust/device_vector.h>
 #include <algorithm>
 #include <vector>
@@ -164,3 +167,5 @@ REGISTER_OP_CUDA_KERNEL(
    cholesky_grad,
    ops::CholeskyGradKernel<paddle::platform::CUDADeviceContext, float>,
    ops::CholeskyGradKernel<paddle::platform::CUDADeviceContext, double>);
+
+#endif  // not PADDLE_WITH_HIP
--- a/paddle/fluid/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
@@ -25,7 +25,7 @@ namespace operators {
 using framework::Tensor;
 using platform::Transform;

-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T, typename UnaryOperation>
 __global__ void ClipCudaKernel(const T* input, T* out, int num,
                               UnaryOperation op) {
@@ -105,7 +105,7 @@ class ClipKernel : public framework::OpKernel<T> {
      const T* x_data = x->data<T>();
      int64_t numel = x->numel();
      if (platform::is_gpu_place(context.GetPlace())) {
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
        int threads = 256;
        int blocks = (numel + threads - 1) / threads;
        ClipCudaKernel<T, ClipFunctor<T>><<<

--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -289,7 +289,7 @@ REGISTER_OP_CPU_KERNEL(
    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, float>,
    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, double>);

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
    coalesce_tensor,
    ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext,

--- a/paddle/fluid/operators/correlation_op.cu
+++ b/paddle/fluid/operators/correlation_op.cu
@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#ifndef PADDLE_WITH_HIP
+// HIP not supported yet
+
 #include <algorithm>
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
@@ -480,3 +483,5 @@ REGISTER_OP_CUDA_KERNEL(correlation, ops::CorrelationCUDAKernel<float>,
                        ops::CorrelationCUDAKernel<double>);
 REGISTER_OP_CUDA_KERNEL(correlation_grad, ops::CorrelationCUDAGradKernel<float>,
                        ops::CorrelationCUDAGradKernel<double>);
+
+#endif  // not PADDLE_WITH_HIP
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -14,9 +14,14 @@ limitations under the License. */

 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/cudnn_lstm_cache.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/utils.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/operators/cudnn_lstm_cache.h"
+#endif
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/operators/miopen_lstm_cache.h"
+#endif

 namespace paddle {
 namespace platform {
@@ -54,7 +59,7 @@ int size_sum(const std::vector<const Tensor *> &weight_list) {
 }

 template <typename T>
-void weight_to_tensor(const platform::Place &place, cudaStream_t stream,
+void weight_to_tensor(const platform::Place &place, gpuStream_t stream,
                      const std::vector<const Tensor *> &weight_list,
                      Tensor *weight) {
  auto weight_data = weight->data<T>();
@@ -72,7 +77,7 @@ void weight_to_tensor(const platform::Place &place, cudaStream_t stream,
 }

 template <typename T>
-void weight_to_tensor_list(const platform::Place &place, cudaStream_t stream,
+void weight_to_tensor_list(const platform::Place &place, gpuStream_t stream,
                           std::vector<Tensor *> *weight_grad,
                           const std::vector<const Tensor *> &weight_input,
                           const Tensor *weight) {
@@ -92,23 +97,36 @@ void weight_to_tensor_list(const platform::Place &place, cudaStream_t stream,
 }

 template <typename T>
+#ifdef PADDLE_WITH_HIP
+void LSTMInferece(const bool &has_seq_length, const miopenHandle_t &handle,
+#else
 void LSTMInferece(const bool &has_seq_length, const cudnnHandle_t &handle,
+#endif
                  const int &seq_length, ScopedRNNBase *rnn, const T *x_data,
                  const T *init_h_data, const T *init_c_data, const T *w_data,
                  T *out_data, T *last_h_data, T *last_c_data,
                  framework::Tensor *workspace_data,
                  const size_t &workspace_size) {
  if (!has_seq_length) {
-    // for inference
-    // This interface is used when the input/output is unpadded.
+// for inference
+// This interface is used when the input/output is unpadded.
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNForwardInference(
+        handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data,
+        rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data,
+        rnn->weight_desc(), w_data, rnn->y_descs(), out_data,
+        rnn->last_h_desc(), last_h_data, rnn->last_c_desc(), last_c_data,
+        workspace_data->data<uint8_t>(), workspace_size));
+#else
    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
        handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data,
        rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data,
        rnn->weight_desc(), w_data, rnn->y_descs(), out_data,
        rnn->last_h_desc(), last_h_data, rnn->last_c_desc(), last_c_data,
        workspace_data->data<uint8_t>(), workspace_size));
+#endif
  } else {
-#if CUDNN_VERSION >= 7201
+#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
    // for inference
    // This interface is used when the input/output is padded.
    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInferenceEx(
@@ -256,8 +274,17 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
                      last_c_data, &workspace_data_, workspace_size);
    } else {
      if (!has_seq_length) {
-        // for train
-        // This interface is used when the input/output is unpadded.
+// for train
+// This interface is used when the input/output is unpadded.
+#ifdef PADDLE_WITH_HIP
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNForwardTraining(
+            handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data,
+            rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+            rnn.weight_desc(), w_data, rnn.y_descs(), out_data,
+            rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data,
+            workspace_data_.data<uint8_t>(), workspace_size, reserve_data,
+            reserve_size));
+#else
        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
            handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data,
            rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
@@ -265,8 +292,9 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
            rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data,
            workspace_data_.data<uint8_t>(), workspace_size, reserve_data,
            reserve_size));
+#endif
      } else {
-#if CUDNN_VERSION >= 7201
+#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
        // for train
        // This interface is used when the input/output is padded.
        PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -403,7 +431,23 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
    const uint8_t *reserve_data = reserve->data<uint8_t>();

    if (!has_seq_length) {
-      // This interface is used when the input/output is unpadded.
+// This interface is used when the input/output is unpadded.
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNBackwardData(
+          handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data,
+          rnn.y_descs(), out_grad_data, rnn.last_h_desc(), last_h_grad_data,
+          rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
+          rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+          rnn.x_descs(), in_grad_data, rnn.init_h_desc(), init_h_grad_data,
+          rnn.init_c_desc(), init_c_grad_data, workspace_data_.data<uint8_t>(),
+          workspace_size, const_cast<uint8_t *>(reserve_data), reserve_size));
+
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNBackwardWeights(
+          handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data<T>(),
+          rnn.init_h_desc(), init_h->data<T>(), rnn.y_descs(), out->data<T>(),
+          rnn.weight_desc(), weight_grad_data, workspace_data_.data<uint8_t>(),
+          workspace_size, const_cast<uint8_t *>(reserve_data), reserve_size));
+#else
      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData(
          handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data,
          rnn.y_descs(), out_grad_data, rnn.last_h_desc(), last_h_grad_data,
@@ -418,8 +462,9 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
          rnn.init_h_desc(), init_h->data<T>(), rnn.y_descs(), out->data<T>(),
          workspace_data_.data<uint8_t>(), workspace_size, rnn.weight_desc(),
          weight_grad_data, const_cast<uint8_t *>(reserve_data), reserve_size));
+#endif
    } else {
-#if CUDNN_VERSION >= 7201
+#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
      // for train
      // This interface is used when the input/output is padded.
      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardDataEx(
@@ -452,7 +497,13 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+REGISTER_OP_CUDA_KERNEL(cudnn_lstm, ops::CudnnLSTMGPUKernel<float>);
+REGISTER_OP_CUDA_KERNEL(cudnn_lstm_grad, ops::CudnnLSTMGPUGradKernel<float>);
+#else
 REGISTER_OP_CUDA_KERNEL(cudnn_lstm, ops::CudnnLSTMGPUKernel<float>,
                        ops::CudnnLSTMGPUKernel<double>);
 REGISTER_OP_CUDA_KERNEL(cudnn_lstm_grad, ops::CudnnLSTMGPUGradKernel<float>,
                        ops::CudnnLSTMGPUGradKernel<double>);
+#endif
--- a/paddle/fluid/operators/cumsum_op.cu
+++ b/paddle/fluid/operators/cumsum_op.cu
@@ -16,7 +16,13 @@ limitations under the License. */
 #include <thrust/device_vector.h>
 #include <thrust/reverse.h>
 #include <thrust/scan.h>
-#include "cub/cub.cuh"
+#ifdef __NVCC__
+#include <cub/cub.cuh>
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
 #include "paddle/fluid/operators/cum_op.h"
 #include "paddle/fluid/platform/gpu_launch_config.h"


--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/data_norm_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
@@ -174,7 +174,7 @@ class DataNormGradKernel<platform::CUDADeviceContext, T>
        d_batch_sum, d_batch_square_sum);

    if (need_sync_stats) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
      auto comm = platform::NCCLCommContext::Instance().Get(0, ctx.GetPlace());
      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
          reinterpret_cast<const void *>(d_batch_size),
@@ -188,7 +188,11 @@ class DataNormGradKernel<platform::CUDADeviceContext, T>
          reinterpret_cast<const void *>(d_batch_square_sum),
          reinterpret_cast<void *>(d_batch_square_sum), C,
          platform::ToNCCLDataType(x->type()), ncclSum, comm->comm(), stream));
+#ifdef PADDLE_WITH_RCCL
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
+#else
      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+#endif
 #else
      PADDLE_THROW(platform::errors::PreconditionNotMet(
          "PaddlePaddle should compile with GPU, and need_sync_stats connot be "

--- a/paddle/fluid/operators/diag_embed_op.h
+++ b/paddle/fluid/operators/diag_embed_op.h
@@ -100,7 +100,7 @@ class DiagEmbedKernel : public framework::OpKernel<T> {
    strides.push_back(stride[dim1_] + stride[dim2_]);
    const auto dims = vectorize(input->dims());

-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
    thrust::device_vector<int64_t> dims_vec(dims);
    const int64_t* dims_arr = thrust::raw_pointer_cast(dims_vec.data());
    thrust::device_vector<int64_t> strides_vec(strides);

--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -45,7 +45,7 @@ struct DotGradFunction<DeviceContext, T, math::EnableComplex<T>> {
                  const Tensor* tensor_dout, Tensor* tensor_dx,
                  Tensor* tensor_dy,
                  const paddle::framework::ExecutionContext& ctx) {
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
    if (1 == tensor_dout->dims().size()) {
      auto dout = framework::EigenVector<T>::Flatten(*tensor_dout);

@@ -249,7 +249,7 @@ class DotKernel : public framework::OpKernel<T> {
    auto* tensor_out = ctx.Output<Tensor>("Out");
    tensor_out->mutable_data<T>(ctx.GetPlace());

-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
    if (1 == tensor_out->dims().size()) {
      auto out = framework::EigenScalar<T>::From(*tensor_out);
      auto x = framework::EigenVector<T>::Flatten(*tensor_x);

--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -11,8 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <curand_kernel.h>
+#include "paddle/fluid/platform/dynload/curand.h"
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#include <hiprand_kernel.h>
+#include "paddle/fluid/platform/dynload/hiprand.h"
+#endif
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/random.h>
@@ -21,7 +30,6 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/platform/dynload/curand.h"
 #include "paddle/fluid/platform/float16.h"

 namespace paddle {
@@ -32,15 +40,24 @@ __global__ void RandomGenerator(const size_t n, uint64_t seed,
                                const float dropout_prob, const T* src,
                                MaskType* mask_data, T* dst,
                                bool is_upscale_in_train, uint64_t increment) {
-  curandStatePhilox4_32_10_t state;
  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+#ifdef PADDLE_WITH_HIP
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(seed, idx, increment, &state);
+#else
+  curandStatePhilox4_32_10_t state;
  curand_init(seed, idx, increment, &state);
+#endif

  MaskType mask;
  T dest;
  for (; idx < n; idx += blockDim.x * gridDim.x) {
    T s = src[idx];
+#ifdef PADDLE_WITH_HIP
+    if (hiprand_uniform(&state) < dropout_prob) {
+#else
    if (curand_uniform(&state) < dropout_prob) {
+#endif
      mask = 0;
      dest = 0;
    } else {
@@ -62,9 +79,15 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
                                          const T* src, MaskType* mask_data,
                                          T* dst, bool is_upscale_in_train,
                                          uint64_t increment) {
+#ifdef PADDLE_WITH_HIP
+  int64_t idx = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(seed, idx, increment, &state);
+#else
  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
  curandStatePhilox4_32_10_t state;
  curand_init(seed, idx, increment, &state);
+#endif

  MaskType mask;
  T dest;
@@ -75,7 +98,11 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
    T src_vec[VecSize];
    LoadT* value = reinterpret_cast<LoadT*>(&src_vec);
    *value = *reinterpret_cast<const LoadT*>(&src[i]);
+#ifdef PADDLE_WITH_HIP
+    float4 rand = hiprand_uniform4(&state);
+#else
    float4 rand = curand_uniform4(&state);
+#endif

    T dest_vec[VecSize];
    MaskType mask_vec[VecSize];
@@ -131,10 +158,17 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
      auto* x_data = x->data<T>();
      auto* y_data = y->mutable_data<T>(context.GetPlace());
      if (dropout_prob == 1.0f) {
+#ifdef PADDLE_WITH_HIP
+        PADDLE_ENFORCE_CUDA_SUCCESS(
+            hipMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
+        PADDLE_ENFORCE_CUDA_SUCCESS(
+            hipMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream));
+#else
        PADDLE_ENFORCE_CUDA_SUCCESS(
            cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
        PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(
            mask_data, 0, x_numel * sizeof(*mask_data), stream));
+#endif
        return;
      }

@@ -180,6 +214,20 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
        increment = offset;
      }

+#ifdef __HIPCC__
+      if (vec_size == 4 && size % 4 == 0) {
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(VectorizedRandomGenerator<T, uint8_t, 4>),
+            config.block_per_grid, config.thread_per_block, 0, stream, size,
+            seed_data, dropout_prob, x_data, mask_data, y_data,
+            upscale_in_train, increment);
+      } else {
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(RandomGenerator<T, uint8_t>),
+                           config.block_per_grid, config.thread_per_block, 0,
+                           stream, size, seed_data, dropout_prob, x_data,
+                           mask_data, y_data, upscale_in_train, increment);
+      }
+#else
      if (vec_size == 4 && size % 4 == 0) {
        VectorizedRandomGenerator<
            T, uint8_t,
@@ -192,7 +240,7 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
            size, seed_data, dropout_prob, x_data, mask_data, y_data,
            upscale_in_train, increment);
      }
-
+#endif
    } else {
      auto X = EigenMatrix<T>::Reshape(*x, 1);
      auto Y = EigenMatrix<T>::Reshape(*y, 1);

--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
@@ -42,7 +42,7 @@ inline int VectorizedSize(const T* pointer) {
  return 1;
 }

-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T, typename MaskType, int VecSize>
 __global__ void DropoutGradCUDAKernel(const T* dout, const MaskType* mask,
                                      const T factor, const int64_t size,
@@ -186,7 +186,7 @@ class DropoutGradKernel : public framework::OpKernel<T> {
        int vec_size = VectorizedSize<T>(grad_y->data<T>());
        if (platform::is_gpu_place(context.GetPlace()) && vec_size == 4 &&
            size % 4 == 0) {
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
          auto factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
          auto stream = context.cuda_device_context().stream();
          platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(

--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -162,7 +162,11 @@ struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
      int grid = cout;
      int max_threads = 1024;

+#ifdef PADDLE_WITH_HIP
+      hipMemset(out_abs_max, 0, sizeof(T) * cout);
+#else
      cudaMemset(out_abs_max, 0, sizeof(T) * cout);
+#endif

      for (int i = 0; i < cin / max_threads; i++) {
        int block = max_threads;

--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
@@ -65,7 +65,7 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
      functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
              out, static_cast<T>(value));
    }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    if (!cpu_place) {
      math::SetConstant<platform::CUDADeviceContext, T> functor;
      out->mutable_data(ctx.GetPlace(), data_type);

--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -121,7 +121,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
      functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
              tensor, static_cast<T>(value));
    } else if (actual_place == 1) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      tensor->mutable_data(ctx.GetPlace(), data_type);
      math::SetConstant<platform::CUDADeviceContext, T> functor;
      functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
@@ -131,7 +131,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
          "PaddlePaddle should compile with GPU."));
 #endif
    } else if (actual_place == 2) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      tensor->mutable_data(platform::CUDAPinnedPlace(), data_type);
      math::SetConstant<platform::CPUDeviceContext, T> functor;
      functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),

--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ b/paddle/fluid/operators/filter_by_instag_op.h
@@ -31,7 +31,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 using SelectedRows = framework::SelectedRows;
 using LoDTensor = framework::LoDTensor;
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename T>
 using Vector = framework::Vector<T>;
 #else

--- a/paddle/fluid/operators/gelu_op.h
+++ b/paddle/fluid/operators/gelu_op.h
@@ -54,7 +54,8 @@ struct GeluFunctor {
      }
    } else {
 #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA)
+    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
+    !defined(PADDLE_WITH_HIP)
      auto x_data = x.data();
      auto out_data = out.data();
      int n = std::min(x.size(), out.size());
@@ -121,7 +122,8 @@ struct GeluGradFunctor {
      }
    } else {
 #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA)
+    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
+    !defined(PADDLE_WITH_HIP)
      auto x_data = x.data();
      auto dx_data = dx.data();
      auto dout_data = dout.data();

--- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
+++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
@@ -107,7 +107,7 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(get_tensor_from_selected_rows, float,
                               ops::GetTensorFromSelectedRowsKernel, int64_t,
                               ops::GetTensorFromSelectedRowsKernel);

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(get_tensor_from_selected_rows, float,
                                ops::GetTensorFromSelectedRowsKernel, double,
                                ops::GetTensorFromSelectedRowsKernel, int,