[ROCM] fix dropout and remove hipcub, test=develop (#31455)

f9377965 · Qi Li · GitHub · fadabbe9 · f9377965 · f9377965
15 changed file
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 #include "paddle/fluid/platform/miopen_helper.h"
+namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -64,27 +65,16 @@ static void SortDescending(const platform::CUDADeviceContext &ctx,
  // Determine temporary device storage requirements
  size_t temp_storage_bytes = 0;
-#ifdef PADDLE_WITH_HIP
-  hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
-      nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
-#else
  cub::DeviceRadixSort::SortPairsDescending<T, int>(
      nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
-#endif
  // Allocate temporary storage
  auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
  auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
-// Run sorting operation
+  // Run sorting operation
-#ifdef PADDLE_WITH_HIP
-  hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
-      d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in,
-      idx_out, num);
-#else
  cub::DeviceRadixSort::SortPairsDescending<T, int>(
      d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in,
      idx_out, num);
-#endif
 }
 template <typename T>

--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 #endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
 #endif
 #include <paddle/fluid/memory/allocation/allocator.h>
@@ -141,29 +142,17 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
    // Determine temporary device storage requirements
    size_t temp_storage_bytes = 0;
-#ifdef PADDLE_WITH_HIP
-    hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
-        nullptr, temp_storage_bytes, concat_scores.data<T>(), keys_out, idx_in,
-        idx_out, total_roi_num);
-#else
    cub::DeviceRadixSort::SortPairsDescending<T, int>(
        nullptr, temp_storage_bytes, concat_scores.data<T>(), keys_out, idx_in,
        idx_out, total_roi_num);
-#endif
    // Allocate temporary storage
    auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
-// Run sorting operation
+    // Run sorting operation
-// sort score to get corresponding index
+    // sort score to get corresponding index
-#ifdef PADDLE_WITH_HIP
-    hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
-        d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data<T>(),
-        keys_out, idx_in, idx_out, total_roi_num);
-#else
    cub::DeviceRadixSort::SortPairsDescending<T, int>(
        d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data<T>(),
        keys_out, idx_in, idx_out, total_roi_num);
-#endif
    index_out_t.Resize({real_post_num});
    Tensor sorted_rois;
    sorted_rois.mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
@@ -185,29 +174,17 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
        out_id_t.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
    // Determine temporary device storage requirements
    temp_storage_bytes = 0;
-#ifdef PADDLE_WITH_HIP
-    hipcub::DeviceRadixSort::SortPairs<int, int>(
-        nullptr, temp_storage_bytes, sorted_batch_id.data<int>(), out_id_data,
-        batch_idx_in, index_out_t.data<int>(), real_post_num);
-#else
    cub::DeviceRadixSort::SortPairs<int, int>(
        nullptr, temp_storage_bytes, sorted_batch_id.data<int>(), out_id_data,
        batch_idx_in, index_out_t.data<int>(), real_post_num);
-#endif
    // Allocate temporary storage
    d_temp_storage = memory::Alloc(place, temp_storage_bytes);
-// Run sorting operation
+    // Run sorting operation
-// sort batch_id to get corresponding index
+    // sort batch_id to get corresponding index
-#ifdef PADDLE_WITH_HIP
-    hipcub::DeviceRadixSort::SortPairs<int, int>(
-        d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data<int>(),
-        out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num);
-#else
    cub::DeviceRadixSort::SortPairs<int, int>(
        d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data<int>(),
        out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num);
-#endif
    GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);

--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
 #endif
 #include <paddle/fluid/memory/allocation/allocator.h>
@@ -149,42 +150,24 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
    // Determine temporary device storage requirements
    size_t temp_storage_bytes = 0;
-#ifdef PADDLE_WITH_HIP
-    hipcub::DeviceRadixSort::SortPairs<int, int>(nullptr, temp_storage_bytes,
-                                                 target_lvls_data, keys_out,
-                                                 idx_in, idx_out, roi_num);
-#else
    cub::DeviceRadixSort::SortPairs<int, int>(nullptr, temp_storage_bytes,
                                              target_lvls_data, keys_out,
                                              idx_in, idx_out, roi_num);
-#endif
    // Allocate temporary storage
    auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
-// Run sorting operation
+    // Run sorting operation
-// sort target level to get corresponding index
+    // sort target level to get corresponding index
-#ifdef PADDLE_WITH_HIP
-    hipcub::DeviceRadixSort::SortPairs<int, int>(
-        d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out,
-        idx_in, idx_out, roi_num);
-#else
    cub::DeviceRadixSort::SortPairs<int, int>(
        d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out,
        idx_in, idx_out, roi_num);
-#endif
    int* restore_idx_data =
        restore_index->mutable_data<int>({roi_num, 1}, dev_ctx.GetPlace());
-// sort current index to get restore index
+    // sort current index to get restore index
-#ifdef PADDLE_WITH_HIP
-    hipcub::DeviceRadixSort::SortPairs<int, int>(
-        d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in,
-        restore_idx_data, roi_num);
-#else
    cub::DeviceRadixSort::SortPairs<int, int>(
        d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in,
        restore_idx_data, roi_num);
-#endif
    int start = 0;
    auto multi_rois_num = ctx.MultiOutput<Tensor>("MultiLevelRoIsNum");

--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/group_norm_op.h"
@@ -46,18 +47,10 @@ enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 };
 template <typename T>
 __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
-#ifdef PADDLE_WITH_CUDA
  typedef cub::WarpReduce<T> WarpReduce;
-#else
-  typedef hipcub::WarpReduce<T> WarpReduce;
-#endif
  typename WarpReduce::TempStorage temp_storage;
  value = WarpReduce(temp_storage).Sum(value);
-#ifdef PADDLE_WITH_CUDA
  if (cub::LaneId() == 0) platform::CudaAtomicAdd(sum, value);
-#else
-  if (hipcub::LaneId() == 0) platform::CudaAtomicAdd(sum, value);
-#endif
 }
 template <typename T>

--- a/paddle/fluid/operators/kron_op.h
+++ b/paddle/fluid/operators/kron_op.h
@@ -369,19 +369,7 @@ struct KronGradOpFunctor {
    for_range(func);
 // reduce_sum along aixs 1
-#ifdef __HIPCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
-    auto stream = dev_ctx.stream();  // it is a cuda device_context
-    if (dx) {
-      TensorReduce<T, T, hipcub::Sum, IdentityFunctor<T>>(
-          dout_x, dx, {1}, static_cast<T>(0), hipcub::Sum(),
-          IdentityFunctor<T>(), stream);
-    }
-    if (dy) {
-      TensorReduce<T, T, hipcub::Sum, IdentityFunctor<T>>(
-          dout_y, dy, {1}, static_cast<T>(0), hipcub::Sum(),
-          IdentityFunctor<T>(), stream);
-    }
-#elif defined(__NVCC__)
    auto stream = dev_ctx.stream();  // it is a cuda device_context
    if (dx) {
      TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(

--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -45,12 +45,7 @@ template <typename DeviceContext, typename T>
 void ReduceSumForMatmulGrad(const Tensor* input, Tensor* output,
                            const std::vector<int>& reduce_dims,
                            const paddle::framework::ExecutionContext& ctx) {
-#ifdef __HIPCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
-  auto stream = ctx.cuda_device_context().stream();
-  TensorReduce<T, T, hipcub::Sum, IdentityFunctor<T>>(
-      *input, output, reduce_dims, static_cast<T>(0), hipcub::Sum(),
-      IdentityFunctor<T>(), stream);
-#elif defined(__NVCC__)
  auto stream = ctx.cuda_device_context().stream();
  TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
      *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),

--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -213,12 +213,7 @@ class PoolKernel : public framework::OpKernel<T> {
          if (reduce_num > 0 &&
              adaptive) {  // for adaptive_avg_pool2d && output_size == 1
-#ifdef __HIPCC__
+#if defined(__HIPCC__) || defined(__NVCC__)
-            auto stream = dev_ctx.stream();
-            TensorReduce<T, T, hipcub::Sum, DivideFunctor<T>>(
-                *in_x, out, reduce_dim, static_cast<T>(0), hipcub::Sum(),
-                DivideFunctor<T>(reduce_num), stream);
-#elif defined(__NVCC__)
            auto stream = dev_ctx.stream();
            TensorReduce<T, T, cub::Sum, DivideFunctor<T>>(
                *in_x, out, reduce_dim, static_cast<T>(0), cub::Sum(),

--- a/paddle/fluid/operators/prelu_op.cu
+++ b/paddle/fluid/operators/prelu_op.cu
@@ -174,15 +174,9 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
      reduce_dims.push_back(i);
    }
-#ifdef __HIPCC__
-    TensorReduce<T, T, hipcub::Sum, IdentityFunctor<T>>(
-        dalpha_tmp, dalpha, reduce_dims, static_cast<T>(0), hipcub::Sum(),
-        IdentityFunctor<T>(), stream);
-#else
    TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
        dalpha_tmp, dalpha, reduce_dims, static_cast<T>(0), cub::Sum(),
        IdentityFunctor<T>(), stream);
-#endif
  }
 };

--- a/paddle/fluid/operators/reduce_ops/cub_reduce.h
+++ b/paddle/fluid/operators/reduce_ops/cub_reduce.h
@@ -26,6 +26,7 @@
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
 #endif
 #include "paddle/fluid/framework/tensor.h"
@@ -71,12 +72,7 @@ template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
 __global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer,
                               TransformOp transformer, Ty init,
                               int reduce_num) {
-#ifdef __HIPCC__
-  __shared__
-      typename hipcub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
-#else
  __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
-#endif
  int idx_x = blockIdx.x * reduce_num;
  int idx_y = threadIdx.x;
  Ty reduce_var = init;
@@ -85,13 +81,8 @@ __global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer,
        reducer(reduce_var, static_cast<Ty>(transformer(x[idx_x + idx_y])));
  __syncthreads();
-#ifdef __HIPCC__
-  reduce_var = hipcub::BlockReduce<Ty, BlockDim>(temp_storage)
-                   .Reduce(reduce_var, reducer);
-#else
  reduce_var =
      cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
-#endif
  if (threadIdx.x == 0) {
    y[blockIdx.x] = reduce_var;
@@ -107,12 +98,7 @@ __global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer,
                             Array<int, ReduceRank> reduce_strides,
                             Array<int, Rank - ReduceRank> left_dim,
                             Array<int, Rank - ReduceRank> left_strides) {
-#ifdef __HIPCC__
-  __shared__
-      typename hipcub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
-#else
  __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
-#endif
  Array<int, Rank> sub_index;
  int left_idx = blockIdx.x;
  for (int i = 0; i < Rank - ReduceRank; ++i) {
@@ -144,13 +130,8 @@ __global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer,
  }
  __syncthreads();
-#ifdef __HIPCC__
-  reduce_var = hipcub::BlockReduce<Ty, BlockDim>(temp_storage)
-                   .Reduce(reduce_var, reducer);
-#else
  reduce_var =
      cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
-#endif
  if (threadIdx.x == 0) {
    y[blockIdx.x] = reduce_var;
@@ -238,32 +219,17 @@ static void TensorReduceImpl(
  int rank = x_strides.size();
  int reduce_rank = reduce_strides.size();
  if (rank == reduce_rank) {
-#ifdef __HIPCC__
-    hipcub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
-        x_data, transformer);
-#else
    cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
        x_data, transformer);
-#endif
    size_t temp_storage_bytes = 0;
-#ifdef __HIPCC__
-    hipcub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
-                                 reduce_num, reducer, init, stream);
-#else
    cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
                              reduce_num, reducer, init, stream);
-#endif
    framework::Tensor tmp;
    auto* temp_storage = tmp.mutable_data<uint8_t>(
        framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
        place);
-#ifdef __HIPCC__
-    hipcub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x,
-                                 y_data, reduce_num, reducer, init, stream);
-#else
    cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
                              reduce_num, reducer, init, stream);
-#endif
    return;
  }
  if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) {

--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
@@ -56,15 +56,9 @@ class ReduceMeanKernel : public framework::OpKernel<T> {
    }
    auto stream = context.cuda_device_context().stream();
-#ifdef PADDLE_WITH_HIP
-    TensorReduce<T, T, hipcub::Sum, DivideFunctor<T>>(
-        *input, output, reduce_dims, static_cast<T>(0), hipcub::Sum(),
-        DivideFunctor<T>(reduce_num), stream);
-#else
    TensorReduce<T, T, cub::Sum, DivideFunctor<T>>(
        *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
        DivideFunctor<T>(reduce_num), stream);
-#endif
  }
 };

--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
@@ -56,25 +56,13 @@ class ReduceSumKernel : public framework::OpKernel<T> {
    if (out_dtype >= 0) {
      framework::VisitDataTypeSmall(
          static_cast<framework::proto::VarType::Type>(out_dtype),
-#ifdef __HIPCC__
-          TensorReduceFunctor<T, hipcub::Sum, IdentityFunctor<T>>(
-              *input, output, reduce_dims, static_cast<double>(0.0),
-              hipcub::Sum(), IdentityFunctor<T>(), stream));
-#else
          TensorReduceFunctor<T, cub::Sum, IdentityFunctor<T>>(
              *input, output, reduce_dims, static_cast<double>(0.0), cub::Sum(),
              IdentityFunctor<T>(), stream));
-#endif
    } else {
-#ifdef __HIPCC__
-      TensorReduce<T, T, hipcub::Sum, IdentityFunctor<T>>(
-          *input, output, reduce_dims, static_cast<T>(0), hipcub::Sum(),
-          IdentityFunctor<T>(), stream);
-#else
      TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
          *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
          IdentityFunctor<T>(), stream);
-#endif
    }
  }
 };

--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
@@ -20,6 +20,7 @@ limitations under the License. */
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/math.h"
@@ -31,11 +32,7 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 template <typename T, int BlockDim>
-#ifdef __HIPCC__
-using BlockReduce = hipcub::BlockReduce<T, BlockDim>;
-#else
 using BlockReduce = cub::BlockReduce<T, BlockDim>;
-#endif
 template <typename T, int BlockDim>
 using BlockReduceTempStorage = typename BlockReduce<T, BlockDim>::TempStorage;
@@ -57,13 +54,8 @@ __global__ void sequence_softmax_kernel(const T *in_data, const size_t *ref_lod,
      T ele = in_data[start + tid];
      max_ele = max_ele > ele ? max_ele : ele;
    }
-#ifdef __HIPCC__
-    max_ele =
-        BlockReduce<T, BlockDim>(temp_storage).Reduce(max_ele, hipcub::Max());
-#else
    max_ele =
        BlockReduce<T, BlockDim>(temp_storage).Reduce(max_ele, cub::Max());
-#endif
    if (threadIdx.x == 0) {
      shared_max_data = max_ele;
    }
@@ -75,13 +67,8 @@ __global__ void sequence_softmax_kernel(const T *in_data, const size_t *ref_lod,
      T ele = in_data[start + tid];
      sum_data += real_exp(ele - shared_max_data);
    }
-#ifdef __HIPCC__
-    sum_data =
-        BlockReduce<T, BlockDim>(temp_storage).Reduce(sum_data, hipcub::Sum());
-#else
    sum_data =
        BlockReduce<T, BlockDim>(temp_storage).Reduce(sum_data, cub::Sum());
-#endif
    if (threadIdx.x == 0) {
      shared_sum_data = sum_data;
    }
@@ -116,12 +103,7 @@ __global__ void sequence_softmax_grad_kernel(const T *softmax_grad_data,
      T s_d = softmax_data[idx];
      result += s_g_d * s_d;
    }
-#ifdef __HIPCC__
-    result =
-        BlockReduce<T, BlockDim>(temp_storage).Reduce(result, hipcub::Sum());
-#else
    result = BlockReduce<T, BlockDim>(temp_storage).Reduce(result, cub::Sum());
-#endif
    if (threadIdx.x == 0) {
      shared_data = result;
    }

--- a/paddle/fluid/operators/trace_op.cu
+++ b/paddle/fluid/operators/trace_op.cu
@@ -43,15 +43,9 @@ class TraceCUDAKernel : public framework::OpKernel<T> {
      auto stream = context.cuda_device_context().stream();
      std::vector<int> reduce_dims;
      reduce_dims.push_back(out->dims().size());
-#ifdef __HIPCC__
-      TensorReduce<T, T, hipcub::Sum, IdentityFunctor<T>>(
-          diag, out, reduce_dims, static_cast<T>(0), hipcub::Sum(),
-          IdentityFunctor<T>(), stream);
-#else
      TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
          diag, out, reduce_dims, static_cast<T>(0), cub::Sum(),
          IdentityFunctor<T>(), stream);
-#endif
    }
  }
 };

--- a/paddle/fluid/platform/gpu_launch_config.h
+++ b/paddle/fluid/platform/gpu_launch_config.h
@@ -41,7 +41,11 @@ struct GpuLaunchConfig {
 inline GpuLaunchConfig GetGpuLaunchConfig1D(
    const platform::CUDADeviceContext& context, int element_count,
+#ifdef PADDLE_WITH_HIP
+    int max_threads = 256) {
+#else
    int max_threads = 1024) {
+#endif
  PADDLE_ENFORCE_GT(element_count, 0,
                    platform::errors::InvalidArgument(
                        "element count should be greater than 0,"

--- a/tools/dockerfile/Dockerfile.rocm
+++ b/tools/dockerfile/Dockerfile.rocm
 # A image for building paddle binaries
 # Use rocm-terminal base image for both rocm environment
 # When you modify it, please be aware of rocm version
-# 
+#
-# Build: ROCM 3.9
+# Build: ROCM 4.0.1
 # cd Paddle/tools/dockerfile
 # docker build -f Dockerfile.rocm  \
-#        --build-arg ROCM_VERSION=3.9  \
+#        --build-arg ROCM_VERSION=4.0.1  \
-#        -t paddlepaddle/paddle-centos-rocm39-dev:latest .
+#        -t paddlepaddle/paddle-centos-rocm401-dev:latest .
-# 
+#
 # docker run -it --device=/dev/kfd --device=/dev/dri \
 # --security-opt seccomp=unconfined --group-add video \
-# paddlepaddle/paddle-centos-rocm39-dev:latest /bin/bash
+# paddlepaddle/paddle-centos-rocm401-dev:latest /bin/bash
 FROM centos:7.8.2003
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
@@ -21,7 +21,8 @@ ENV LANGUAGE en_US.UTF-8
 RUN yum install -y epel-release deltarpm sudo openssh-server gettext-devel sqlite-devel \
        zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel \
-        make bzip2 git patch unzip bison yasm diffutils automake which file kernel-headers kernel-devel
+        make bzip2 git patch unzip bison yasm diffutils automake which file kernel-headers kernel-devel \
+        net-tools numactl-devel chrpath
 # Install devtoolset-7
 RUN yum install -y yum-utils centos-release-scl && \
@@ -70,7 +71,7 @@ RUN cd /opt && wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \
  make -j8 && make install && \
  cd .. && rm -rf git-2.17.1.tar.gz && rm -rf git-2.17.1
-ENV GOROOT=/usr/local/go 
+ENV GOROOT=/usr/local/go
 ENV GOPATH=/root/gopath
 ENV PATH=${GOROOT}/bin:${GOPATH}/bin:${PATH}
@@ -82,7 +83,7 @@ RUN wget --no-check-certificate -qO- https://storage.googleapis.com/golang/go1.8
    mkdir /root/gopath/src
 # protobuf 3.6.1
-RUN cd /opt && wget -q --no-check-certificate https://paddle-ci.cdn.bcebos.com/protobuf-cpp-3.6.1.tar.gz && \ 
+RUN cd /opt && wget -q --no-check-certificate https://paddle-ci.cdn.bcebos.com/protobuf-cpp-3.6.1.tar.gz && \
    tar xzf protobuf-cpp-3.6.1.tar.gz && \
    cd protobuf-3.6.1 && ./configure && make -j4 && make install && \
    cd .. && rm -f protobuf-cpp-3.6.1.tar.gz && rm -rf protobuf-3.6.1
@@ -91,28 +92,34 @@ RUN cd /opt && wget -q --no-check-certificate https://paddle-ci.cdn.bcebos.com/p
 RUN cd /opt && wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && chmod +x Miniconda3-latest-Linux-x86_64.sh
 RUN mkdir /opt/conda && ./Miniconda3-latest-Linux-x86_64.sh -b -f -p "/opt/conda" && rm -rf Miniconda3-latest-Linux-x86_64.sh
 ENV PATH=/opt/conda/bin:${PATH}
-RUN conda init bash && \
+RUN conda init bash && conda install -n base jupyter 
-    conda create -n python2.7 python=2.7 && \
-    conda create -n python3.7 python=3.7
-# install paddle requirement
+# install Paddle requirement
 RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt
 RUN /opt/conda/bin/pip install -r /root/requirements.txt && \
-    /opt/conda/envs/python2.7/bin/pip install -r /root/requirements.txt && \
-    /opt/conda/envs/python3.7/bin/pip install -r /root/requirements.txt && \
    rm -rf /root/requirements.txt
 RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/unittest_py/requirements.txt -O /root/requirements.txt
-RUN /opt/conda/bin/pip install -r /root/requirements.txt && \
+RUN /opt/conda/bin/pip install -r /root/requirements.txt && rm -rf /root/requirements.txt
-    /opt/conda/envs/python2.7/bin/pip install -r /root/requirements.txt && \
-    /opt/conda/envs/python3.7/bin/pip install -r /root/requirements.txt && \
+# install PaddleClas requirement
-    rm -rf /root/requirements.txt
+RUN wget https://raw.githubusercontent.com/PaddlePaddle/PaddleClas/develop/requirements.txt -O /root/requirements.txt
+RUN /opt/conda/bin/pip install -r /root/requirements.txt && rm -rf /root/requirements.txt
+# install PaddleDetection requirement
+RUN wget https://raw.githubusercontent.com/PaddlePaddle/PaddleDetection/develop/requirements.txt -O /root/requirements.txt
+RUN /opt/conda/bin/pip install -r /root/requirements.txt && rm -rf /root/requirements.txt
 # configure ssh
 RUN sed -i "s/^#PermitRootLogin/PermitRootLogin/" /etc/ssh/sshd_config && \
    sed -i "s/^#PubkeyAuthentication/PubkeyAuthentication/" /etc/ssh/sshd_config && \
    sed -i "s/^#RSAAuthentication/RSAAuthentication/" /etc/ssh/sshd_config
+# clang-format 3.8
+RUN wget https://copr.fedorainfracloud.org/coprs/alonid/llvm-3.8.0/repo/epel-7/alonid-llvm-3.8.0-epel-7.repo -P /etc/yum.repos.d/
+RUN yum install -y clang-3.8.0
+ENV PATH=/opt/llvm-3.8.0/bin:${PATH}
 # patchelf
 RUN yum install -y patchelf && \
    yum clean all && \