apply gcc12 to gpups (#52960)

* apply gcc12 to gpups * apply gcc12 to gpups * apply gcc12 to gpups * apply gcc12 to gpups * apply gcc12 to gpups * apply gcc12 to gpups * apply gcc12 to gpips * apply gcc12 to gpups * apply gcc12 to gpups * test * test * apply gcc12 to gpups * apply_gcc12_to_gpups * fix compiler bug * fix compiler bug * test * fix dangling-pointer compiler * fix dangling-pointer compiler * fix dangling-pointer compiler * apply_gcc12_to_gpups * apply gcc12 to gpups * Update cuda_streams_py.cc

apply gcc12 to gpups (#52960)
* apply gcc12 to gpups * apply gcc12 to gpups * apply gcc12 to gpups * apply gcc12 to gpups * apply gcc12 to gpups * apply gcc12 to gpups * apply gcc12 to gpips * apply gcc12 to gpups * apply gcc12 to gpups * test * test * apply gcc12 to gpups * apply_gcc12_to_gpups * fix compiler bug * fix compiler bug * test * fix dangling-pointer compiler * fix dangling-pointer compiler * fix dangling-pointer compiler * apply_gcc12_to_gpups * apply gcc12 to gpups * Update cuda_streams_py.cc
cbfd43e4 · risemeup1 · GitHub · 328195d7 · cbfd43e4 · cbfd43e4
8 changed file
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -48,7 +48,7 @@ PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
 TEST(Benchmark, EagerScaleCUDA) {
  eager_test::InitEnv(paddle::platform::CUDAPlace());
-  for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
+  for (const std::string mode : {"Accuracy", "WarmUp", "Performance"}) {
    paddle::framework::DDim ddim = phi::make_ddim({2, 4, 4, 4});
    paddle::Tensor tensor = CreateTensorWithValue(ddim,
                                                  paddle::platform::CUDAPlace(),
@@ -89,7 +89,7 @@ TEST(Benchmark, EagerMatmulCUDA) {
  paddle::platform::CUDAPlace place;
  eager_test::InitEnv(place);
-  for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
+  for (const std::string mode : {"Accuracy", "WarmUp", "Performance"}) {
    paddle::framework::DDim ddimX = phi::make_ddim({2, 2});
    paddle::Tensor X = CreateTensorWithValue(ddimX,
                                             paddle::platform::CUDAPlace(),
@@ -143,7 +143,7 @@ TEST(Benchmark, EagerIntermediateMatmulCUDA) {
  tracer->SetExpectedPlace(place);
  paddle::imperative::SetCurrentTracer(tracer);
-  for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
+  for (const std::string mode : {"Accuracy", "WarmUp", "Performance"}) {
    paddle::framework::DDim ddimX = phi::make_ddim({2, 2});
    paddle::Tensor X = CreateTensorWithValue(ddimX,
                                             paddle::platform::CUDAPlace(),
@@ -197,7 +197,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
  tracer->SetExpectedPlace(place);
  paddle::imperative::SetCurrentTracer(tracer);
-  for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
+  for (const std::string mode : {"Accuracy", "WarmUp", "Performance"}) {
    paddle::framework::DDim ddimX = phi::make_ddim({MLP_M, MLP_N});
    paddle::Tensor X = CreateTensorWithValue(ddimX,
                                             paddle::platform::CUDAPlace(),

--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -53,7 +53,7 @@ TEST(Benchmark, FluidScaleCUDA) {
  platform::CUDAPlace place;
  eager_test::InitEnv(place);
-  for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
+  for (const std::string mode : {"Accuracy", "WarmUp", "Performance"}) {
    std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
    X->SetOverridedStopGradient(false);
@@ -108,7 +108,7 @@ TEST(Benchmark, FluidMatmulCUDA) {
  platform::CUDAPlace place;
  eager_test::InitEnv(place);
-  for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
+  for (const std::string mode : {"Accuracy", "WarmUp", "Performance"}) {
    std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
    X->SetOverridedStopGradient(false);
    std::shared_ptr<imperative::VarBase> Y(new imperative::VarBase(true, "Y"));
@@ -176,7 +176,7 @@ TEST(Benchmark, FluidMLPCUDA) {
  platform::CUDAPlace place;
  eager_test::InitEnv(place);
-  for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
+  for (const std::string mode : {"Accuracy", "WarmUp", "Performance"}) {
    paddle::platform::DeviceContextPool& pool =
        paddle::platform::DeviceContextPool::Instance();
    auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));

--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
@@ -760,6 +760,10 @@ struct FeaturePushValue {
  int mf_dim;
  float mf_g[0];
+  __device__ __forceinline__ FeaturePushValue() = default;
+  __device__ __forceinline__ FeaturePushValue(const FeaturePushValue&) =
+      default;
  __device__ __forceinline__ FeaturePushValue
  operator+(const FeaturePushValue& a) const {
    FeaturePushValue out;

--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -257,16 +257,16 @@ void BindCudaStream(py::module *m_ptr) {
                  "Priority should be 1(high) or 2(normal) "));
            }
+            auto stream_flag = phi::CUDAStream::StreamFlag::kStreamNonBlocking;
            if (place == nullptr) {
              int curr_device_id = platform::GetCurrentDeviceId();
              auto place_tmp = platform::CUDAPlace(curr_device_id);
-              place = &place_tmp;
+              new (&self) phi::CUDAStream(place_tmp, priority - 2, stream_flag);
-            }
+            } else {
-            auto stream_flag = phi::CUDAStream::StreamFlag::kStreamNonBlocking;
              // seting priority 1(high) and 2(normal) correspond to the actual
              // cuda stream priority -1 and 0.
              new (&self) phi::CUDAStream(*place, priority - 2, stream_flag);
+            }
 #else
            PADDLE_THROW(platform::errors::Unavailable(
        "Class CUDAStream can only be initialized on the GPU platform."));

--- a/tools/dockerfile/Dockerfile.ubuntu20
+++ b/tools/dockerfile/Dockerfile.ubuntu20
+# A image for building paddle binaries
+# Use cuda devel base image for both cpu and gpu environment
+# When you modify it, please be aware of cudnn-runtime version
+FROM <baseimg>
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+# ENV variables
+ARG WITH_GPU
+ARG WITH_AVX
+ENV WITH_GPU=${WITH_GPU:-ON}
+ENV WITH_AVX=${WITH_AVX:-ON}
+ENV DEBIAN_FRONTEND=noninteractive
+<setcuda>
+ENV HOME /root
+# Add bash enhancements
+COPY paddle/scripts/docker/root/ /root/
+RUN chmod 777 /tmp
+RUN apt-key del 7fa2af80
+RUN rm /etc/apt/sources.list.d/*
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub
+RUN apt-get update --allow-unauthenticated && \
+  apt-get install -y software-properties-common && add-apt-repository ppa:deadsnakes/ppa && \
+  apt-get update && \
+  apt-get install -y curl wget vim git unzip unrar tar xz-utils libssl-dev bzip2 gzip \ 
+    coreutils ntp language-pack-zh-hans libsm6 libxext6 libxrender-dev libgl1-mesa-glx \
+    bison graphviz libjpeg-dev zlib1g-dev automake locales swig net-tools libtool kmod
+<install_cpu_package>
+# Downgrade gcc&&g++
+WORKDIR /usr/bin 
+COPY tools/dockerfile/build_scripts /build_scripts 
+RUN bash /build_scripts/install_trt.sh
+# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
+# # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
+# # So install a newer version here.
+RUN bash /build_scripts/install_patchelf.sh
+RUN bash /build_scripts/install_gcc.sh gcc121
+RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++ 
+RUN ln -s /usr/local/gcc-12.1/bin/gcc /usr/local/bin/gcc 
+RUN ln -s /usr/local/gcc-12.1/bin/g++ /usr/local/bin/g++ 
+RUN ln -s /usr/local/gcc-12.1/bin/gcc /usr/bin/gcc 
+RUN ln -s /usr/local/gcc-12.1/bin/g++ /usr/bin/g++ 
+ENV PATH=/usr/local/gcc-12.1/bin:$PATH 
+Run bash /build_scripts/install_cudnn.sh cudnn841
+ENV CUDNN_VERSION=8.4.1
+#RUN bash /build_scripts/install_nccl2.sh
+RUN rm -rf /build_script
+# install cmake
+WORKDIR /home
+RUN wget -q https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.tar.gz && tar -zxvf cmake-3.18.0-Linux-x86_64.tar.gz && rm cmake-3.18.0-Linux-x86_64.tar.gz
+ENV PATH=/home/cmake-3.18.0-Linux-x86_64/bin:$PATH
+RUN apt-get update && \
+  apt-get install -y python3.7 python3.7-dev python3.7-distutils\
+  python3.8 python3.8-dev python3.8-distutils \
+  python3.9 python3.9-dev python3.9-distutils && \
+  apt-get install python-is-python3 && \
+  rm /usr/bin/python && ln -s /usr/bin/python3.7 /usr/bin/python && \
+  rm /usr/bin/python3 && ln -s /usr/bin/python3.7 /usr/bin/python3
+WORKDIR /home
+RUN wget https://files.pythonhosted.org/packages/a7/e0/30642b9c2df516506d40b563b0cbd080c49c6b3f11a70b4c7a670f13a78b/setuptools-50.3.2.zip && apt-get -y install unzip && unzip setuptools-50.3.2.zip
+WORKDIR /home/setuptools-50.3.2
+RUN python3.9 setup.py build && python3.9 setup.py install && \
+  python3.8 setup.py build && python3.8 setup.py install && \
+  python3.7 setup.py build && python3.7 setup.py install
+WORKDIR /home
+RUN wget https://files.pythonhosted.org/packages/28/af/2c76c8aa46ccdf7578b83d97a11a2d1858794d4be4a1610ade0d30182e8b/pip-20.0.1.tar.gz && tar -zxvf pip-20.0.1.tar.gz
+WORKDIR pip-20.0.1
+RUN python3.9 setup.py install && \
+  python3.8 setup.py install && \
+  python3.7 setup.py install
+WORKDIR /home
+RUN rm setuptools-50.3.2.zip pip-20.0.1.tar.gz && \
+    rm -r setuptools-50.3.2 pip-20.0.1
+RUN rm /usr/local/bin/pip && ln -s /usr/local/bin/pip3.7 /usr/local/bin/pip && \
+  rm /usr/local/bin/pip3 && ln -s /usr/local/bin/pip3.7 /usr/local/bin/pip3
+# remove them when apt-get support 2.27 and higher version
+RUN wget -q https://ftp.gnu.org/gnu/binutils/binutils-2.33.1.tar.gz && \ 
+    tar -xzf binutils-2.33.1.tar.gz && \ 
+    cd binutils-2.33.1 && \
+    ./configure && make -j && make install && cd .. && rm -rf binutils-2.33.1 binutils-2.33.1.tar.gz
+# Install Go and glide
+RUN wget --no-check-certificate -qO- https://paddle-ci.gz.bcebos.com/go1.17.2.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+# install glide
+RUN curl -s -q https://glide.sh/get | sh
+# git credential to skip password typing
+RUN git config --global credential.helper store
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+RUN pip3.7 --no-cache-dir install ipython==5.3.0 && \
+    pip3.7 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip3.8 --no-cache-dir install ipython==5.3.0 && \
+    pip3.8 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip3.9 --no-cache-dir install ipython==5.3.0 && \
+    pip3.9 --no-cache-dir install ipykernel==4.6.0 wheel
+#For docstring checker
+RUN pip3.7 --no-cache-dir install pytest astroid isort && \
+    pip3.8 --no-cache-dir install pytest astroid isort && \
+    pip3.9 --no-cache-dir install pytest astroid isort
+#For pre-commit
+RUN pip3.7 --no-cache-dir install --upgrade pip && \
+    pip3.8 --no-cache-dir install --upgrade pip && \
+    pip3.9 --no-cache-dir install --upgrade pip
+RUN pip3.7 --no-cache-dir install pre-commit==2.17.0 pylint==2.12.0 && \
+    pip3.8 --no-cache-dir install pre-commit==2.17.0 pylint==2.12.0 && \
+    pip3.9 --no-cache-dir install pre-commit==2.17.0 pylint==2.12.0 && \
+    pip3.7 --no-cache-dir install cpplint==1.6.0 clang-format==13.0.0 && \
+    pip3.8 --no-cache-dir install cpplint==1.6.0 clang-format==13.0.0 && \
+    pip3.9 --no-cache-dir install cpplint==1.6.0 clang-format==13.0.0
+COPY ./python/requirements.txt /root/
+RUN pip3.7 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.8 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.9 --no-cache-dir install -r /root/requirements.txt
+# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+#RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+#CMD source ~/.bashrc
+# ccache 3.7.9
+RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
+    tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \
+    ./configure -prefix=/usr/local/ccache-3.7.9 && \
+    make -j8 && make install && \
+    ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache && \
+    cd ../ && rm -rf ccache-3.7.9 ccache-3.7.9.tar.gz
+# clang+llvm 3.8.0
+RUN wget https://paddle-ci.cdn.bcebos.com/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz && \ 
+    tar xf clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz && cd clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04 && \
+    cp -rn * /usr/local && cd .. && rm -rf clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04 && rm -rf clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz 
+EXPOSE 22
--- a/tools/dockerfile/build_scripts/install_gcc.sh
+++ b/tools/dockerfile/build_scripts/install_gcc.sh
@@ -71,4 +71,18 @@ elif [ "$1" == "gcc122" ]; then
  ln -s /usr/local/gcc-12.2/lib64/libgfortran.so.5 ${lib_so_5} && \
  ln -s /usr/local/gcc-12.2/lib64/libstdc++.so.6 ${lib_so_6} && \
  cp /usr/local/gcc-12.2/lib64/libstdc++.so.6.0.30 ${lib_path}
+elif [ "$1" == "gcc121" ]; then
+  wget -q --no-proxy https://paddle-ci.gz.bcebos.com/gcc-12.1.0.tar.gz
+  tar -xzf gcc-12.1.0.tar.gz && \
+  cd gcc-12.1.0 && \
+  unset LIBRARY_PATH CPATH C_INCLUDE_PATH PKG_CONFIG_PATH CPLUS_INCLUDE_PATH INCLUDE && \
+  ./contrib/download_prerequisites && \
+  cd .. && mkdir temp_gcc121 && cd temp_gcc121 && \
+  ../gcc-12.1.0/configure --prefix=/usr/local/gcc-12.1 --enable-checking=release --enable-languages=c,c++ --disable-multilib && \
+  make -j8 && make install
+  cd .. && rm -rf temp_gcc122 gcc-12.1.0 gcc-12.1.0.tar.gz
+  cp ${lib_so_6} ${lib_so_6}.bak  && rm -f ${lib_so_6} &&
+  ln -s /usr/local/gcc-12.1/lib64/libgfortran.so.5 ${lib_so_5} && \
+  ln -s /usr/local/gcc-12.1/lib64/libstdc++.so.6 ${lib_so_6} && \
+  cp /usr/local/gcc-12.1/lib64/libstdc++.so.6.0.30 ${lib_path}
 fi
--- a/tools/dockerfile/build_scripts/install_nccl2.sh
+++ b/tools/dockerfile/build_scripts/install_nccl2.sh
@@ -41,6 +41,7 @@ elif [ "$VERSION" == "12.0" ]; then
 libnccl-*
    exit 0
  fi
+  DEB="nccl-local-repo-ubuntu2004-2.16.5-cuda12.0_1.0-1_amd64.deb"
 elif [ "$VERSION" == "9.0" ]; then
  DEB="nccl-repo-ubuntu1604-2.3.7-ga-cuda9.0_1-1_amd64.deb"
 else

--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -147,8 +147,8 @@ function make_ce_framework_dockcerfile(){
 function make_unbuntu18_cu117_dockerfile(){
  dockerfile_name="Dockerfile.cuda117_cudnn8_gcc82_ubuntu18_coverage"
-  sed "s#<baseimg>#nvidia/cuda:11.7.0-cudnn8-devel-ubuntu18.04#g" ./Dockerfile.ubuntu18 >${dockerfile_name}
+  sed "s#<baseimg>#nvidia/cuda:12.0.1-cudnn8-devel-ubuntu20.04#g" ./Dockerfile.ubuntu20 >${dockerfile_name}
-  sed -i "s#<setcuda>#ENV LD_LIBRARY_PATH=/usr/local/cuda-11.7/targets/x86_64-linux/lib:\$LD_LIBRARY_PATH #g" ${dockerfile_name}
+  sed -i "s#<setcuda>#ENV LD_LIBRARY_PATH=/usr/local/cuda-12.0/targets/x86_64-linux/lib:\$LD_LIBRARY_PATH #g" ${dockerfile_name}
  sed -i 's#<install_cpu_package>##g' ${dockerfile_name}
  sed -i "7i ENV TZ=Asia/Beijing" ${dockerfile_name}
  sed -i "8i RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone" ${dockerfile_name}