提交 97d6bc64 编写于 作者: H hedaoyuan

Merge branch 'develop' of https://github.com/baidu/Paddle into FunctionTest

...@@ -30,7 +30,7 @@ include(simd) ...@@ -30,7 +30,7 @@ include(simd)
################################ Configurations ####################################### ################################ Configurations #######################################
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" OFF) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON) option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON)
......
...@@ -108,6 +108,7 @@ function(link_paddle_exe TARGET_NAME) ...@@ -108,6 +108,7 @@ function(link_paddle_exe TARGET_NAME)
endif() endif()
if(WITH_GPU) if(WITH_GPU)
target_link_libraries(${TARGET_NAME} ${CUDA_CUDART_LIBRARY})
if(NOT WITH_DSO OR WITH_METRIC) if(NOT WITH_DSO OR WITH_METRIC)
target_link_libraries(${TARGET_NAME} target_link_libraries(${TARGET_NAME}
${CUDNN_LIBRARY} ${CUDNN_LIBRARY}
......
...@@ -27,5 +27,6 @@ paddle train \ ...@@ -27,5 +27,6 @@ paddle train \
--num_passes=300 \ --num_passes=300 \
--save_dir=$output \ --save_dir=$output \
2>&1 | tee $log 2>&1 | tee $log
paddle usage -l $log -e $? -n "image_classification_train" >/dev/null 2>&1
python -m paddle.utils.plotcurve -i $log > plot.png python -m paddle.utils.plotcurve -i $log > plot.png
...@@ -19,3 +19,4 @@ paddle train \ ...@@ -19,3 +19,4 @@ paddle train \
--save_dir=./output \ --save_dir=./output \
--num_passes=30 \ --num_passes=30 \
2>&1 |tee 'train.log' 2>&1 |tee 'train.log'
paddle usage -l "train.log" -e $? -n "introduction" >/dev/null 2>&1
...@@ -27,5 +27,6 @@ paddle train \ ...@@ -27,5 +27,6 @@ paddle train \
--num_passes=100 \ --num_passes=100 \
--save_dir=$output \ --save_dir=$output \
2>&1 | tee $log 2>&1 | tee $log
paddle usage -l $log -e $? -n "mnist_train" >/dev/null 2>&1
python -m paddle.utils.plotcurve -i $log > plot.png python -m paddle.utils.plotcurve -i $log > plot.png
...@@ -25,6 +25,7 @@ log_file="$bin_dir/train.log" ...@@ -25,6 +25,7 @@ log_file="$bin_dir/train.log"
pushd "$home_dir" pushd "$home_dir"
cfg=trainer_config.lr.py cfg=trainer_config.lr.py
paddle train \ paddle train \
--start_pserver=false \
--config=$cfg \ --config=$cfg \
--save_dir=${model_dir} \ --save_dir=${model_dir} \
--trainer_count=4 \ --trainer_count=4 \
......
...@@ -26,5 +26,7 @@ paddle train \ ...@@ -26,5 +26,7 @@ paddle train \
--init_model_path=$model \ --init_model_path=$model \
--config_args=is_predict=1 \ --config_args=is_predict=1 \
--predict_output_dir=. \ --predict_output_dir=. \
2>&1 | tee 'predict.log'
paddle usage -l 'predict.log' -e $? -n "quick_start_predict_${cfg}" >/dev/null 2>&1
mv rank-00000 result.txt mv rank-00000 result.txt
...@@ -31,3 +31,4 @@ paddle train \ ...@@ -31,3 +31,4 @@ paddle train \
--show_parameter_stats_period=100 \ --show_parameter_stats_period=100 \
--test_all_data_in_one_period=1 \ --test_all_data_in_one_period=1 \
2>&1 | tee 'train.log' 2>&1 | tee 'train.log'
paddle usage -l "train.log" -e $? -n "quick_start_${cfg}" >/dev/null 2>&1
...@@ -22,3 +22,4 @@ paddle train \ ...@@ -22,3 +22,4 @@ paddle train \
--log_period=100 \ --log_period=100 \
--dot_period=1 \ --dot_period=1 \
--num_passes=50 2>&1 | tee 'log.txt' --num_passes=50 2>&1 | tee 'log.txt'
paddle usage -l log.txt -e $? -n "recommendation" >/dev/null 2>&1
...@@ -38,3 +38,4 @@ paddle train \ ...@@ -38,3 +38,4 @@ paddle train \
--config_args=is_test=1 \ --config_args=is_test=1 \
--test_all_data_in_one_period=1 \ --test_all_data_in_one_period=1 \
2>&1 | tee 'test.log' 2>&1 | tee 'test.log'
paddle usage -l test.log -e $? -n "semantic_role_labeling_test" >/dev/null 2>&1
...@@ -27,3 +27,4 @@ paddle train \ ...@@ -27,3 +27,4 @@ paddle train \
--load_missing_parameter_strategy=rand \ --load_missing_parameter_strategy=rand \
--test_all_data_in_one_period=1 \ --test_all_data_in_one_period=1 \
2>&1 | tee 'train.log' 2>&1 | tee 'train.log'
paddle usage -l train.log -e $? -n "semantic_role_labeling_train" >/dev/null 2>&1
...@@ -37,3 +37,4 @@ paddle train --config=$net_conf \ ...@@ -37,3 +37,4 @@ paddle train --config=$net_conf \
--trainer_count=4 \ --trainer_count=4 \
--config_args=is_test=1 \ --config_args=is_test=1 \
2>&1 | tee 'test.log' 2>&1 | tee 'test.log'
paddle usage -l test.log -e $? -n "sentiment_test" >/dev/null 2>&1
...@@ -27,3 +27,4 @@ paddle train --config=$config \ ...@@ -27,3 +27,4 @@ paddle train --config=$config \
--show_parameter_stats_period=100 \ --show_parameter_stats_period=100 \
--test_all_data_in_one_period=1 \ --test_all_data_in_one_period=1 \
2>&1 | tee 'train.log' 2>&1 | tee 'train.log'
paddle usage -l train.log -e $? -n "sentiment_train" >/dev/null 2>&1
...@@ -27,3 +27,4 @@ paddle train \ ...@@ -27,3 +27,4 @@ paddle train \
--log_period=10 \ --log_period=10 \
--dot_period=5 \ --dot_period=5 \
2>&1 | tee 'paraphrase/train.log' 2>&1 | tee 'paraphrase/train.log'
paddle usage -l 'paraphrase/train.log' -e $? -n "seqToseq_paraphrase_train" >/dev/null 2>&1
...@@ -24,3 +24,4 @@ paddle train \ ...@@ -24,3 +24,4 @@ paddle train \
--test_pass=12 \ --test_pass=12 \
--trainer_count=1 \ --trainer_count=1 \
2>&1 | tee 'translation/gen.log' 2>&1 | tee 'translation/gen.log'
paddle usage -l 'translation/gen.log' -e $? -n "seqToseq_translation_gen" >/dev/null 2>&1
...@@ -25,3 +25,4 @@ paddle train \ ...@@ -25,3 +25,4 @@ paddle train \
--log_period=10 \ --log_period=10 \
--dot_period=5 \ --dot_period=5 \
2>&1 | tee 'translation/train.log' 2>&1 | tee 'translation/train.log'
paddle usage -l 'translation/train.log' -e $? -n "seqToseq_translation_train" >/dev/null 2>&1
...@@ -7,4 +7,6 @@ paddle train \ ...@@ -7,4 +7,6 @@ paddle train \
--dot_period=10 \ --dot_period=10 \
--log_period=1000 \ --log_period=1000 \
--test_period=0 \ --test_period=0 \
--num_passes=10 --num_passes=10 \
2>&1 | tee 'train.log'
paddle usage -l 'train.log' -e $? -n "sequence_tagging_train" >/dev/null 2>&1
...@@ -7,3 +7,5 @@ paddle train \ ...@@ -7,3 +7,5 @@ paddle train \
--log_period=10000 \ --log_period=10000 \
--test_period=0 \ --test_period=0 \
--num_passes=10 --num_passes=10
2>&1 | tee 'train_linear.log'
paddle usage -l 'train_linear.log' -e $? -n "sequence_tagging_train_linear" >/dev/null 2>&1
...@@ -286,22 +286,3 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID,相同名字 ...@@ -286,22 +286,3 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID,相同名字
.. code-block:: bash .. code-block:: bash
paddle train --use_gpu=true --trainer_count=2 --gpu_id=2 paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
12. 编译源码提示warp-ctc/include/ctc.h 找不到的情况
---------------------------------------------------
目前Paddle使用\ :code:`git submodule`\ 来引用一些第三方模块。简单的\
:code:`git clone`\ 命令不能得到第三方模块的代码。需要使用\:
.. code-block:: bash
git clone --recursive https://github.com/PaddlePaddle/Paddle.git
来获取所有源码。对于已经clone的git版本库,可以在Paddle的源码目录中执行\:
.. code-block:: bash
git submodule init
git submodule update
来获得所有第三方模块。
...@@ -11,32 +11,21 @@ You can download PaddlePaddle from the [github source](https://github.com/Paddle ...@@ -11,32 +11,21 @@ You can download PaddlePaddle from the [github source](https://github.com/Paddle
```bash ```bash
git clone https://github.com/PaddlePaddle/Paddle paddle git clone https://github.com/PaddlePaddle/Paddle paddle
cd paddle cd paddle
git submodule update --init --recursive
``` ```
If you already have a local PaddlePaddle repo and have not initialized the submodule, your local submodule folder will be empty. You can simply run the last line of the above codes in your PaddlePaddle home directory to initialize your submodule folder.
If you have already initialized your submodule and you would like to sync with the upstream submodule repo, you can run the following command
```
git submodule update --remote
```
## <span id="requirements">Requirements</span> ## <span id="requirements">Requirements</span>
To compile the source code, your computer must be equipped with the following dependencies. To compile the source code, your computer must be equipped with the following dependencies.
- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1) - **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1)
- **CMake**: version >= 2.8 - **CMake**: version >= 3.0 (at least CMake 3.4 on Mac OS X)
- **BLAS**: MKL, OpenBlas or ATLAS - **BLAS**: MKL, OpenBlas or ATLAS
- **Protocol Buffers**: version >= 2.4, **Note: 3.x is not supported**
- **Python**: only python 2.7 is supported currently
**Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported! **Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
For CUDA 8.0, GCC versions later than 5.3 are not supported! For CUDA 8.0, GCC versions later than 5.3 are not supported!
### Options ### Options
PaddlePaddle supports some build options. To enable it, first you need to install the related libraries. PaddlePaddle supports some build options.
<html> <html>
<table> <table>
...@@ -47,12 +36,21 @@ PaddlePaddle supports some build options. To enable it, first you need to instal ...@@ -47,12 +36,21 @@ PaddlePaddle supports some build options. To enable it, first you need to instal
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
<tr><td class="left">WITH_GPU</td><td class="left">Compile with GPU mode.</td></tr> <tr><td class="left">WITH_GPU</td><td class="left">Compile PaddlePaddle with NVIDIA GPU</td></tr>
<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile with double precision floating-point, default: single precision.</td></tr> <tr><td class="left">WITH_AVX</td><td class="left">Compile PaddlePaddle with AVX intrinsics</td></tr>
<tr><td class="left">WITH_TESTING</td><td class="left">Compile with gtest for PaddlePaddle's unit testing.</td></tr> <tr><td class="left">WITH_DSO</td><td class="left">Compile PaddlePaddle with dynamic linked CUDA</td></tr>
<tr><td class="left">WITH_DOC</td><td class="left"> Compile to generate PaddlePaddle's docs, default: disabled (OFF).</td></tr> <tr><td class="left">WITH_TESTING</td><td class="left">Compile PaddlePaddle with unit testing</td></tr>
<tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile with python predict API, default: disabled (OFF).</td></tr> <tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile PaddlePaddle with inference api</td></tr>
<tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile with code style check, default: enabled (ON).</td></tr> <tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile PaddlePaddle with style check</td></tr>
<tr><td class="left">WITH_PYTHON</td><td class="left">Compile PaddlePaddle with python interpreter</td></tr>
<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile PaddlePaddle with double precision</td></tr>
<tr><td class="left">WITH_RDMA</td><td class="left">Compile PaddlePaddle with RDMA support</td></tr>
<tr><td class="left">WITH_TIMER</td><td class="left">Compile PaddlePaddle with stats timer</td></tr>
<tr><td class="left">WITH_PROFILER</td><td class="left">Compile PaddlePaddle with GPU profiler</td></tr>
<tr><td class="left">WITH_DOC</td><td class="left">Compile PaddlePaddle with documentation</td></tr>
<tr><td class="left">ON_COVERALLS</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
<tr><td class="left">COVERALLS_UPLOAD</td><td class="left">Package code coverage data to coveralls</td></tr>
<tr><td class="left">ON_TRAVIS</td><td class="left">Exclude special unit test on Travis CI</td></tr>
</tbody> </tbody>
</table> </table>
</html> </html>
...@@ -64,18 +62,15 @@ PaddlePaddle supports some build options. To enable it, first you need to instal ...@@ -64,18 +62,15 @@ PaddlePaddle supports some build options. To enable it, first you need to instal
As a simple example, consider the following: As a simple example, consider the following:
1. **Python Dependencies(optional)** 1. **BLAS Dependencies(optional)**
To compile PaddlePaddle with python predict API, make sure swig installed and set `-DWITH_SWIG_PY=ON` as follows: Paddle will find BLAS from system's default path. But you can specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
```bash ```bash
# install swig on ubuntu # specify MKL
sudo apt-get install swig cmake .. -DMKL_ROOT=<mkl_path>
# install swig on Mac OS X # or specify OpenBLAS
brew install swig cmake .. -DOPENBLAS_ROOT=<openblas_path>
# active swig in cmake
cmake .. -DWITH_SWIG_PY=ON
``` ```
2. **Doc Dependencies(optional)** 2. **Doc Dependencies(optional)**
...@@ -104,17 +99,9 @@ As a simple example, consider the following: ...@@ -104,17 +99,9 @@ As a simple example, consider the following:
```bash ```bash
# necessary # necessary
sudo apt-get update sudo apt-get update
sudo apt-get install -y g++ make cmake swig build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev git
# optional sudo pip install wheel numpy
sudo apt-get install libgoogle-glog-dev sudo pip install 'protobuf>=3.0.0'
sudo apt-get install libgflags-dev
sudo apt-get install libgtest-dev
sudo pip install wheel
pushd /usr/src/gtest
cmake .
make
sudo cp *.a /usr/lib
popd
``` ```
- **GPU Dependencies (optional)** - **GPU Dependencies (optional)**
...@@ -149,51 +136,17 @@ As usual, the best option is to create build folder under paddle project directo ...@@ -149,51 +136,17 @@ As usual, the best option is to create build folder under paddle project directo
```bash ```bash
mkdir build && cd build mkdir build && cd build
cmake ..
``` ```
CMake first check PaddlePaddle's dependencies in system default path. After installing some optional Finally, you can build and install PaddlePaddle:
libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
If still not found, you can manually set it based on CMake error information from your screen.
As a simple example, consider the following:
- **Only CPU with swig**
```bash
cmake .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON
```
- **GPU with swig**
```bash
cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON
```
- **GPU with doc and swig**
```bash
cmake .. -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
```
Finally, you can build PaddlePaddle:
```bash ```bash
# you can add build option here, such as: # you can add build option here, such as:
cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<path to install> -DWITH_SWIG_PY=ON cmake .. -DCMAKE_INSTALL_PREFIX=<path to install>
# please use sudo make install, if you want to install PaddlePaddle into the system # please use sudo make install, if you want to install PaddlePaddle into the system
make -j `nproc` && make install make -j `nproc` && make install
# set PaddlePaddle installation path in ~/.bashrc # set PaddlePaddle installation path in ~/.bashrc
export PATH=<path to install>/bin:$PATH export PATH=<path to install>/bin:$PATH
``` # install PaddlePaddle Python modules.
If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
Otherwise, PaddlePaddle will automatically install python dependencies
at first time when user run paddle commands, such as `paddle version`, `paddle train`.
It may require sudo privileges:
```bash
# you can run
sudo pip install <path to install>/opt/paddle/share/wheels/*.whl sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
# or just run
sudo paddle version
``` ```
...@@ -16,23 +16,13 @@ Developers can work on PaddlePaddle using Docker. This allows ...@@ -16,23 +16,13 @@ Developers can work on PaddlePaddle using Docker. This allows
developers to work on different platforms -- Linux, Mac OS X, and developers to work on different platforms -- Linux, Mac OS X, and
Windows -- in a consistent way. Windows -- in a consistent way.
The general development workflow with Docker and Bazel is as follows: The general development workflow with Docker and CMake is as follows:
1. Get the source code of Paddle: 1. Get the source code of Paddle:
.. code-block:: bash .. code-block:: bash
git clone --recursive https://github.com/PaddlePaddle/Paddle.git git clone https://github.com/PaddlePaddle/Paddle.git
Here **git clone --recursive is required** as we have a submodule `warp-ctc <https://github.com/baidu-research/warp-ctc>`_.
If you have used :code:`git clone https://github.com/PaddlePaddle/Paddle` and find that the directory :code:`warp-ctc` is
empty, please use the following command to get the submodule.
.. code-block:: bash
git submodule update --init --recursive
2. Build a development Docker image :code:`paddle:dev` from the source 2. Build a development Docker image :code:`paddle:dev` from the source
...@@ -162,7 +152,6 @@ source code: ...@@ -162,7 +152,6 @@ source code:
cd ~ cd ~
git clone https://github.com/PaddlePaddle/Paddle.git git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle cd Paddle
git submodule update --init --recursive
docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile . docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu . docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
......
...@@ -33,7 +33,6 @@ cd Paddle ...@@ -33,7 +33,6 @@ cd Paddle
git checkout -b develop # 创建 develop 分支 git checkout -b develop # 创建 develop 分支
git remote add upstream https://github.com/PaddlePaddle/Paddle.git # 添加 upstream 到 baidu/Paddle git remote add upstream https://github.com/PaddlePaddle/Paddle.git # 添加 upstream 到 baidu/Paddle
git pull upstream develop # 更新 upstream git pull upstream develop # 更新 upstream
git submodule update --init --recursive
``` ```
然后你可以通过做一个本地开发分支开始开发 然后你可以通过做一个本地开发分支开始开发
......
...@@ -38,7 +38,6 @@ cd Paddle ...@@ -38,7 +38,6 @@ cd Paddle
git checkout -b develop # create develop branch. git checkout -b develop # create develop branch.
git remote add upstream https://github.com/PaddlePaddle/Paddle.git # add upstream to baidu/Paddle git remote add upstream https://github.com/PaddlePaddle/Paddle.git # add upstream to baidu/Paddle
git pull upstream develop # update to upstream git pull upstream develop # update to upstream
git submodule update --init --recursive
``` ```
Then you can start to develop by making a local developement branch Then you can start to develop by making a local developement branch
......
...@@ -13,5 +13,5 @@ GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@" ...@@ -13,5 +13,5 @@ GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
GFLAGS_LOCATION="@GFLAGS_LOCATION@" GFLAGS_LOCATION="@GFLAGS_LOCATION@"
CBLAS_LIBRARIES="@CBLAS_LIBRARIES@" CBLAS_LIBRARIES="@CBLAS_LIBRARIES@"
CUDA_LIBRARIES="@CUDA_cudart_shared_LIBRARY@" CUDA_LIBRARIES="@CUDA_CUDART_LIBRARY@"
WITH_COVERALLS="@ON_COVERALLS@" WITH_COVERALLS="@ON_COVERALLS@"
...@@ -15,7 +15,6 @@ else() ...@@ -15,7 +15,6 @@ else()
endif() endif()
set(CUDA_CXX_WITH_GPU_SOURCES set(CUDA_CXX_WITH_GPU_SOURCES
src/hl_cudart_wrap.cc
src/hl_cuda_cublas.cc src/hl_cuda_cublas.cc
src/hl_cuda_cudnn.cc src/hl_cuda_cudnn.cc
src/hl_cuda_device.cc) src/hl_cuda_device.cc)
......
...@@ -36,14 +36,6 @@ void GetCublasDsoHandle(void** dso_handle); ...@@ -36,14 +36,6 @@ void GetCublasDsoHandle(void** dso_handle);
*/ */
void GetCudnnDsoHandle(void** dso_handle); void GetCudnnDsoHandle(void** dso_handle);
/**
* @brief load the DSO of CUDA Run Time
*
* @param **dso_handle dso handler
*
*/
void GetCudartDsoHandle(void** dso_handle);
/** /**
* @brief load the DSO of CURAND * @brief load the DSO of CURAND
* *
......
...@@ -22,10 +22,9 @@ limitations under the License. */ ...@@ -22,10 +22,9 @@ limitations under the License. */
#include <sys/time.h> #include <sys/time.h>
#include <unistd.h> #include <unistd.h>
#include <mutex> #include <mutex>
#include "hl_cuda.h"
#include "hl_cuda.ph" #include "hl_cuda.ph"
#include "hl_dso_loader.h"
#include "hl_thread.ph" #include "hl_thread.ph"
#include "hl_dso_loader.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
// clang-format on // clang-format on
...@@ -77,78 +76,6 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP) ...@@ -77,78 +76,6 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
#undef CURAND_RAND_ROUTINE_EACH #undef CURAND_RAND_ROUTINE_EACH
#undef DYNAMIC_LOAD_CURAND_WRAP #undef DYNAMIC_LOAD_CURAND_WRAP
std::once_flag cudart_dso_flag;
void *cudart_dso_handle = nullptr;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cuda routine
* via operator overloading.
*
* note: default dynamic linked libs
*/
#ifdef PADDLE_USE_DSO
#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using cudart_func = decltype(__name(args...)) (*)(Args...); \
std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
void *p_##__name = dlsym(cudart_dso_handle, #__name); \
return reinterpret_cast<cudart_func>(p_##__name)(args...); \
} \
} __name; /* struct DynLoad__##__name */
#else
#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
return __name(args...); \
} \
} __name; /* struct DynLoad__##__name */
#endif
/* include all needed cuda functions in HPPL */
// clang-format off
#define CUDA_ROUTINE_EACH(__macro) \
__macro(cudaMalloc) \
__macro(cudaHostAlloc) \
__macro(cudaFree) \
__macro(cudaFreeHost) \
__macro(cudaMemcpy) \
__macro(cudaMemset) \
__macro(cudaMemcpyAsync) \
__macro(cudaSetDevice) \
__macro(cudaGetDevice) \
__macro(cudaGetDeviceCount) \
__macro(cudaGetDeviceProperties) \
__macro(cudaDeviceSynchronize) \
__macro(cudaDeviceCanAccessPeer) \
__macro(cudaDeviceEnablePeerAccess) \
__macro(cudaStreamCreate) \
__macro(cudaStreamDestroy) \
__macro(cudaStreamSynchronize) \
__macro(cudaStreamWaitEvent) \
__macro(cudaEventCreate) \
__macro(cudaEventRecord) \
__macro(cudaEventQuery) \
__macro(cudaEventDestroy) \
__macro(cudaEventSynchronize) \
__macro(cudaEventElapsedTime) \
__macro(cudaSetDeviceFlags) \
__macro(cudaGetLastError) \
__macro(cudaFuncSetCacheConfig) \
__macro(cudaRuntimeGetVersion) \
__macro(cudaGetErrorString) \
__macro(cudaProfilerStart) \
__macro(cudaProfilerStop)
// clang-format on
CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
#undef CUDA_ROUNTINE_EACH
#undef DYNAMIC_LOAD_CUDART_WRAP
} /* namespace dynload */ } /* namespace dynload */
/** /**
...@@ -175,7 +102,7 @@ int g_cuda_lib_version = 0; ...@@ -175,7 +102,7 @@ int g_cuda_lib_version = 0;
do { \ do { \
cudaError_t cudaStat = cudaFunc; \ cudaError_t cudaStat = cudaFunc; \
CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: " \ CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: " \
<< dynload::cudaGetErrorString(cudaStat); \ << cudaGetErrorString(cudaStat); \
} while (0) } while (0)
/** /**
...@@ -284,13 +211,13 @@ void hl_fini() { ...@@ -284,13 +211,13 @@ void hl_fini() {
tmp_stream = (char *)t_device[dev]->stream; tmp_stream = (char *)t_device[dev]->stream;
} }
for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) { for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
CHECK_CUDA(dynload::cudaStreamDestroy(t_device[dev]->stream[j])); CHECK_CUDA(cudaStreamDestroy(t_device[dev]->stream[j]));
} }
/* free device memory */ /* free device memory */
hl_free_mem_device(t_device[dev]->gpu_mem); hl_free_mem_device(t_device[dev]->gpu_mem);
hl_free_mem_host(t_device[dev]->cpu_mem); hl_free_mem_host(t_device[dev]->cpu_mem);
CHECK_CUDA(dynload::cudaEventDestroy(t_device[dev]->mem_event)); CHECK_CUDA(cudaEventDestroy(t_device[dev]->mem_event));
} }
free(tmp); free(tmp);
...@@ -308,7 +235,7 @@ void hl_set_device(int device) { ...@@ -308,7 +235,7 @@ void hl_set_device(int device) {
CHECK(device >= 0 && device < g_system_device_num && g_device[device]) CHECK(device >= 0 && device < g_system_device_num && g_device[device])
<< "Device: " << device << " is not specified in startup."; << "Device: " << device << " is not specified in startup.";
CHECK_CUDA(dynload::cudaSetDevice(device)); CHECK_CUDA(cudaSetDevice(device));
/* switch thread stream */ /* switch thread stream */
for (int i = 0; i < NUMBER_OF_GLOBAL_STREAM; i++) { for (int i = 0; i < NUMBER_OF_GLOBAL_STREAM; i++) {
...@@ -336,7 +263,7 @@ void hl_set_device(int device) { ...@@ -336,7 +263,7 @@ void hl_set_device(int device) {
int hl_get_device() { int hl_get_device() {
int device; int device;
CHECK_CUDA(dynload::cudaGetDevice(&device)); CHECK_CUDA(cudaGetDevice(&device));
return device; return device;
} }
...@@ -344,7 +271,7 @@ void *hl_malloc_device(size_t size) { ...@@ -344,7 +271,7 @@ void *hl_malloc_device(size_t size) {
void *dest_d; void *dest_d;
CHECK(size) << __func__ << ": the size for device memory is 0, please check."; CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
CHECK_CUDA(dynload::cudaMalloc((void **)&dest_d, size)); CHECK_CUDA(cudaMalloc((void **)&dest_d, size));
return dest_d; return dest_d;
} }
...@@ -352,7 +279,7 @@ void *hl_malloc_device(size_t size) { ...@@ -352,7 +279,7 @@ void *hl_malloc_device(size_t size) {
void hl_free_mem_device(void *dest_d) { void hl_free_mem_device(void *dest_d) {
CHECK_NOTNULL(dest_d); CHECK_NOTNULL(dest_d);
cudaError_t err = dynload::cudaFree(dest_d); cudaError_t err = cudaFree(dest_d);
CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err) CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
<< hl_get_device_error_string(); << hl_get_device_error_string();
} }
...@@ -361,8 +288,7 @@ void *hl_malloc_host(size_t size) { ...@@ -361,8 +288,7 @@ void *hl_malloc_host(size_t size) {
void *dest_h; void *dest_h;
CHECK(size) << __func__ << ": the size for device memory is 0, please check."; CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
CHECK_CUDA( CHECK_CUDA(cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
dynload::cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
return dest_h; return dest_h;
} }
...@@ -370,7 +296,7 @@ void *hl_malloc_host(size_t size) { ...@@ -370,7 +296,7 @@ void *hl_malloc_host(size_t size) {
void hl_free_mem_host(void *dest_h) { void hl_free_mem_host(void *dest_h) {
CHECK_NOTNULL(dest_h); CHECK_NOTNULL(dest_h);
cudaError_t err = dynload::cudaFreeHost(dest_h); cudaError_t err = cudaFreeHost(dest_h);
CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err) CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
<< hl_get_device_error_string(); << hl_get_device_error_string();
} }
...@@ -381,11 +307,11 @@ void hl_memcpy(void *dst, void *src, size_t size) { ...@@ -381,11 +307,11 @@ void hl_memcpy(void *dst, void *src, size_t size) {
} }
CHECK_NOTNULL(dst); CHECK_NOTNULL(dst);
CHECK_NOTNULL(src); CHECK_NOTNULL(src);
CHECK_CUDA(dynload::cudaMemcpy(dst, src, size, cudaMemcpyDefault)); CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
} }
void hl_memset_device(void *dest_d, int value, size_t size) { void hl_memset_device(void *dest_d, int value, size_t size) {
CHECK_CUDA(dynload::cudaMemset(dest_d, value, size)); CHECK_CUDA(cudaMemset(dest_d, value, size));
} }
void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) { void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
...@@ -394,7 +320,7 @@ void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) { ...@@ -394,7 +320,7 @@ void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
} }
CHECK_NOTNULL(src_h); CHECK_NOTNULL(src_h);
CHECK_NOTNULL(dest_d); CHECK_NOTNULL(dest_d);
CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice)); CHECK_CUDA(cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
} }
void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) { void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
...@@ -403,7 +329,7 @@ void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) { ...@@ -403,7 +329,7 @@ void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
} }
CHECK_NOTNULL(dest_h); CHECK_NOTNULL(dest_h);
CHECK_NOTNULL(src_d); CHECK_NOTNULL(src_d);
CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost)); CHECK_CUDA(cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
} }
void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) { void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
...@@ -412,8 +338,7 @@ void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) { ...@@ -412,8 +338,7 @@ void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
} }
CHECK_NOTNULL(dest_d); CHECK_NOTNULL(dest_d);
CHECK_NOTNULL(src_d); CHECK_NOTNULL(src_d);
CHECK_CUDA( CHECK_CUDA(cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
dynload::cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
} }
void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) { void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
...@@ -427,8 +352,7 @@ void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) { ...@@ -427,8 +352,7 @@ void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
CHECK_LT(stream, HPPL_STREAM_END); CHECK_LT(stream, HPPL_STREAM_END);
cu_stream = t_resource.stream[stream]; cu_stream = t_resource.stream[stream];
CHECK_CUDA( CHECK_CUDA(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
} }
void hl_start() { void hl_start() {
...@@ -439,8 +363,7 @@ void hl_start() { ...@@ -439,8 +363,7 @@ void hl_start() {
bool hl_device_can_access_peer(int device, int peerDevice) { bool hl_device_can_access_peer(int device, int peerDevice) {
int canAccessPeer; int canAccessPeer;
CHECK_CUDA( CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
if (canAccessPeer == 1) { if (canAccessPeer == 1) {
return true; return true;
...@@ -450,9 +373,9 @@ bool hl_device_can_access_peer(int device, int peerDevice) { ...@@ -450,9 +373,9 @@ bool hl_device_can_access_peer(int device, int peerDevice) {
} }
void hl_device_enable_peer_access(int peerDevice) { void hl_device_enable_peer_access(int peerDevice) {
cudaError_t err = dynload::cudaDeviceEnablePeerAccess(peerDevice, 0); cudaError_t err = cudaDeviceEnablePeerAccess(peerDevice, 0);
if (cudaErrorPeerAccessAlreadyEnabled == err) { if (cudaErrorPeerAccessAlreadyEnabled == err) {
dynload::cudaGetLastError(); cudaGetLastError();
} else { } else {
CHECK_CUDA(err); CHECK_CUDA(err);
} }
...@@ -463,9 +386,9 @@ void hl_create_global_resources(hl_device_prop device_prop) { ...@@ -463,9 +386,9 @@ void hl_create_global_resources(hl_device_prop device_prop) {
int device = device_prop->device; int device = device_prop->device;
global_device_resources device_res = device_prop->device_resources; global_device_resources device_res = device_prop->device_resources;
CHECK_CUDA(dynload::cudaSetDevice(device)); CHECK_CUDA(cudaSetDevice(device));
/* device properties */ /* device properties */
CHECK_CUDA(dynload::cudaGetDeviceProperties(&cu_prop, device)); CHECK_CUDA(cudaGetDeviceProperties(&cu_prop, device));
device_prop->major = cu_prop.major; device_prop->major = cu_prop.major;
device_prop->minor = cu_prop.minor; device_prop->minor = cu_prop.minor;
...@@ -474,7 +397,7 @@ void hl_create_global_resources(hl_device_prop device_prop) { ...@@ -474,7 +397,7 @@ void hl_create_global_resources(hl_device_prop device_prop) {
/* create device stream */ /* create device stream */
for (int j = 0; j < NUMBER_OF_GLOBAL_STREAM; j++) { for (int j = 0; j < NUMBER_OF_GLOBAL_STREAM; j++) {
CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j])); CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
} }
/* cublas init */ /* cublas init */
...@@ -501,18 +424,18 @@ void hl_create_global_resources(hl_device_prop device_prop) { ...@@ -501,18 +424,18 @@ void hl_create_global_resources(hl_device_prop device_prop) {
device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t))); device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
pthread_mutex_init(device_res->gen_mutex, NULL); pthread_mutex_init(device_res->gen_mutex, NULL);
CHECK_CUDA(dynload::cudaRuntimeGetVersion(&g_cuda_lib_version)); CHECK_CUDA(cudaRuntimeGetVersion(&g_cuda_lib_version));
} }
int hl_get_cuda_version() { return g_cuda_lib_version; } int hl_get_cuda_version() { return g_cuda_lib_version; }
void hl_create_thread_resources(int device, void hl_create_thread_resources(int device,
thread_device_resources device_res) { thread_device_resources device_res) {
CHECK_CUDA(dynload::cudaSetDevice(device)); CHECK_CUDA(cudaSetDevice(device));
/* create thread stream */ /* create thread stream */
for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) { for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j])); CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
} }
/* allocation device memory */ /* allocation device memory */
...@@ -521,14 +444,14 @@ void hl_create_thread_resources(int device, ...@@ -521,14 +444,14 @@ void hl_create_thread_resources(int device,
/* allocation host memory */ /* allocation host memory */
device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE); device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
CHECK_CUDA(dynload::cudaEventCreate(&device_res->mem_event)); CHECK_CUDA(cudaEventCreate(&device_res->mem_event));
} }
void hl_specify_devices_start(int *device, int number) { void hl_specify_devices_start(int *device, int number) {
if (hl_start_flag) return; if (hl_start_flag) return;
/* 1. get the number of devices */ /* 1. get the number of devices */
CHECK_CUDA(dynload::cudaGetDeviceCount(&g_system_device_num)); CHECK_CUDA(cudaGetDeviceCount(&g_system_device_num));
CHECK_NE(g_system_device_num, 0) << "[Start failed] there is no GPU device"; CHECK_NE(g_system_device_num, 0) << "[Start failed] there is no GPU device";
if (device == NULL) { if (device == NULL) {
number = g_system_device_num; number = g_system_device_num;
...@@ -640,7 +563,7 @@ void hl_stream_synchronize(hl_stream_t stream) { ...@@ -640,7 +563,7 @@ void hl_stream_synchronize(hl_stream_t stream) {
<< ": the parameter stream is error."; << ": the parameter stream is error.";
cu_stream = t_resource.stream[stream]; cu_stream = t_resource.stream[stream];
CHECK_CUDA(dynload::cudaStreamSynchronize(cu_stream)); CHECK_CUDA(cudaStreamSynchronize(cu_stream));
} }
void hl_create_event(hl_event_t *event) { void hl_create_event(hl_event_t *event) {
...@@ -649,7 +572,7 @@ void hl_create_event(hl_event_t *event) { ...@@ -649,7 +572,7 @@ void hl_create_event(hl_event_t *event) {
struct _hl_event_st *st_event = struct _hl_event_st *st_event =
(struct _hl_event_st *)malloc(sizeof(struct _hl_event_st)); (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
CHECK_CUDA(dynload::cudaEventCreate(&st_event->cu_event)); CHECK_CUDA(cudaEventCreate(&st_event->cu_event));
*event = st_event; *event = st_event;
} }
...@@ -659,8 +582,7 @@ float hl_event_elapsed_time(hl_event_t start, hl_event_t end) { ...@@ -659,8 +582,7 @@ float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
CHECK_NOTNULL(start); CHECK_NOTNULL(start);
CHECK_NOTNULL(end); CHECK_NOTNULL(end);
CHECK_CUDA( CHECK_CUDA(cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
dynload::cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
return time; return time;
} }
...@@ -672,7 +594,7 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) { ...@@ -672,7 +594,7 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
<< ": the parameter stream is error."; << ": the parameter stream is error.";
cu_stream = t_resource.stream[stream]; cu_stream = t_resource.stream[stream];
CHECK_CUDA(dynload::cudaEventRecord(event->cu_event, cu_stream)); CHECK_CUDA(cudaEventRecord(event->cu_event, cu_stream));
} }
void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) { void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
...@@ -683,12 +605,12 @@ void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) { ...@@ -683,12 +605,12 @@ void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
<< ": the parameter stream is error."; << ": the parameter stream is error.";
cu_stream = t_resource.stream[stream]; cu_stream = t_resource.stream[stream];
CHECK_CUDA(dynload::cudaStreamWaitEvent(cu_stream, event->cu_event, 0)); CHECK_CUDA(cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
} }
void hl_destroy_event(hl_event_t event) { void hl_destroy_event(hl_event_t event) {
CHECK_NOTNULL(event); CHECK_NOTNULL(event);
CHECK_CUDA(dynload::cudaEventDestroy(event->cu_event)); CHECK_CUDA(cudaEventDestroy(event->cu_event));
free(event); free(event);
event = NULL; event = NULL;
...@@ -696,7 +618,7 @@ void hl_destroy_event(hl_event_t event) { ...@@ -696,7 +618,7 @@ void hl_destroy_event(hl_event_t event) {
void hl_event_synchronize(hl_event_t event) { void hl_event_synchronize(hl_event_t event) {
CHECK_NOTNULL(event); CHECK_NOTNULL(event);
CHECK_CUDA(dynload::cudaEventSynchronize(event->cu_event)); CHECK_CUDA(cudaEventSynchronize(event->cu_event));
} }
void hl_get_device_name(char *name, int len, int device) { void hl_get_device_name(char *name, int len, int device) {
...@@ -725,24 +647,24 @@ void hl_get_device_compute_capability(int *major, int *minor, int device) { ...@@ -725,24 +647,24 @@ void hl_get_device_compute_capability(int *major, int *minor, int device) {
*minor = g_device[device]->minor; *minor = g_device[device]->minor;
} }
int hl_get_device_last_error() { return (int)dynload::cudaGetLastError(); } int hl_get_device_last_error() { return (int)cudaGetLastError(); }
const char *hl_get_device_error_string() { const char *hl_get_device_error_string() {
cudaError_t err = dynload::cudaGetLastError(); cudaError_t err = cudaGetLastError();
return dynload::cudaGetErrorString(err); return cudaGetErrorString(err);
} }
const char *hl_get_device_error_string(size_t err) { const char *hl_get_device_error_string(size_t err) {
return dynload::cudaGetErrorString((cudaError_t)err); return cudaGetErrorString((cudaError_t)err);
} }
void hl_device_synchronize() { CHECK_CUDA(dynload::cudaDeviceSynchronize()); } void hl_device_synchronize() { CHECK_CUDA(cudaDeviceSynchronize()); }
void hl_set_device_flags_block() { void hl_set_device_flags_block() {
CHECK_CUDA(dynload::cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
} }
bool hl_cuda_event_is_ready(hl_event_t event) { bool hl_cuda_event_is_ready(hl_event_t event) {
cudaError_t err = dynload::cudaEventQuery(event->cu_event); cudaError_t err = cudaEventQuery(event->cu_event);
CHECK(cudaSuccess == err || cudaErrorNotReady == err); CHECK(cudaSuccess == err || cudaErrorNotReady == err);
if (cudaErrorNotReady == err) { if (cudaErrorNotReady == err) {
...@@ -751,6 +673,6 @@ bool hl_cuda_event_is_ready(hl_event_t event) { ...@@ -751,6 +673,6 @@ bool hl_cuda_event_is_ready(hl_event_t event) {
return true; return true;
} }
void hl_profiler_start() { CHECK_CUDA(dynload::cudaProfilerStart()); } void hl_profiler_start() { CHECK_CUDA(cudaProfilerStart()); }
void hl_profiler_end() { CHECK_CUDA(dynload::cudaProfilerStop()); } void hl_profiler_end() { CHECK_CUDA(cudaProfilerStop()); }
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_USE_DSO
#include <cuda_runtime.h>
#include <mutex>
#include "hl_dso_loader.h"
/**
* cudart wrapper: for dynamic load libcudart.so.
* When nvcc compile cuda kernels, it will insert
* some build-in runtime routines, which must be
* provided by us if PADDLE_USE_DSO is true. If
* PADDLE_USE_DSO is false, all of them must be
* ignored to avoid multiple definitions.
*/
namespace dynload {
extern std::once_flag cudart_dso_flag;
extern void *cudart_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cuda routine
* via operator overloading.
**/
#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type) \
struct DynLoad__##__name { \
template <typename... Args> \
__type operator()(Args... args) { \
typedef __type (*cudartFunc)(Args...); \
std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
void *p_##__name = dlsym(cudart_dso_handle, #__name); \
return reinterpret_cast<cudartFunc>(p_##__name)(args...); \
} \
} __name; /* struct DynLoad__##__name */
/* include all needed cuda functions in HPPL */
// clang-format off
#define CUDA_ROUTINE_EACH(__macro) \
__macro(cudaLaunch, cudaError_t) \
__macro(cudaSetupArgument, cudaError_t) \
__macro(cudaConfigureCall, cudaError_t) \
__macro(__cudaRegisterFatBinary, void**) \
__macro(__cudaUnregisterFatBinary, void) \
__macro(__cudaRegisterFunction, void) \
__macro(__cudaRegisterVar, void) \
__macro(__cudaRegisterManagedVar, void) \
__macro(__cudaInitModule, char) \
__macro(__cudaRegisterTexture, void) \
__macro(__cudaRegisterSurface, void)
// clang-format on
CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
#if CUDART_VERSION >= 7000
DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
#endif
#undef CUDA_ROUNTINE_EACH
} /* namespace dynload */
#if CUDART_VERSION >= 7000
__host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
dim3 gridDim,
dim3 blockDim,
void **args,
size_t sharedMem,
cudaStream_t stream) {
return dynload::cudaLaunchKernel(
func, gridDim, blockDim, args, sharedMem, stream);
}
#endif /* CUDART_VERSION >= 7000 */
__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
return dynload::cudaLaunch(func);
}
__host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
size_t size,
size_t offset) {
return dynload::cudaSetupArgument(arg, size, offset);
}
__host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
dim3 blockDim,
size_t sharedMem,
cudaStream_t stream) {
return dynload::cudaConfigureCall(gridDim, blockDim, sharedMem, stream);
}
extern "C" {
void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
return dynload::__cudaRegisterFatBinary(fatCubin);
}
void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
}
void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle,
const char *hostFun,
char *deviceFun,
const char *deviceName,
int thread_limit,
uint3 *tid,
uint3 *bid,
dim3 *bDim,
dim3 *gDim,
int *wSize) {
return dynload::__cudaRegisterFunction(fatCubinHandle,
hostFun,
deviceFun,
deviceName,
thread_limit,
tid,
bid,
bDim,
gDim,
wSize);
}
void CUDARTAPI __cudaRegisterVar(void **fatCubinHandle,
char *hostVar,
char *deviceAddress,
const char *deviceName,
int ext,
int size,
int constant,
int global) {
return dynload::__cudaRegisterVar(fatCubinHandle,
hostVar,
deviceAddress,
deviceName,
ext,
size,
constant,
global);
}
extern void CUDARTAPI __cudaRegisterManagedVar(void **fatCubinHandle,
void **hostVarPtrAddress,
char *deviceAddress,
const char *deviceName,
int ext,
int size,
int constant,
int global) {
return dynload::__cudaRegisterManagedVar(fatCubinHandle,
hostVarPtrAddress,
deviceAddress,
deviceName,
ext,
size,
constant,
global);
}
char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
return dynload::__cudaInitModule(fatCubinHandle);
}
void CUDARTAPI __cudaRegisterTexture(void **fatCubinHandle,
const struct textureReference *hostVar,
const void **deviceAddress,
const char *deviceName,
int dim,
int norm,
int ext) {
return dynload::__cudaRegisterTexture(
fatCubinHandle, hostVar, deviceAddress, deviceName, dim, norm, ext);
}
void CUDARTAPI __cudaRegisterSurface(void **fatCubinHandle,
const struct surfaceReference *hostVar,
const void **deviceAddress,
const char *deviceName,
int dim,
int ext) {
return dynload::__cudaRegisterSurface(
fatCubinHandle, hostVar, deviceAddress, deviceName, dim, ext);
}
} /* extern "C" */
#endif
...@@ -25,10 +25,8 @@ DEFINE_string(cudnn_dir, ...@@ -25,10 +25,8 @@ DEFINE_string(cudnn_dir,
DEFINE_string(cuda_dir, DEFINE_string(cuda_dir,
"", "",
"Specify path for loading cuda library, such as libcublas, " "Specify path for loading cuda library, such as libcublas, "
"libcurand. For instance, /usr/local/cuda/lib64. (Note: " "libcurand. For instance, /usr/local/cuda/lib64. If default, "
"libcudart can not be specified by cuda_dir, since some " "dlopen will search cuda from LD_LIBRARY_PATH");
"build-in function in cudart already ran before main entry). "
"If default, dlopen will search cuda from LD_LIBRARY_PATH");
DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so."); DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
...@@ -147,14 +145,6 @@ void GetCudnnDsoHandle(void** dso_handle) { ...@@ -147,14 +145,6 @@ void GetCudnnDsoHandle(void** dso_handle) {
#endif #endif
} }
void GetCudartDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
#else
GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
#endif
}
void GetCurandDsoHandle(void** dso_handle) { void GetCurandDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle); GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
......
...@@ -24,13 +24,15 @@ set(PSERVER_SOURCES ...@@ -24,13 +24,15 @@ set(PSERVER_SOURCES
BaseClient.cpp BaseClient.cpp
ParameterClient2.cpp ParameterClient2.cpp
ParameterServer2.cpp ParameterServer2.cpp
SparseParameterDistribution.cpp) SparseParameterDistribution.cpp
ParameterServerController.cpp)
set(PSERVER_HEADERS set(PSERVER_HEADERS
BaseClient.h BaseClient.h
ParameterClient2.h ParameterClient2.h
ParameterServer2.h ParameterServer2.h
SparseParameterDistribution.h) SparseParameterDistribution.h
ParameterServerController.h)
add_library(paddle_pserver STATIC add_library(paddle_pserver STATIC
${PSERVER_SOURCES}) ${PSERVER_SOURCES})
......
...@@ -13,66 +13,17 @@ See the License for the specific language governing permissions and ...@@ -13,66 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <fstream> #include <fstream>
#include "paddle/utils/StringUtil.h" #include "ParameterServerController.h"
#include "paddle/utils/Util.h"
#include "ParameterServer2.h"
#include "RDMANetwork.h"
#include "paddle/utils/Flags.h"
using namespace paddle; // NOLINT using namespace paddle; // NOLINT
int main(int argc, char** argv) { int main(int argc, char** argv) {
initMain(argc, argv); initMain(argc, argv);
std::vector<std::string> devices; std::unique_ptr<ParameterServerController> parameterServerPtr(
std::vector<std::shared_ptr<ParameterServer2>> pservers; paddle::ParameterServerController::createFromGflags());
parameterServerPtr->start();
// round robin to loadbalance RDMA server ENGINE parameterServerPtr->wait();
int rdmaCpu = 0;
int onlineCpus = rdma::numCpus();
int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
if (FLAGS_nics.empty()) {
pservers.resize(numPorts);
for (int i = 0; i < numPorts; ++i) {
if (FLAGS_rdma_tcp == "rdma") {
pservers[i].reset(
new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++));
rdmaCpu = rdmaCpu % onlineCpus;
} else {
pservers[i].reset(new ParameterServer2(std::string(), FLAGS_port + i));
}
CHECK(pservers[i]->init()) << "Fail to initialize parameter server"
<< FLAGS_port + i;
LOG(INFO) << "pserver started : " << FLAGS_port + i;
pservers[i]->start();
}
} else {
str::split(FLAGS_nics, ',', &devices);
pservers.resize(devices.size() * numPorts);
for (int i = 0; i < numPorts; ++i) {
for (size_t j = 0; j < devices.size(); ++j) {
if (FLAGS_rdma_tcp == "rdma") {
pservers[i * devices.size() + j].reset(new ParameterServer2(
getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++));
rdmaCpu = rdmaCpu % onlineCpus;
} else {
pservers[i * devices.size() + j].reset(
new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i));
}
CHECK(pservers[i * devices.size() + j]->init())
<< "Fail to initialize parameter server" << devices[j]
<< FLAGS_port + i;
LOG(INFO) << "pserver started : " << devices[j] << ":"
<< FLAGS_port + i;
pservers[i * devices.size() + j]->start();
}
}
}
for (auto& pserver : pservers) {
pserver->join();
}
return 0; return 0;
} }
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "ParameterServerController.h"
namespace paddle {
ParameterServerController::ParameterServerController(
const ParameterServerConfig& config) {
// round robin to load balance RDMA server ENGINE
std::vector<std::string> devices;
int rdmaCpu = 0;
int onlineCpus = rdma::numCpus();
int numPorts = config.ports_num() + config.ports_num_for_sparse();
if (config.nics().empty()) {
parameterServers_.resize(numPorts);
for (int i = 0; i < numPorts; ++i) {
if (config.rdma_tcp() == "rdma") {
parameterServers_[i].reset(
new ParameterServer2(std::string(), config.port() + i, rdmaCpu++));
rdmaCpu = rdmaCpu % onlineCpus;
} else {
parameterServers_[i].reset(
new ParameterServer2(std::string(), config.port() + i));
}
CHECK(parameterServers_[i]->init()) << "Fail to initialize parameter "
"server on port "
<< config.port() + i;
}
} else {
str::split(config.nics(), ',', &devices);
parameterServers_.resize(devices.size() * numPorts);
for (int i = 0; i < numPorts; ++i) {
for (size_t j = 0; j < devices.size(); ++j) {
if (config.rdma_tcp() == "rdma") {
parameterServers_[i * devices.size() + j].reset(new ParameterServer2(
getIpAddr(devices[j]), config.port() + i, rdmaCpu++));
rdmaCpu = rdmaCpu % onlineCpus;
} else {
parameterServers_[i * devices.size() + j].reset(
new ParameterServer2(getIpAddr(devices[j]), config.port() + i));
}
CHECK(parameterServers_[i * devices.size() + j]->init())
<< "Fail to initialize parameter server with device " << devices[j]
<< config.port() + i;
}
}
}
}
ParameterServerController::~ParameterServerController() { this->wait(); }
ParameterServerController* ParameterServerController::createFromGflags() {
ParameterServerConfig config;
config.set_nics(FLAGS_nics);
config.set_rdma_tcp(FLAGS_rdma_tcp);
config.set_port(FLAGS_port);
config.set_ports_num(FLAGS_ports_num);
config.set_ports_num_for_sparse(FLAGS_ports_num_for_sparse);
return create(config);
}
ParameterServerController* ParameterServerController::create(
const ParameterServerConfig& config) {
return new ParameterServerController(config);
}
void ParameterServerController::start() {
LOG(INFO) << "number of parameterServer instances: "
<< parameterServers_.size();
int i = 0;
for (const auto& parameterServer : parameterServers_) {
LOG(INFO) << "Starting parameterServer[" << i << "]";
parameterServer->start();
i++;
}
}
void ParameterServerController::wait() {
int i = 0;
for (const auto& parameterServer : parameterServers_) {
LOG(INFO) << "Waiting parameterServer[" << i << "]";
parameterServer->join();
i++;
}
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "ParameterServer2.h"
#include "ParameterServerConfig.pb.h"
#include "RDMANetwork.h"
#include "paddle/utils/StringUtil.h"
namespace paddle {
/**
* @brief ParameterServerController is used for create, init and manage multi
* parameter server instances. The num of the instances is decided by port
* num(the ports number for parameter send) and network devices configured
* by gflags or proto.
*/
class ParameterServerController final {
public:
DISABLE_COPY(ParameterServerController);
/**
* @brief Ctor, Create a ParameterServerController from ParameterServerConfig.
*/
explicit ParameterServerController(const ParameterServerConfig& config);
/**
* @brief Dtor.
*/
~ParameterServerController();
/**
* @brief create ParameterServerController from gflags, this is used for
* compatibility with the old usage of configuration by gflags.
*/
static ParameterServerController* createFromGflags();
/**
* @brief create ParameterServerController with ParameterServerConfig, remove
* gflags from ParameterServer. Init all ParameterServer2 instances according
* to
* the config.
*/
static ParameterServerController* create(const ParameterServerConfig& config);
/**
* @brief start all ParameterServer2 instances in this
* ParameterServerController.
*/
void start();
/**
* @brief join and wait for all ParameterServer2 instances thread in this
* ParameterServerController.
*/
void wait();
private:
std::vector<std::unique_ptr<ParameterServer2>> parameterServers_;
};
} // namespace paddle
...@@ -2,8 +2,16 @@ configure_file(submit_local.sh.in ...@@ -2,8 +2,16 @@ configure_file(submit_local.sh.in
submit_local.sh submit_local.sh
@ONLY) @ONLY)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/submit_local.sh DESTINATION bin install(FILES ${CMAKE_CURRENT_BINARY_DIR}/submit_local.sh DESTINATION bin
PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
RENAME paddle) RENAME paddle)
configure_file(tools/usage_stat/usage.sh
usage.sh
@ONLY)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/usage.sh DESTINATION opt/paddle/bin
PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
RENAME paddle_usage)
...@@ -122,6 +122,9 @@ case "$1" in ...@@ -122,6 +122,9 @@ case "$1" in
"make_diagram") "make_diagram")
python -m paddle.utils.make_model_diagram ${@:2} python -m paddle.utils.make_model_diagram ${@:2}
;; ;;
"usage")
$MYDIR/../opt/paddle/bin/paddle_usage ${@:2}
;;
"version") "version")
version version
;; ;;
......
#!/bin/bash
ARGPARSE=`getopt -o u:vin:l:e: --long git-user:,help,dry-run,task-name:,log-file:,exit-code: -- "$@"`
KEEP_ANONYMOUS="A_USER_DOES_NOT_TELL_US"
# paddle config home dir, same as paddle
PADDLE_CONF_HOME="$HOME/.config/paddle"
# api url, mirror url(s) will be append later
PD_URLS="http://api.paddlepaddle.org/version"
usage()
{
echo "Usage: `basename $0` [options]"
echo "Options:"
echo " -e, --exit-code=EXIT_CODE The train/predict process's exit code"
echo " -l, --log-file=LOG_FILE_PATH Read which log file to get the duration of process"
echo " -n, --task-name=TASK_NAME The name of demo or example"
echo " -u, --git-user=GITHUB_USER provide contact info, like username or email"
echo " -v, -i Verbose output and interact with user when necessary"
echo " --help display this help message"
}
eval set -- "${ARGPARSE}"
while true; do
case "$1" in
-l|--log-file)
log_file=$2
shift 2
;;
-e|--exit-code)
exit_code=$2
shift 2
;;
-u|--git-user)
github_user=$2
shift 2
;;
-n|--task-name)
task=$2
shift 2
;;
-v|-i)
v=1
shift
;;
--dry-run)
dry_run=1
shift
;;
--)
shift
break
;;
--help)
usage
exit 0
;;
*)
echo "Invalid option $1"
usage
exit 1
;;
esac
done
# parse the log_file to get the time costs
if [ -s "${log_file}" ]; then
duration=`awk 'BEGIN{day=0;last_sec=0;min_sec=0;max_sec=0;}
{if(index($2,":")==3){
t=substr($2,1,8);
sec=day*86400+substr(t,1,2)*3600+substr(t,4,2)*60+substr(t,7,2);
if(sec<last_sec-600){day+=1;sec+=86400;}
last_sec=sec;
if(min_sec==0 || min_sec>sec){min_sec=sec;}
if(max_sec==0 || max_sec<sec){max_sec=sec;}
}}
END{print max_sec-min_sec}' ${log_file}`
else
duration=-1
fi
if [ "${v}" = "1" ]; then echo "duration: ${duration}"; fi
# try find the user/email if not given
if [ -z "${github_user}" ]; then
# search for cached username
if [ -s "${PADDLE_CONF_HOME}/github_user" ]; then
if [ "${v}" = "1" ]; then echo "read github_user from cache..."; fi
github_user=`cat ${PADDLE_CONF_HOME}/github_user`
else
# search the github-user from git config
if [ "${v}" = "1" ]; then echo "read github_user from git..."; fi
git_username=`git config --get user.name 2>/dev/null`
git_url=`git config --get remote.origin.url 2>/dev/null`
if [ "`echo ${git_url} | cut -b 1-19`" = "https://github.com/" ]; then
# under a git url, like https://github.com/user_xxx/proj_yyy.git
if [ "${v}" = "1" ]; then echo " from github url..."; fi
github_user=`echo ${git_url} | cut -d "/" -f 4`
if [ "${github_user}" = "PaddlePaddle" ]; then
github_user=
fi
fi
if [ -n "${git_username}" -a -z "${github_user}" ]; then
if [ "${v}" = "1" ]; then echo " from global git username..."; fi
github_user=${git_username}
fi
fi
fi
# allow user to set the user name, if it's not found
if [ -z "${github_user}" -a "${v}" = "1" ]; then
read -p "Please input your github username or email, or just return to keep this feedback anonymous:"
github_user=${REPLY}
if [ -z "${github_user}" ]; then
# empty input, consider as one anonymous user
github_user="${KEEP_ANONYMOUS}"
fi
fi
if [ -n "${github_user}" -a -z "${dry_run}" ]; then
# valid user and not in dry-run mode, then save to cache
mkdir -p ${PADDLE_CONF_HOME}
echo "${github_user}" >${PADDLE_CONF_HOME}/github_user
fi
if [ "${v}" = "1" ]; then echo "username: ${github_user}"; fi
if [ "${github_user}" = "${KEEP_ANONYMOUS}" ]; then
# anonymous user should keep the var empty.
github_user=
fi
# read local paddle version
paddle_version=`paddle version | grep PaddlePaddle | head -n1 | cut -d " " -f 2 | cut -d "," -f 1`
if [ "${v}" = "1" ]; then echo "version:${paddle_version}"; fi
# read local system time
system_time=`date "+%Y%m%d%H%M%S"`
if [ "${v}" = "1" ]; then echo "system time:${system_time}"; fi
# make empty job_name as default value.
if [ -z "${task}" ]; then
task="(unknown_task)"
fi
if [ "${v}" = "1" ]; then echo "task: ${task}"; fi
# concat the curl command
params="content={\"data_type\":\"usage\",\
\"system_time\":${system_time},\"paddle_version\":\"${paddle_version}\",\
\"github_user\":\"${github_user}\",\"job_name\":\"${task}\",\
\"duration\":${duration},\"exit_code\":\"${exit_code}\"\
}&type=1"
curl_cmd_prefix="curl -m 5 -X POST -d ${params}\
-b ${PADDLE_CONF_HOME}/paddle.cookie -c ${PADDLE_CONF_HOME}/paddle.cookie "
if [ "${dry_run}" = "1" ]; then
first_url=`echo ${PD_URLS} | cut -d " " -f 1`
echo "(dry-run mode)curl command: ${curl_cmd_prefix} ${first_url}"
exit 0
else
for u in ${PD_URLS}; do
curl_cmd="${curl_cmd_prefix} ${u}"
if [ "${v}" = "1" ]; then echo "run: ${curl_cmd}"; fi
${curl_cmd} >/dev/null 2>&1
if [ $? -eq 0 ]; then
if [ "${v}" = "1" ]; then echo "upload OK!"; fi
exit 0
else
if [ "${v}" = "1" ]; then echo "upload failed...try next"; fi
fi
done
if [ "${v}" = "1" ]; then echo "all urls tried but all failed...exit"; fi
exit 1
fi
...@@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/pserver/ParameterServer2.h" #include <fenv.h>
#include "paddle/utils/Common.h" #include "paddle/pserver/ParameterServerController.h"
#include "paddle/utils/PythonUtil.h" #include "paddle/utils/PythonUtil.h"
#include "paddle/utils/StringUtil.h"
#include "ParamUtil.h" #include "ParamUtil.h"
#include "Trainer.h" #include "Trainer.h"
#include "paddle/pserver/RDMANetwork.h"
DEFINE_bool(start_pserver, false, "Whether to start pserver"); DEFINE_bool(start_pserver, false, "Whether to start pserver");
DECLARE_int32(gpu_id); DECLARE_int32(gpu_id);
...@@ -38,54 +36,11 @@ int main(int argc, char** argv) { ...@@ -38,54 +36,11 @@ int main(int argc, char** argv) {
initMain(argc, argv); initMain(argc, argv);
initPython(argc, argv); initPython(argc, argv);
std::vector<std::unique_ptr<ParameterServer2>> pservers; std::unique_ptr<ParameterServerController> parameterServerPtr(nullptr);
std::vector<std::string> devices;
if (FLAGS_start_pserver) { if (FLAGS_start_pserver) {
// round robin to loadbalance RDMA server ENGINE parameterServerPtr.reset(
int rdmaCpu = 0; paddle::ParameterServerController::createFromGflags());
int onlineCpus = rdma::numCpus(); parameterServerPtr->start();
int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
if (FLAGS_nics.empty()) {
pservers.resize(numPorts);
for (int i = 0; i < numPorts; ++i) {
if (FLAGS_rdma_tcp == "rdma") {
pservers[i].reset(
new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++));
rdmaCpu = rdmaCpu % onlineCpus;
} else {
pservers[i].reset(
new ParameterServer2(std::string(), FLAGS_port + i));
}
CHECK(pservers[i]->init()) << "Fail to initialize parameter server"
<< FLAGS_port + i;
LOG(INFO) << "pserver started : " << FLAGS_port + i;
pservers[i]->start();
}
} else {
str::split(FLAGS_nics, ',', &devices);
pservers.resize(devices.size() * numPorts);
for (int i = 0; i < numPorts; ++i) {
for (size_t j = 0; j < devices.size(); ++j) {
if (FLAGS_rdma_tcp == "rdma") {
pservers[i * devices.size() + j].reset(new ParameterServer2(
getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++));
rdmaCpu = rdmaCpu % onlineCpus;
} else {
pservers[i * devices.size() + j].reset(
new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i));
}
CHECK(pservers[i * devices.size() + j]->init())
<< "Fail to initialize parameter server" << devices[j]
<< FLAGS_port + i;
LOG(INFO) << "pserver started : " << devices[j] << ":"
<< FLAGS_port + i;
pservers[i * devices.size() + j]->start();
}
}
}
} }
Trainer trainer; Trainer trainer;
auto config = TrainerConfigHelper::createFromFlags(); auto config = TrainerConfigHelper::createFromFlags();
......
...@@ -4,7 +4,8 @@ set(proto_filenames ...@@ -4,7 +4,8 @@ set(proto_filenames
ModelConfig.proto ModelConfig.proto
ParameterConfig.proto ParameterConfig.proto
ParameterService.proto ParameterService.proto
TrainerConfig.proto) TrainerConfig.proto
ParameterServerConfig.proto)
set(PROTO_GEN) set(PROTO_GEN)
set(PROTO_GEN_PY) set(PROTO_GEN_PY)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax = "proto2";
package paddle;
/**
* Configuration structure for ParameterClient2.
*/
message ParameterClientConfig {
required int32 trainer_id = 1;
}
/**
* Configuration structure for ParameterServer2.
*/
message ParameterServerConfig {
// The ports number for parameter send,
// increment based on default port number
required int32 ports_num = 1 [default = 1];
// The ports number for parameter send,
// increment based on default (port + ports_num
required int32 ports_num_for_sparse = 2 [default = 0];
// network device name for pservers
required string nics = 3 [default = "xgbe0,xgbe1"];
required string rdma_tcp = 4 [default = "tcp"];
// Listening port for pserver
required int32 port = 5 [default = 20134];
// number of gradient servers
required int32 num_gradient_servers = 6 [default = 1];
// number of threads for sync op exec
required int32 pserver_num_threads = 7 [default = 1];
// control config_.async_lagged_grad_discard_ratio() min value
required double async_lagged_ratio_min = 8 [default = 1.0];
// if async_lagged_grad_discard_ratio is not set in trainer_config.conf
// use it as defalut value
required double async_lagged_ratio_default = 9 [default = 1.5];
}
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册