提交 f173341f 编写于 作者: Q qijun

Merge remote-tracking branch 'baidu/develop' into feature/sppnet

...@@ -109,11 +109,9 @@ else() ...@@ -109,11 +109,9 @@ else()
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-g -O3 --use_fast_math") set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-g -O3 --use_fast_math")
if(WITH_AVX) if(WITH_AVX)
if(AVX_FOUND) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler -mavx")
endif(AVX_FOUND)
else(WITH_AVX) else(WITH_AVX)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler -msse3") set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
endif(WITH_AVX) endif(WITH_AVX)
if(WITH_DSO) if(WITH_DSO)
...@@ -138,11 +136,11 @@ if(NOT WITH_TIMER) ...@@ -138,11 +136,11 @@ if(NOT WITH_TIMER)
endif(NOT WITH_TIMER) endif(NOT WITH_TIMER)
if(WITH_AVX) if(WITH_AVX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAGS}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
else(WITH_AVX) else(WITH_AVX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse3") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}")
endif(WITH_AVX) endif(WITH_AVX)
if(WITH_PYTHON) if(WITH_PYTHON)
......
...@@ -3,36 +3,55 @@ ...@@ -3,36 +3,55 @@
INCLUDE(CheckCXXSourceRuns) INCLUDE(CheckCXXSourceRuns)
SET(FIND_AVX_10)
SET(FIND_AVX_20)
SET(AVX_FLAGS)
SET(AVX_FOUND)
# Check AVX 2
SET(CMAKE_REQUIRED_FLAGS)
IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
SET(CMAKE_REQUIRED_FLAGS "-mavx2") set(MMX_FLAG "-mmmx")
ELSEIF(MSVC AND NOT CMAKE_CL_64) # reserve for WINDOWS set(SSE2_FLAG "-msse2")
SET(CMAKE_REQUIRED_FLAGS "/arch:AVX2") set(SSE3_FLAG "-msse3")
SET(AVX_FLAG "-mavx")
SET(AVX2_FLAG "-mavx2")
ELSEIF(MSVC)
set(MMX_FLAG "/arch:MMX")
set(SSE2_FLAG "/arch:SSE2")
set(SSE3_FLAG "/arch:SSE3")
SET(AVX_FLAG "/arch:AVX")
SET(AVX2_FLAG "/arch:AVX2")
ENDIF() ENDIF()
# Check MMX
set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
CHECK_CXX_SOURCE_RUNS(" CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h> #include <mmintrin.h>
int main() int main()
{ {
__m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); _mm_setzero_si64();
__m256i result = _mm256_abs_epi32 (a);
return 0; return 0;
}" FIND_AVX_20) }" MMX_FOUND)
# Check AVX # Check SSE2
SET(CMAKE_REQUIRED_FLAGS) set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG})
IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") CHECK_CXX_SOURCE_RUNS("
SET(CMAKE_REQUIRED_FLAGS "-mavx") #include <emmintrin.h>
ELSEIF(MSVC AND NOT CMAKE_CL_64) int main()
SET(CMAKE_REQUIRED_FLAGS "/arch:AVX") {
endif() _mm_setzero_si128();
return 0;
}" SSE2_FOUND)
# Check SSE3
set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG})
CHECK_CXX_SOURCE_RUNS("
#include <pmmintrin.h>
int main()
{
__m128d a = _mm_set1_pd(6.28);
__m128d b = _mm_set1_pd(3.14);
__m128d result = _mm_addsub_pd(a, b);
result = _mm_movedup_pd(result);
return 0;
}" SSE3_FOUND)
# Check AVX
set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
CHECK_CXX_SOURCE_RUNS(" CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h> #include <immintrin.h>
int main() int main()
...@@ -41,25 +60,17 @@ int main() ...@@ -41,25 +60,17 @@ int main()
__m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
__m256 result = _mm256_add_ps (a, b); __m256 result = _mm256_add_ps (a, b);
return 0; return 0;
}" FIND_AVX_10) }" AVX_FOUND)
IF(${FIND_AVX_20})
IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
SET(AVX_FLAGS "${AVX_FLAGS} -mavx2")
ELSEIF(MSVC)
SET(AVX_FLAGS "${AVX_FLAGS} /arch:AVX2")
ENDIF()
ENDIF()
IF(${FIND_AVX_10}) # Check AVX 2
IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
SET(AVX_FLAGS "${AVX_FLAGS} -mavx") CHECK_CXX_SOURCE_RUNS("
ELSEIF(MSVC) #include <immintrin.h>
SET(AVX_FLAGS "${AVX_FLAGS} /arch:AVX") int main()
ENDIF() {
ENDIF() __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
__m256i result = _mm256_abs_epi32 (a);
return 0;
}" AVX2_FOUND)
IF(${FIND_AVX_10}) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)
SET(AVX_FOUND TRUE)
MESSAGE(STATUS "Find CPU supports ${AVX_FLAGS}.")
ENDIF()
...@@ -5,3 +5,5 @@ plot.png ...@@ -5,3 +5,5 @@ plot.png
train.log train.log
image_provider_copy_1.py image_provider_copy_1.py
*pyc *pyc
train.list
test.list
文件模式从 100644 更改为 100755
...@@ -58,24 +58,29 @@ def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg, ...@@ -58,24 +58,29 @@ def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg,
settings.logger.info('DataProvider Initialization finished') settings.logger.info('DataProvider Initialization finished')
@provider(init_hook=hook) @provider(init_hook=hook, min_pool_size=0)
def processData(settings, file_name): def processData(settings, file_list):
""" """
The main function for loading data. The main function for loading data.
Load the batch, iterate all the images and labels in this batch. Load the batch, iterate all the images and labels in this batch.
file_name: the batch file name. file_list: the batch file list.
""" """
data = cPickle.load(io.open(file_name, 'rb')) with open(file_list, 'r') as fdata:
indexes = list(range(len(data['images']))) lines = [line.strip() for line in fdata]
if settings.is_train: random.shuffle(lines)
random.shuffle(indexes) for file_name in lines:
for i in indexes: with io.open(file_name.strip(), 'rb') as file:
if settings.use_jpeg == 1: data = cPickle.load(file)
img = image_util.decode_jpeg(data['images'][i]) indexes = list(range(len(data['images'])))
else: if settings.is_train:
img = data['images'][i] random.shuffle(indexes)
img_feat = image_util.preprocess_img(img, settings.img_mean, for i in indexes:
settings.img_size, settings.is_train, if settings.use_jpeg == 1:
settings.color) img = image_util.decode_jpeg(data['images'][i])
label = data['labels'][i] else:
yield img_feat.tolist(), int(label) img = data['images'][i]
img_feat = image_util.preprocess_img(img, settings.img_mean,
settings.img_size, settings.is_train,
settings.color)
label = data['labels'][i]
yield img_feat.astype('float32'), int(label)
...@@ -35,6 +35,8 @@ if __name__ == '__main__': ...@@ -35,6 +35,8 @@ if __name__ == '__main__':
data_creator = ImageClassificationDatasetCreater(data_dir, data_creator = ImageClassificationDatasetCreater(data_dir,
processed_image_size, processed_image_size,
color) color)
data_creator.train_list_name = "train.txt"
data_creator.test_list_name = "test.txt"
data_creator.num_per_batch = 1000 data_creator.num_per_batch = 1000
data_creator.overwrite = True data_creator.overwrite = True
data_creator.create_batches() data_creator.create_batches()
...@@ -17,3 +17,6 @@ set -e ...@@ -17,3 +17,6 @@ set -e
data_dir=./data/cifar-out data_dir=./data/cifar-out
python preprocess.py -i $data_dir -s 32 -c 1 python preprocess.py -i $data_dir -s 32 -c 1
echo "data/cifar-out/batches/train.txt" > train.list
echo "data/cifar-out/batches/test.txt" > test.list
...@@ -25,8 +25,8 @@ if not is_predict: ...@@ -25,8 +25,8 @@ if not is_predict:
'img_size': 32,'num_classes': 10, 'img_size': 32,'num_classes': 10,
'use_jpeg': 1,'color': "color"} 'use_jpeg': 1,'color': "color"}
define_py_data_sources2(train_list=data_dir+"train.list", define_py_data_sources2(train_list="train.list",
test_list=data_dir+'test.list', test_list="train.list",
module='image_provider', module='image_provider',
obj='processData', obj='processData',
args=args) args=args)
......
# Cluster Training # Distributed Training
We provide some simple scripts ```paddle/scripts/cluster_train``` to help you to launch cluster training Job to harness PaddlePaddle's distributed trainning. For MPI and other cluster scheduler refer this naive script to implement more robust cluster training platform by yourself. In this article, we explain how to run distributed Paddle training jobs on clusters. We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).
The following cluster demo is based on RECOMMENDATION local training demo in PaddlePaddle ```demo/recommendation``` directory. Assuming you enter the ```paddle/scripts/cluster_train/``` directory. [Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH. They also work as a reference for users running more sophisticated cluster management systems like MPI and Kubernetes.
## Pre-requirements ## Prerequisite
Firstly, 1. Aforementioned scripts use a Python library [fabric](http://www.fabfile.org/) to run SSH commands. We can use `pip` to install fabric:
```bash ```bash
pip install fabric pip install fabric
``` ```
Secondly, go through installing scripts to install PaddlePaddle at all nodes to make sure demo can run as local mode. For CUDA enabled training, we assume that CUDA is installed in ```/usr/local/cuda```, otherwise missed cuda runtime libraries error could be reported at cluster runtime. In one word, the local training environment should be well prepared for the simple scripts.
Then you should prepare same ROOT_DIR directory in all nodes. ROOT_DIR is from in cluster_train/conf.py. Assuming that the ROOT_DIR = /home/paddle, you can create ```paddle``` user account as well, at last ```paddle.py``` can ssh connections to all nodes with ```paddle``` user automatically. 1. We need to install PaddlePaddle on all nodes in the cluster. To enable GPUs, we need to install CUDA in `/usr/local/cuda`; otherwise Paddle would report errors at runtime.
At last you can create ssh mutual trust relationship between all nodes for easy ssh login, otherwise ```password``` should be provided at runtime from ```paddle.py```. 1. Set the `ROOT_DIR` variable in [`cluster_train/conf.py`] on all nodes. For convenience, we often create a Unix user `paddle` on all nodes and set `ROOT_DIR=/home/paddle`. In this way, we can write public SSH keys into `/home/paddle/.ssh/authorized_keys` so that user `paddle` can SSH to all nodes without password.
## Prepare Job Workspace ## Prepare Job Workspace
```Job workspace``` is defined as one package directory which contains dependency libraries, train data, test data, model config file and all other related file dependencies. We refer to the directory where we put dependent libraries, config files, etc., as *workspace*.
These ```train/test``` data should be prepared before launching cluster job. To satisfy the requirement that train/test data are placed in different directory from workspace, PADDLE refers train/test data according to index file named as ```train.list/test.list``` which are used in model config file. So the train/test data also contains train.list/test.list two list file. All local training demo already provides scripts to help you create these two files, and all nodes in cluster job will handle files with same logical code in normal condition. These ```train/test``` data should be prepared before launching cluster job. To satisfy the requirement that train/test data are placed in different directory from workspace, PADDLE refers train/test data according to index file named as ```train.list/test.list``` which are used in model config file. So the train/test data also contains train.list/test.list two list file. All local training demo already provides scripts to help you create these two files, and all nodes in cluster job will handle files with same logical code in normal condition.
......
...@@ -48,7 +48,7 @@ static inline std::string join(const std::string& part1, const std::string& part ...@@ -48,7 +48,7 @@ static inline std::string join(const std::string& part1, const std::string& part
static inline void GetDsoHandleFromDefaultPath( static inline void GetDsoHandleFromDefaultPath(
std::string& dso_path, void** dso_handle, int dynload_flags) { std::string& dso_path, void** dso_handle, int dynload_flags) {
LOG(INFO) << "Try to find cuda library: " << dso_path VLOG(3) << "Try to find cuda library: " << dso_path
<< " from default system path."; << " from default system path.";
// default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
*dso_handle = dlopen(dso_path.c_str(), dynload_flags); *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册