- ResNet-50
@@ -47,9 +43,7 @@ TBD
| MKLML | 32.52 | 31.89 | 33.12 |
| MKL-DNN | 81.69 | 82.35 | 84.08 |
-
-chart on batch size 128
-TBD
+
- GoogLeNet
@@ -59,10 +53,35 @@ TBD
| MKLML | 128.46| 137.89| 158.63 |
| MKL-DNN | 250.46| 264.83| 269.50 |
-chart on batch size 128
-TBD
+
+
+#### Inference
+Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+- VGG-19
+
+| BatchSize | 1 | 2 | 4 | 8 | 16 |
+|-----------|-------|-------|-------|-------|-------|
+| OpenBLAS | 1.07 | 1.08 | 1.06 | 0.88 | 0.65 |
+| MKLML | 5.58 | 9.80 | 15.15 | 21.21 | 28.67 |
+| MKL-DNN | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 |
+
+- ResNet-50
+
+| BatchSize | 1 | 2 | 4 | 8 | 16 |
+|-----------|-------|--------|--------|--------|--------|
+| OpenBLAS | 3.35 | 3.19 | 3.09 | 2.55 | 1.96 |
+| MKLML | 6.33 | 12.02 | 22.88 | 40.53 | 63.09 |
+| MKL-DNN | 107.83| 148.84 | 177.78 | 189.35 | 217.69 |
+
+
+- GoogLeNet
+
+| BatchSize | 1 | 2 | 4 | 8 | 16 |
+|-----------|--------|--------|--------|--------|--------|
+| OpenBLAS | 12.04 | 11.31 | 10.00 | 9.07 | 4.34 |
+| MKLML | 22.74 | 41.56 | 81.22 | 133.47 | 210.53 |
+| MKL-DNN | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
+
### Laptop
TBD
-### Desktop
-TBD
diff --git a/benchmark/figs/googlenet-cpu-train.png b/benchmark/figs/googlenet-cpu-train.png
new file mode 100644
index 0000000000000000000000000000000000000000..c3f67faf096fe9b45dd815f294b41679dc7c9e54
Binary files /dev/null and b/benchmark/figs/googlenet-cpu-train.png differ
diff --git a/benchmark/figs/resnet-cpu-train.png b/benchmark/figs/resnet-cpu-train.png
new file mode 100644
index 0000000000000000000000000000000000000000..b96ecd5ff940c0d000613b1ed1f11fb16796cf47
Binary files /dev/null and b/benchmark/figs/resnet-cpu-train.png differ
diff --git a/benchmark/figs/vgg-cpu-train.png b/benchmark/figs/vgg-cpu-train.png
new file mode 100644
index 0000000000000000000000000000000000000000..f830ca6a87d10b72a5113636dd5686ab25a2e864
Binary files /dev/null and b/benchmark/figs/vgg-cpu-train.png differ
diff --git a/benchmark/paddle/image/alexnet.py b/benchmark/paddle/image/alexnet.py
index 3358d43a4b08c6a9b89d59e1a8be53ee1f12bbe0..77d130ae34059d1e87040d00346ac1dadd86b0d8 100644
--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
@@ -6,8 +6,18 @@ height = 227
width = 227
num_class = 1000
batch_size = get_config_arg('batch_size', int, 128)
+gp = get_config_arg('layer_num', int, 1)
+is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+ 'height': height,
+ 'width': width,
+ 'color': True,
+ 'num_class': num_class,
+ 'is_infer': is_infer,
+ 'num_samples': num_samples
+}
define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args)
@@ -31,7 +41,7 @@ net = img_pool_layer(input=net, pool_size=3, stride=2)
# conv2
net = img_conv_layer(
- input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1)
+ input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
net = img_pool_layer(input=net, pool_size=3, stride=2)
@@ -40,11 +50,11 @@ net = img_conv_layer(
input=net, filter_size=3, num_filters=384, stride=1, padding=1)
# conv4
net = img_conv_layer(
- input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1)
+ input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)
# conv5
net = img_conv_layer(
- input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1)
+ input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
net = img_pool_layer(input=net, pool_size=3, stride=2)
net = fc_layer(
@@ -59,6 +69,9 @@ net = fc_layer(
layer_attr=ExtraAttr(drop_rate=0.5))
net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
-lab = data_layer('label', num_class)
-loss = cross_entropy(input=net, label=lab)
-outputs(loss)
+if is_infer:
+ outputs(net)
+else:
+ lab = data_layer('label', num_class)
+ loss = cross_entropy(input=net, label=lab)
+ outputs(loss)
diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py
index 7059c13bd2c2b98eb3fbcf633a6f7064e54d5402..2a850ccb7f2c75b467554181fc5f4aa8f2b97a09 100644
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@@ -7,13 +7,15 @@ num_class = 1000
batch_size = get_config_arg('batch_size', int, 128)
use_gpu = get_config_arg('use_gpu', bool, True)
is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
- 'is_infer': is_infer
+ 'is_infer': is_infer,
+ 'num_samples': num_samples
}
define_py_data_sources2(
"train.list" if not is_infer else None,
diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py
index 927b1759941f362ef4b5ffe84dd01332986d9306..1018ec9ce1e529f618ddd7b7afa72a84c5e876a1 100644
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@@ -14,6 +14,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
else:
settings.data_size = settings.height * settings.width
settings.is_infer = kwargs.get('is_infer', False)
+ settings.num_samples = kwargs.get('num_samples', 2560)
if settings.is_infer:
settings.slots = [dense_vector(settings.data_size)]
else:
@@ -23,7 +24,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
@provider(
init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, file_list):
- for i in xrange(2560 if settings.is_infer else 1024):
+ for i in xrange(settings.num_samples):
img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
if settings.is_infer:
yield img.astype('float32')
diff --git a/benchmark/paddle/image/resnet.py b/benchmark/paddle/image/resnet.py
index 4a14363ff1db48a5072cbb5f5eb3bc9241ffca8f..2846e4763f1cda4602f03af5ec649d57ee6cf0d8 100644
--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
@@ -7,13 +7,15 @@ num_class = 1000
batch_size = get_config_arg('batch_size', int, 64)
layer_num = get_config_arg("layer_num", int, 50)
is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
- 'is_infer': is_infer
+ 'is_infer': is_infer,
+ 'num_samples': num_samples
}
define_py_data_sources2(
"train.list" if not is_infer else None,
diff --git a/benchmark/paddle/image/run_mkldnn_infer.sh b/benchmark/paddle/image/run_mkl_infer.sh
similarity index 87%
rename from benchmark/paddle/image/run_mkldnn_infer.sh
rename to benchmark/paddle/image/run_mkl_infer.sh
index 03a76c0540092501b33e1fdd430ae4e754744fd0..62c9bf6efd3810f506fd4592b2ba3a21b1b7f0e7 100755
--- a/benchmark/paddle/image/run_mkldnn_infer.sh
+++ b/benchmark/paddle/image/run_mkl_infer.sh
@@ -4,7 +4,7 @@ function clock_to_seconds() {
hours=`echo $1 | awk -F ':' '{print $1}'`
mins=`echo $1 | awk -F ':' '{print $2}'`
secs=`echo $1 | awk -F ':' '{print $3}'`
- echo `bc -l <<< "$secs + $mins * 60 + $hours * 3600"`
+ echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
}
function infer() {
@@ -37,7 +37,7 @@ function infer() {
--trainer_count=1 \
--num_passes=1 \
--save_dir="models/${topology}-${layer_num}" \
- --config_args="batch_size=128,layer_num=${layer_num}" \
+ --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
> /dev/null 2>&1
echo "Done"
fi
@@ -58,9 +58,9 @@ function infer() {
end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
start_sec=`clock_to_seconds $start`
end_sec=`clock_to_seconds $end`
- fps=`bc <<< "scale = 2; 1280 / ($end_sec - $start_sec)"`
+ fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
- echo "FPS: $fps images/sec" >> ${log}
+ echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
}
if [ ! -f "train.list" ]; then
@@ -79,8 +79,9 @@ fi
# inference benchmark
for use_mkldnn in True False; do
for batchsize in 1 2 4 8 16; do
- infer googlenet v1 $batchsize $use_mkldnn
- infer resnet 50 $batchsize $use_mkldnn
infer vgg 19 $batchsize $use_mkldnn
+ infer resnet 50 $batchsize $use_mkldnn
+ infer googlenet v1 $batchsize $use_mkldnn
+ infer alexnet 2 $batchsize $use_mkldnn
done
done
diff --git a/benchmark/paddle/image/run_mkldnn_train.sh b/benchmark/paddle/image/run_mkl_train.sh
similarity index 83%
rename from benchmark/paddle/image/run_mkldnn_train.sh
rename to benchmark/paddle/image/run_mkl_train.sh
index 320206239ae960bd088b05d3b10934a98da741b1..03d2d378fb72e36f765d89af788f6ee96fe21d4e 100755
--- a/benchmark/paddle/image/run_mkldnn_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
@@ -28,6 +28,10 @@ function train() {
--test_period=100 \
--config_args=$args \
2>&1 | tee ${log}
+
+ avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
+ fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
+ echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
}
if [ ! -f "train.list" ]; then
@@ -43,5 +47,6 @@ for use_mkldnn in True False; do
train vgg 19 $batchsize $use_mkldnn
train resnet 50 $batchsize $use_mkldnn
train googlenet v1 $batchsize $use_mkldnn
+ train alexnet 2 $batchsize $use_mkldnn
done
done
diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh
new file mode 100755
index 0000000000000000000000000000000000000000..da034f3b9dff794e22086a5295ad2b0c2361c356
--- /dev/null
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@@ -0,0 +1,64 @@
+set -e
+
+function clock_to_seconds() {
+ hours=`echo $1 | awk -F ':' '{print $1}'`
+ mins=`echo $1 | awk -F ':' '{print $2}'`
+ secs=`echo $1 | awk -F ':' '{print $3}'`
+ echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
+}
+
+function infer() {
+ unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+ topology=$1
+ layer_num=$2
+ bs=$3
+ thread=`nproc`
+ if [ $thread -gt $bs ]; then
+ thread=$bs
+ fi
+ log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log"
+
+ models_in="models/${topology}-${layer_num}/pass-00000/"
+ if [ ! -d $models_in ]; then
+ echo "./run_mkl_infer.sh to save the model first"
+ exit 0
+ fi
+ log_period=$((32 / bs))
+ paddle train --job=test \
+ --config="${topology}.py" \
+ --use_mkldnn=False \
+ --use_gpu=False \
+ --trainer_count=$thread \
+ --log_period=$log_period \
+ --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
+ --init_model_path=$models_in \
+ 2>&1 | tee ${log}
+
+ # calculate the last 5 logs period time of 160(=32*5) samples,
+ # the time before are burning time.
+ start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+ end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+ start_sec=`clock_to_seconds $start`
+ end_sec=`clock_to_seconds $end`
+ fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
+ echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+ echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+ echo " " > train.list
+fi
+if [ ! -f "test.list" ]; then
+ echo " " > test.list
+fi
+if [ ! -d "logs" ]; then
+ mkdir logs
+fi
+
+# inference benchmark
+for batchsize in 1 2 4 8 16; do
+ infer vgg 19 $batchsize
+ infer resnet 50 $batchsize
+ infer googlenet v1 $batchsize
+ infer alexnet 2 $batchsize
+done
diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh
new file mode 100755
index 0000000000000000000000000000000000000000..e9df83fee2a3f796b7234b39619364f6ee4d5dc9
--- /dev/null
+++ b/benchmark/paddle/image/run_openblas_train.sh
@@ -0,0 +1,41 @@
+set -e
+
+function train() {
+ unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+ topology=$1
+ layer_num=$2
+ bs=$3
+ thread=`nproc`
+ # each trainer_count use only 1 core to avoid conflict
+ log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
+ args="batch_size=${bs},layer_num=${layer_num}"
+ config="${topology}.py"
+ paddle train --job=time \
+ --config=$config \
+ --use_mkldnn=False \
+ --use_gpu=False \
+ --trainer_count=$thread \
+ --log_period=3 \
+ --test_period=30 \
+ --config_args=$args \
+ 2>&1 | tee ${log}
+
+ avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
+ fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
+ echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+ echo " " > train.list
+fi
+if [ ! -d "logs" ]; then
+ mkdir logs
+fi
+
+# training benchmark
+for batchsize in 64 128 256; do
+ train vgg 19 $batchsize
+ train resnet 50 $batchsize
+ train googlenet v1 $batchsize
+ train alexnet 2 $batchsize
+done
diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py
index 8d0a1e97a451cd52ef17e4e326673cc90059ef3c..ca0a6798fb8c35b68cf84d263855955eb93ba0b0 100644
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@@ -7,13 +7,15 @@ num_class = 1000
batch_size = get_config_arg('batch_size', int, 64)
layer_num = get_config_arg('layer_num', int, 19)
is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
- 'is_infer': is_infer
+ 'is_infer': is_infer,
+ 'num_samples': num_samples
}
define_py_data_sources2(
"train.list" if not is_infer else None,
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index b21fc43904d9aafe9f7d019dfbe5b1c0d3f9e2d6..6320b17520a687f88993b6f464d9115838b0f96b 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -3,7 +3,7 @@
# It will search MKLML, atlas, OpenBlas, reference-cblas in order.
#
# If any cblas implementation found, the following variable will be set.
-# CBLAS_PROVIDER # one of MKLML, ATLAS, OPENBLAS, REFERENCE
+# CBLAS_PROVIDER # one of MKLML, OPENBLAS, REFERENCE
# CBLAS_INC_DIR # the include directory for cblas.
# CBLAS_LIBS # a list of libraries should be linked by paddle.
# # Each library should be full path to object file.
@@ -17,7 +17,7 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
set(CBLAS_INC_DIR ${MKLML_INC_DIR})
set(CBLAS_LIBRARIES ${MKLML_LIB})
- add_definitions(-DPADDLE_USE_MKLML)
+ add_definitions(-DPADDLE_WITH_MKLML)
add_definitions(-DLAPACK_FOUND)
message(STATUS "Found cblas and lapack in MKLML "
@@ -25,42 +25,6 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
return()
endif()
-## Then find atlas.
-set(ATLAS_ROOT $ENV{ATLAS_ROOT} CACHE PATH "Folder contains Atlas")
-set(ATLAS_INCLUDE_SEARCH_PATHS
- ${ATLAS_ROOT}/include
- /usr/include
- /usr/include/atlas)
-set(ATLAS_LIB_SEARCH_PATHS
- ${ATLAS_ROOT}/lib
- /usr/lib
- /usr/lib/blas/atlas
- /usr/lib/atlas
- /usr/lib/atlas-base # special for ubuntu 14.04.
- )
-find_path(ATLAS_INC_DIR NAMES cblas.h
- PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
- PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
- PATHS ${ATLAS_LIB_SEARCH_PATHS})
-find_library(ATLAS_CLAPACK_LIB NAMES lapack_atlas liblapack_atlas.so.3
- PATHS ${ATLAS_LIB_SEARCH_PATHS})
-
-if(ATLAS_CLAPACK_INC_DIR AND ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_CLAPACK_LIB)
- set(CBLAS_FOUND ON)
- set(CBLAS_PROVIDER ATLAS)
- set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
- set(CBLAS_LIBRARIES ${ATLAS_CLAPACK_LIB} ${ATLAS_CBLAS_LIB})
-
- add_definitions(-DPADDLE_USE_ATLAS)
- add_definitions(-DLAPACK_FOUND)
-
- message(STATUS "Found ATLAS (include: ${ATLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
- message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
- return()
-endif()
-
## Then find openblas.
set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
set(OPENBLAS_INCLUDE_SEARCH_PATHS
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index e550ec285668ea25757eeee9e7c5dc48fc9d339d..5c6bcfde76a1201f792d04766d698db8cd395a49 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -24,6 +24,11 @@ if(WITH_DOUBLE)
add_definitions(-DPADDLE_TYPE_DOUBLE)
endif(WITH_DOUBLE)
+if(WITH_ARM_FP16)
+ add_definitions(-DPADDLE_ARM_FP16)
+ add_definitions("-march=armv8.2-a+fp16+simd")
+endif(WITH_ARM_FP16)
+
if(WITH_TESTING)
add_definitions(-DPADDLE_WITH_TESTING)
endif(WITH_TESTING)
diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake
index ac456933bd2260b2bbde2de78c486a5c0a1f5a96..aec51410b33669f8a549f2eca193cc6aa2d07a13 100644
--- a/cmake/external/cares.cmake
+++ b/cmake/external/cares.cmake
@@ -33,7 +33,7 @@ ExternalProject_Add(
UPDATE_COMMAND ""
CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR}
BUILD_IN_SOURCE 1
- BUILD_COMMAND make
+ BUILD_COMMAND make -j8
INSTALL_COMMAND make install
)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index fc52d339d7a336b44c97f2e0a9fc8d6604854365..5d24caebdcc5a28823164d718fb1628be5c4179d 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -67,5 +67,5 @@ ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
-add_definitions(-DPADDLE_USE_MKLDNN)
+add_definitions(-DPADDLE_WITH_MKLDNN)
LIST(APPEND external_project_dependencies mkldnn)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 4c4f59656dae68739f2f07f3febd510e727fe2dd..97857a686b38d935b19f510ecdcb66bcca91fe03 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -114,11 +114,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
- ADD_LIBRARY(cblas SHARED ${dummyfile})
-ELSE()
- ADD_LIBRARY(cblas STATIC ${dummyfile})
-ENDIF()
+ADD_LIBRARY(cblas STATIC ${dummyfile})
TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
IF(NOT ${CBLAS_FOUND})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index fab2af362bb070a54987b6499748056f3d12a56b..ff5855052dabaa0b63099cd219f3f04e22f1aa85 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -253,9 +253,9 @@ IF(NOT PROTOBUF_FOUND)
IF(WITH_C_API)
INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
IF(ANDROID)
- INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
+ INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
ELSE()
- INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib)
+ INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib)
ENDIF()
ENDIF()
diff --git a/doc/api/index_cn.rst b/doc/api/index_cn.rst
index 9be0b370ee5e301aee4a6e31b1cfa905754968e8..84f9097a6cdc2da269bd6a0685796e14e26da37e 100644
--- a/doc/api/index_cn.rst
+++ b/doc/api/index_cn.rst
@@ -7,3 +7,4 @@ API
模型配置 





+
+After compiling, the graph as shows
+
+
+
+Operators are added to the sub-graphs. Every GPU assigned a role of `rank0`, `rank1` etc.
+
+- **Broadcast**. Broadcast operator distribute initialized parameter to all the GPUs from the GPU who owns it. e.g. from`rank0` GPU.
+- **AllReduce**. AllReduce operator synchronizes parameters/gradients between GPUs. AllReduce implemented in the Ring-Based communicating method, avoid of the bottle neck in a single GPU.
+
+Need to notice that AllReduce operator force GPUs synchronized at that point. The whole training process in asynchronous or synchronous mode depends on the AllReduce point in the graph.
+
+As it shown in the picture, when each GPU compute the gradient of `W`, followed with a `AllReduce` operator, accumulate the `dW` to full batch of data, then run the optimize process individually and apply the gradient to its `W`.
+
+- **AllReduce**
+ Need to note that our AllReduce operator is a ring-base AllReduce implementation. If we use the NCCL2 AllReduce primitive, every GPU optimized full batch of data, wasted (n-1) GPU compute resources. In addition, NCCL2 built-in AllReduce will only utilize the communicating resource during synchronization, then update the gradient will be a subsequent phase. In fact, we can amortize the update gradient time cost into the communicating phase. The process is
+1. Every parameter has its root card. That card will responsible for aggregating the gradients from GPUs.
+2. The whole model's parameter will be hashed to different root card, ensure the load balance between GPUs.
+3. Logically neighberhood card will start send parameter to the next one. After one round, the parameter main card will aggregate the full gradients.
+4. Then the root card will optimize the parameter.
+5. This parameter card will send its optimized result to its neighberhood, then the neighberhood will send parameter to its next one.
+6. Finish the sychronization round.
+
+The total time cost will be 2 * (n-1) * per-parameter-send-time, we reach the goal of amortize the upgrade time into communicating phase.
diff --git a/doc/design/refactor/distributed_architecture.md b/doc/design/refactor/distributed_architecture.md
index 2b4f921ae93c3b443ed62a28b1fa9fbda14f73ab..d9fe7d6bbb0eeb73fcdca3ee749a4f10bcdda682 100644
--- a/doc/design/refactor/distributed_architecture.md
+++ b/doc/design/refactor/distributed_architecture.md
@@ -53,7 +53,7 @@ The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the
The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document -
-[Design Doc: Operation Graph Based Parameter Server](./dist_train.md)
+[Design Doc: Operation Graph Based Parameter Server](./parameter_server.md)
## Distributed Training Architecture
diff --git a/doc/design/refactor/multi_cpu.md b/doc/design/refactor/multi_cpu.md
new file mode 100644
index 0000000000000000000000000000000000000000..a8d8ee0422acc84835170a44eb83f9b5f0c6bb40
--- /dev/null
+++ b/doc/design/refactor/multi_cpu.md
@@ -0,0 +1,43 @@
+# Design Doc: Execute the Program with Multi CPU
+
+## Abstract
+
+This Design Doc propose an approach to make the user-defined Op graph
+running with multi-CPU, we will use an auto transpiler to convert the user-defined
+Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
+
+## Transpiler
+
+
+
+After converted:
+
+
+
+## Implement
+
+- `Multi-CPU Transpiler` will convert the graph to a multi-CPU graph
+ which would be executed with multi-threads.
+- `BlockingCounter` will `Init/Decrement` an atomic counter, and Blocking `Wait`
+ for the atomic counter become `0`:
+ ```cpp
+ BlockingCounter bc(thread_count);
+ for (int i = 0; i < thread_count; ++i) {
+ thread_pool->Start([&bc] {bc.DecrementCount(); })
+ }
+ bc.Wait();
+ ```
+- `ParallelDo` Operator
+ - Initialize a thread pool which is a Singleton.
+ - Use a block id as the input, and create run the specify Block on independent scope
+ with multi-threads.
+ - Initialize a `BlockingCounter` instance and wait until all threads are done.
+- `Split` Operator will split the Input Tensor into a TensorArray.
+- `Merge` merge all the gradients which calculated in different threads
+ with `mean/sum/max/min...` method, and then run the Optimizer Op to optimize `W`.
+
+## TODO
+
+- Improve the optimizer stage with multi-threads, since we could
+ assign the parameters to the different threads and execute
+ optimizer with multi-threads.
diff --git a/doc/design/refactor/src/multi-threads.graffle b/doc/design/refactor/src/multi-threads.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..e71173715fff92a0a933d0c7d83599ba948552c6
Binary files /dev/null and b/doc/design/refactor/src/multi-threads.graffle differ
diff --git a/doc/design/refactor/src/multi-threads/multi-threads@3x.png b/doc/design/refactor/src/multi-threads/multi-threads@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..e40a869987dbbf5019d4cb03c1dab55b74d6c9f9
Binary files /dev/null and b/doc/design/refactor/src/multi-threads/multi-threads@3x.png differ
diff --git a/doc/design/refactor/src/multi-threads/single-thread@3x.png b/doc/design/refactor/src/multi-threads/single-thread@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..4083aebfdd45af5fbac25fa2c4176bc08c3cb44a
Binary files /dev/null and b/doc/design/refactor/src/multi-threads/single-thread@3x.png differ
diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md
index 62ff8f3229bbbb5bc82e4da29259baffc30c2c87..14c081ea84282e52a2e36475c3c0ea755122d154 100644
--- a/doc/design/releasing_process.md
+++ b/doc/design/releasing_process.md
@@ -5,8 +5,9 @@ PaddlePaddle使用git-flow branching model做分支管理,使用[Semantic Vers
PaddlePaddle每次发新的版本,遵循以下流程:
1. 从`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0`
-2. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。
-3. 对这个版本的提交,做如下几个操作:
+1. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。
+1. 对这个版本的提交,做如下几个操作:
+ * 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
* 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步
* 编译这个版本的Ubuntu Deb包。如果失败,修复Ubuntu Deb包编译问题,Patch号加一,返回第二步。
* 使用Regression Test List作为检查列表,测试Docker镜像/ubuntu安装包的功能正确性
@@ -20,9 +21,9 @@ PaddlePaddle每次发新的版本,遵循以下流程:
pip install twine
twine upload dist/[package to upload]
```
-4. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
-5. 编译master分支的Docker发行镜像,发布到dockerhub。编译ubuntu的deb包,发布到github release页面
-6. 协同完成Release Note的书写
+1. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
+1. 编译master分支的Docker发行镜像,发布到dockerhub。编译ubuntu的deb包,发布到github release页面
+1. 协同完成Release Note的书写
需要注意的是:
@@ -30,7 +31,7 @@ PaddlePaddle每次发新的版本,遵循以下流程:
* `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试PaddlePaddle的行为。
* 在`release/版本号`分支存在的时候,如果有bugfix的行为,需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
-# PaddlePaddle 分支规范
+## PaddlePaddle 分支规范
PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。
@@ -47,11 +48,11 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
* BugFix分支也是在开发者自己的fork版本库维护,与功能分支不同的是,BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支,同时提起`Pull Request`。
-# PaddlePaddle回归测试列表
+## PaddlePaddle回归测试列表
本列表说明PaddlePaddle发版之前需要测试的功能点。
-## PaddlePaddle Book中所有章节
+### PaddlePaddle Book中所有章节
PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
diff --git a/doc/design/support_new_device.md b/doc/design/support_new_device.md
new file mode 100644
index 0000000000000000000000000000000000000000..fd23dc211a35fdc9d87bc9233fcf4e90254da748
--- /dev/null
+++ b/doc/design/support_new_device.md
@@ -0,0 +1,248 @@
+# Design Doc: Supporting new Device/Library
+
+## Background
+
+Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries flexibly and efficiently.
+
+On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example,Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
+
+On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.
+
+So, how to support a new Device/Library in Fluid becomes a challenge.
+
+
+## Basic: Integrate A New Device/Library
+
+For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/read_source.md).
+
+There are mainly three parts that we have to consider while integrating a new device/library:
+
+- Place and DeviceContext: indicates the device id and manages hardware resources
+
+- Memory and Tensor: malloc/free data on certain device
+
+- Math Functor and OpKernel: implement computing unit on certain devices/libraries
+
+### Place and DeviceContext
+
+
+#### Place
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent different devices and computing libraries. There are inheritance relationships between different kinds of `Place`.
+
+```
+ | CPUPlace --> MKLDNNPlace
+Place --| CUDAPlace --> CUDNNPlace
+ | FPGAPlace
+```
+
+And `Place` is defined as follows:
+
+```
+typedef boost::variant
@@ -32,10 +15,11 @@
在使用同步SGD训练神经网络时,PaddlePaddle使用同步屏障(barrier),使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中,则并不会等待所有trainer提交梯度才更新参数,这样极大地提高了计算的并行性:参数服务器之间不相互依赖,并行地接收梯度和更新参数,参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步,计算节点之间也不会相互依赖,并行地执行模型的训练。可以看出,虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新,在任意时间某一台参数服务器上保存的参数可能比另一台要更新,与同步SGD相比,梯度会有噪声。
+
## 环境准备
1. 准备您的计算集群。计算集群通常由一组(几台到几千台规模)的Linux服务器组成。服务器之间可以通过局域网(LAN)联通,每台服务器具有集群中唯一的IP地址(或者可被DNS解析的主机名)。集群中的每台计算机通常被成为一个“节点”。
-1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU,还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install)的多种安装方式。我们推荐使用[Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)安装方式来快速安装PaddlePaddle。
+1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU,还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。
安装完成之后,执行下面的命令可以查看已经安装的版本(docker安装方式可以进入docker容器执行:`docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
```bash
@@ -63,12 +47,12 @@ $ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradie
$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
```
-| 参数 | 是否必选 | 默认值 | 说明 |
-| ------------- | ------------- | ------------- | ------------- |
-| port | 必选 | 7164 | pserver监听的起始端口,根据ports_num决定| IOS_PLATFORM | +IOS_ARCH | +
|---|---|
| OS | +armv7, armv7s, arm64 | +
| SIMULATOR | +i386, x86_64 | +