+- Alexnet
+
+| BatchSize | 64 | 128 | 256 |
+|--------------|--------| ------ | -------|
+| OpenBLAS | 2.13 | 2.45 | 2.68 |
+| MKLML | 66.37 | 105.60 | 144.04 |
+| MKL-DNN | 399.00 | 498.94 | 626.53 |
+
+chart TBD
+
#### Inference
Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
- VGG-19
@@ -82,6 +93,15 @@ Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
| MKLML | 22.74 | 41.56 | 81.22 | 133.47 | 210.53 |
| MKL-DNN | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
+- Alexnet
+
+| BatchSize | 1 | 2 | 4 | 8 | 16 |
+|-----------|--------|--------|--------|--------|--------|
+| OpenBLAS | | | | | |
+| MKLML | 21.32 | 36.55 | 73.06 | 131.15 | 192.77 |
+| MKL-DNN | 442.91 | 656.41 | 719.10 | 847.68 | 850.51 |
+
+chart TBD
### Laptop
TBD
diff --git a/benchmark/paddle/image/alexnet.py b/benchmark/paddle/image/alexnet.py
index 3358d43a4b08c6a9b89d59e1a8be53ee1f12bbe0..cad6051f1413a5bb95f87a940f3aa81e49e5d282 100644
--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
@@ -6,10 +6,24 @@ height = 227
width = 227
num_class = 1000
batch_size = get_config_arg('batch_size', int, 128)
+gp = get_config_arg('layer_num', int, 1)
+is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+ 'height': height,
+ 'width': width,
+ 'color': True,
+ 'num_class': num_class,
+ 'is_infer': is_infer,
+ 'num_samples': num_samples
+}
define_py_data_sources2(
- "train.list", None, module="provider", obj="process", args=args)
+ "train.list" if not is_infer else None,
+ "test.list" if is_infer else None,
+ module="provider",
+ obj="process",
+ args=args)
settings(
batch_size=batch_size,
@@ -31,7 +45,7 @@ net = img_pool_layer(input=net, pool_size=3, stride=2)
# conv2
net = img_conv_layer(
- input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1)
+ input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
net = img_pool_layer(input=net, pool_size=3, stride=2)
@@ -40,11 +54,11 @@ net = img_conv_layer(
input=net, filter_size=3, num_filters=384, stride=1, padding=1)
# conv4
net = img_conv_layer(
- input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1)
+ input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)
# conv5
net = img_conv_layer(
- input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1)
+ input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
net = img_pool_layer(input=net, pool_size=3, stride=2)
net = fc_layer(
@@ -59,6 +73,9 @@ net = fc_layer(
layer_attr=ExtraAttr(drop_rate=0.5))
net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
-lab = data_layer('label', num_class)
-loss = cross_entropy(input=net, label=lab)
-outputs(loss)
+if is_infer:
+ outputs(net)
+else:
+ lab = data_layer('label', num_class)
+ loss = cross_entropy(input=net, label=lab)
+ outputs(loss)
diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py
index 7059c13bd2c2b98eb3fbcf633a6f7064e54d5402..2a850ccb7f2c75b467554181fc5f4aa8f2b97a09 100644
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@@ -7,13 +7,15 @@ num_class = 1000
batch_size = get_config_arg('batch_size', int, 128)
use_gpu = get_config_arg('use_gpu', bool, True)
is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
- 'is_infer': is_infer
+ 'is_infer': is_infer,
+ 'num_samples': num_samples
}
define_py_data_sources2(
"train.list" if not is_infer else None,
diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py
index 927b1759941f362ef4b5ffe84dd01332986d9306..1018ec9ce1e529f618ddd7b7afa72a84c5e876a1 100644
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@@ -14,6 +14,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
else:
settings.data_size = settings.height * settings.width
settings.is_infer = kwargs.get('is_infer', False)
+ settings.num_samples = kwargs.get('num_samples', 2560)
if settings.is_infer:
settings.slots = [dense_vector(settings.data_size)]
else:
@@ -23,7 +24,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
@provider(
init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, file_list):
- for i in xrange(2560 if settings.is_infer else 1024):
+ for i in xrange(settings.num_samples):
img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
if settings.is_infer:
yield img.astype('float32')
diff --git a/benchmark/paddle/image/resnet.py b/benchmark/paddle/image/resnet.py
index 4a14363ff1db48a5072cbb5f5eb3bc9241ffca8f..2846e4763f1cda4602f03af5ec649d57ee6cf0d8 100644
--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
@@ -7,13 +7,15 @@ num_class = 1000
batch_size = get_config_arg('batch_size', int, 64)
layer_num = get_config_arg("layer_num", int, 50)
is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
- 'is_infer': is_infer
+ 'is_infer': is_infer,
+ 'num_samples': num_samples
}
define_py_data_sources2(
"train.list" if not is_infer else None,
diff --git a/benchmark/paddle/image/run_mkl_infer.sh b/benchmark/paddle/image/run_mkl_infer.sh
index d795bcab1b7d098295066f79189d17e8299d28fb..62c9bf6efd3810f506fd4592b2ba3a21b1b7f0e7 100755
--- a/benchmark/paddle/image/run_mkl_infer.sh
+++ b/benchmark/paddle/image/run_mkl_infer.sh
@@ -37,7 +37,7 @@ function infer() {
--trainer_count=1 \
--num_passes=1 \
--save_dir="models/${topology}-${layer_num}" \
- --config_args="batch_size=128,layer_num=${layer_num}" \
+ --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
> /dev/null 2>&1
echo "Done"
fi
@@ -79,8 +79,9 @@ fi
# inference benchmark
for use_mkldnn in True False; do
for batchsize in 1 2 4 8 16; do
- infer googlenet v1 $batchsize $use_mkldnn
- infer resnet 50 $batchsize $use_mkldnn
infer vgg 19 $batchsize $use_mkldnn
+ infer resnet 50 $batchsize $use_mkldnn
+ infer googlenet v1 $batchsize $use_mkldnn
+ infer alexnet 2 $batchsize $use_mkldnn
done
done
diff --git a/benchmark/paddle/image/run_mkl_train.sh b/benchmark/paddle/image/run_mkl_train.sh
index 5335af5ac1b9a4a48ec107b8b6386b50ead8284c..03d2d378fb72e36f765d89af788f6ee96fe21d4e 100755
--- a/benchmark/paddle/image/run_mkl_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
@@ -47,5 +47,6 @@ for use_mkldnn in True False; do
train vgg 19 $batchsize $use_mkldnn
train resnet 50 $batchsize $use_mkldnn
train googlenet v1 $batchsize $use_mkldnn
+ train alexnet 2 $batchsize $use_mkldnn
done
done
diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh
index c1001d3a7c95a293d0b2b5b78fb7415e167b3e9f..71a49231a5527ebee9f45d5f4650ce2a4f6a1c31 100755
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@@ -8,39 +8,44 @@ function clock_to_seconds() {
}
function infer() {
- unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
topology=$1
layer_num=$2
bs=$3
- thread=`nproc`
- if [ $thread -gt $bs ]; then
- thread=$bs
+ trainers=`nproc`
+ if [ $trainers -gt $bs ]; then
+ trainers=$bs
fi
- log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log"
+ log="logs/infer-${topology}-${layer_num}-${trainers}openblas-${bs}.log"
+ threads=$((`nproc` / trainers))
+ if [ $threads -eq 0 ]; then
+ threads=1
+ fi
+ export OPENBLAS_NUM_THREADS=$threads
models_in="models/${topology}-${layer_num}/pass-00000/"
if [ ! -d $models_in ]; then
echo "./run_mkl_infer.sh to save the model first"
exit 0
fi
- log_period=$((256 / bs))
+ log_period=$((32 / bs))
paddle train --job=test \
--config="${topology}.py" \
+ --use_mkldnn=False \
--use_gpu=False \
- --trainer_count=$thread \
+ --trainer_count=$trainers \
--log_period=$log_period \
- --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
+ --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
--init_model_path=$models_in \
2>&1 | tee ${log}
- # calculate the last 5 logs period time of 1280 samples,
+ # calculate the last 5 logs period time of 160(=32*5) samples,
# the time before are burning time.
start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
start_sec=`clock_to_seconds $start`
end_sec=`clock_to_seconds $end`
- fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
- echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+ fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
+ echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
}
@@ -56,7 +61,8 @@ fi
# inference benchmark
for batchsize in 1 2 4 8 16; do
- infer googlenet v1 $batchsize
- infer resnet 50 $batchsize
infer vgg 19 $batchsize
+ infer resnet 50 $batchsize
+ infer googlenet v1 $batchsize
+ infer alexnet 2 $batchsize
done
diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh
index b9494ce119523953a3360b2b67e2cb6f3e0f1643..935cff6f2c97d25d6de556cfee25e27dbe49b5b6 100755
--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
@@ -1,7 +1,7 @@
set -e
function train() {
- unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+ export OPENBLAS_NUM_THREADS=1
topology=$1
layer_num=$2
bs=$3
@@ -12,10 +12,11 @@ function train() {
config="${topology}.py"
paddle train --job=time \
--config=$config \
+ --use_mkldnn=False \
--use_gpu=False \
--trainer_count=$thread \
- --log_period=10 \
- --test_period=100 \
+ --log_period=3 \
+ --test_period=30 \
--config_args=$args \
2>&1 | tee ${log}
@@ -36,4 +37,5 @@ for batchsize in 64 128 256; do
train vgg 19 $batchsize
train resnet 50 $batchsize
train googlenet v1 $batchsize
+ train alexnet 2 $batchsize
done
diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py
index 8d0a1e97a451cd52ef17e4e326673cc90059ef3c..ca0a6798fb8c35b68cf84d263855955eb93ba0b0 100644
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@@ -7,13 +7,15 @@ num_class = 1000
batch_size = get_config_arg('batch_size', int, 64)
layer_num = get_config_arg('layer_num', int, 19)
is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
- 'is_infer': is_infer
+ 'is_infer': is_infer,
+ 'num_samples': num_samples
}
define_py_data_sources2(
"train.list" if not is_infer else None,
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index fab2af362bb070a54987b6499748056f3d12a56b..ff5855052dabaa0b63099cd219f3f04e22f1aa85 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -253,9 +253,9 @@ IF(NOT PROTOBUF_FOUND)
IF(WITH_C_API)
INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
IF(ANDROID)
- INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
+ INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
ELSE()
- INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib)
+ INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib)
ENDIF()
ENDIF()
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index c3f9c18d0663a7a24880b441981875c1e4f015aa..ddf0b055a92d80295b24255a5462d477e0d9c796 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -252,6 +252,11 @@ first_seq
.. autoclass:: paddle.v2.layer.first_seq
:noindex:
+sub_seq
+---------
+.. autoclass:: paddle.v2.layer.sub_seq
+ :noindex:
+
concat
------
.. autoclass:: paddle.v2.layer.concat
@@ -467,7 +472,7 @@ lambda_cost
:noindex:
square_error_cost
---------
+-----------------
.. autoclass:: paddle.v2.layer.square_error_cost
:noindex:
@@ -533,7 +538,7 @@ Miscs
=====
dropout
---------------
+--------
.. autoclass:: paddle.v2.layer.dropout
:noindex:
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index 9f3669e11583a4ed6467f1a1bb509481fdf0b9d1..004ee2d8c85ce7661886179570e693d7d61bc6d8 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -19,17 +19,17 @@ dynamic_lstm
:noindex:
data
----------
+----
.. autofunction:: paddle.v2.fluid.layers.data
:noindex:
mean
----------
+----
.. autofunction:: paddle.v2.fluid.layers.mean
:noindex:
mul
----------
+---
.. autofunction:: paddle.v2.fluid.layers.mul
:noindex:
@@ -45,13 +45,13 @@ elementwise_div
dropout
----------
+-------
.. autofunction:: paddle.v2.fluid.layers.dropout
:noindex:
reshape
----------
+--------
.. autofunction:: paddle.v2.fluid.layers.reshape
:noindex:
@@ -68,12 +68,6 @@ scale
:noindex:
-reshape
----------
-.. autofunction:: paddle.v2.fluid.layers.reshape
- :noindex:
-
-
transpose
---------
.. autofunction:: paddle.v2.fluid.layers.transpose
@@ -81,67 +75,67 @@ transpose
sigmoid_cross_entropy_with_logits
----------
+---------------------------------
.. autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
:noindex:
cast
----------
+----
.. autofunction:: paddle.v2.fluid.layers.cast
:noindex:
concat
----------
+-------
.. autofunction:: paddle.v2.fluid.layers.concat
:noindex:
sums
----------
+----
.. autofunction:: paddle.v2.fluid.layers.sums
:noindex:
linear_chain_crf
----------
+----------------
.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf
:noindex:
assign
----------
+-------
.. autofunction:: paddle.v2.fluid.layers.embedding
:noindex:
split_lod_tensor
----------
+----------------
.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor
:noindex:
merge_lod_tensor
----------
+----------------
.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
:noindex:
cos_sim
----------
+--------
.. autofunction:: paddle.v2.fluid.layers.cos_sim
:noindex:
cross_entropy
----------
+-------------
.. autofunction:: paddle.v2.fluid.layers.cross_entropy
:noindex:
square_error_cost
----------
+-----------------
.. autofunction:: paddle.v2.fluid.layers.square_error_cost
:noindex:
@@ -153,74 +147,80 @@ accuracy
sequence_conv
----------
+-------------
.. autofunction:: paddle.v2.fluid.layers.sequence_conv
:noindex:
conv2d
----------
+------
.. autofunction:: paddle.v2.fluid.layers.conv2d
:noindex:
sequence_pool
----------
+-------------
.. autofunction:: paddle.v2.fluid.layers.sequence_pool
:noindex:
+sequence_first_step
+-------------------
+.. autofunction:: paddle.v2.fluid.layers.sequence_first_step
+ :noindex:
+
+
+sequence_last_step
+------------------
+.. autofunction:: paddle.v2.fluid.layers.sequence_last_step
+ :noindex:
+
+
pool2d
----------
+------
.. autofunction:: paddle.v2.fluid.layers.pool2d
:noindex:
batch_norm
----------
+----------
.. autofunction:: paddle.v2.fluid.layers.batch_norm
:noindex:
beam_search_decode
----------
+------------------
.. autofunction:: paddle.v2.fluid.layers.beam_search_decode
:noindex:
-lstm
----------
-.. autofunction:: paddle.v2.fluid.layers.lstm
- :noindex:
-
-
lod_rank_table
----------
+--------------
.. autofunction:: paddle.v2.fluid.layers.lod_rank_table
:noindex:
max_sequence_len
----------
+----------------
.. autofunction:: paddle.v2.fluid.layers.max_sequence_len
:noindex:
topk
----------
+-----
.. autofunction:: paddle.v2.fluid.layers.topk
:noindex:
lod_tensor_to_array
----------
+-------------------
.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
:noindex:
array_to_lod_tensor
----------
+-------------------
.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
:noindex:
@@ -228,26 +228,26 @@ array_to_lod_tensor
fill_constant
----------
+-------------
.. autofunction:: paddle.v2.fluid.layers.fill_constant
:noindex:
fill_constant_batch_size_like
----------
+-----------------------------
.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
:noindex:
ones
----------
+----
.. autofunction:: paddle.v2.fluid.layers.ones
:noindex:
zeros
----------
+-----
.. autofunction:: paddle.v2.fluid.layers.zeros
:noindex:
@@ -259,14 +259,14 @@ increment
array_write
----------
+-----------
.. autofunction:: paddle.v2.fluid.layers.array_write
:noindex:
create_array
----------
+------------
.. autofunction:: paddle.v2.fluid.layers.create_array
:noindex:
@@ -278,29 +278,67 @@ less_than
array_read
----------
+----------
.. autofunction:: paddle.v2.fluid.layers.array_read
:noindex:
shrink_memory
----------
+--------------
.. autofunction:: paddle.v2.fluid.layers.shrink_memory
:noindex:
array_length
----------
+-------------
.. autofunction:: paddle.v2.fluid.layers.array_length
:noindex:
conv2d_transpose
----------
+----------------
.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose
:noindex:
+
sequence_expand
----------
+---------------
.. autofunction:: paddle.v2.fluid.layers.sequence_expand
:noindex:
+
+
+lstm_unit
+---------
+.. autofunction:: paddle.v2.fluid.layers.lstm_unit
+ :noindex:
+
+
+sequence_softmax
+----------------
+.. autofunction:: paddle.v2.fluid.layers.sequence_softmax
+ :noindex:
+
+
+reduce_sum
+----------
+.. autofunction:: paddle.v2.fluid.layers.reduce_sum
+ :noindex:
+
+
+reduce_mean
+-----------
+.. autofunction:: paddle.v2.fluid.layers.reduce_mean
+ :noindex:
+
+
+reduce_max
+----------
+.. autofunction:: paddle.v2.fluid.layers.reduce_max
+ :noindex:
+
+
+reduce_min
+----------
+.. autofunction:: paddle.v2.fluid.layers.reduce_min
+ :noindex:
+
diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst
index 2c3d075422de29c96e25458e831133a30270dd39..b792efb71f85ae643df655568da69c82414e9d5d 100644
--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
@@ -3,19 +3,19 @@ Nets
===========
simple_img_conv_pool
------------
+--------------------
.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
:noindex:
img_conv_group
------------
+---------------
.. autofunction:: paddle.v2.fluid.nets.img_conv_group
:noindex:
sequence_conv_pool
------------
+------------------
.. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
:noindex:
diff --git a/doc/api/v2/fluid/optimizer.rst b/doc/api/v2/fluid/optimizer.rst
index 233762fcdfb39e592740adef6721a556fae3feef..19b4940f08de3e2f7dc177f2961e538946d10a78 100644
--- a/doc/api/v2/fluid/optimizer.rst
+++ b/doc/api/v2/fluid/optimizer.rst
@@ -18,7 +18,7 @@ SGDOptimizer
MomentumOptimizer
------------
+-----------------
.. automodule:: paddle.v2.fluid.optimizer
:members: MomentumOptimizer
:noindex:
@@ -26,14 +26,14 @@ MomentumOptimizer
AdagradOptimizer
------------
+----------------
.. automodule:: paddle.v2.fluid.optimizer
:members: AdagradOptimizer
:noindex:
AdamOptimizer
------------
+-------------
.. automodule:: paddle.v2.fluid.optimizer
:members: AdamOptimizer
:noindex:
@@ -47,7 +47,7 @@ AdamaxOptimizer
DecayedAdagradOptimizer
------------
+-----------------------
.. automodule:: paddle.v2.fluid.optimizer
:members: DecayedAdagradOptimizer
:noindex:
diff --git a/doc/api/v2/fluid/regularizer.rst b/doc/api/v2/fluid/regularizer.rst
index 3af2b07d2ae55d99df705fbf1ad2402eee05c435..868e225ed3d59e79aeb217fb88081ea25f80fa2c 100644
--- a/doc/api/v2/fluid/regularizer.rst
+++ b/doc/api/v2/fluid/regularizer.rst
@@ -3,14 +3,14 @@ Regularizer
===========
WeightDecayRegularizer
------------
+----------------------
.. automodule:: paddle.v2.fluid.regularizer
:members: WeightDecayRegularizer
:noindex:
L2DecayRegularizer
------------
+------------------
.. automodule:: paddle.v2.fluid.regularizer
:members: L2DecayRegularizer
:noindex:
@@ -18,7 +18,7 @@ L2DecayRegularizer
L1DecayRegularizer
------------
+-------------------
.. automodule:: paddle.v2.fluid.regularizer
:members: L1DecayRegularizer
diff --git a/doc/design/backward.md b/doc/design/backward.md
new file mode 100644
index 0000000000000000000000000000000000000000..20fda7a98f514a3f1c1c2d0ba7447ec954b21d5a
--- /dev/null
+++ b/doc/design/backward.md
@@ -0,0 +1,158 @@
+# Backward Building
+
+## Motivation
+
+In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. However, when configuring the model structure, users do not need to define the backward part. So a mechanism is required by the framework which can complete the model's backward part automatically according to the given forward part.
+
+When implementing a specific `op`, the developer is also asked to implement its backward version, called `grad_op`. A `grad_op` takes gradients of its corresponding `op`'s outputs, and calculate gradients of the `op`'s inputs. During the building of a model's backward part, the framework creates each forward `op`'s `grad_op`, and then string them together in reverse order of forwarding part. In this way, gradients spread from the end to the beginning of the model, in another word, from the loss to parameters.
+
+## Challenges
+
+The motivation of backward building is apparent. However, implementation it correctly is not so easy. In the **Fluid** design, a deep learning model is described by `Program`, `Block`, `Op` and `Variable`. The `Block` itself can be nested. It means that the `op`s and `variable`s are scattered across different blocks rather than all be gathered in a single graph. Our backward building algorithm shall visit blocks in recursive order and be able to insert `grad_op`s and new created `variable`s into the right place.
+
+## Usage
+
+Although the whole algorithm is comprised of many functions, only one is exposed as API:
+
+```python
+def append_backward(loss, parameter_list=None, no_grad_set=None):
+ """
+ Append backward part to main_program
+
+ Args:
+ loss(Variable): The variable generated by the cost function.
+ parameter_list(list): Parameters that need to be updated by optimizers.
+ If None, it means all parameters need to be updated.
+
+ no_grad_set(set): Variables that have no gradients in Block 0.
+ If None, the set will be generated inside the function and
+ contains all variables with `step_gradient=True` from all blocks.
+
+ Return:
+ (list[Variable]): list of (parameters, gradients) pair.
+ """
+```
+
+By invoking this API, the framework appends backward part of the program where the `loss` is. It takes three arguments. `loss` means the final loss value. It must be a scalar and is usually the output of the loss layer. It is also where the gradient generated and backpropagation starts. `parameter_list` marks all parameters needs updating. If it's `None`, all parameter will be updated by optimizers. `no_grad_set` marks variables without gradient. if all outputs of some `grad_op` are in `no_grad_set`, the `grad_op` will not be run.
+
+This API will be invoked automatically before optimizer building.
+As a result, in most cases, users do not need to invoke the API by themselves to append backward part.
+
+## Implementation
+
+The implementation of backward building algorithm is in `backward.py` file. The whole algorithm can be divided into two independent parts: creating `grad_op`s and creating new variables.
+
+### Creating `grad_op`s
+
+The creating of `grad_op`s is implemented by:
+
+```python
+def _append_backward_ops_(target,
+ block,
+ target_block,
+ no_grad_dict,
+ grad_to_var):
+ """
+ Create all grad ops, and insert them into given block
+
+ Args:
+ target(Variable): the target variable of forward pass
+ block(Block): the block where forward ops are
+ target_block(Block): the block which is going to hold new generated grad ops
+ no_grad_dict(dict):
+ key(int) block index
+ val(set) a set of varibale names. These varibales have no gradient
+ grad_to_var(dict)(output argument):
+ key(str): grad variable name
+ val(str): corresponding forward variable name
+ """
+```
+
+Given a `block`, the function will traverses all `op`s in this block in reverse order, gets corresponding `grad_op` from the C++ core via `core.get_grad_op_desc()`, then append it to `target_block`.
+
+However, some specific `op`(e.g. `while_op`, `if_else_op`) can hold its own sub-block. For these sub-blocks contains `op`s as well, the `grad_op` creating should be recursive.
+
+During the reverse traversal, we check each `op` whether it has an attribute named `sub_block`. If so, it means there is a sub-block and we need to deal with it first. After creating a new block whose father is the one in `op`'s attribute, we invoke `_append_backward_ops_()` recursively, assigning the new block to parameter `target_block` and the one in `op`'s attribute to `block`. The *pseudo-code* shows this process:
+
+```
+******* pseudo-code ********
+for op in reversed(block.ops):
+ if op has an attribute named 'sub_block':
+ Get the sub-block(`s_block`) from op's attribute.
+ Create a new block(`grad_s_block`), whose father is `s_block`.
+ Invoke _append_backward_ops_(), with `block=s_block` and `target_block=grad_s_block`
+
+ Invoke `core.get_grad_op_desc()` to get op's grad_op.
+ Insert name correspondings between variables and their gradients of the grad_op to grad_to_var
+ Assign grad_s_block to grad_op as it's 'sub_block' attribute.
+ Append grad_op to current target_block.
+```
+
+The first invoking of `_append_backward_ops_()` is initiated by `append_backward()`, in which parameters `block` and `target_block` are all assigned with root block(the block with index 0).
+
+### Corner Cases of `grad_op` Creating
+
+In the previous section, we show the regular process of `grad_op` creating. However, in some corner cases, the conventional algorithm is not enough to get the correct result and appending handling is required. These additional processes run after the algorithm mentioned above and do some special adjusts on its output `grad_op`s.
+
+#### Shared Variables
+
+If a variable is read by more than one `op` in the forward pass, its gradient is likely to be written by more than one `grad_op`s in the next backward pass. To make the gradient result being the sum of all `grad_op`s' outputs instead of the last running one, we assign each output with a temporary variable and then add a `sum_op` to add them up.
+
+For the debug convenience, if the final gradient name is `w@GRAD`, it's corresponding temporary variables will be named as `w@GRAD@RENAME@0`, `w@GRAD@RENAME@1`...
+
+See function `_addup_repetitive_outputs_` in `backward.py` for implementation details.
+
+#### No Gradient Variables
+
+In our framework, variables can be marked as *no_gradient*, it means that the gradient of this variable is unnecessary and can be considered as zero in model training. Apparently, when all the outputs of some `grad_op` are marked as *no_gradient*, the `grad_op` itself can be skipped in backward pass.
+
+Another situation is all the gradient inputs of some `grad_op` are marked as *no_gradient*, which means all of them can be considered as zeros. For `grad_op`s are in essence the propagation of gradients, all the outputs are definitely zeros when all gradient inputs are zeros. Therefore the `grad_op` can also be skipped.
+
+It should be noted that all these zero gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros.
+
+This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`s one-by-one, removes who can be skipped and inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on the fly by scanning all variables' `no_gradient` attribute(True or False).
+
+### Creating Backward Variables
+
+Up to now, we have completed all creating and adjusting jobs of `grad_op`s. However, backward variables have not been created. Now they are only represented by `grad_op`'s input and output arguments. The backward variable creating job will be done by:
+
+```python
+def _append_backward_vars_(block,
+ start_op_idx,
+ grad_to_var,
+ grad_info_map):
+ """
+ Create new variables required by backward pass.
+
+ Args:
+ block(Block): the block where new variables will be created
+ start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created
+ grad_to_var(dict):
+ key(str): grad variable name
+ val(str): corresponding forward variable name
+ In most cases, this dict is generated by _append_backward_ops_()
+ grad_info_map(dict)(output argument):
+ key(str): forward variable name
+ val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index
+ """
+```
+
+Given a `block`, this function traverses all the `grad_op`s in it(The argument `start_op_idx` indicates where the grad_op sequence starts.) and creates all the uncreated outputs. The *pseudo-code* shows this process:
+
+```
+for op in block.ops[start_op_idx : ]:
+
+ if op has an attribute named 'sub_block':
+ Get the sub-block(`s_block`) from op's attribute.
+ Invoke _append_backward_vars_(), with `block=s_block`
+
+ for var_name in op.all_output_names():
+ if block.has_var_recursive(var_name) or var_name is the name of empty variable:
+ continue
+ create a new variable named 'var_name' in block
+ if grad_to_var.has_key(var_name):
+ set grad_info_map[grad_to_var[var_name]] as a tuple of (var_name. block)
+
+ do op's var type inference
+ do op's shape inference
+```
diff --git a/doc/design/block.md b/doc/design/block.md
index 4066122c0e8dfa33776796c3d205ba5aec9e0f52..fab7f2dc481ae51aa982164dc5048d90fcdc2b0b 100644
--- a/doc/design/block.md
+++ b/doc/design/block.md
@@ -291,10 +291,10 @@ public:
}
void Run(const framework::Scope& scope,
- const platform::DeviceContext& dev_ctx) const override {
+ const platform::Place& place) const override {
PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first.");
for (auto& op : runtime_table_.ops()) {
- op->Run(scope, dev_ctx);
+ op->Run(scope, place);
}
}
diff --git a/paddle/framework/images/duplicate_op.graffle b/doc/design/images/duplicate_op.graffle
similarity index 100%
rename from paddle/framework/images/duplicate_op.graffle
rename to doc/design/images/duplicate_op.graffle
diff --git a/paddle/framework/images/duplicate_op.png b/doc/design/images/duplicate_op.png
similarity index 100%
rename from paddle/framework/images/duplicate_op.png
rename to doc/design/images/duplicate_op.png
diff --git a/paddle/framework/images/duplicate_op2.graffle b/doc/design/images/duplicate_op2.graffle
similarity index 100%
rename from paddle/framework/images/duplicate_op2.graffle
rename to doc/design/images/duplicate_op2.graffle
diff --git a/paddle/framework/images/duplicate_op2.png b/doc/design/images/duplicate_op2.png
similarity index 100%
rename from paddle/framework/images/duplicate_op2.png
rename to doc/design/images/duplicate_op2.png
diff --git a/doc/design/images/profiler.png b/doc/design/images/profiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..d57b71ca88aaba5d05584a6219d84214e285a1e1
Binary files /dev/null and b/doc/design/images/profiler.png differ
diff --git a/doc/design/kernel_hint_design.md b/doc/design/kernel_hint_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..a54b7da045e1a362626ef066f9ebb56af2c3181a
--- /dev/null
+++ b/doc/design/kernel_hint_design.md
@@ -0,0 +1,57 @@
+## Problem
+In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
+
+In the current design, we use KernelType to describe one kernel.
+
+```cpp
+struct KernelType {
+ Place place_;
+ DataType data_type_;
+ LayoutType layout_;
+};
+```
+ `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it.
+
+The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
+
+So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel.
+
+The problem is, how should we define and send the information for `GetExpectedKernelType` to use?
+
+## Solution
+
+### Potential choice
+1. Do nothing, let the user add the information they want to operator‘s attribute and get them inside `GetExpectedKernelType`, this can work properly. But there is a little problem that users may define many kinds of hints for the same purpose, such as `force_cpu`, `use_cpu`, `cpu_kernel` to choose CPU kernel, and `use_cudnn`, `force_cudnn`, `cudnn_kernel` to choose CUDNN kernel.
+
+2. Pre-define all the needed option and use a single attr key such as `kernel_hint` for the user, this is not so flexible if the user wants to define some more kind of hint.
+
+### Final choice
+To provide enough flexibility while avoiding confusion definition, we can define some global constants for these attribute names, such as `force_cpu`, `use_cudnn`, `use_mkldnn` for a user to choose.
+
+In C++
+
+```cpp
+const std::string kForceCPU = "force_cpu";
+const std::string kUseCUDNN = "use_cudnn";
+const std::string kUseMKLDNN = "use_mkldnn";
+
+KernelType GetExpectedKernelType() {
+ if (Attr
+
+After converted:
+
+
+
+## Implement
+
+- `Multi-CPU Transpiler` will convert the graph to a multi-CPU graph
+ which would be executed with multi-threads.
+- `BlockingCounter` will `Init/Decrement` an atomic counter, and Blocking `Wait`
+ for the atomic counter become `0`:
+ ```cpp
+ BlockingCounter bc(thread_count);
+ for (int i = 0; i < thread_count; ++i) {
+ thread_pool->Start([&bc] {bc.DecrementCount(); })
+ }
+ bc.Wait();
+ ```
+- `ParallelDo` Operator
+ - Initialize a thread pool which is a Singleton.
+ - Use a block id as the input, and create run the specify Block on independent scope
+ with multi-threads.
+ - Initialize a `BlockingCounter` instance and wait until all threads are done.
+- `Split` Operator will split the Input Tensor into a TensorArray.
+- `Merge` merge all the gradients which calculated in different threads
+ with `mean/sum/max/min...` method, and then run the Optimizer Op to optimize `W`.
+
+## TODO
+
+- Improve the optimizer stage with multi-threads, since we could
+ assign the parameters to the different threads and execute
+ optimizer with multi-threads.
diff --git a/doc/design/refactor/src/multi-threads.graffle b/doc/design/refactor/src/multi-threads.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..e71173715fff92a0a933d0c7d83599ba948552c6
Binary files /dev/null and b/doc/design/refactor/src/multi-threads.graffle differ
diff --git a/doc/design/refactor/src/multi-threads/multi-threads@3x.png b/doc/design/refactor/src/multi-threads/multi-threads@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..e40a869987dbbf5019d4cb03c1dab55b74d6c9f9
Binary files /dev/null and b/doc/design/refactor/src/multi-threads/multi-threads@3x.png differ
diff --git a/doc/design/refactor/src/multi-threads/single-thread@3x.png b/doc/design/refactor/src/multi-threads/single-thread@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..4083aebfdd45af5fbac25fa2c4176bc08c3cb44a
Binary files /dev/null and b/doc/design/refactor/src/multi-threads/single-thread@3x.png differ
diff --git a/doc/design/support_new_device.md b/doc/design/support_new_device.md
index fd23dc211a35fdc9d87bc9233fcf4e90254da748..f54b2b3694cc2a8f1d892792fd4d39a0484dc750 100644
--- a/doc/design/support_new_device.md
+++ b/doc/design/support_new_device.md
@@ -25,13 +25,14 @@ There are mainly three parts that we have to consider while integrating a new de
### Place and DeviceContext
+Please remind that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
#### Place
-Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent different devices and computing libraries. There are inheritance relationships between different kinds of `Place`.
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add corresponding `DevicePlace`.
```
- | CPUPlace --> MKLDNNPlace
-Place --| CUDAPlace --> CUDNNPlace
+ | CPUPlace
+Place --| CUDAPlace
| FPGAPlace
```
@@ -43,7 +44,7 @@ typedef boost::variant
-
-
- Figure 1. Sharing variables in operators.
-
-
-
-
- Figure 2. Replace sharing variable's gradient with `Add` operator.
-
-