merge release/1.3

b71823df · sneaxiy · 0d168cf6 · 4550862e · b71823df · 0d168cf6
79 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,12 +25,18 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 if(WIN32)
+    set(CMAKE_SUPPRESS_REGENERATION ON)
    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
    add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+    add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
+    set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
+    set(CMAKE_STATIC_LINKER_FLAGS  "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
 endif(WIN32)

 find_package(CUDA QUIET)

--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
-# Benchmark
-
-Machine:
-
- Server: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
- Laptop: TBD
-
-System: CentOS release 6.3 (Final), Docker 1.12.1.
-
-PaddlePaddle:
- paddlepaddle/paddle:0.11.0 (for MKLML and MKL-DNN)
-  - MKL-DNN tag v0.11
-  - MKLML 2018.0.1.20171007
- paddlepaddle/paddle:0.11.0-openblas (for OpenBLAS)
-  - OpenBLAS v0.2.20
-	 
-On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
-
-## Benchmark Model
-
-### Server
-
-#### Training
-Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
-Pay attetion that the speed below includes forward, backward and parameter update time. So we can not directly compare the data with the benchmark of caffe `time` [command](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/caffe/image/run.sh#L9), which only contain forward and backward. The updating time of parameter would become very heavy when the weight size are large, especially on alexnet.
-
-Input image size - 3 * 224 * 224, Time: images/second
-
- VGG-19
-
-| BatchSize    | 64    | 128  | 256     |
-|--------------|-------| -----| --------|
-| OpenBLAS     | 7.80  | 9.00  | 10.80  | 
-| MKLML        | 12.12 | 13.70 | 16.18  |
-| MKL-DNN      | 28.46 | 29.83 | 30.44  |
-
-<img src="figs/vgg-cpu-train.png" width="500">
-
- - ResNet-50
-
-| BatchSize    | 64    | 128   | 256    |
-|--------------|-------| ------| -------|
-| OpenBLAS     | 25.22 | 25.68 | 27.12  | 
-| MKLML        | 32.52 | 31.89 | 33.12  |
-| MKL-DNN      | 81.69 | 82.35 | 84.08  |
-
-<img src="figs/resnet-cpu-train.png" width="500">
-
- - GoogLeNet
-
-| BatchSize    | 64    | 128   | 256    |
-|--------------|-------| ------| -------|
-| OpenBLAS     | 89.52 | 96.97 | 108.25 | 
-| MKLML        | 128.46| 137.89| 158.63 |
-| MKL-DNN      | 250.46| 264.83| 269.50 |
-
-<img src="figs/googlenet-cpu-train.png" width="500">
-
- AlexNet
-
-| BatchSize    | 64     | 128    | 256    |
-|--------------|--------| ------ | -------|
-| OpenBLAS     | 45.62  | 72.79  | 107.22 | 
-| MKLML        | 66.37  | 105.60 | 144.04 |
-| MKL-DNN      | 399.00 | 498.94 | 626.53 | 
-
-<img src="figs/alexnet-cpu-train.png" width="500">
-
-#### Inference
-Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
- VGG-19
-
-| BatchSize | 1     | 2     | 4     | 8     | 16    |
-|-----------|-------|-------|-------|-------|-------|
-| OpenBLAS  | 1.10  | 1.96  | 3.62  | 3.63  | 2.25  |
-| MKLML     | 5.58  | 9.80  | 15.15 | 21.21 | 28.67 |
-| MKL-DNN   | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 |
-
-<img src="figs/vgg-cpu-infer.png" width="500">
-
- ResNet-50
-
-| BatchSize | 1     | 2      | 4      | 8      | 16     |
-|-----------|-------|--------|--------|--------|--------|
-| OpenBLAS  | 3.31  | 6.72   | 11.59  | 13.17  | 9.27   |
-| MKLML     | 6.33  | 12.02  | 22.88  | 40.53  | 63.09  |
-| MKL-DNN   | 107.83| 148.84 | 177.78 | 189.35 | 217.69 |
-
-<img src="figs/resnet-cpu-infer.png" width="500">
-
- GoogLeNet
-
-| BatchSize | 1      | 2      | 4      | 8      | 16     |
-|-----------|--------|--------|--------|--------|--------|
-| OpenBLAS  | 12.06  | 23.56  | 34.48  | 36.45  | 23.12  |
-| MKLML     | 22.74  | 41.56  | 81.22  | 133.47 | 210.53 |
-| MKL-DNN   | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
-
-<img src="figs/googlenet-cpu-infer.png" width="500">
-
- AlexNet
-
-| BatchSize | 1      | 2      | 4      | 8      | 16     |
-|-----------|--------|--------|--------|--------|--------|
-| OpenBLAS  | 3.53   | 6.23   | 15.04  | 26.06  | 31.62  |
-| MKLML     | 21.32  | 36.55  | 73.06  | 131.15 | 192.77 |
-| MKL-DNN   | 442.91 | 656.41 | 719.10 | 847.68 | 850.51 |
-
-<img src="figs/alexnet-cpu-infer.png" width="500">
-
-### Laptop
-TBD
--- a/benchmark/README.md
+++ b/benchmark/README.md
-# Benchmark
-
-Machine: 
-
- CPU: 12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz
- GPU: Tesla K40m
- cuDNN: v5.1
- system: Docker 1.12.1, all platforms are tested in docker environment.
-
-Platforms: 
-
- PaddlePaddle: paddledev/paddle:gpu-devel-v0.9.0a0 
- Tensorflow: gcr.io/tensorflow/tensorflow:0.11.0rc0-gpu 
- Caffe: kaixhin/cuda-caffe
-
-Several convolutional neural networks and recurrent neural networks are used to test.
-
-## Image
-
-### Benchmark Model
-
-AlexNet, GoogleNet and a small network used in Caffe.
-
- [AlexNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet): but the group size is one.
-
- [GoogleNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet): but remove loss1 and loss2 when testing benchmark.
-
- [SmallNet](https://github.com/BVLC/caffe/blob/master/examples/cifar10/cifar10\_quick\_train\_test.prototxt)
-
-
-### Single-GPU
-
- AlexNet:  input - 3 * 227 * 227,  Time: ms/batch
-
-| BatchSize    | 64  | 128  | 256   | 512  |
-|--------------|-----| -----| ------| -----|
-| PaddlePaddle | 195 | 334  | 602   | 1629 |
-| TensorFlow   | 223 | 364  | 645   | 1235 |
-| Caffe        | 324 | 627  | 1232  | 2513 |
- 
-**Notation**
-
-All platforms use cuDNN-v5.1. We see that caffe is slower in this experiment, because its workspace limit size of cuDNN-conv interface is 8 * 1024 * 1024, which is smaller in PaddlePaddle and TensorFlow. Note that Caffe will be faster if increasing the workspace limit size.
- 
- GoogletNet:  input - 3 * 224 * 224, Time: ms/batch
-
-
-| BatchSize    | 64    |   128  | 256     |
-|--------------|-------| -------| --------|
-| PaddlePaddle | 613   | 1149   | 2348    |
-| TensorFlow   | 644   | 1176   | 2219    |
-| Caffe        | 694   | 1364   | out of memory   |
-
- SmallNet: input - 3 * 32 * 32, Time ms/batch
-
-| BatchSize    | 64     |   128    | 256     | 512     |
-|--------------|--------| -------- | --------|---------|
-| PaddlePaddle | 10.463 | 18.184   | 33.113  |  63.039 |
-| TensorFlow   | 9     | 15       | 28      | 59       |
-| Caffe        | 9.373  | 16.6606  | 31.4797 | 59.719  |
-
-**Notation**
-
-All the single-GPU experiments in caffe use `caffe time` to calculate elapsed time, which does not include parameter updating time. However, both PaddlePaddle and TensorFlow experiments contain the parameter updating time. As compared with the total time, this part is relatively little on single machine, we can ignore it.
-
-In Tensorflow, they implement algorithm searching method instead of using the algorithm searching interface in cuDNN.
-
-### Multi-GPU: 4 GPUs
-
- AlexNet,  ms / batch
-
-| total-BatchSize | 128 * 4  | 256 * 4    |
-|------------------|----------| -----------|
-| PaddlePaddle     | 347      | 622        |
-| TensorFlow       | 377      | 675        |
-| Caffe            | 1229     | 2435       |
-
-For example, if `total-BatchSize = 128 * 4`, the speedup ratio is calculated by 
-
-```
-  time_at_1gpu_batch_128 * 4 / time_at_4gpu_total_batch_512 
-= (334 * 4)/347 
-= 3.85
-``` 
-
-<img src="figs/alexnet-4gpu.png" width="420">
-
-
- GoogleNet, ms / batch
-
-| total-BatchSize  | 128 * 4      |  256 * 4    |
-|-------------------|--------------| ----------- |
-| PaddlePaddle      | 1178         | 2367        |
-| TensorFlow        | 1210         | 2292        |
-| Caffe             | 2007         | out of memory  |
-
-<img src="figs/googlenet-4gpu.png" width="420">
-
-
-## RNN
-We use lstm network for text classfication to test benchmark.
-
-### Dataset
-  [IMDB](http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl)
- Sequence length is 100. In fact, PaddlePaddle supports training with variable-length sequence, but TensorFlow needs to pad. Thus, we also pad sequence length to 100 in PaddlePaddle in order to compare.
- Dictionary size=30000 
- Peephole connection is used in `lstmemory` by default in PaddlePaddle. It is also configured in TensorFlow.
-
-### Single-GPU
-
-#### LSTM in Text Classification
-
-Testing `2 lstm layer + fc` network with different hidden size and batch size.
-  
- Batch size = 64, ms / batch
- 
-| hidden_size  | 256   | 512    |  1280   |
-|--------------|-------| -------| --------|
-| PaddlePaddle | 83    | 184    | 641     |
-| TensorFlow   | 175   | 280    | 818     |
-
- Batch size = 128, ms / batch
- 
-| hidden_size  | 256    | 512    |  1280   |
-|--------------|------- | -------| --------|
-| PaddlePaddle | 110    | 261    | 1007    |
-| TensorFlow   | 181    | 361    | 1237    |
-
-
- Batch size = 256, ms / batch
- 
-| hidden_size  | 256   | 512    |  1280   |
-|--------------|-------| -------| --------|
-| PaddlePaddle | 170   | 414    | 1655    |
-| TensorFlow   | 238   | 536    | 1905    |
-
-<img src="figs/rnn_lstm_cls.png" width="600">
-
-#### Seq2Seq
-
-The benchmark of sequence-to-sequence network will be added later.
- 
-
-### Multi GPU: 4 GPUs
-
-#### LSTM in Text Classification
-
- hidden_size = 256, ms / batch
- 
-| batch_size   | 256    |  512    |
-|--------------| -------| --------|
-| PaddlePaddle | 90     | 118     |
-| TensorFlow   | 226    | 118     |
-
-
- hidden_size = 512, ms / batch
- 
-| batch_size   | 256    |  512    |
-|--------------| -------| --------|
-| PaddlePaddle | 189    | 268     |
-| TensorFlow   | 297    | 383     |
-
-
-<img src="figs/rnn_lstm_4gpus.png" width="420">
-
-#### Seq2Seq
-
-The benchmark of sequence-to-sequence network will be added later.
--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
@@ -15,9 +15,6 @@ RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s
 RUN pip install -U pip
 RUN pip install -U kubernetes paddlepaddle

-RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
-RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
-RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python'
 RUN pip uninstall -y paddlepaddle && mkdir /workspace

 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin

--- a/benchmark/paddle/image/check_env.sh
+++ b/benchmark/paddle/image/check_env.sh
--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-height = 227
-width = 227
-num_class = 1000
-batch_size = get_config_arg('batch_size', int, 128)
-gp = get_config_arg('layer_num', int, 1)
-is_infer = get_config_arg("is_infer", bool, False)
-num_samples = get_config_arg('num_samples', int, 2560)
-
-args = {
-    'height': height,
-    'width': width,
-    'color': True,
-    'num_class': num_class,
-    'is_infer': is_infer,
-    'num_samples': num_samples
-}
-define_py_data_sources2(
-    "train.list" if not is_infer else None,
-    "test.list" if is_infer else None,
-    module="provider",
-    obj="process",
-    args=args)
-
-settings(
-    batch_size=batch_size,
-    learning_rate=0.01 / batch_size,
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * batch_size))
-
-# conv1
-net = data_layer('data', size=height * width * 3)
-net = img_conv_layer(
-    input=net,
-    filter_size=11,
-    num_channels=3,
-    num_filters=96,
-    stride=4,
-    padding=1)
-net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
-net = img_pool_layer(input=net, pool_size=3, stride=2)
-
-# conv2
-net = img_conv_layer(
-    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
-net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
-net = img_pool_layer(input=net, pool_size=3, stride=2)
-
-# conv3
-net = img_conv_layer(
-    input=net, filter_size=3, num_filters=384, stride=1, padding=1)
-# conv4
-net = img_conv_layer(
-    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)
-
-# conv5
-net = img_conv_layer(
-    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
-net = img_pool_layer(input=net, pool_size=3, stride=2)
-
-net = fc_layer(
-    input=net,
-    size=4096,
-    act=ReluActivation(),
-    layer_attr=ExtraAttr(drop_rate=0.5))
-net = fc_layer(
-    input=net,
-    size=4096,
-    act=ReluActivation(),
-    layer_attr=ExtraAttr(drop_rate=0.5))
-net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
-
-if is_infer:
-    outputs(net)
-else:
-    lab = data_layer('label', num_class)
-    loss = cross_entropy(input=net, label=lab)
-    outputs(loss)
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
-#!/usr/bin/env python
-from paddle.trainer_config_helpers import *
-
-height = 224
-width = 224
-num_class = 1000
-batch_size = get_config_arg('batch_size', int, 128)
-use_gpu = get_config_arg('use_gpu', bool, True)
-is_infer = get_config_arg("is_infer", bool, False)
-num_samples = get_config_arg('num_samples', int, 2560)
-
-args = {
-    'height': height,
-    'width': width,
-    'color': True,
-    'num_class': num_class,
-    'is_infer': is_infer,
-    'num_samples': num_samples
-}
-define_py_data_sources2(
-    "train.list" if not is_infer else None,
-    "test.list" if is_infer else None,
-    module="provider",
-    obj="process",
-    args=args)
-
-settings(
-    batch_size=batch_size,
-    learning_rate=0.01 / batch_size,
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * batch_size))
-
-conv_projection = conv_projection if use_gpu else img_conv_layer
-
-def inception2(name, input, channels, \
-    filter1,
-    filter3R, filter3,
-    filter5R, filter5,
-    proj):
-
-    conv1 = name + '_1'
-    conv3r = name + '_3r'
-    conv3 = name + '_3'
-    conv5r = name + '_5r'
-    conv5 = name + '_5'
-    maxpool = name + '_max'
-    convproj = name + '_proj'
-
-    cov1 = img_conv_layer(
-        name=conv1,
-        input=input,
-        filter_size=1,
-        num_channels=channels,
-        num_filters=filter1,
-        stride=1,
-        padding=0)
-
-    cov3r = img_conv_layer(
-        name=conv3r,
-        input=input,
-        filter_size=1,
-        num_channels=channels,
-        num_filters=filter3R,
-        stride=1,
-        padding=0)
-    cov3 = img_conv_layer(
-        name=conv3,
-        input=cov3r,
-        filter_size=3,
-        num_filters=filter3,
-        stride=1,
-        padding=1)
-
-    cov5r = img_conv_layer(
-        name=conv5r,
-        input=input,
-        filter_size=1,
-        num_channels=channels,
-        num_filters=filter5R,
-        stride=1,
-        padding=0)
-    cov5 = img_conv_layer(
-        name=conv5,
-        input=cov5r,
-        filter_size=5,
-        num_filters=filter5,
-        stride=1,
-        padding=2)
-
-    pool1 = img_pool_layer(
-        name=maxpool,
-        input=input,
-        pool_size=3,
-        num_channels=channels,
-        stride=1,
-        padding=1)
-    covprj = img_conv_layer(
-        name=convproj,
-        input=pool1,
-        filter_size=1,
-        num_filters=proj,
-        stride=1,
-        padding=0)
-
-    cat = concat_layer(name=name, input=[cov1, cov3, cov5, covprj])
-    return cat
-
-def inception(name, input, channels, \
-    filter1,
-    filter3R, filter3,
-    filter5R, filter5,
-    proj):
-
-    cov1 = conv_projection(
-        input=input,
-        filter_size=1,
-        num_channels=channels,
-        num_filters=filter1,
-        stride=1,
-        padding=0)
-
-    cov3r = img_conv_layer(
-        name=name + '_3r',
-        input=input,
-        filter_size=1,
-        num_channels=channels,
-        num_filters=filter3R,
-        stride=1,
-        padding=0)
-    cov3 = conv_projection(
-        input=cov3r, filter_size=3, num_filters=filter3, stride=1, padding=1)
-
-    cov5r = img_conv_layer(
-        name=name + '_5r',
-        input=input,
-        filter_size=1,
-        num_channels=channels,
-        num_filters=filter5R,
-        stride=1,
-        padding=0)
-    cov5 = conv_projection(
-        input=cov5r, filter_size=5, num_filters=filter5, stride=1, padding=2)
-
-    pool1 = img_pool_layer(
-        name=name + '_max',
-        input=input,
-        pool_size=3,
-        num_channels=channels,
-        stride=1,
-        padding=1)
-    covprj = conv_projection(
-        input=pool1, filter_size=1, num_filters=proj, stride=1, padding=0)
-
-    cat = concat_layer(
-        name=name,
-        input=[cov1, cov3, cov5, covprj],
-        bias_attr=True if use_gpu else False,
-        act=ReluActivation())
-    return cat
-
-
-data = data_layer(name="input", size=3 * height * width)
-
-# stage 1
-conv1 = img_conv_layer(
-    name="conv1",
-    input=data,
-    filter_size=7,
-    num_channels=3,
-    num_filters=64,
-    stride=2,
-    padding=3)
-pool1 = img_pool_layer(
-    name="pool1", input=conv1, pool_size=3, num_channels=64, stride=2)
-
-# stage 2
-conv2_1 = img_conv_layer(
-    name="conv2_1",
-    input=pool1,
-    filter_size=1,
-    num_filters=64,
-    stride=1,
-    padding=0)
-conv2_2 = img_conv_layer(
-    name="conv2_2",
-    input=conv2_1,
-    filter_size=3,
-    num_filters=192,
-    stride=1,
-    padding=1)
-pool2 = img_pool_layer(
-    name="pool2", input=conv2_2, pool_size=3, num_channels=192, stride=2)
-
-# stage 3
-ince3a = inception("ince3a", pool2, 192, 64, 96, 128, 16, 32, 32)
-ince3b = inception("ince3b", ince3a, 256, 128, 128, 192, 32, 96, 64)
-pool3 = img_pool_layer(
-    name="pool3", input=ince3b, num_channels=480, pool_size=3, stride=2)
-
-# stage 4
-ince4a = inception("ince4a", pool3, 480, 192, 96, 208, 16, 48, 64)
-ince4b = inception("ince4b", ince4a, 512, 160, 112, 224, 24, 64, 64)
-ince4c = inception("ince4c", ince4b, 512, 128, 128, 256, 24, 64, 64)
-ince4d = inception("ince4d", ince4c, 512, 112, 144, 288, 32, 64, 64)
-ince4e = inception("ince4e", ince4d, 528, 256, 160, 320, 32, 128, 128)
-pool4 = img_pool_layer(
-    name="pool4", input=ince4e, num_channels=832, pool_size=3, stride=2)
-
-# stage 5
-ince5a = inception("ince5a", pool4, 832, 256, 160, 320, 32, 128, 128)
-ince5b = inception("ince5b", ince5a, 832, 384, 192, 384, 48, 128, 128)
-pool5 = img_pool_layer(
-    name="pool5",
-    input=ince5b,
-    num_channels=1024,
-    pool_size=7,
-    stride=7,
-    pool_type=AvgPooling())
-
-# We remove loss1 and loss2 for all system when testing benchmark
-# output 1
-# pool_o1 = img_pool_layer(name="pool_o1", input=ince4a, num_channels=512, pool_size=5, stride=3, pool_type=AvgPooling())
-# conv_o1 = img_conv_layer(name="conv_o1", input=pool_o1, filter_size=1, num_filters=128, stride=1, padding=0)
-# fc_o1 = fc_layer(name="fc_o1", input=conv_o1, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation())
-# out1 = fc_layer(name="output1", input=fc_o1,  size=1000, act=SoftmaxActivation())
-# loss1 = cross_entropy(name='loss1', input=out1, label=lab, coeff=0.3) 
-
-# output 2
-#pool_o2 = img_pool_layer(name="pool_o2", input=ince4d, num_channels=528, pool_size=5, stride=3, pool_type=AvgPooling())
-#conv_o2 = img_conv_layer(name="conv_o2", input=pool_o2, filter_size=1, num_filters=128, stride=1, padding=0)
-#fc_o2 = fc_layer(name="fc_o2", input=conv_o2, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation())
-#out2 = fc_layer(name="output2", input=fc_o2, size=1000, act=SoftmaxActivation())
-#loss2 = cross_entropy(name='loss2', input=out2, label=lab, coeff=0.3) 
-
-# output 3
-dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4)
-out3 = fc_layer(
-    name="output3", input=dropout, size=1000, act=SoftmaxActivation())
-
-if is_infer:
-    outputs(out3)
-else:
-    lab = data_layer(name="label", size=num_class)
-    loss3 = cross_entropy(name='loss3', input=out3, label=lab)
-    outputs(loss3)
--- a/benchmark/paddle/image/plotlog.py
+++ b/benchmark/paddle/image/plotlog.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import argparse
-import matplotlib.pyplot as plt
-
-
-def parse_args():
-    parser = argparse.ArgumentParser('Parse Log')
-    parser.add_argument(
-        '--file_path', '-f', type=str, help='the path of the log file')
-    parser.add_argument(
-        '--sample_rate',
-        '-s',
-        type=float,
-        default=1.0,
-        help='the rate to take samples from log')
-    parser.add_argument(
-        '--log_period', '-p', type=int, default=1, help='the period of log')
-
-    args = parser.parse_args()
-    return args
-
-
-def parse_file(file_name):
-    loss = []
-    error = []
-    with open(file_name) as f:
-        for i, line in enumerate(f):
-            line = line.strip()
-            if not line.startswith('pass'):
-                continue
-            line_split = line.split(' ')
-            if len(line_split) != 5:
-                continue
-
-            loss_str = line_split[2][:-1]
-            cur_loss = float(loss_str.split('=')[-1])
-            loss.append(cur_loss)
-
-            err_str = line_split[3][:-1]
-            cur_err = float(err_str.split('=')[-1])
-            error.append(cur_err)
-
-    accuracy = [1.0 - err for err in error]
-
-    return loss, accuracy
-
-
-def sample(metric, sample_rate):
-    interval = int(1.0 / sample_rate)
-    if interval > len(metric):
-        return metric[:1]
-
-    num = len(metric) / interval
-    idx = [interval * i for i in range(num)]
-    metric_sample = [metric[id] for id in idx]
-    return metric_sample
-
-
-def plot_metric(metric,
-                batch_id,
-                graph_title,
-                line_style='b-',
-                line_label='y',
-                line_num=1):
-    plt.figure()
-    plt.title(graph_title)
-    if line_num == 1:
-        plt.plot(batch_id, metric, line_style, label=line_label)
-    else:
-        for i in range(line_num):
-            plt.plot(batch_id, metric[i], line_style[i], label=line_label[i])
-    plt.xlabel('batch')
-    plt.ylabel(graph_title)
-    plt.legend()
-    plt.savefig(graph_title + '.jpg')
-    plt.close()
-
-
-def main():
-    args = parse_args()
-    assert args.sample_rate > 0. and args.sample_rate <= 1.0, "The sample rate should in the range (0, 1]."
-
-    loss, accuracy = parse_file(args.file_path)
-    batch = [args.log_period * i for i in range(len(loss))]
-
-    batch_sample = sample(batch, args.sample_rate)
-    loss_sample = sample(loss, args.sample_rate)
-    accuracy_sample = sample(accuracy, args.sample_rate)
-
-    plot_metric(loss_sample, batch_sample, 'loss', line_label='loss')
-    plot_metric(
-        accuracy_sample,
-        batch_sample,
-        'accuracy',
-        line_style='g-',
-        line_label='accuracy')
-
-
-if __name__ == '__main__':
-    main()
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import io, os
-import random
-import numpy as np
-from paddle.trainer.PyDataProvider2 import *
-
-
-def initHook(settings, height, width, color, num_class, **kwargs):
-    settings.height = height
-    settings.width = width
-    settings.color = color
-    settings.num_class = num_class
-    if settings.color:
-        settings.data_size = settings.height * settings.width * 3
-    else:
-        settings.data_size = settings.height * settings.width
-    settings.is_infer = kwargs.get('is_infer', False)
-    settings.num_samples = kwargs.get('num_samples', 2560)
-    if settings.is_infer:
-        settings.slots = [dense_vector(settings.data_size)]
-    else:
-        settings.slots = [dense_vector(settings.data_size), integer_value(1)]
-
-
-@provider(
-    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_list):
-    for i in xrange(settings.num_samples):
-        img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
-        if settings.is_infer:
-            yield img.astype('float32')
-        else:
-            lab = random.randint(0, settings.num_class - 1)
-            yield img.astype('float32'), int(lab)
--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
-#!/usr/bin/env python
-from paddle.trainer_config_helpers import *
-
-height = 224
-width = 224
-num_class = 1000
-batch_size = get_config_arg('batch_size', int, 64)
-layer_num = get_config_arg("layer_num", int, 50)
-is_infer = get_config_arg("is_infer", bool, False)
-num_samples = get_config_arg('num_samples', int, 2560)
-
-args = {
-    'height': height,
-    'width': width,
-    'color': True,
-    'num_class': num_class,
-    'is_infer': is_infer,
-    'num_samples': num_samples
-}
-define_py_data_sources2(
-    "train.list" if not is_infer else None,
-    "test.list" if is_infer else None,
-    module="provider",
-    obj="process",
-    args=args)
-
-settings(
-    batch_size=batch_size,
-    learning_rate=0.01 / batch_size,
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * batch_size))
-
-
-#######################Network Configuration #############
-def conv_bn_layer(name,
-                  input,
-                  filter_size,
-                  num_filters,
-                  stride,
-                  padding,
-                  channels=None,
-                  active_type=ReluActivation()):
-    """
-    A wrapper for conv layer with batch normalization layers.
-    Note:
-    conv layer has no activation.
-    """
-
-    tmp = img_conv_layer(
-        name=name + "_conv",
-        input=input,
-        filter_size=filter_size,
-        num_channels=channels,
-        num_filters=num_filters,
-        stride=stride,
-        padding=padding,
-        act=LinearActivation(),
-        bias_attr=False)
-    return batch_norm_layer(
-        name=name + "_bn",
-        input=tmp,
-        act=active_type,
-        use_global_stats=is_infer)
-
-
-def bottleneck_block(name, input, num_filters1, num_filters2):
-    """
-    A wrapper for bottlenect building block in ResNet.
-    Last conv_bn_layer has no activation.
-    Addto layer has activation of relu.
-    """
-    last_name = conv_bn_layer(
-        name=name + '_branch2a',
-        input=input,
-        filter_size=1,
-        num_filters=num_filters1,
-        stride=1,
-        padding=0)
-    last_name = conv_bn_layer(
-        name=name + '_branch2b',
-        input=last_name,
-        filter_size=3,
-        num_filters=num_filters1,
-        stride=1,
-        padding=1)
-    last_name = conv_bn_layer(
-        name=name + '_branch2c',
-        input=last_name,
-        filter_size=1,
-        num_filters=num_filters2,
-        stride=1,
-        padding=0,
-        active_type=LinearActivation())
-
-    return addto_layer(
-        name=name + "_addto", input=[input, last_name], act=ReluActivation())
-
-
-def mid_projection(name, input, num_filters1, num_filters2, stride=2):
-    """
-    A wrapper for middile projection in ResNet.
-    projection shortcuts are used for increasing dimensions,
-    and other shortcuts are identity
-    branch1: projection shortcuts are used for increasing
-    dimensions, has no activation.
-    branch2x: bottleneck building block, shortcuts are identity.
-    """
-    # stride = 2
-    branch1 = conv_bn_layer(
-        name=name + '_branch1',
-        input=input,
-        filter_size=1,
-        num_filters=num_filters2,
-        stride=stride,
-        padding=0,
-        active_type=LinearActivation())
-
-    last_name = conv_bn_layer(
-        name=name + '_branch2a',
-        input=input,
-        filter_size=1,
-        num_filters=num_filters1,
-        stride=stride,
-        padding=0)
-    last_name = conv_bn_layer(
-        name=name + '_branch2b',
-        input=last_name,
-        filter_size=3,
-        num_filters=num_filters1,
-        stride=1,
-        padding=1)
-
-    last_name = conv_bn_layer(
-        name=name + '_branch2c',
-        input=last_name,
-        filter_size=1,
-        num_filters=num_filters2,
-        stride=1,
-        padding=0,
-        active_type=LinearActivation())
-
-    return addto_layer(
-        name=name + "_addto", input=[branch1, last_name], act=ReluActivation())
-
-
-img = data_layer(name='image', size=height * width * 3)
-
-
-def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
-    """
-    A wrapper for 50,101,152 layers of ResNet.
-    res2_num: number of blocks stacked in conv2_x
-    res3_num: number of blocks stacked in conv3_x
-    res4_num: number of blocks stacked in conv4_x
-    res5_num: number of blocks stacked in conv5_x
-    """
-    # For ImageNet
-    # conv1: 112x112
-    tmp = conv_bn_layer(
-        "conv1",
-        input=img,
-        filter_size=7,
-        channels=3,
-        num_filters=64,
-        stride=2,
-        padding=3)
-    tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2)
-
-    # conv2_x: 56x56
-    tmp = mid_projection(
-        name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1)
-    for i in xrange(2, res2_num + 1, 1):
-        tmp = bottleneck_block(
-            name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256)
-
-    # conv3_x: 28x28
-    tmp = mid_projection(
-        name="res3_1", input=tmp, num_filters1=128, num_filters2=512)
-    for i in xrange(2, res3_num + 1, 1):
-        tmp = bottleneck_block(
-            name="res3_" + str(i),
-            input=tmp,
-            num_filters1=128,
-            num_filters2=512)
-
-    # conv4_x: 14x14
-    tmp = mid_projection(
-        name="res4_1", input=tmp, num_filters1=256, num_filters2=1024)
-    for i in xrange(2, res4_num + 1, 1):
-        tmp = bottleneck_block(
-            name="res4_" + str(i),
-            input=tmp,
-            num_filters1=256,
-            num_filters2=1024)
-
-    # conv5_x: 7x7
-    tmp = mid_projection(
-        name="res5_1", input=tmp, num_filters1=512, num_filters2=2048)
-    for i in xrange(2, res5_num + 1, 1):
-        tmp = bottleneck_block(
-            name="res5_" + str(i),
-            input=tmp,
-            num_filters1=512,
-            num_filters2=2048)
-
-    tmp = img_pool_layer(
-        name='avgpool',
-        input=tmp,
-        pool_size=7,
-        stride=1,
-        pool_type=AvgPooling())
-
-    return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
-
-
-if layer_num == 50:
-    resnet = deep_res_net(3, 4, 6, 3)
-elif layer_num == 101:
-    resnet = deep_res_net(3, 4, 23, 3)
-elif layer_num == 152:
-    resnet = deep_res_net(3, 8, 36, 3)
-else:
-    print("Wrong layer number.")
-
-if is_infer:
-    outputs(resnet)
-else:
-    lbl = data_layer(name="label", size=num_class)
-    loss = cross_entropy(name='loss', input=resnet, label=lbl)
-    outputs(loss)
--- a/benchmark/paddle/image/run.sh
+++ b/benchmark/paddle/image/run.sh
-#!/bin/bash
-
-set -e
-
-function train() {
-  cfg=$1
-  thread=$2
-  bz=$3
-  args="batch_size=$3"
-  prefix=$4
-  paddle train --job=time \
-    --config=$cfg \
-    --use_gpu=True \
-    --trainer_count=$thread \
-    --log_period=10 \
-    --test_period=100 \
-    --config_args=$args \
-    > logs/$prefix-${thread}gpu-$bz.log 2>&1 
-}
-
-if [ ! -d "train.list" ]; then
-  echo " " > train.list
-fi
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-#========single-gpu=========#
-# alexnet
-train alexnet.py 1 64 alexnet
-train alexnet.py 1 128 alexnet
-train alexnet.py 1 256 alexnet
-train alexnet.py 1 512 alexnet
-
-# googlenet
-train googlenet.py 1 64 googlenet
-train googlenet.py 1 128 googlenet
-train googlenet.py 1 256 googlenet
-
-# smallnet
-train smallnet_mnist_cifar.py 1 64 smallnet
-train smallnet_mnist_cifar.py 1 128 smallnet
-train smallnet_mnist_cifar.py 1 256 smallnet
-train smallnet_mnist_cifar.py 1 512 smallnet
-
-
-############################
-#========multi-gpus=========#
-train alexnet.py 4 512 alexnet
-train alexnet.py 4 1024 alexnet
-
-train googlenet.py 4 512 googlenet 
-train googlenet.py 4 1024 googlenet
--- a/benchmark/paddle/image/run_mkl_infer.sh
+++ b/benchmark/paddle/image/run_mkl_infer.sh
-#!/bin/bash
-
-set -e
-
-function clock_to_seconds() {
-  hours=`echo $1 | awk -F ':' '{print $1}'`
-  mins=`echo $1 | awk -F ':' '{print $2}'`
-  secs=`echo $1 | awk -F ':' '{print $3}'`
-  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
-}
-
-function infer() {
-  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
-  topology=$1
-  layer_num=$2
-  bs=$3
-  use_mkldnn=$4
-  if [ $4 == "True" ]; then
-    thread=1
-    log="logs/infer-${topology}-${layer_num}-mkldnn-${bs}.log"
-  elif [ $4 == "False" ]; then
-    thread=`nproc`
-    if [ $thread -gt $bs ]; then
-      thread=$bs
-    fi
-    log="logs/infer-${topology}-${layer_num}-${thread}mklml-${bs}.log"
-  else
-    echo "Wrong input $4, use True or False."
-    exit 0
-  fi
-
-  models_in="models/${topology}-${layer_num}/pass-00000/"
-  if [ ! -d $models_in ]; then
-    echo "Training model ${topology}_${layer_num}"
-    paddle train --job=train \
-      --config="${topology}.py" \
-      --use_mkldnn=True \
-      --use_gpu=False \
-      --trainer_count=1 \
-      --num_passes=1 \
-      --save_dir="models/${topology}-${layer_num}" \
-      --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
-      > /dev/null 2>&1
-    echo "Done"
-  fi
-  log_period=$((256 / bs))
-  paddle train --job=test \
-    --config="${topology}.py" \
-    --use_mkldnn=$use_mkldnn \
-    --use_gpu=False \
-    --trainer_count=$thread \
-    --log_period=$log_period \
-    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
-    --init_model_path=$models_in \
-    2>&1 | tee ${log}
-
-  # calculate the last 5 logs period time of 1280 samples,
-  # the time before are burning time.
-  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
-  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
-  start_sec=`clock_to_seconds $start`
-  end_sec=`clock_to_seconds $end`
-  fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
-  echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
-  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
-}
-
-if [ ! -f "train.list" ]; then
-  echo " " > train.list
-fi
-if [ ! -f "test.list" ]; then
-  echo " " > test.list
-fi
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-if [ ! -d "models" ]; then
-  mkdir -p models
-fi
-
-# inference benchmark
-for use_mkldnn in True False; do
-  for batchsize in 1 2 4 8 16; do
-    infer vgg 19 $batchsize $use_mkldnn
-    infer resnet 50 $batchsize $use_mkldnn
-    infer googlenet v1 $batchsize $use_mkldnn
-    infer alexnet 2 $batchsize $use_mkldnn
-  done
-done
--- a/benchmark/paddle/image/run_mkl_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
-#!/bin/bash
-
-set -e
-
-function train() {
-  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
-  topology=$1
-  layer_num=$2
-  bs=$3
-  use_mkldnn=$4
-  if [ $4 == "True" ]; then
-    thread=1
-    log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log"
-  elif [ $4 == "False" ]; then
-    thread=`nproc`
-    # each trainer_count use only 1 core to avoid conflict
-    log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log"
-  else
-    echo "Wrong input $4, use True or False."
-    exit 0
-  fi
-  args="batch_size=${bs},layer_num=${layer_num}"
-  config="${topology}.py"
-  paddle train --job=time \
-    --config=$config \
-    --use_mkldnn=$use_mkldnn \
-    --use_gpu=False \
-    --trainer_count=$thread \
-    --log_period=10 \
-    --test_period=100 \
-    --config_args=$args \
-    2>&1 | tee ${log} 
-
-  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
-  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
-  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
-}
-
-if [ ! -f "train.list" ]; then
-  echo " " > train.list
-fi
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-# training benchmark
-for use_mkldnn in True False; do
-  for batchsize in 64 128 256; do
-    train vgg 19 $batchsize $use_mkldnn
-    train resnet 50 $batchsize $use_mkldnn
-    train googlenet v1 $batchsize $use_mkldnn
-    train alexnet 2 $batchsize $use_mkldnn
-  done
-done
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
-#!/bin/bash
-
-set -e
-
-function clock_to_seconds() {
-  hours=`echo $1 | awk -F ':' '{print $1}'`
-  mins=`echo $1 | awk -F ':' '{print $2}'`
-  secs=`echo $1 | awk -F ':' '{print $3}'`
-  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
-}
-
-function infer() {
-  export OPENBLAS_MAIN_FREE=1
-  topology=$1
-  layer_num=$2
-  bs=$3
-  trainers=`nproc`
-  if [ $trainers -gt $bs ]; then
-    trainers=$bs
-  fi
-  log="logs/infer-${topology}-${layer_num}-${trainers}openblas-${bs}.log"
-  threads=$((`nproc` / trainers))
-  if [ $threads -eq 0 ]; then
-    threads=1
-  fi
-  export OPENBLAS_NUM_THREADS=$threads
-
-  models_in="models/${topology}-${layer_num}/pass-00000/"
-  if [ ! -d $models_in ]; then
-    echo "./run_mkl_infer.sh to save the model first"
-    exit 0
-  fi
-  log_period=$((32 / bs))
-  paddle train --job=test \
-    --config="${topology}.py" \
-    --use_mkldnn=False \
-    --use_gpu=False \
-    --trainer_count=$trainers \
-    --log_period=$log_period \
-    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
-    --init_model_path=$models_in \
-    2>&1 | tee ${log}
-
-  # calculate the last 5 logs period time of 160(=32*5) samples,
-  # the time before are burning time.
-  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
-  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
-  start_sec=`clock_to_seconds $start`
-  end_sec=`clock_to_seconds $end`
-  fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
-  echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
-  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
-}
-
-if [ ! -f "train.list" ]; then
-  echo " " > train.list
-fi
-if [ ! -f "test.list" ]; then
-  echo " " > test.list
-fi
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-# inference benchmark
-for batchsize in 1 2 4 8 16; do
-  infer vgg 19 $batchsize
-  infer resnet 50 $batchsize 
-  infer googlenet v1 $batchsize
-  infer alexnet 2 $batchsize
-done
--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
-#!/bin/bash
-
-set -e
-
-function train() {
-  export OPENBLAS_NUM_THREADS=1
-  topology=$1
-  layer_num=$2
-  bs=$3
-  thread=`nproc`
-  # each trainer_count use only 1 core to avoid conflict
-  log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
-  args="batch_size=${bs},layer_num=${layer_num}"
-  config="${topology}.py"
-  paddle train --job=time \
-    --config=$config \
-    --use_mkldnn=False \
-    --use_gpu=False \
-    --trainer_count=$thread \
-    --log_period=3 \
-    --test_period=30 \
-    --config_args=$args \
-    2>&1 | tee ${log} 
-
-  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
-  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
-  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
-}
-
-if [ ! -f "train.list" ]; then
-  echo " " > train.list
-fi
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-# training benchmark
-for batchsize in 64 128 256; do
-  train vgg 19 $batchsize
-  train resnet 50 $batchsize
-  train googlenet v1 $batchsize
-  train alexnet 2 $batchsize
-done
--- a/benchmark/paddle/image/smallnet_mnist_cifar.py
+++ b/benchmark/paddle/image/smallnet_mnist_cifar.py
-#!/usr/bin/env python
-
-from paddle.trainer_config_helpers import *
-
-height = 32
-width = 32
-num_class = 10
-
-batch_size = get_config_arg('batch_size', int, 128)
-
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
-define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
-
-settings(
-    batch_size=batch_size,
-    learning_rate=0.01 / batch_size,
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * batch_size))
-
-# conv1
-net = data_layer('data', size=height * width * 3)
-net = img_conv_layer(
-    input=net,
-    filter_size=5,
-    num_channels=3,
-    num_filters=32,
-    stride=1,
-    padding=2)
-net = img_pool_layer(input=net, pool_size=3, stride=2, padding=1)
-
-# conv2
-net = img_conv_layer(
-    input=net, filter_size=5, num_filters=32, stride=1, padding=2)
-net = img_pool_layer(
-    input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
-
-# conv3
-net = img_conv_layer(
-    input=net, filter_size=3, num_filters=64, stride=1, padding=1)
-net = img_pool_layer(
-    input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
-
-net = fc_layer(input=net, size=64, act=ReluActivation())
-net = fc_layer(input=net, size=10, act=SoftmaxActivation())
-
-lab = data_layer('label', num_class)
-loss = classification_cost(input=net, label=lab)
-outputs(loss)
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
-#!/usr/bin/env python
-from paddle.trainer_config_helpers import *
-
-height = 224
-width = 224
-num_class = 1000
-batch_size = get_config_arg('batch_size', int, 64)
-layer_num = get_config_arg('layer_num', int, 19)
-is_infer = get_config_arg("is_infer", bool, False)
-num_samples = get_config_arg('num_samples', int, 2560)
-
-args = {
-    'height': height,
-    'width': width,
-    'color': True,
-    'num_class': num_class,
-    'is_infer': is_infer,
-    'num_samples': num_samples
-}
-define_py_data_sources2(
-    "train.list" if not is_infer else None,
-    "test.list" if is_infer else None,
-    module="provider",
-    obj="process",
-    args=args)
-
-settings(
-    batch_size=batch_size,
-    learning_rate=0.001 / batch_size,
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * batch_size))
-
-img = data_layer(name='image', size=height * width * 3)
-
-
-def vgg_network(vgg_num=3):
-    tmp = img_conv_group(
-        input=img,
-        num_channels=3,
-        conv_padding=1,
-        conv_num_filter=[64, 64],
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_size=2,
-        pool_stride=2,
-        pool_type=MaxPooling())
-
-    tmp = img_conv_group(
-        input=tmp,
-        conv_num_filter=[128, 128],
-        conv_padding=1,
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_stride=2,
-        pool_type=MaxPooling(),
-        pool_size=2)
-
-    channels = []
-    for i in range(vgg_num):
-        channels.append(256)
-    tmp = img_conv_group(
-        input=tmp,
-        conv_num_filter=channels,
-        conv_padding=1,
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_stride=2,
-        pool_type=MaxPooling(),
-        pool_size=2)
-    channels = []
-    for i in range(vgg_num):
-        channels.append(512)
-    tmp = img_conv_group(
-        input=tmp,
-        conv_num_filter=channels,
-        conv_padding=1,
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_stride=2,
-        pool_type=MaxPooling(),
-        pool_size=2)
-    tmp = img_conv_group(
-        input=tmp,
-        conv_num_filter=channels,
-        conv_padding=1,
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_stride=2,
-        pool_type=MaxPooling(),
-        pool_size=2)
-
-    tmp = fc_layer(
-        input=tmp,
-        size=4096,
-        act=ReluActivation(),
-        layer_attr=ExtraAttr(drop_rate=0.5))
-
-    tmp = fc_layer(
-        input=tmp,
-        size=4096,
-        act=ReluActivation(),
-        layer_attr=ExtraAttr(drop_rate=0.5))
-
-    return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
-
-
-if layer_num == 16:
-    vgg = vgg_network(3)
-elif layer_num == 19:
-    vgg = vgg_network(4)
-else:
-    print("Wrong layer number.")
-
-if is_infer:
-    outputs(vgg)
-else:
-    lab = data_layer('label', num_class)
-    loss = cross_entropy(input=vgg, label=lab)
-    outputs(loss)
--- a/benchmark/paddle/rnn/imdb.py
+++ b/benchmark/paddle/rnn/imdb.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import six.moves.cPickle as pickle
-import gzip
-import os
-import numpy
-
-
-def get_dataset_file(dataset, default_dataset, origin):
-    data_dir, data_file = os.path.split(dataset)
-    if (not os.path.isfile(dataset)) and data_file == default_dataset:
-        from six.moves import urllib
-        print('Downloading data from %s' % origin)
-        urllib.request.urlretrieve(origin, dataset)
-
-    return dataset
-
-
-def create_data(path="imdb.pkl"):
-
-    if (not os.path.isfile('imdb.train.pkl')):
-        path = get_dataset_file(
-            path, "imdb.pkl",
-            "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
-
-        if path.endswith(".gz"):
-            f = gzip.open(path, 'rb')
-        else:
-            f = open(path, 'rb')
-
-        train_set = pickle.load(f)
-        test_set = pickle.load(f)
-        f.close()
-
-        pickle.dump(train_set, open('imdb.train.pkl', 'wb'))
-        pickle.dump(test_set, open('imdb.test.pkl', 'wb'))
-
-    if (not os.path.isfile('train.list')):
-        file('train.list', 'w').write('imdb.train.pkl\n')
-
-
-def main():
-    create_data('imdb.pkl')
-
-
-if __name__ == "__main__":
-    main()
--- a/benchmark/paddle/rnn/provider.py
+++ b/benchmark/paddle/rnn/provider.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import io, os
-import random
-import numpy as np
-import six.moves.cPickle as pickle
-from paddle.trainer.PyDataProvider2 import *
-
-
-def remove_unk(x, n_words):
-    return [[1 if w >= n_words else w for w in sen] for sen in x]
-
-
-# ==============================================================
-#  tensorflow uses fixed length, but PaddlePaddle can process
-#  variable-length. Padding is used in benchmark in order to
-#  compare with other platform. 
-# ==============================================================
-def pad_sequences(sequences,
-                  maxlen=None,
-                  dtype='int32',
-                  padding='post',
-                  truncating='post',
-                  value=0.):
-    lengths = [len(s) for s in sequences]
-
-    nb_samples = len(sequences)
-    if maxlen is None:
-        maxlen = np.max(lengths)
-
-    x = (np.ones((nb_samples, maxlen)) * value).astype(dtype)
-    for idx, s in enumerate(sequences):
-        if len(s) == 0:
-            continue  # empty list was found
-        if truncating == 'pre':
-            trunc = s[-maxlen:]
-        elif truncating == 'post':
-            trunc = s[:maxlen]
-        else:
-            raise ValueError("Truncating type '%s' not understood" % padding)
-
-        if padding == 'post':
-            x[idx, :len(trunc)] = trunc
-        elif padding == 'pre':
-            x[idx, -len(trunc):] = trunc
-        else:
-            raise ValueError("Padding type '%s' not understood" % padding)
-    return x
-
-
-def initHook(settings, vocab_size, pad_seq, maxlen, **kwargs):
-    settings.vocab_size = vocab_size
-    settings.pad_seq = pad_seq
-    settings.maxlen = maxlen
-    settings.input_types = [
-        integer_value_sequence(vocab_size), integer_value(2)
-    ]
-
-
-@provider(
-    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file):
-    f = open(file, 'rb')
-    train_set = pickle.load(f)
-    f.close()
-    x, y = train_set
-
-    # remove unk, namely remove the words out of dictionary
-    x = remove_unk(x, settings.vocab_size)
-    if settings.pad_seq:
-        x = pad_sequences(x, maxlen=settings.maxlen, value=0.)
-
-    for i in range(len(y)):
-        yield map(int, x[i]), int(y[i])
--- a/benchmark/paddle/rnn/rnn.py
+++ b/benchmark/paddle/rnn/rnn.py
-#!/usr/bin/env python
-
-from paddle.trainer_config_helpers import *
-import imdb
-
-num_class = 2
-vocab_size = 30000
-fixedlen = 100
-batch_size = get_config_arg('batch_size', int, 128)
-lstm_num = get_config_arg('lstm_num', int, 1)
-hidden_size = get_config_arg('hidden_size', int, 128)
-# whether to pad sequence into fixed length
-pad_seq = get_config_arg('pad_seq', bool, True)
-imdb.create_data('imdb.pkl')
-
-args = {'vocab_size': vocab_size, 'pad_seq': pad_seq, 'maxlen': fixedlen}
-define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
-
-settings(
-    batch_size=batch_size,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer(),
-    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
-
-net = data_layer('data', size=vocab_size)
-net = embedding_layer(input=net, size=128)
-
-for i in xrange(lstm_num):
-    net = simple_lstm(input=net, size=hidden_size)
-
-net = last_seq(input=net)
-net = fc_layer(input=net, size=2, act=SoftmaxActivation())
-
-lab = data_layer('label', num_class)
-loss = classification_cost(input=net, label=lab)
-outputs(loss)
--- a/benchmark/paddle/rnn/run.sh
+++ b/benchmark/paddle/rnn/run.sh
-#!/bin/bash
-
-set -e
-
-function train() {
-  cfg=$1
-  thread=$2
-  args="lstm_num=${3},seq_pad=${4},hidden_size=${5},batch_size=${6}"
-  paddle train --job=time \
-    --config=$cfg \
-    --use_gpu=1 \
-    --trainer_count=$thread \
-    --log_period=10 \
-    --test_period=100 \
-    --num_passes=1 \
-    --feed_data=1 \
-    --config_args=$args \
-    >logs/rnn-pad${4}-${thread}gpu-lstm${3}-batch${6}-hid${5}.log 2>&1
-}
-
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-## padding, single gpu
-#-----config--gpu--lstm_num--padding--hidden_size--batch_size
-## lstm_num=2, batch_size=64
-train rnn.py 1 2 1 256 64 
-train rnn.py 1 2 1 512 64 
-train rnn.py 1 2 1 1280 64 
-
-## lstm_num=2, batch_size=128
-train rnn.py 1 2 1 256 128 
-train rnn.py 1 2 1 512 128 
-train rnn.py 1 2 1 1280 128 
-
-## lstm_num=4, batch_size=256
-train rnn.py 1 2 1 256 256 
-train rnn.py 1 2 1 512 256 
-train rnn.py 1 2 1 1280 256 
-
-
-#==================multi gpus=====================#
-# hidden_size=256, lstm_num=2, different batch size
-train rnn.py 4 2 1 256 128 
-train rnn.py 4 2 1 256 256 
-train rnn.py 4 2 1 256 512 
-
-# hidden_size=512, lstm_num=4, different batch size
-train rnn.py 4 2 1 512 128 
-train rnn.py 4 2 1 512 256 
-train rnn.py 4 2 1 512 512 
--- a/benchmark/tensorflow/machine_translation.py
+++ b/benchmark/tensorflow/machine_translation.py
@@ -35,8 +35,6 @@ import os
 import argparse
 import time

-import paddle.v2 as paddle
-
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--embedding_dim",

--- a/benchmark/tensorflow/mnist.py
+++ b/benchmark/tensorflow/mnist.py
@@ -21,7 +21,6 @@ import time
 import numpy as np

 import tensorflow as tf
-import paddle.v2 as paddle

 DTYPE = tf.float32


--- a/benchmark/tensorflow/resnet.py
+++ b/benchmark/tensorflow/resnet.py
@@ -27,7 +27,6 @@ import argparse
 import time
 import numpy as np

-import paddle.v2 as paddle
 import tensorflow as tf

 DTYPE = tf.float32

--- a/benchmark/tensorflow/stacked_dynamic_lstm.py
+++ b/benchmark/tensorflow/stacked_dynamic_lstm.py
@@ -21,8 +21,6 @@ import argparse
 import time
 import tensorflow as tf

-import paddle.v2 as paddle
-

 def parse_args():
    parser = argparse.ArgumentParser("LSTM model benchmark.")

--- a/benchmark/tensorflow/vgg.py
+++ b/benchmark/tensorflow/vgg.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """VGG16 benchmark in TensorFlow"""
 import tensorflow as tf
-import paddle.v2 as paddle
 import numpy as np
 import argparse
 import time

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -152,7 +152,12 @@ endif()

 if (WITH_MKLML AND MKLML_IOMP_LIB)
    message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
-    set(OPENMP_FLAGS "-fopenmp")
+    if(WIN32)
+        # openmp not support well for now on windows
+        set(OPENMP_FLAGS "")
+    else(WIN32)
+        set(OPENMP_FLAGS "-fopenmp")
+    endif(WIN32)
    set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
    set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")

--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -203,25 +203,26 @@ list(APPEND CUDA_NVCC_FLAGS "-w")
 list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")

 if (NOT WIN32)
-if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
-    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
-    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
-    # nvcc 9 does not support -Os. Use Release flags instead
-    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
-endif()
+  if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
+  elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
+      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
+  elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
+      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
+  elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
+      # nvcc 9 does not support -Os. Use Release flags instead
+      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
+  endif()
 else(NOT WIN32)
-list(APPEND CUDA_NVCC_FLAGS  "--compiler-options;/bigobj")
-if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-  list(APPEND CUDA_NVCC_FLAGS  "-g -G")
-  # match the cl's _ITERATOR_DEBUG_LEVEL
-  list(APPEND CUDA_NVCC_FLAGS  "-D_DEBUG")
-elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
-  list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
-else()
+  list(APPEND CUDA_NVCC_FLAGS  "-Xcompiler \"/wd 4244 /wd 4267 /wd 4819\"")
+  list(APPEND CUDA_NVCC_FLAGS  "--compiler-options;/bigobj")
+  if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+    list(APPEND CUDA_NVCC_FLAGS  "-g -G")
+    # match the cl's _ITERATOR_DEBUG_LEVEL
+    list(APPEND CUDA_NVCC_FLAGS  "-D_DEBUG")
+  elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
+    list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
+  else()
  message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.")
 endif()
 endif(NOT WIN32)

--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -20,8 +20,10 @@ SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include dire

 IF(WIN32)
  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE)
+  SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530")
 ELSE(WIN32)
  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
+  SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
 ENDIF(WIN32)

 INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
@@ -39,7 +41,7 @@ ExternalProject_Add(
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS}
                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -49,6 +49,8 @@ IF(NOT WIN32)
    SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
    SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
    SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
+ELSE()
+    SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc")
 ENDIF(NOT WIN32)

 ExternalProject_Add(
@@ -61,7 +63,6 @@ ExternalProject_Add(
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
    CMAKE_ARGS          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
    CMAKE_ARGS          -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}

--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -20,6 +20,12 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
 set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
 set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)

+if(WIN32)
+    SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267")
+else()
+    SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+endif()
+
 ExternalProject_Add(
    extern_snappy
    GIT_REPOSITORY "https://github.com/google/snappy"
@@ -31,7 +37,7 @@ ExternalProject_Add(
                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS}
                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                    -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -147,12 +147,6 @@ set(GPU_COMMON_FLAGS
    -Wno-error=unused-function  # Warnings in Numpy Header.
    -Wno-error=array-bounds # Warnings in Eigen::array
 )
-
-else(NOT WIN32)
-set(COMMON_FLAGS
-    "/w") #disable all warnings.
-set(GPU_COMMON_FLAGS
-    "/w") #disable all warnings
 endif(NOT WIN32)

 if (APPLE)
@@ -193,8 +187,7 @@ safe_set_static_flag()
        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-      if(${flag_var} MATCHES "/W3")
-        string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}")
-      endif(${flag_var} MATCHES "/W3")
+        string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}")
+        set(flag_var "${flag_var} /w")
    endforeach(flag_var)
 endif(WIN32)
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -8,13 +8,13 @@ paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None
 paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
-paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
-paddle.fluid.name_scope ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.program_guard ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.name_scope ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False))
 paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
-paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.scope_guard ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
@@ -66,7 +66,7 @@ paddle.fluid.initializer.XavierInitializer.__init__ ArgSpec(args=['self', 'unifo
 paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0))
 paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
-paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
 paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
@@ -229,7 +229,7 @@ paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes',
 paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True))
 paddle.fluid.layers.create_py_reader_by_data ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True))
 paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.Preprocessor.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.layers.Preprocessor.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.Preprocessor.outputs ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None)
 paddle.fluid.layers.load ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,))
@@ -270,7 +270,7 @@ paddle.fluid.layers.IfElse.input ArgSpec(args=['self', 'x'], varargs=None, keywo
 paddle.fluid.layers.IfElse.output ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None)
 paddle.fluid.layers.IfElse.true_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.DynamicRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.DynamicRNN.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.layers.DynamicRNN.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.DynamicRNN.memory ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32'))
 paddle.fluid.layers.DynamicRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
 paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
@@ -346,12 +346,12 @@ paddle.fluid.contrib.StateCell.set_state ArgSpec(args=['self', 'state_name', 'st
 paddle.fluid.contrib.StateCell.state_updater ArgSpec(args=['self', 'updater'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.StateCell.update_states ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.TrainingDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.contrib.TrainingDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.contrib.TrainingDecoder.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.TrainingDecoder.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
 paddle.fluid.contrib.TrainingDecoder.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.TrainingDecoder.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.BeamSearchDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'init_ids', 'init_scores', 'target_dict_dim', 'word_dim', 'input_var_dict', 'topk_size', 'sparse_emb', 'max_len', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=({}, 50, True, 100, 1, 1, None))
-paddle.fluid.contrib.BeamSearchDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.contrib.BeamSearchDecoder.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
@@ -456,7 +456,7 @@ paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients ArgSpec(args=['self', '
 paddle.fluid.optimizer.AdadeltaOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None))
-paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.optimizer.ModelAverage.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.optimizer.ModelAverage.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
@@ -491,14 +491,14 @@ paddle.fluid.clip.ErrorClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'],
 paddle.fluid.clip.GradientClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.clip.GradientClipByNorm.__init__ ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.clip.GradientClipByGlobalNorm.__init__ ArgSpec(args=['self', 'clip_norm', 'group_name'], varargs=None, keywords=None, defaults=('default_group',))
-paddle.fluid.profiler.cuda_profiler ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.profiler.cuda_profiler ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.profiler.reset_profiler ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
-paddle.fluid.profiler.profiler ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.profiler.profiler ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile'))
 paddle.fluid.profiler.start_profiler ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.profiler.stop_profiler ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile'))
 paddle.fluid.unique_name.generate ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.unique_name.guard ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
 paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
 paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope

--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -141,7 +141,8 @@ class Graph {
  ir::Node *CreateControlDepVar() {
    // TODO(panyx0718): control var name should be really unique.
    const std::string name = string::Sprintf(
-        "%s@%llu", ir::Node::kControlDepVarName, node_set_.size());
+        "%s@%llu", static_cast<const char *>(ir::Node::kControlDepVarName),
+        num_node_created_);
    auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable));
    x->SetId(num_node_created_++);
    return x;

--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
 if(WITH_PYTHON)
-cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas)
-cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context)
+cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind)
+cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
 cc_library(engine SRCS engine.cc)
 endif()
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -58,12 +58,13 @@ if(WIN32)
  sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
              DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
                   analysis_config paddle_pass_builder)
-  target_link_libraries(paddle_fluid_shared shlwapi)
 else(WIN32)
  cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
             DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
                  analysis_config paddle_pass_builder)
 endif()
+get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+target_link_libraries(paddle_fluid_shared ${os_dependency_modules})

 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 if(NOT APPLE AND NOT WIN32)

--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
 cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
+if(WITH_TESTING)
+  add_dependencies(subgraph_detector gtest)
+endif()

 if (WITH_GPU AND TENSORRT_FOUND)
  cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -37,7 +37,7 @@ using paddle::framework::Tensor;
          "(bool, default false) Set to true for inference only, false " \
          "for training. Some layers may run faster when this is true.") \
          .SetDefault(false);                                            \
-      AddComment(#OP_COMMENT);                                           \
+      AddComment(OP_COMMENT);                                            \
    }                                                                    \
  }

@@ -124,7 +124,7 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
 UNUSED constexpr char SigmoidDoc[] = R"DOC(
 Sigmoid Activation Operator

-$$out = \frac{1}{1 + e^{-x}}$$
+$$out = \\frac{1}{1 + e^{-x}}$$

 )DOC";

@@ -187,14 +187,14 @@ $out = |x|$
 UNUSED constexpr char CeilDoc[] = R"DOC(
 Ceil Activation Operator.

-$out = ceil(x)$
+$out = \left \lceil x \right \rceil$

 )DOC";

 UNUSED constexpr char FloorDoc[] = R"DOC(
 Floor Activation Operator.

-$out = floor(x)$
+$out = \left \lfloor x \right \rfloor$

 )DOC";

@@ -252,7 +252,7 @@ $out = \ln(1 + e^{x})$
 UNUSED constexpr char SoftsignDoc[] = R"DOC(
 Softsign Activation Operator.

-$$out = \frac{x}{1 + |x|}$$
+$$out = \\frac{x}{1 + \|x\|}$$

 )DOC";


--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -21,26 +21,17 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVectorArrayMap =
-    Eigen::TensorMap<Eigen::Tensor<T, 1, MajorType, IndexType>>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using ConstEigenVectorArrayMap =
-    Eigen::TensorMap<const Eigen::Tensor<T, 1, MajorType, IndexType>>;
+template <typename T>
+struct Compare {
+ public:
+  bool operator()(const T a, const T b) { return (std::abs(a) < std::abs(b)); }
+};

 template <typename T>
 struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
  void operator()(const platform::CPUDeviceContext& ctx, const T* in,
                  const int num, T* out) {
-    Eigen::DSizes<Eigen::DenseIndex, 1> idim(num);
-    Eigen::DSizes<Eigen::DenseIndex, 1> odim(1);
-    Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor>> in_e(in, idim);
-    Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>> out_e(out, odim);
-
-    out_e = in_e.abs().maximum();
+    *out = *(std::max_element(in + 0, in + num, Compare<T>()));
  }
 };


--- a/paddle/fluid/operators/jit/gen/act.h
+++ b/paddle/fluid/operators/jit/gen/act.h
@@ -63,7 +63,6 @@ class VActFunc : public JitCode {
 public:
  explicit VActFunc(size_t code_size, void* code_ptr)
      : JitCode(code_size, code_ptr) {}
-  virtual const char* name() const = 0;
  virtual void genCode() = 0;

 protected:
@@ -269,7 +268,7 @@ class VActJitCode : public VActFunc {
    this->genCode();
  }

-  const char* name() const override {
+  std::string name() const override {
    std::string base = "VActJitCode";
    switch (type_) {
      case operand_type::RELU:
@@ -293,7 +292,7 @@ class VActJitCode : public VActFunc {
      default:
        break;
    }
-    return base.c_str();
+    return base;
  }
  void genCode() override;


--- a/paddle/fluid/operators/jit/gen/blas.h
+++ b/paddle/fluid/operators/jit/gen/blas.h
@@ -41,7 +41,7 @@ class VXXJitCode : public JitCode {
    this->genCode();
  }

-  virtual const char* name() const {
+  std::string name() const override {
    std::string base = "VXXJitCode";
    if (scalar_index_ == 1) {
      base += "_Scalar";
@@ -62,7 +62,7 @@ class VXXJitCode : public JitCode {
    }
    base += (with_relu_ ? "_Relu" : "");
    base += "_D" + std::to_string(num_);
-    return base.c_str();
+    return base;
  }
  void genCode() override;


--- a/paddle/fluid/operators/jit/gen/gru.h
+++ b/paddle/fluid/operators/jit/gen/gru.h
@@ -49,7 +49,7 @@ class GRUJitCode : public VActFunc {
    this->genCode();
  }

-  const char* name() const override {
+  std::string name() const override {
    std::string base = "GRUJitCode";
    if (id_ == 0) {
      base += "_H1";
@@ -81,7 +81,7 @@ class GRUJitCode : public VActFunc {
    };
    AddTypeStr(act_gate_);
    AddTypeStr(act_cand_);
-    return base.c_str();
+    return base;
  }
  void genCode() override;


--- a/paddle/fluid/operators/jit/gen/hopv.h
+++ b/paddle/fluid/operators/jit/gen/hopv.h
@@ -35,14 +35,14 @@ class HOPVJitCode : public JitCode {
    this->genCode();
  }

-  virtual const char* name() const {
+  std::string name() const override {
    std::string base = "VXXJitCode";
    if (type_ == operand_type::MAX) {
      base += "_MAX";
    } else {
      base += "_SUM";
    }
-    return base.c_str();
+    return base;
  }
  void genCode() override;


--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <string>
 #include <type_traits>
 #include "paddle/fluid/operators/jit/gen_base.h"
 #include "paddle/fluid/platform/cpu_info.h"
@@ -59,7 +60,7 @@ typedef enum {
 } operand_type;

 #define DECLARE_JIT_CODE(codename) \
-  const char* name() const override { return #codename; }
+  std::string name() const override { return #codename; }

 class JitCode : public GenBase, public Xbyak::CodeGenerator {
 public:
@@ -68,7 +69,6 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
            (code_size % 4096 != 0 ? (code_size / 4096 + 1) * 4096 : code_size),
            code_ptr) {}

-  virtual const char* name() const = 0;
  virtual void genCode() = 0;

  size_t getSize() const override { return CodeGenerator::getSize(); }

--- a/paddle/fluid/operators/jit/gen/lstm.h
+++ b/paddle/fluid/operators/jit/gen/lstm.h
@@ -53,7 +53,7 @@ class LSTMJitCode : public VActFunc {
    this->genCode();
  }

-  const char* name() const override {
+  std::string name() const override {
    std::string base = "LSTMJitCode";
    if (use_peephole_) {
      base += "_Peephole";
@@ -85,7 +85,7 @@ class LSTMJitCode : public VActFunc {
    AddTypeStr(act_gate_);
    AddTypeStr(act_cand_);
    AddTypeStr(act_cell_);
-    return base.c_str();
+    return base;
  }
  void genCode() override;


--- a/paddle/fluid/operators/jit/gen/matmul.h
+++ b/paddle/fluid/operators/jit/gen/matmul.h
@@ -36,11 +36,11 @@ class MatMulJitCode : public JitCode {
    this->genCode();
  }

-  virtual const char* name() const {
+  std::string name() const override {
    std::string base = "MatMulJitCode";
    base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" +
           std::to_string(k_);
-    return base.c_str();
+    return base;
  }
  void genCode() override;


--- a/paddle/fluid/operators/jit/gen/seqpool.h
+++ b/paddle/fluid/operators/jit/gen/seqpool.h
@@ -38,7 +38,7 @@ class SeqPoolJitCode : public JitCode {
    this->genCode();
  }

-  virtual const char* name() const {
+  std::string name() const override {
    std::string base = "SeqPoolJitCode";
    if (type_ == SeqPoolType::kSum) {
      base += "_Sum";
@@ -48,7 +48,7 @@ class SeqPoolJitCode : public JitCode {
      base += "_Sqrt";
    }
    base += ("_W" + std::to_string(w_));
-    return base.c_str();
+    return base;
  }
  void genCode() override;


--- a/paddle/fluid/operators/jit/gen_base.h
+++ b/paddle/fluid/operators/jit/gen_base.h
@@ -16,6 +16,7 @@

 #include <gflags/gflags.h>
 #include <memory>  // for unique_ptr
+#include <string>
 #include <vector>
 #include "paddle/fluid/operators/jit/kernel_base.h"

@@ -28,7 +29,7 @@ namespace jit {
 class GenBase : public Kernel {
 public:
  virtual ~GenBase() = default;
-  virtual const char* name() const = 0;
+  virtual std::string name() const = 0;
  virtual size_t getSize() const = 0;
  virtual const unsigned char* getCodeInternal() = 0;
  template <typename Func>

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -37,7 +37,7 @@ math_library(concat_and_split)
 math_library(context_project DEPS im2col math_function)
 math_library(cross_entropy)
 math_library(cos_sim_functor)
-math_library(depthwise_conv)
+math_library(depthwise_conv DEPS cub)
 math_library(im2col)
 math_library(sampler)


--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -282,7 +282,7 @@ class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                               ? mkldnn::inner_product_backward_weights::desc(
                                     src, diff_weights, bias, diff_dst)
                               : mkldnn::inner_product_backward_weights::desc(
-                                     src, diff_weights, bias, diff_dst);
+                                     src, diff_weights, diff_dst);

    return mkldnn::inner_product_backward_weights::primitive_desc(
        bwd_weight_desc, engine, pd);

--- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
 include(operators)
-register_operators()
+if(WITH_GPU)
+    register_operators(DEPS cub)
+else()
+    register_operators()
+endif()

 if(WITH_GPU)
    file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.part.cu")

--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -109,23 +109,23 @@ from future subsequences in a computationally efficient manner to improve
 unidirectional recurrent neural networks. The row convolution operator is 
 different from the 1D sequence convolution, and is computed as follows:

-Given an input sequence $in$ of length $t$ and input dimension $d$, 
-and a filter ($W$) of size $context \times d$, 
+Given an input sequence $X$ of length $t$ and input dimension $D$, 
+and a filter ($W$) of size $context \times D$,
 the output sequence is convolved as:

 $$
-out_{i, :} = \\sum_{j=i}^{i + context} in_{j,:} \\cdot W_{i-j, :}
+out_{i} = \\sum_{j=i}^{i + context - 1} X_{j} \\cdot W_{j-i}
 $$

 In the above equation:

 * $Out_{i}$: The i-th row of output variable with shape [1, D].

-* $\\tau$: Future context size.
+* $context$: Future context size.

 * $X_{j}$: The j-th row of input variable with shape [1, D].

-* $W_{i-j}$: The (i-j)-th row of parameters with shape [1, D].
+* $W_{j-i}$: The (j-i)-th row of parameters with shape [1, D].

 More details about row_conv please refer to
 the design document

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
-proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto)
+proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool)
 py_proto_compile(profiler_py_proto SRCS profiler.proto)

 add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
@@ -36,7 +36,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)

 nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)

-cc_library(place SRCS place.cc DEPS enforce boost)
+cc_library(place SRCS place.cc DEPS enforce boost lib_any)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)

 add_subdirectory(dynload)

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -26,5 +26,5 @@ if(WITH_PYTHON)
  get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
  target_link_libraries(paddle_pybind ${os_dependency_modules})

-  cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python)
+  cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python pybind)
 endif(WITH_PYTHON)
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -54,7 +54,7 @@ ELSE(WIN32)
 		DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 ENDIF()

-set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS})
+set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS} ${external_project_dependencies})
 add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})

 set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)

--- a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
+++ b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
@@ -22,7 +22,7 @@ This API is still under active development and may change drastically.

 from __future__ import print_function

-import contextlib
+from ...wrapped_decorator import signature_safe_contextmanager
 import numpy as np
 import six

@@ -419,7 +419,7 @@ class TrainingDecoder(object):
        self._state_cell = state_cell
        self._state_cell._enter_decoder(self)

-    @contextlib.contextmanager
+    @signature_safe_contextmanager
    def block(self):
        """
        Define the behavior of the decoder for each RNN time step.
@@ -613,7 +613,7 @@ class BeamSearchDecoder(object):
        self._word_dim = word_dim
        self._input_var_dict = input_var_dict

-    @contextlib.contextmanager
+    @signature_safe_contextmanager
    def block(self):
        """
        Define the behavior of the decoder for each RNN time step.

--- a/python/paddle/fluid/contrib/inferencer.py
+++ b/python/paddle/fluid/contrib/inferencer.py
@@ -14,7 +14,7 @@

 from __future__ import print_function

-import contextlib
+from ..wrapped_decorator import signature_safe_contextmanager

 from .. import core

@@ -105,7 +105,7 @@ class Inferencer(object):

        return results

-    @contextlib.contextmanager
+    @signature_safe_contextmanager
    def _prog_and_scope_guard(self):
        with framework.program_guard(main_program=self.inference_program):
            with executor.scope_guard(self.scope):

--- a/python/paddle/fluid/contrib/trainer.py
+++ b/python/paddle/fluid/contrib/trainer.py
@@ -14,7 +14,7 @@

 from __future__ import print_function

-import contextlib
+from ..wrapped_decorator import signature_safe_contextmanager
 import os
 import errno
 import shutil
@@ -453,7 +453,7 @@ class Trainer(object):
            io.save_inference_model(param_path, feeded_var_names, target_vars,
                                    exe)

-    @contextlib.contextmanager
+    @signature_safe_contextmanager
    def _prog_and_scope_guard(self):
        with framework.program_guard(
                main_program=self.train_program,

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -17,7 +17,7 @@ from __future__ import print_function
 import os
 import multiprocessing
 import numpy as np
-import contextlib
+from .wrapped_decorator import signature_safe_contextmanager
 import six
 from .framework import Program, default_main_program, Variable
 from . import core
@@ -49,7 +49,7 @@ def _switch_scope(scope):
    return ex


-@contextlib.contextmanager
+@signature_safe_contextmanager
 def scope_guard(scope):
    """
    Change the global/default scope instance by Python `with` statement. All

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -16,7 +16,7 @@ from __future__ import print_function

 import collections
 from collections import defaultdict
-import contextlib
+from .wrapped_decorator import signature_safe_contextmanager
 import os
 import re
 import traceback
@@ -111,7 +111,7 @@ class NameScope(object):
 _name_scope = NameScope()


-@contextlib.contextmanager
+@signature_safe_contextmanager
 def name_scope(prefix=None):
    """
    Generate hierarchical name prefix for the operators.
@@ -1775,7 +1775,7 @@ class Program(object):
    def set_op_role_var(self, var_name):
        self._op_role_var = [var_name]

-    @contextlib.contextmanager
+    @signature_safe_contextmanager
    def _optimized_guard(self, param_and_grads):
        """
        A with guard to set :code:`Optimization` :code:`OpRole` and
@@ -1805,7 +1805,7 @@ class Program(object):
        self._op_role_var = tmp_var
        self._current_role = tmp_role

-    @contextlib.contextmanager
+    @signature_safe_contextmanager
    def _lr_schedule_guard(self, is_with_opt=False):
        """
        A with guard to set :code:`LRSched` :code:`OpRole` and
@@ -2459,7 +2459,7 @@ def switch_startup_program(program):
    return prev_program


-@contextlib.contextmanager
+@signature_safe_contextmanager
 def program_guard(main_program, startup_program=None):
    """
    Change the global main program and startup program with `with` statement.
@@ -2524,7 +2524,7 @@ def _get_var(name, program=None):
    return program.global_block().var(name)


-@contextlib.contextmanager
+@signature_safe_contextmanager
 def _imperative_guard(tracer):
    global _imperative_tracer_
    tmp_trace = _imperative_tracer_
@@ -2535,7 +2535,7 @@ def _imperative_guard(tracer):
    _imperative_tracer_ = tmp_trace


-@contextlib.contextmanager
+@signature_safe_contextmanager
 def _imperative_place_guard(place):
    global _imperative_current_expected_place_
    tmp_place = _imperative_current_expected_place_

--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import contextlib
+from ..wrapped_decorator import signature_safe_contextmanager
 import numpy as np

 from paddle.fluid import core
@@ -24,7 +24,7 @@ def enabled():
    return framework._in_imperative_mode()


-@contextlib.contextmanager
+@signature_safe_contextmanager
 def guard(place=None):
    train = framework.Program()
    startup = framework.Program()

--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -16,7 +16,7 @@ from __future__ import print_function

 from . import framework
 import numpy as np
-import contextlib
+from .wrapped_decorator import signature_safe_contextmanager
 from .core import VarDesc
 from . import unique_name

@@ -49,7 +49,7 @@ def force_init_on_cpu():
    return _force_init_on_cpu_


-@contextlib.contextmanager
+@signature_safe_contextmanager
 def init_on_cpu():
    """
    Force the variable to be inited on CPU.

--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -302,7 +302,8 @@ class LayerHelper(object):
        if default_initializer is None and attr.initializer is None:
            if isinstance(dtype, core.VarDesc.VarType):
                if dtype != core.VarDesc.VarType.FP32 and \
-                    dtype != core.VarDesc.VarType.FP64:
+                    dtype != core.VarDesc.VarType.FP64 and \
+                    dtype != core.VarDesc.VarType.FP16:
                    raise TypeError(
                        "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
                    )

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -13,7 +13,7 @@
 # limitations under the License.

 from __future__ import print_function
-import contextlib
+from ..wrapped_decorator import signature_safe_contextmanager

 from .layer_function_generator import autodoc, templatedoc
 from .tensor import assign, fill_constant
@@ -1532,7 +1532,7 @@ class DynamicRNN(object):
            outputs={'Out': [x_reordered]})
        return shrink_memory(x_reordered, self.step_idx, self.lod_rank_table)

-    @contextlib.contextmanager
+    @signature_safe_contextmanager
    def block(self):
        """
        The block for user to define operators in RNN. See the class docstring

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -13,7 +13,7 @@
 # limitations under the License.

 from __future__ import print_function
-import contextlib
+from ..wrapped_decorator import signature_safe_contextmanager
 import multiprocessing
 import os
 import six
@@ -1116,7 +1116,7 @@ class Preprocessor(object):
    def _is_completed(self):
        return self.sub_block and self.source_var_names and self.sink_var_names

-    @contextlib.contextmanager
+    @signature_safe_contextmanager
    def block(self):
        self.status = Preprocessor.IN_SUB_BLOCK
        self.sub_block = self.main_prog._create_block()

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -15,7 +15,7 @@
 from __future__ import print_function

 from collections import defaultdict
-from contextlib import contextmanager
+from .wrapped_decorator import signature_safe_contextmanager

 from paddle.fluid.framework import Program, Variable, name_scope, default_main_program
 from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
@@ -1610,7 +1610,7 @@ class ModelAverage(Optimizer):
            },
            stop_gradient=True)

-    @contextmanager
+    @signature_safe_contextmanager
    def apply(self, executor, need_restore=True):
        """Apply average values to parameters of current model.
        """

--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -15,7 +15,7 @@
 from __future__ import print_function

 from . import core
-from contextlib import contextmanager
+from .wrapped_decorator import signature_safe_contextmanager
 import os
 import six

@@ -35,7 +35,7 @@ NVPROF_CONFIG = [
 ]


-@contextmanager
+@signature_safe_contextmanager
 def cuda_profiler(output_file, output_mode=None, config=None):
    """The CUDA profiler.
    This fuctions is used to profile CUDA program by CUDA runtime application
@@ -217,7 +217,7 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
    core.disable_profiler(key_map[sorted_key], profile_path)


-@contextmanager
+@signature_safe_contextmanager
 def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
    """The profiler interface.
    Different from cuda_profiler, this profiler can be used to profile both CPU

--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
@@ -15,14 +15,14 @@
 from __future__ import print_function

 import os
-import contextlib
+from .wrapped_decorator import signature_safe_contextmanager
 from . import core
 __all__ = [
    'convert_reader_to_recordio_file', 'convert_reader_to_recordio_files'
 ]


-@contextlib.contextmanager
+@signature_safe_contextmanager
 def create_recordio_writer(filename,
                           compressor=core.RecordIOWriter.Compressor.Snappy,
                           max_num_records=1000):

--- a/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
+++ b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
@@ -16,7 +16,6 @@ from __future__ import print_function

 import sys
 import paddle.fluid as fluid
-import paddle.v2 as paddle


 def load_vocab(filename):

--- a/python/paddle/fluid/tests/demo/pyreader.py
+++ b/python/paddle/fluid/tests/demo/pyreader.py
@@ -20,7 +20,6 @@ import six
 import paddle
 import paddle.dataset.mnist as mnist
 import paddle.fluid as fluid
-import paddle.v2


 def network(is_train):
@@ -72,7 +71,7 @@ def main():
        use_cuda=use_cuda, share_vars_from=trainer, main_program=test_prog)

    train_reader.decorate_paddle_reader(
-        paddle.v2.reader.shuffle(
+        paddle.reader.shuffle(
            paddle.batch(mnist.train(), 512), buf_size=8192))

    test_reader.decorate_paddle_reader(paddle.batch(mnist.test(), 512))

--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -355,6 +355,10 @@ class ControlFlowGraph(object):
                                                 is_forward).dtype()
                        cache_dtype = self._find_var(block_desc, cache_var,
                                                     is_forward).dtype()
+                        if x_dtype != cache_dtype:
+                            if PRINT_LOG:
+                                print("x_dtype and cache_dtype are different!")
+                            continue

                        if not compare_shape(x_shape, cache_shape, level):
                            continue

--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
@@ -15,7 +15,7 @@
 from __future__ import print_function

 import collections
-import contextlib
+from .wrapped_decorator import signature_safe_contextmanager
 import six
 import sys

@@ -68,7 +68,7 @@ def switch(new_generator=None):
    return old


-@contextlib.contextmanager
+@signature_safe_contextmanager
 def guard(new_generator=None):
    if isinstance(new_generator, six.string_types):
        new_generator = UniqueNameGenerator(new_generator)

--- a/python/paddle/utils/dump_config.py
+++ b/python/paddle/utils/dump_config.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,34 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from paddle.trainer.config_parser import parse_config
-from paddle.proto import TrainerConfig_pb2
-import sys
+import decorator
+import contextlib

-__all__ = []
+__all__ = ['wrap_decorator', 'signature_safe_contextmanager']

-if __name__ == '__main__':
-    whole_conf = False
-    binary = False
-    if len(sys.argv) == 2:
-        conf = parse_config(sys.argv[1], '')
-    elif len(sys.argv) == 3:
-        conf = parse_config(sys.argv[1], sys.argv[2])
-    elif len(sys.argv) == 4:
-        conf = parse_config(sys.argv[1], sys.argv[2])
-        if sys.argv[3] == '--whole':
-            whole_conf = True
-        elif sys.argv[3] == '--binary':
-            binary = True
-    else:
-        raise RuntimeError()

-    assert isinstance(conf, TrainerConfig_pb2.TrainerConfig)
+def wrap_decorator(decorator_func):
+    @decorator.decorator
+    def __impl__(func, *args, **kwargs):
+        wrapped_func = decorator_func(func)
+        return wrapped_func(*args, **kwargs)

-    if whole_conf:
-        print(conf)
-    else:
-        if binary:
-            sys.stdout.write(conf.model_config.SerializeToString())
-        else:
-            print(conf.model_config)
+    return __impl__
+
+
+signature_safe_contextmanager = wrap_decorator(contextlib.contextmanager)
--- a/python/paddle/utils/dump_v2_config.py
+++ b/python/paddle/utils/dump_v2_config.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-
-from paddle.trainer_config_helpers.layers import LayerOutput
-from paddle.v2.layer import parse_network
-from paddle.proto import TrainerConfig_pb2
-
-__all__ = ["dump_v2_config"]
-
-
-def dump_v2_config(topology, save_path, binary=False):
-    """ Dump the network topology to a specified file.
-
-    This function is only used to dump network defined by using PaddlePaddle V2
-    APIs. This function will NOT dump configurations related to PaddlePaddle
-    optimizer.
-
-    :param topology: The output layers (can be more than one layers given in a
-                     Python List or Tuple) of the entire network. Using the
-                     specified layers (if more than one layer is given) as root,
-                     traversing back to the data layer(s), all the layers
-                     connected to the specified output layers will be dumped.
-                     Layers not connceted to the specified will not be dumped.
-    :type topology: LayerOutput|List|Tuple
-    :param save_path: The path to save the dumped network topology.
-    :type save_path: str
-    :param binary: Whether to dump the serialized network topology or not.
-                   The default value is false. NOTE that, if you call this
-                   function to generate network topology for PaddlePaddle C-API,
-                   a serialized version of network topology is required. When
-                   using PaddlePaddle C-API, this flag MUST be set to True.
-    :type binary: bool
-    """
-
-    if isinstance(topology, LayerOutput):
-        topology = [topology]
-    elif isinstance(topology, collections.Sequence):
-        for out_layer in topology:
-            assert isinstance(out_layer, LayerOutput), (
-                "The type of each element in the parameter topology "
-                "should be LayerOutput.")
-    else:
-        raise RuntimeError("Error input type for parameter topology.")
-
-    model_str = parse_network(topology)
-    with open(save_path, "w") as fout:
-        if binary:
-            fout.write(model_str.SerializeToString())
-        else:
-            fout.write(str(model_str))
--- a/python/paddle/utils/image_multiproc.py
+++ b/python/paddle/utils/image_multiproc.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os, sys
-import numpy as np
-from PIL import Image
-import six
-from six.moves import cStringIO as StringIO
-import multiprocessing
-import functools
-import itertools
-
-from paddle.utils.image_util import *
-from paddle.trainer.config_parser import logger
-
-try:
-    import cv2
-except ImportError:
-    logger.warning("OpenCV2 is not installed, using PIL to process")
-    cv2 = None
-
-__all__ = ["CvTransformer", "PILTransformer", "MultiProcessImageTransformer"]
-
-
-class CvTransformer(ImageTransformer):
-    """
-    CvTransformer used python-opencv to process image.
-    """
-
-    def __init__(
-            self,
-            min_size=None,
-            crop_size=None,
-            transpose=(2, 0, 1),  # transpose to C * H * W
-            channel_swap=None,
-            mean=None,
-            is_train=True,
-            is_color=True):
-        ImageTransformer.__init__(self, transpose, channel_swap, mean, is_color)
-        self.min_size = min_size
-        self.crop_size = crop_size
-        self.is_train = is_train
-
-    def resize(self, im, min_size):
-        row, col = im.shape[:2]
-        new_row, new_col = min_size, min_size
-        if row > col:
-            new_row = min_size * row / col
-        else:
-            new_col = min_size * col / row
-        im = cv2.resize(im, (new_row, new_col), interpolation=cv2.INTER_CUBIC)
-        return im
-
-    def crop_and_flip(self, im):
-        """
-        Return cropped image.
-        The size of the cropped image is inner_size * inner_size.
-        im: (H x W x K) ndarrays
-        """
-        row, col = im.shape[:2]
-        start_h, start_w = 0, 0
-        if self.is_train:
-            start_h = np.random.randint(0, row - self.crop_size + 1)
-            start_w = np.random.randint(0, col - self.crop_size + 1)
-        else:
-            start_h = (row - self.crop_size) / 2
-            start_w = (col - self.crop_size) / 2
-        end_h, end_w = start_h + self.crop_size, start_w + self.crop_size
-        if self.is_color:
-            im = im[start_h:end_h, start_w:end_w, :]
-        else:
-            im = im[start_h:end_h, start_w:end_w]
-        if (self.is_train) and (np.random.randint(2) == 0):
-            if self.is_color:
-                im = im[:, ::-1, :]
-            else:
-                im = im[:, ::-1]
-        return im
-
-    def transform(self, im):
-        im = self.resize(im, self.min_size)
-        im = self.crop_and_flip(im)
-        # transpose, swap channel, sub mean
-        im = im.astype('float32')
-        ImageTransformer.transformer(self, im)
-        return im
-
-    def load_image_from_string(self, data):
-        flag = cv2.CV_LOAD_IMAGE_COLOR if self.is_color else cv2.CV_LOAD_IMAGE_GRAYSCALE
-        im = cv2.imdecode(np.fromstring(data, np.uint8), flag)
-        return im
-
-    def transform_from_string(self, data):
-        im = self.load_image_from_string(data)
-        return self.transform(im)
-
-    def load_image_from_file(self, file):
-        flag = cv2.CV_LOAD_IMAGE_COLOR if self.is_color else cv2.CV_LOAD_IMAGE_GRAYSCALE
-        im = cv2.imread(file, flag)
-        return im
-
-    def transform_from_file(self, file):
-        im = self.load_image_from_file(file)
-        return self.transform(im)
-
-
-class PILTransformer(ImageTransformer):
-    """
-    PILTransformer used PIL to process image.
-    """
-
-    def __init__(
-            self,
-            min_size=None,
-            crop_size=None,
-            transpose=(2, 0, 1),  # transpose to C * H * W
-            channel_swap=None,
-            mean=None,
-            is_train=True,
-            is_color=True):
-        ImageTransformer.__init__(self, transpose, channel_swap, mean, is_color)
-        self.min_size = min_size
-        self.crop_size = crop_size
-        self.is_train = is_train
-
-    def resize(self, im, min_size):
-        row, col = im.size[:2]
-        new_row, new_col = min_size, min_size
-        if row > col:
-            new_row = min_size * row / col
-        else:
-            new_col = min_size * col / row
-        im = im.resize((new_row, new_col), Image.ANTIALIAS)
-        return im
-
-    def crop_and_flip(self, im):
-        """
-        Return cropped image.
-        The size of the cropped image is inner_size * inner_size.
-        """
-        row, col = im.size[:2]
-        start_h, start_w = 0, 0
-        if self.is_train:
-            start_h = np.random.randint(0, row - self.crop_size + 1)
-            start_w = np.random.randint(0, col - self.crop_size + 1)
-        else:
-            start_h = (row - self.crop_size) / 2
-            start_w = (col - self.crop_size) / 2
-        end_h, end_w = start_h + self.crop_size, start_w + self.crop_size
-        im = im.crop((start_h, start_w, end_h, end_w))
-        if (self.is_train) and (np.random.randint(2) == 0):
-            im = im.transpose(Image.FLIP_LEFT_RIGHT)
-        return im
-
-    def transform(self, im):
-        im = self.resize(im, self.min_size)
-        im = self.crop_and_flip(im)
-        im = np.array(im, dtype=np.float32)  # convert to numpy.array
-        # transpose, swap channel, sub mean
-        ImageTransformer.transformer(self, im)
-        return im
-
-    def load_image_from_string(self, data):
-        im = Image.open(StringIO(data))
-        return im
-
-    def transform_from_string(self, data):
-        im = self.load_image_from_string(data)
-        return self.transform(im)
-
-    def load_image_from_file(self, file):
-        im = Image.open(file)
-        return im
-
-    def transform_from_file(self, file):
-        im = self.load_image_from_file(file)
-        return self.transform(im)
-
-
-def job(is_img_string, transformer, data_label_pack):
-    (data, label) = data_label_pack
-    if is_img_string:
-        return transformer.transform_from_string(data), label
-    else:
-        return transformer.transform_from_file(data), label
-
-
-class MultiProcessImageTransformer(object):
-    def __init__(self,
-                 procnum=10,
-                 resize_size=None,
-                 crop_size=None,
-                 transpose=(2, 0, 1),
-                 channel_swap=None,
-                 mean=None,
-                 is_train=True,
-                 is_color=True,
-                 is_img_string=True):
-        """
-        Processing image with multi-process. If it is used in PyDataProvider,
-        the simple usage for CNN is as follows:
-
-        .. code-block:: python
-
-            def hool(settings, is_train,  **kwargs):
-                settings.is_train = is_train
-                settings.mean_value = np.array([103.939,116.779,123.68], dtype=np.float32)
-                settings.input_types = [
-                    dense_vector(3 * 224 * 224),
-                    integer_value(1)]
-                settings.transformer = MultiProcessImageTransformer(
-                    procnum=10,
-                    resize_size=256,
-                    crop_size=224,
-                    transpose=(2, 0, 1),
-                    mean=settings.mean_values,
-                    is_train=settings.is_train)
-
-
-            @provider(init_hook=hook, pool_size=20480)
-            def process(settings, file_list):
-                with open(file_list, 'r') as fdata:
-                    for line in fdata:
-                        data_dic = np.load(line.strip()) # load the data batch pickled by Pickle.
-                        data = data_dic['data']
-                        labels = data_dic['label']
-                        labels = np.array(labels, dtype=np.float32)
-                        for im, lab in settings.dp.run(data, labels):
-                            yield [im.astype('float32'), int(lab)]
-
-        :param procnum: processor number.
-        :type procnum: int
-        :param resize_size: the shorter edge size of image after resizing.
-        :type resize_size: int
-        :param crop_size: the croping size.
-        :type crop_size: int
-        :param transpose: the transpose order, Paddle only allow C * H * W order.
-        :type transpose: tuple or list
-        :param channel_swap: the channel swap order, RGB or BRG.
-        :type channel_swap: tuple or list
-        :param mean: the mean values of image, per-channel mean or element-wise mean.
-        :type mean: array, The dimension is 1 for per-channel mean.
-                    The dimension is 3 for element-wise mean.
-        :param is_train: training peroid or testing peroid.
-        :type is_train: bool.
-        :param is_color: the image is color or gray.
-        :type is_color: bool.
-        :param is_img_string: The input can be the file name of image or image string.
-        :type is_img_string: bool.
-        """
-
-        self.procnum = procnum
-        self.pool = multiprocessing.Pool(procnum)
-        self.is_img_string = is_img_string
-        if cv2 is not None:
-            self.transformer = CvTransformer(resize_size, crop_size, transpose,
-                                             channel_swap, mean, is_train,
-                                             is_color)
-        else:
-            self.transformer = PILTransformer(resize_size, crop_size, transpose,
-                                              channel_swap, mean, is_train,
-                                              is_color)
-
-    def run(self, data, label):
-        fun = functools.partial(job, self.is_img_string, self.transformer)
-        return self.pool.imap_unordered(
-            fun, six.moves.zip(data, label), chunksize=100 * self.procnum)
--- a/python/paddle/utils/make_model_diagram.py
+++ b/python/paddle/utils/make_model_diagram.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Generate dot diagram file for the given paddle model config
-# The generated file can be viewed using Graphviz (http://graphviz.org)
-
-from __future__ import print_function
-
-import six
-import sys
-import traceback
-
-from paddle.trainer.config_parser import parse_config
-
-
-def make_layer_label(layer_config):
-    label = '%s type=%s' % (layer_config.name, layer_config.type)
-    if layer_config.reversed:
-        label += ' <=='
-
-    label2 = ''
-    if layer_config.active_type:
-        label2 += 'act=%s ' % layer_config.active_type
-    if layer_config.bias_parameter_name:
-        label2 += 'bias=%s ' % layer_config.bias_parameter_name
-
-    if label2:
-        label += '\l' + label2
-    return label
-
-
-def make_diagram(config_file, dot_file, config_arg_str):
-    config = parse_config(config_file, config_arg_str)
-    make_diagram_from_proto(config.model_config, dot_file)
-
-
-def make_diagram_from_proto(model_config, dot_file):
-    # print >> sys.stderr, config
-    name2id = {}
-    f = open(dot_file, 'w')
-    submodel_layers = set()
-
-    def make_link(link):
-        return 'l%s -> l%s;' % (name2id[link.layer_name],
-                                name2id[link.link_name])
-
-    def make_mem(mem):
-        s = ''
-        if mem.boot_layer_name:
-            s += 'l%s -> l%s;\n' % (name2id[mem.boot_layer_name],
-                                    name2id[mem.layer_name])
-        s += 'l%s -> l%s [style=dashed];' % (name2id[mem.layer_name],
-                                             name2id[mem.link_name])
-        return s
-
-    print('digraph graphname {', file=f)
-    print('node [width=0.375,height=0.25];', file=f)
-    for i in six.moves.xrange(len(model_config.layers)):
-        l = model_config.layers[i]
-        name2id[l.name] = i
-
-    i = 0
-    for sub_model in model_config.sub_models:
-        if sub_model.name == 'root':
-            continue
-        print('subgraph cluster_%s {' % i, file=f)
-        print('style=dashed;', file=f)
-        label = '%s ' % sub_model.name
-        if sub_model.reversed:
-            label += '<=='
-        print('label = "%s";' % label, file=f)
-        i += 1
-        submodel_layers.add(sub_model.name)
-        for layer_name in sub_model.layer_names:
-            submodel_layers.add(layer_name)
-            lid = name2id[layer_name]
-            layer_config = model_config.layers[lid]
-            label = make_layer_label(layer_config)
-            print('l%s [label="%s", shape=box];' % (lid, label), file=f)
-        print('}', file=f)
-
-    for i in six.moves.xrange(len(model_config.layers)):
-        l = model_config.layers[i]
-        if l.name not in submodel_layers:
-            label = make_layer_label(l)
-            print('l%s [label="%s", shape=box];' % (i, label), file=f)
-
-    for sub_model in model_config.sub_models:
-        if sub_model.name == 'root':
-            continue
-        for link in sub_model.in_links:
-            print(make_link(link), file=f)
-        for link in sub_model.out_links:
-            print(make_link(link), file=f)
-        for mem in sub_model.memories:
-            print(make_mem(mem), file=f)
-
-    for i in six.moves.xrange(len(model_config.layers)):
-        for l in model_config.layers[i].inputs:
-            print(
-                'l%s -> l%s [label="%s"];' % (name2id[l.input_layer_name], i,
-                                              l.input_parameter_name),
-                file=f)
-
-    print('}', file=f)
-    f.close()
-
-
-def usage():
-    print(
-        ("Usage: python show_model_diagram.py" +
-         " CONFIG_FILE DOT_FILE [config_str]"),
-        file=sys.stderr)
-    exit(1)
-
-
-if __name__ == '__main__':
-    if len(sys.argv) < 3 or len(sys.argv) > 4:
-        usage()
-
-    config_file = sys.argv[1]
-    dot_file = sys.argv[2]
-    config_arg_str = sys.argv[3] if len(sys.argv) == 4 else ''
-
-    try:
-        make_diagram(config_file, dot_file, config_arg_str)
-    except:
-        traceback.print_exc()
-        raise
--- a/python/paddle/utils/merge_model.py
+++ b/python/paddle/utils/merge_model.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gzip
-import struct
-import os
-
-from paddle.trainer_config_helpers.layers import LayerOutput
-from paddle.v2.parameters import Parameters
-from paddle.proto import ModelConfig_pb2
-from paddle.v2.topology import Topology
-
-
-def merge_v2_model(net, param_file, output_file):
-    '''Merge the model config and parameters into one file.
-
-    The model configuration file describes the model structure which
-    ends with .py. The parameters file stores the parameters of the model
-    which ends with .tar.gz.
-
-    @param  net            The output layer of the network for inference.
-    @param  param_file     Path of the parameters (.tar.gz) which is stored by
-                           v2 api.
-    @param  output_file    Path of the merged file which will be generated.
-
-    Usage:
-
-        from paddle.utils.merge_model import merge_v2_model
-        # import your network configuration
-        from example_net import net_conf
-
-        net = net_conf(is_predict=True)
-        param_file = './param_pass_00000.tar.gz'
-        output_file = './output.paddle'
-
-        merge_v2_model(net, param_file, output_file)
-
-    '''
-
-    assert isinstance(net, LayerOutput), \
-            "The net should be the output of the network for inference"
-    assert os.path.exists(param_file), \
-            "The model parameters file %s does not exists " % (param_file)
-
-    model_proto = Topology(net).proto()
-    assert isinstance(model_proto, ModelConfig_pb2.ModelConfig)
-
-    with gzip.open(param_file) as f:
-        params = Parameters.from_tar(f)
-
-    if os.path.exists(output_file):
-        os.remove(output_file)
-
-    with open(output_file, 'w') as f:
-        param_names = [param.name for param in model_proto.parameters]
-        conf_str = model_proto.SerializeToString()
-        f.write(struct.pack('q', len(conf_str)))
-        f.write(conf_str)
-        for pname in param_names:
-            params.serialize(pname, f)
-
-    print('Generate  %s  success!' % (output_file))
--- a/python/paddle/utils/predefined_net.py
+++ b/python/paddle/utils/predefined_net.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import six
-import os
-from paddle.trainer.config_parser import *
-from paddle.utils.preprocess_img import \
-    ImageClassificationDatasetCreater
-from paddle.trainer_config_helpers import *
-
-
-def image_data(data_dir,
-               processed_image_size,
-               overwrite=False,
-               color=True,
-               train_list="batches/train.list",
-               test_list="batches/test.list",
-               meta_file="batches/batches.meta",
-               use_jpeg=1):
-    """
-    Predefined image data provider for image classification.
-    train_list: a text file containing a list of training batches.
-    test_list: a text file containing a list of test batches.
-    processed_image_size: all the input images will be resized into this size.
-       If the image is not square. Then the shorter edge will be resized into
-       this size, and the aspect ratio is kept the same.
-    color: whether the images are color or gray.
-    meta_path: the path of the meta file that stores the mean image file and
-               other dataset information, such as the size of images,
-               the size of the mean image, the number of classes.
-    async_load_data: whether to load image data asynchronuously.
-    """
-    data_creator = ImageClassificationDatasetCreater(
-        data_dir, processed_image_size, color)
-    batch_data_dir = data_dir
-    train_list = os.path.join(batch_data_dir, train_list)
-    test_list = os.path.join(batch_data_dir, test_list)
-    meta_path = os.path.join(batch_data_dir, meta_file)
-    image_size = processed_image_size
-    conf = np.load(meta_path)
-    mean_image_size = conf["mean_image_size"]
-    is_color = conf["color"]
-    num_classes = conf["num_classes"]
-    color_string = "color" if is_color else "gray"
-
-    args = {
-        'meta': meta_path,
-        'mean_img_size': mean_image_size,
-        'img_size': image_size,
-        'num_classes': num_classes,
-        'use_jpeg': use_jpeg != 0,
-        'color': color_string
-    }
-
-    define_py_data_sources2(
-        train_list,
-        test_list,
-        module='image_provider',
-        obj='processData',
-        args=args)
-    return {
-        "image_size": image_size,
-        "num_classes": num_classes,
-        "is_color": is_color
-    }
-
-
-def get_extra_layer_attr(drop_rate):
-    if drop_rate == 0:
-        return None
-    else:
-        return ExtraLayerAttribute(drop_rate=drop_rate)
-
-
-def image_data_layers(image_size, num_classes, is_color=False,
-                      is_predict=False):
-    """
-    Data layers for image classification.
-    image_size: image size.
-    num_classes: num of classes.
-    is_color: whether the input images are color.
-    is_predict: whether the network is used for prediction.
-    """
-    num_image_channels = 3 if is_color else 1
-    data_input = data_layer("input",
-                            image_size * image_size * num_image_channels)
-    if is_predict:
-        return data_input, None, num_image_channels
-    else:
-        label_input = data_layer("label", 1)
-        return data_input, label_input, num_image_channels
-
-
-def simple_conv_net(data_conf, is_color=False):
-    """
-    A Wrapper for a simple network for MNIST digit recognition.
-    It contains two convolutional layers, one fully conencted layer, and
-    one softmax layer.
-    data_conf is a dictionary with the following keys:
-        image_size: image size.
-        num_classes: num of classes.
-        is_color: whether the input images are color.
-    """
-    for k, v in six.iteritems(data_conf):
-        globals()[k] = v
-    data_input, label_input, num_image_channels = \
-        image_data_layers(image_size, num_classes, is_color, is_predict)
-    filter_sizes = [5, 5]
-    num_channels = [32, 64]
-    strides = [1, 1]
-    fc_dims = [500]
-    conv_bn_pool1 = img_conv_bn_pool(
-        name="g1",
-        input=data_input,
-        filter_size=filter_sizes[0],
-        num_channel=num_image_channels,
-        num_filters=num_channels[0],
-        conv_stride=1,
-        conv_padding=0,
-        pool_size=3,
-        pool_stride=2,
-        act=ReluActivation())
-    conv_bn_pool2 = img_conv_bn_pool(
-        name="g2",
-        input=conv_bn_pool1,
-        filter_size=filter_sizes[1],
-        num_channel=num_channels[0],
-        num_filters=num_channels[1],
-        conv_stride=1,
-        conv_padding=0,
-        pool_size=3,
-        pool_stride=2,
-        act=ReluActivation())
-    fc3 = fc_layer(
-        name="fc3", input=conv_bn_pool2, dim=fc_dims[0], act=ReluActivation())
-    fc3_dropped = dropout_layer(name="fc3_dropped", input=fc3, dropout_rate=0.5)
-    output = fc_layer(
-        name="output",
-        input=fc3_dropped,
-        dim=fc_dims[0],
-        act=SoftmaxActivation())
-    if is_predict:
-        end_of_network(output)
-    else:
-        cost = classify(name="cost", input=output, label=label_input)
-        end_of_network(cost)
-
-
-def conv_layer_group(prefix_num,
-                     num_layers,
-                     input,
-                     input_channels,
-                     output_channels,
-                     drop_rates=[],
-                     strides=[],
-                     with_bn=[]):
-    """
-    A set of convolution layers, and batch normalization layers,
-    followed by one pooling layer.
-    It is utilized in VGG network for image classifcation.
-    prefix_num: the prefix number of the layer names.
-                For example, if prefix_num = 1, the first convolutioal layer's
-                name will be conv_1_1.
-    num_layers: number of the convolutional layers.
-    input: the name of the input layer.
-    input_channels: the number of channels of the input feature map.
-    output_channels: the number of channels of the output feature map.
-    drop_rates: the drop rates of the BN layers. It will be all zero by default.
-    strides: the stride of the convolution for the layers.
-             It will be all 1 by  default.
-    with_bn: whether to use Batch Normalization for Conv layers.
-             By default,  it is all false.
-    """
-    if len(drop_rates) == 0: drop_rates = [0] * num_layers
-    if len(strides) == 0: strides = [1] * num_layers
-    if len(with_bn) == 0: with_bn = [False] * num_layers
-    assert (len(drop_rates) == num_layers)
-    assert (len(strides) == num_layers)
-
-    for i in range(1, num_layers + 1):
-        if i == 1:
-            i_conv_in = input
-        else:
-            i_conv_in = group_output
-        i_channels_conv = input_channels if i == 1 else output_channels
-        conv_act = LinearActivation() if with_bn[i - 1] else ReluActivation()
-        conv_output = img_conv_layer(
-            name="conv%d_%d" % (prefix_num, i),
-            input=i_conv_in,
-            filter_size=3,
-            num_channels=i_channels_conv,
-            num_filters=output_channels,
-            stride=strides[i - 1],
-            padding=1,
-            act=conv_act)
-        if with_bn[i - 1]:
-            bn = batch_norm_layer(
-                name="conv%d_%d_bn" % (prefix_num, i),
-                input=conv_output,
-                num_channels=output_channels,
-                act=ReluActivation(),
-                layer_attr=get_extra_layer_attr(drop_rate=drop_rates[i - 1]))
-            group_output = bn
-        else:
-            group_output = conv_output
-    pool = img_pool_layer(
-        name="pool%d" % prefix_num,
-        input=group_output,
-        pool_size=2,
-        num_channels=output_channels,
-        stride=2)
-    return pool
-
-
-def vgg_conv_net(image_size,
-                 num_classes,
-                 num_layers,
-                 channels,
-                 strides,
-                 with_bn,
-                 fc_dims,
-                 drop_rates,
-                 drop_rates_fc=[],
-                 is_color=True,
-                 is_predict=False):
-    """
-    A Wrapper for a VGG network for image classification.
-    It is a set of convolutional groups followed by several fully
-    connected layers, and a cross-entropy classifiation loss.
-    The detailed architecture of the paper can be found here:
-      Very Deep Convolutional Networks for Large-Scale Visual Recognition
-      http://www.robots.ox.ac.uk/~vgg/research/very_deep/
-    image_size: image size.
-    num_classes: num of classes.
-    num_layers: the number of layers for all the convolution groups.
-    channels: the number of output filters for all the convolution groups.
-    with_bn: whether each layer of a convolution group is followed by a
-    batch normalization.
-    drop_rates: the dropout rates for all the convolutional layers.
-    fc_dims: the dimension for all the fully connected layers.
-    is_color: whether the input images are color.
-    """
-    data_input, label_input, num_image_channels = \
-        image_data_layers(image_size, num_classes, is_color, is_predict)
-    assert (len(num_layers) == len(channels))
-    assert (len(num_layers) == len(strides))
-    assert (len(num_layers) == len(with_bn))
-    num_fc_layers = len(fc_dims)
-    assert (num_fc_layers + 1 == len(drop_rates_fc))
-
-    for i in range(len(num_layers)):
-        input_layer = data_input if i == 0 else group_output
-        input_channels = 3 if i == 0 else channels[i - 1]
-        group_output = conv_layer_group(
-            prefix_num=i + 1,
-            num_layers=num_layers[i],
-            input=input_layer,
-            input_channels=input_channels,
-            output_channels=channels[i],
-            drop_rates=drop_rates[i],
-            strides=strides[i],
-            with_bn=with_bn[i])
-    conv_output_name = group_output
-    if drop_rates_fc[0] != 0.0:
-        dropped_pool_name = "pool_dropped"
-        conv_output_name = dropout_layer(
-            name=dropped_pool_name,
-            input=conv_output_name,
-            dropout_rate=drop_rates_fc[0])
-    for i in range(len(fc_dims)):
-        input_layer_name = conv_output_name if i == 0 else fc_output
-        active_type = LinearActivation() if i == len(
-            fc_dims) - 1 else ReluActivation()
-        drop_rate = 0.0 if i == len(fc_dims) - 1 else drop_rates_fc[i + 1]
-        fc_output = fc_layer(
-            name="fc%d" % (i + 1),
-            input=input_layer_name,
-            size=fc_dims[i],
-            act=active_type,
-            layer_attr=get_extra_layer_attr(drop_rate))
-    bn = batch_norm_layer(
-        name="fc_bn",
-        input=fc_output,
-        num_channels=fc_dims[len(fc_dims) - 1],
-        act=ReluActivation(),
-        layer_attr=get_extra_layer_attr(drop_rate=drop_rates_fc[-1]))
-    output = fc_layer(
-        name="output", input=bn, size=num_classes, act=SoftmaxActivation())
-    if is_predict:
-        outputs(output)
-    else:
-        cost = classification_cost(name="cost", input=output, label=label_input)
-        outputs(cost)
-
-
-def vgg16_conv_net(image_size, num_classes, is_color=True, is_predict=False):
-    """
-    A Wrapper for a 16 layers VGG network for image classification.
-    The detailed architecture of the paper can be found here:
-      Very Deep Convolutional Networks for Large-Scale Visual Recognition
-      http://www.robots.ox.ac.uk/~vgg/research/very_deep/
-    image_size: image size.
-    num_classes: num of classes.
-    is_color: whether the input images are color.
-    """
-    vgg_conv_net(image_size, num_classes,
-                 num_layers=[2, 2, 3, 3, 3],
-                 channels=[64, 128, 256, 512, 512],
-                 strides=[[], [], [], [], []],
-                 with_bn=[[False, True], [False, True], [False, False, True], \
-                          [False, False, True], [False, False, True]],
-                 drop_rates=[[]] * 5,
-                 drop_rates_fc=[0.0, 0.5, 0.5],
-                 fc_dims=[4096, 4096],
-                 is_predict=is_predict)
-
-
-def small_vgg(data_conf, is_predict=False):
-    """
-    A Wrapper for a small VGG network for CIFAR-10 image classification.
-    The detailed architecture of the paper can be found here:
-      92.45% on CIFAR-10 in Torch
-      http://torch.ch/blog/2015/07/30/cifar.html
-    Due to the constraints of CuDNN, it only has four convolutional groups
-    rather than five.
-    Thus, it only achieves 91.2% test accuracy and 98.1% training accuracy.
-    data_conf is a dictionary with the following keys:
-        image_size: image size.
-        num_classes: num of classes.
-        is_color: whether the input images are color.
-    """
-    for k, v in six.iteritems(data_conf):
-        globals()[k] = v
-    vgg_conv_net(image_size, num_classes,
-                 num_layers=[2, 2, 3, 3],
-                 channels=[64, 128, 256, 512],
-                 strides=[[], [], [], []],
-                 with_bn=[[True, True], [True, True], [True, True, True], \
-                          [True, True, True]],
-                 drop_rates=[[0.3, 0.0], [0.4, 0.0],
-                             [0.4, 0.4, 0.0], [0.4, 0.4, 0.0]],
-                 drop_rates_fc=[0.5, 0.5],
-                 fc_dims=[512],
-                 is_predict=is_predict)
-
-
-def training_settings(learning_rate=0.1,
-                      batch_size=128,
-                      algorithm="sgd",
-                      momentum=0.9,
-                      decay_rate=0.001):
-    """
-    Training settings.
-    learning_rate: learning rate of the training.
-    batch_size: the size of each training batch.
-    algorithm: training algorithm, can be
-       - sgd
-       - adagrad
-       - adadelta
-       - rmsprop
-    momentum: momentum of the training algorithm.
-    decay_rate: weight decay rate.
-    """
-    Settings(
-        algorithm=algorithm,
-        batch_size=batch_size,
-        learning_rate=learning_rate / float(batch_size))
-    default_momentum(momentum)
-    default_decay_rate(decay_rate * batch_size)
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -11,3 +11,4 @@ graphviz
 six
 funcsigs
 pyyaml
+decorator